fs/btrfs/scrub.c at v6.0 · tjh.dev/kernel

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / btrfs / scrub.c
at v6.0 4357 lines 122 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   4 */
   5
   6#include <linux/blkdev.h>
   7#include <linux/ratelimit.h>
   8#include <linux/sched/mm.h>
   9#include <crypto/hash.h>
  10#include "ctree.h"
  11#include "discard.h"
  12#include "volumes.h"
  13#include "disk-io.h"
  14#include "ordered-data.h"
  15#include "transaction.h"
  16#include "backref.h"
  17#include "extent_io.h"
  18#include "dev-replace.h"
  19#include "check-integrity.h"
  20#include "rcu-string.h"
  21#include "raid56.h"
  22#include "block-group.h"
  23#include "zoned.h"
  24
  25/*
  26 * This is only the first step towards a full-features scrub. It reads all
  27 * extent and super block and verifies the checksums. In case a bad checksum
  28 * is found or the extent cannot be read, good data will be written back if
  29 * any can be found.
  30 *
  31 * Future enhancements:
  32 *  - In case an unrepairable extent is encountered, track which files are
  33 *    affected and report them
  34 *  - track and record media errors, throw out bad devices
  35 *  - add a mode to also read unallocated space
  36 */
  37
  38struct scrub_block;
  39struct scrub_ctx;
  40
  41/*
  42 * The following three values only influence the performance.
  43 *
  44 * The last one configures the number of parallel and outstanding I/O
  45 * operations. The first one configures an upper limit for the number
  46 * of (dynamically allocated) pages that are added to a bio.
  47 */
  48#define SCRUB_SECTORS_PER_BIO	32	/* 128KiB per bio for 4KiB pages */
  49#define SCRUB_BIOS_PER_SCTX	64	/* 8MiB per device in flight for 4KiB pages */
  50
  51/*
  52 * The following value times PAGE_SIZE needs to be large enough to match the
  53 * largest node/leaf/sector size that shall be supported.
  54 */
  55#define SCRUB_MAX_SECTORS_PER_BLOCK	(BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
  56
  57struct scrub_recover {
  58	refcount_t		refs;
  59	struct btrfs_io_context	*bioc;
  60	u64			map_length;
  61};
  62
  63struct scrub_sector {
  64	struct scrub_block	*sblock;
  65	struct page		*page;
  66	struct btrfs_device	*dev;
  67	struct list_head	list;
  68	u64			flags;  /* extent flags */
  69	u64			generation;
  70	u64			logical;
  71	u64			physical;
  72	u64			physical_for_dev_replace;
  73	atomic_t		refs;
  74	u8			mirror_num;
  75	unsigned int		have_csum:1;
  76	unsigned int		io_error:1;
  77	u8			csum[BTRFS_CSUM_SIZE];
  78
  79	struct scrub_recover	*recover;
  80};
  81
  82struct scrub_bio {
  83	int			index;
  84	struct scrub_ctx	*sctx;
  85	struct btrfs_device	*dev;
  86	struct bio		*bio;
  87	blk_status_t		status;
  88	u64			logical;
  89	u64			physical;
  90	struct scrub_sector	*sectors[SCRUB_SECTORS_PER_BIO];
  91	int			sector_count;
  92	int			next_free;
  93	struct work_struct	work;
  94};
  95
  96struct scrub_block {
  97	struct scrub_sector	*sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
  98	int			sector_count;
  99	atomic_t		outstanding_sectors;
 100	refcount_t		refs; /* free mem on transition to zero */
 101	struct scrub_ctx	*sctx;
 102	struct scrub_parity	*sparity;
 103	struct {
 104		unsigned int	header_error:1;
 105		unsigned int	checksum_error:1;
 106		unsigned int	no_io_error_seen:1;
 107		unsigned int	generation_error:1; /* also sets header_error */
 108
 109		/* The following is for the data used to check parity */
 110		/* It is for the data with checksum */
 111		unsigned int	data_corrected:1;
 112	};
 113	struct work_struct	work;
 114};
 115
 116/* Used for the chunks with parity stripe such RAID5/6 */
 117struct scrub_parity {
 118	struct scrub_ctx	*sctx;
 119
 120	struct btrfs_device	*scrub_dev;
 121
 122	u64			logic_start;
 123
 124	u64			logic_end;
 125
 126	int			nsectors;
 127
 128	u32			stripe_len;
 129
 130	refcount_t		refs;
 131
 132	struct list_head	sectors_list;
 133
 134	/* Work of parity check and repair */
 135	struct work_struct	work;
 136
 137	/* Mark the parity blocks which have data */
 138	unsigned long		dbitmap;
 139
 140	/*
 141	 * Mark the parity blocks which have data, but errors happen when
 142	 * read data or check data
 143	 */
 144	unsigned long		ebitmap;
 145};
 146
 147struct scrub_ctx {
 148	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX];
 149	struct btrfs_fs_info	*fs_info;
 150	int			first_free;
 151	int			curr;
 152	atomic_t		bios_in_flight;
 153	atomic_t		workers_pending;
 154	spinlock_t		list_lock;
 155	wait_queue_head_t	list_wait;
 156	struct list_head	csum_list;
 157	atomic_t		cancel_req;
 158	int			readonly;
 159	int			sectors_per_bio;
 160
 161	/* State of IO submission throttling affecting the associated device */
 162	ktime_t			throttle_deadline;
 163	u64			throttle_sent;
 164
 165	int			is_dev_replace;
 166	u64			write_pointer;
 167
 168	struct scrub_bio        *wr_curr_bio;
 169	struct mutex            wr_lock;
 170	struct btrfs_device     *wr_tgtdev;
 171	bool                    flush_all_writes;
 172
 173	/*
 174	 * statistics
 175	 */
 176	struct btrfs_scrub_progress stat;
 177	spinlock_t		stat_lock;
 178
 179	/*
 180	 * Use a ref counter to avoid use-after-free issues. Scrub workers
 181	 * decrement bios_in_flight and workers_pending and then do a wakeup
 182	 * on the list_wait wait queue. We must ensure the main scrub task
 183	 * doesn't free the scrub context before or while the workers are
 184	 * doing the wakeup() call.
 185	 */
 186	refcount_t              refs;
 187};
 188
 189struct scrub_warning {
 190	struct btrfs_path	*path;
 191	u64			extent_item_size;
 192	const char		*errstr;
 193	u64			physical;
 194	u64			logical;
 195	struct btrfs_device	*dev;
 196};
 197
 198struct full_stripe_lock {
 199	struct rb_node node;
 200	u64 logical;
 201	u64 refs;
 202	struct mutex mutex;
 203};
 204
 205static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 206				     struct scrub_block *sblocks_for_recheck);
 207static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 208				struct scrub_block *sblock,
 209				int retry_failed_mirror);
 210static void scrub_recheck_block_checksum(struct scrub_block *sblock);
 211static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 212					     struct scrub_block *sblock_good);
 213static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
 214					    struct scrub_block *sblock_good,
 215					    int sector_num, int force_write);
 216static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 217static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
 218					     int sector_num);
 219static int scrub_checksum_data(struct scrub_block *sblock);
 220static int scrub_checksum_tree_block(struct scrub_block *sblock);
 221static int scrub_checksum_super(struct scrub_block *sblock);
 222static void scrub_block_put(struct scrub_block *sblock);
 223static void scrub_sector_get(struct scrub_sector *sector);
 224static void scrub_sector_put(struct scrub_sector *sector);
 225static void scrub_parity_get(struct scrub_parity *sparity);
 226static void scrub_parity_put(struct scrub_parity *sparity);
 227static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
 228			 u64 physical, struct btrfs_device *dev, u64 flags,
 229			 u64 gen, int mirror_num, u8 *csum,
 230			 u64 physical_for_dev_replace);
 231static void scrub_bio_end_io(struct bio *bio);
 232static void scrub_bio_end_io_worker(struct work_struct *work);
 233static void scrub_block_complete(struct scrub_block *sblock);
 234static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
 235				 u64 extent_logical, u32 extent_len,
 236				 u64 *extent_physical,
 237				 struct btrfs_device **extent_dev,
 238				 int *extent_mirror_num);
 239static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
 240				      struct scrub_sector *sector);
 241static void scrub_wr_submit(struct scrub_ctx *sctx);
 242static void scrub_wr_bio_end_io(struct bio *bio);
 243static void scrub_wr_bio_end_io_worker(struct work_struct *work);
 244static void scrub_put_ctx(struct scrub_ctx *sctx);
 245
 246static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
 247{
 248	return sector->recover &&
 249	       (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
 250}
 251
 252static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 253{
 254	refcount_inc(&sctx->refs);
 255	atomic_inc(&sctx->bios_in_flight);
 256}
 257
 258static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 259{
 260	atomic_dec(&sctx->bios_in_flight);
 261	wake_up(&sctx->list_wait);
 262	scrub_put_ctx(sctx);
 263}
 264
 265static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 266{
 267	while (atomic_read(&fs_info->scrub_pause_req)) {
 268		mutex_unlock(&fs_info->scrub_lock);
 269		wait_event(fs_info->scrub_pause_wait,
 270		   atomic_read(&fs_info->scrub_pause_req) == 0);
 271		mutex_lock(&fs_info->scrub_lock);
 272	}
 273}
 274
 275static void scrub_pause_on(struct btrfs_fs_info *fs_info)
 276{
 277	atomic_inc(&fs_info->scrubs_paused);
 278	wake_up(&fs_info->scrub_pause_wait);
 279}
 280
 281static void scrub_pause_off(struct btrfs_fs_info *fs_info)
 282{
 283	mutex_lock(&fs_info->scrub_lock);
 284	__scrub_blocked_if_needed(fs_info);
 285	atomic_dec(&fs_info->scrubs_paused);
 286	mutex_unlock(&fs_info->scrub_lock);
 287
 288	wake_up(&fs_info->scrub_pause_wait);
 289}
 290
 291static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 292{
 293	scrub_pause_on(fs_info);
 294	scrub_pause_off(fs_info);
 295}
 296
 297/*
 298 * Insert new full stripe lock into full stripe locks tree
 299 *
 300 * Return pointer to existing or newly inserted full_stripe_lock structure if
 301 * everything works well.
 302 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
 303 *
 304 * NOTE: caller must hold full_stripe_locks_root->lock before calling this
 305 * function
 306 */
 307static struct full_stripe_lock *insert_full_stripe_lock(
 308		struct btrfs_full_stripe_locks_tree *locks_root,
 309		u64 fstripe_logical)
 310{
 311	struct rb_node **p;
 312	struct rb_node *parent = NULL;
 313	struct full_stripe_lock *entry;
 314	struct full_stripe_lock *ret;
 315
 316	lockdep_assert_held(&locks_root->lock);
 317
 318	p = &locks_root->root.rb_node;
 319	while (*p) {
 320		parent = *p;
 321		entry = rb_entry(parent, struct full_stripe_lock, node);
 322		if (fstripe_logical < entry->logical) {
 323			p = &(*p)->rb_left;
 324		} else if (fstripe_logical > entry->logical) {
 325			p = &(*p)->rb_right;
 326		} else {
 327			entry->refs++;
 328			return entry;
 329		}
 330	}
 331
 332	/*
 333	 * Insert new lock.
 334	 */
 335	ret = kmalloc(sizeof(*ret), GFP_KERNEL);
 336	if (!ret)
 337		return ERR_PTR(-ENOMEM);
 338	ret->logical = fstripe_logical;
 339	ret->refs = 1;
 340	mutex_init(&ret->mutex);
 341
 342	rb_link_node(&ret->node, parent, p);
 343	rb_insert_color(&ret->node, &locks_root->root);
 344	return ret;
 345}
 346
 347/*
 348 * Search for a full stripe lock of a block group
 349 *
 350 * Return pointer to existing full stripe lock if found
 351 * Return NULL if not found
 352 */
 353static struct full_stripe_lock *search_full_stripe_lock(
 354		struct btrfs_full_stripe_locks_tree *locks_root,
 355		u64 fstripe_logical)
 356{
 357	struct rb_node *node;
 358	struct full_stripe_lock *entry;
 359
 360	lockdep_assert_held(&locks_root->lock);
 361
 362	node = locks_root->root.rb_node;
 363	while (node) {
 364		entry = rb_entry(node, struct full_stripe_lock, node);
 365		if (fstripe_logical < entry->logical)
 366			node = node->rb_left;
 367		else if (fstripe_logical > entry->logical)
 368			node = node->rb_right;
 369		else
 370			return entry;
 371	}
 372	return NULL;
 373}
 374
 375/*
 376 * Helper to get full stripe logical from a normal bytenr.
 377 *
 378 * Caller must ensure @cache is a RAID56 block group.
 379 */
 380static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
 381{
 382	u64 ret;
 383
 384	/*
 385	 * Due to chunk item size limit, full stripe length should not be
 386	 * larger than U32_MAX. Just a sanity check here.
 387	 */
 388	WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
 389
 390	/*
 391	 * round_down() can only handle power of 2, while RAID56 full
 392	 * stripe length can be 64KiB * n, so we need to manually round down.
 393	 */
 394	ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
 395			cache->full_stripe_len + cache->start;
 396	return ret;
 397}
 398
 399/*
 400 * Lock a full stripe to avoid concurrency of recovery and read
 401 *
 402 * It's only used for profiles with parities (RAID5/6), for other profiles it
 403 * does nothing.
 404 *
 405 * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
 406 * So caller must call unlock_full_stripe() at the same context.
 407 *
 408 * Return <0 if encounters error.
 409 */
 410static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 411			    bool *locked_ret)
 412{
 413	struct btrfs_block_group *bg_cache;
 414	struct btrfs_full_stripe_locks_tree *locks_root;
 415	struct full_stripe_lock *existing;
 416	u64 fstripe_start;
 417	int ret = 0;
 418
 419	*locked_ret = false;
 420	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 421	if (!bg_cache) {
 422		ASSERT(0);
 423		return -ENOENT;
 424	}
 425
 426	/* Profiles not based on parity don't need full stripe lock */
 427	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 428		goto out;
 429	locks_root = &bg_cache->full_stripe_locks_root;
 430
 431	fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 432
 433	/* Now insert the full stripe lock */
 434	mutex_lock(&locks_root->lock);
 435	existing = insert_full_stripe_lock(locks_root, fstripe_start);
 436	mutex_unlock(&locks_root->lock);
 437	if (IS_ERR(existing)) {
 438		ret = PTR_ERR(existing);
 439		goto out;
 440	}
 441	mutex_lock(&existing->mutex);
 442	*locked_ret = true;
 443out:
 444	btrfs_put_block_group(bg_cache);
 445	return ret;
 446}
 447
 448/*
 449 * Unlock a full stripe.
 450 *
 451 * NOTE: Caller must ensure it's the same context calling corresponding
 452 * lock_full_stripe().
 453 *
 454 * Return 0 if we unlock full stripe without problem.
 455 * Return <0 for error
 456 */
 457static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 458			      bool locked)
 459{
 460	struct btrfs_block_group *bg_cache;
 461	struct btrfs_full_stripe_locks_tree *locks_root;
 462	struct full_stripe_lock *fstripe_lock;
 463	u64 fstripe_start;
 464	bool freeit = false;
 465	int ret = 0;
 466
 467	/* If we didn't acquire full stripe lock, no need to continue */
 468	if (!locked)
 469		return 0;
 470
 471	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 472	if (!bg_cache) {
 473		ASSERT(0);
 474		return -ENOENT;
 475	}
 476	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 477		goto out;
 478
 479	locks_root = &bg_cache->full_stripe_locks_root;
 480	fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 481
 482	mutex_lock(&locks_root->lock);
 483	fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
 484	/* Unpaired unlock_full_stripe() detected */
 485	if (!fstripe_lock) {
 486		WARN_ON(1);
 487		ret = -ENOENT;
 488		mutex_unlock(&locks_root->lock);
 489		goto out;
 490	}
 491
 492	if (fstripe_lock->refs == 0) {
 493		WARN_ON(1);
 494		btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
 495			fstripe_lock->logical);
 496	} else {
 497		fstripe_lock->refs--;
 498	}
 499
 500	if (fstripe_lock->refs == 0) {
 501		rb_erase(&fstripe_lock->node, &locks_root->root);
 502		freeit = true;
 503	}
 504	mutex_unlock(&locks_root->lock);
 505
 506	mutex_unlock(&fstripe_lock->mutex);
 507	if (freeit)
 508		kfree(fstripe_lock);
 509out:
 510	btrfs_put_block_group(bg_cache);
 511	return ret;
 512}
 513
 514static void scrub_free_csums(struct scrub_ctx *sctx)
 515{
 516	while (!list_empty(&sctx->csum_list)) {
 517		struct btrfs_ordered_sum *sum;
 518		sum = list_first_entry(&sctx->csum_list,
 519				       struct btrfs_ordered_sum, list);
 520		list_del(&sum->list);
 521		kfree(sum);
 522	}
 523}
 524
 525static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 526{
 527	int i;
 528
 529	if (!sctx)
 530		return;
 531
 532	/* this can happen when scrub is cancelled */
 533	if (sctx->curr != -1) {
 534		struct scrub_bio *sbio = sctx->bios[sctx->curr];
 535
 536		for (i = 0; i < sbio->sector_count; i++) {
 537			WARN_ON(!sbio->sectors[i]->page);
 538			scrub_block_put(sbio->sectors[i]->sblock);
 539		}
 540		bio_put(sbio->bio);
 541	}
 542
 543	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 544		struct scrub_bio *sbio = sctx->bios[i];
 545
 546		if (!sbio)
 547			break;
 548		kfree(sbio);
 549	}
 550
 551	kfree(sctx->wr_curr_bio);
 552	scrub_free_csums(sctx);
 553	kfree(sctx);
 554}
 555
 556static void scrub_put_ctx(struct scrub_ctx *sctx)
 557{
 558	if (refcount_dec_and_test(&sctx->refs))
 559		scrub_free_ctx(sctx);
 560}
 561
 562static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
 563		struct btrfs_fs_info *fs_info, int is_dev_replace)
 564{
 565	struct scrub_ctx *sctx;
 566	int		i;
 567
 568	sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
 569	if (!sctx)
 570		goto nomem;
 571	refcount_set(&sctx->refs, 1);
 572	sctx->is_dev_replace = is_dev_replace;
 573	sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
 574	sctx->curr = -1;
 575	sctx->fs_info = fs_info;
 576	INIT_LIST_HEAD(&sctx->csum_list);
 577	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 578		struct scrub_bio *sbio;
 579
 580		sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
 581		if (!sbio)
 582			goto nomem;
 583		sctx->bios[i] = sbio;
 584
 585		sbio->index = i;
 586		sbio->sctx = sctx;
 587		sbio->sector_count = 0;
 588		INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
 589
 590		if (i != SCRUB_BIOS_PER_SCTX - 1)
 591			sctx->bios[i]->next_free = i + 1;
 592		else
 593			sctx->bios[i]->next_free = -1;
 594	}
 595	sctx->first_free = 0;
 596	atomic_set(&sctx->bios_in_flight, 0);
 597	atomic_set(&sctx->workers_pending, 0);
 598	atomic_set(&sctx->cancel_req, 0);
 599
 600	spin_lock_init(&sctx->list_lock);
 601	spin_lock_init(&sctx->stat_lock);
 602	init_waitqueue_head(&sctx->list_wait);
 603	sctx->throttle_deadline = 0;
 604
 605	WARN_ON(sctx->wr_curr_bio != NULL);
 606	mutex_init(&sctx->wr_lock);
 607	sctx->wr_curr_bio = NULL;
 608	if (is_dev_replace) {
 609		WARN_ON(!fs_info->dev_replace.tgtdev);
 610		sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
 611		sctx->flush_all_writes = false;
 612	}
 613
 614	return sctx;
 615
 616nomem:
 617	scrub_free_ctx(sctx);
 618	return ERR_PTR(-ENOMEM);
 619}
 620
 621static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 622				     void *warn_ctx)
 623{
 624	u32 nlink;
 625	int ret;
 626	int i;
 627	unsigned nofs_flag;
 628	struct extent_buffer *eb;
 629	struct btrfs_inode_item *inode_item;
 630	struct scrub_warning *swarn = warn_ctx;
 631	struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
 632	struct inode_fs_paths *ipath = NULL;
 633	struct btrfs_root *local_root;
 634	struct btrfs_key key;
 635
 636	local_root = btrfs_get_fs_root(fs_info, root, true);
 637	if (IS_ERR(local_root)) {
 638		ret = PTR_ERR(local_root);
 639		goto err;
 640	}
 641
 642	/*
 643	 * this makes the path point to (inum INODE_ITEM ioff)
 644	 */
 645	key.objectid = inum;
 646	key.type = BTRFS_INODE_ITEM_KEY;
 647	key.offset = 0;
 648
 649	ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
 650	if (ret) {
 651		btrfs_put_root(local_root);
 652		btrfs_release_path(swarn->path);
 653		goto err;
 654	}
 655
 656	eb = swarn->path->nodes[0];
 657	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 658					struct btrfs_inode_item);
 659	nlink = btrfs_inode_nlink(eb, inode_item);
 660	btrfs_release_path(swarn->path);
 661
 662	/*
 663	 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
 664	 * uses GFP_NOFS in this context, so we keep it consistent but it does
 665	 * not seem to be strictly necessary.
 666	 */
 667	nofs_flag = memalloc_nofs_save();
 668	ipath = init_ipath(4096, local_root, swarn->path);
 669	memalloc_nofs_restore(nofs_flag);
 670	if (IS_ERR(ipath)) {
 671		btrfs_put_root(local_root);
 672		ret = PTR_ERR(ipath);
 673		ipath = NULL;
 674		goto err;
 675	}
 676	ret = paths_from_inode(inum, ipath);
 677
 678	if (ret < 0)
 679		goto err;
 680
 681	/*
 682	 * we deliberately ignore the bit ipath might have been too small to
 683	 * hold all of the paths here
 684	 */
 685	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 686		btrfs_warn_in_rcu(fs_info,
 687"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
 688				  swarn->errstr, swarn->logical,
 689				  rcu_str_deref(swarn->dev->name),
 690				  swarn->physical,
 691				  root, inum, offset,
 692				  fs_info->sectorsize, nlink,
 693				  (char *)(unsigned long)ipath->fspath->val[i]);
 694
 695	btrfs_put_root(local_root);
 696	free_ipath(ipath);
 697	return 0;
 698
 699err:
 700	btrfs_warn_in_rcu(fs_info,
 701			  "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
 702			  swarn->errstr, swarn->logical,
 703			  rcu_str_deref(swarn->dev->name),
 704			  swarn->physical,
 705			  root, inum, offset, ret);
 706
 707	free_ipath(ipath);
 708	return 0;
 709}
 710
 711static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 712{
 713	struct btrfs_device *dev;
 714	struct btrfs_fs_info *fs_info;
 715	struct btrfs_path *path;
 716	struct btrfs_key found_key;
 717	struct extent_buffer *eb;
 718	struct btrfs_extent_item *ei;
 719	struct scrub_warning swarn;
 720	unsigned long ptr = 0;
 721	u64 extent_item_pos;
 722	u64 flags = 0;
 723	u64 ref_root;
 724	u32 item_size;
 725	u8 ref_level = 0;
 726	int ret;
 727
 728	WARN_ON(sblock->sector_count < 1);
 729	dev = sblock->sectors[0]->dev;
 730	fs_info = sblock->sctx->fs_info;
 731
 732	path = btrfs_alloc_path();
 733	if (!path)
 734		return;
 735
 736	swarn.physical = sblock->sectors[0]->physical;
 737	swarn.logical = sblock->sectors[0]->logical;
 738	swarn.errstr = errstr;
 739	swarn.dev = NULL;
 740
 741	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 742				  &flags);
 743	if (ret < 0)
 744		goto out;
 745
 746	extent_item_pos = swarn.logical - found_key.objectid;
 747	swarn.extent_item_size = found_key.offset;
 748
 749	eb = path->nodes[0];
 750	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 751	item_size = btrfs_item_size(eb, path->slots[0]);
 752
 753	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 754		do {
 755			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 756						      item_size, &ref_root,
 757						      &ref_level);
 758			btrfs_warn_in_rcu(fs_info,
 759"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
 760				errstr, swarn.logical,
 761				rcu_str_deref(dev->name),
 762				swarn.physical,
 763				ref_level ? "node" : "leaf",
 764				ret < 0 ? -1 : ref_level,
 765				ret < 0 ? -1 : ref_root);
 766		} while (ret != 1);
 767		btrfs_release_path(path);
 768	} else {
 769		btrfs_release_path(path);
 770		swarn.path = path;
 771		swarn.dev = dev;
 772		iterate_extent_inodes(fs_info, found_key.objectid,
 773					extent_item_pos, 1,
 774					scrub_print_warning_inode, &swarn, false);
 775	}
 776
 777out:
 778	btrfs_free_path(path);
 779}
 780
 781static inline void scrub_get_recover(struct scrub_recover *recover)
 782{
 783	refcount_inc(&recover->refs);
 784}
 785
 786static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
 787				     struct scrub_recover *recover)
 788{
 789	if (refcount_dec_and_test(&recover->refs)) {
 790		btrfs_bio_counter_dec(fs_info);
 791		btrfs_put_bioc(recover->bioc);
 792		kfree(recover);
 793	}
 794}
 795
 796/*
 797 * scrub_handle_errored_block gets called when either verification of the
 798 * sectors failed or the bio failed to read, e.g. with EIO. In the latter
 799 * case, this function handles all sectors in the bio, even though only one
 800 * may be bad.
 801 * The goal of this function is to repair the errored block by using the
 802 * contents of one of the mirrors.
 803 */
 804static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 805{
 806	struct scrub_ctx *sctx = sblock_to_check->sctx;
 807	struct btrfs_device *dev;
 808	struct btrfs_fs_info *fs_info;
 809	u64 logical;
 810	unsigned int failed_mirror_index;
 811	unsigned int is_metadata;
 812	unsigned int have_csum;
 813	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 814	struct scrub_block *sblock_bad;
 815	int ret;
 816	int mirror_index;
 817	int sector_num;
 818	int success;
 819	bool full_stripe_locked;
 820	unsigned int nofs_flag;
 821	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
 822				      DEFAULT_RATELIMIT_BURST);
 823
 824	BUG_ON(sblock_to_check->sector_count < 1);
 825	fs_info = sctx->fs_info;
 826	if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
 827		/*
 828		 * if we find an error in a super block, we just report it.
 829		 * They will get written with the next transaction commit
 830		 * anyway
 831		 */
 832		spin_lock(&sctx->stat_lock);
 833		++sctx->stat.super_errors;
 834		spin_unlock(&sctx->stat_lock);
 835		return 0;
 836	}
 837	logical = sblock_to_check->sectors[0]->logical;
 838	BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1);
 839	failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1;
 840	is_metadata = !(sblock_to_check->sectors[0]->flags &
 841			BTRFS_EXTENT_FLAG_DATA);
 842	have_csum = sblock_to_check->sectors[0]->have_csum;
 843	dev = sblock_to_check->sectors[0]->dev;
 844
 845	if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
 846		return 0;
 847
 848	/*
 849	 * We must use GFP_NOFS because the scrub task might be waiting for a
 850	 * worker task executing this function and in turn a transaction commit
 851	 * might be waiting the scrub task to pause (which needs to wait for all
 852	 * the worker tasks to complete before pausing).
 853	 * We do allocations in the workers through insert_full_stripe_lock()
 854	 * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
 855	 * this function.
 856	 */
 857	nofs_flag = memalloc_nofs_save();
 858	/*
 859	 * For RAID5/6, race can happen for a different device scrub thread.
 860	 * For data corruption, Parity and Data threads will both try
 861	 * to recovery the data.
 862	 * Race can lead to doubly added csum error, or even unrecoverable
 863	 * error.
 864	 */
 865	ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
 866	if (ret < 0) {
 867		memalloc_nofs_restore(nofs_flag);
 868		spin_lock(&sctx->stat_lock);
 869		if (ret == -ENOMEM)
 870			sctx->stat.malloc_errors++;
 871		sctx->stat.read_errors++;
 872		sctx->stat.uncorrectable_errors++;
 873		spin_unlock(&sctx->stat_lock);
 874		return ret;
 875	}
 876
 877	/*
 878	 * read all mirrors one after the other. This includes to
 879	 * re-read the extent or metadata block that failed (that was
 880	 * the cause that this fixup code is called) another time,
 881	 * sector by sector this time in order to know which sectors
 882	 * caused I/O errors and which ones are good (for all mirrors).
 883	 * It is the goal to handle the situation when more than one
 884	 * mirror contains I/O errors, but the errors do not
 885	 * overlap, i.e. the data can be repaired by selecting the
 886	 * sectors from those mirrors without I/O error on the
 887	 * particular sectors. One example (with blocks >= 2 * sectorsize)
 888	 * would be that mirror #1 has an I/O error on the first sector,
 889	 * the second sector is good, and mirror #2 has an I/O error on
 890	 * the second sector, but the first sector is good.
 891	 * Then the first sector of the first mirror can be repaired by
 892	 * taking the first sector of the second mirror, and the
 893	 * second sector of the second mirror can be repaired by
 894	 * copying the contents of the 2nd sector of the 1st mirror.
 895	 * One more note: if the sectors of one mirror contain I/O
 896	 * errors, the checksum cannot be verified. In order to get
 897	 * the best data for repairing, the first attempt is to find
 898	 * a mirror without I/O errors and with a validated checksum.
 899	 * Only if this is not possible, the sectors are picked from
 900	 * mirrors with I/O errors without considering the checksum.
 901	 * If the latter is the case, at the end, the checksum of the
 902	 * repaired area is verified in order to correctly maintain
 903	 * the statistics.
 904	 */
 905
 906	sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
 907				      sizeof(*sblocks_for_recheck), GFP_KERNEL);
 908	if (!sblocks_for_recheck) {
 909		spin_lock(&sctx->stat_lock);
 910		sctx->stat.malloc_errors++;
 911		sctx->stat.read_errors++;
 912		sctx->stat.uncorrectable_errors++;
 913		spin_unlock(&sctx->stat_lock);
 914		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 915		goto out;
 916	}
 917
 918	/* Setup the context, map the logical blocks and alloc the sectors */
 919	ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
 920	if (ret) {
 921		spin_lock(&sctx->stat_lock);
 922		sctx->stat.read_errors++;
 923		sctx->stat.uncorrectable_errors++;
 924		spin_unlock(&sctx->stat_lock);
 925		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 926		goto out;
 927	}
 928	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
 929	sblock_bad = sblocks_for_recheck + failed_mirror_index;
 930
 931	/* build and submit the bios for the failed mirror, check checksums */
 932	scrub_recheck_block(fs_info, sblock_bad, 1);
 933
 934	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 935	    sblock_bad->no_io_error_seen) {
 936		/*
 937		 * The error disappeared after reading sector by sector, or
 938		 * the area was part of a huge bio and other parts of the
 939		 * bio caused I/O errors, or the block layer merged several
 940		 * read requests into one and the error is caused by a
 941		 * different bio (usually one of the two latter cases is
 942		 * the cause)
 943		 */
 944		spin_lock(&sctx->stat_lock);
 945		sctx->stat.unverified_errors++;
 946		sblock_to_check->data_corrected = 1;
 947		spin_unlock(&sctx->stat_lock);
 948
 949		if (sctx->is_dev_replace)
 950			scrub_write_block_to_dev_replace(sblock_bad);
 951		goto out;
 952	}
 953
 954	if (!sblock_bad->no_io_error_seen) {
 955		spin_lock(&sctx->stat_lock);
 956		sctx->stat.read_errors++;
 957		spin_unlock(&sctx->stat_lock);
 958		if (__ratelimit(&rs))
 959			scrub_print_warning("i/o error", sblock_to_check);
 960		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 961	} else if (sblock_bad->checksum_error) {
 962		spin_lock(&sctx->stat_lock);
 963		sctx->stat.csum_errors++;
 964		spin_unlock(&sctx->stat_lock);
 965		if (__ratelimit(&rs))
 966			scrub_print_warning("checksum error", sblock_to_check);
 967		btrfs_dev_stat_inc_and_print(dev,
 968					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
 969	} else if (sblock_bad->header_error) {
 970		spin_lock(&sctx->stat_lock);
 971		sctx->stat.verify_errors++;
 972		spin_unlock(&sctx->stat_lock);
 973		if (__ratelimit(&rs))
 974			scrub_print_warning("checksum/header error",
 975					    sblock_to_check);
 976		if (sblock_bad->generation_error)
 977			btrfs_dev_stat_inc_and_print(dev,
 978				BTRFS_DEV_STAT_GENERATION_ERRS);
 979		else
 980			btrfs_dev_stat_inc_and_print(dev,
 981				BTRFS_DEV_STAT_CORRUPTION_ERRS);
 982	}
 983
 984	if (sctx->readonly) {
 985		ASSERT(!sctx->is_dev_replace);
 986		goto out;
 987	}
 988
 989	/*
 990	 * now build and submit the bios for the other mirrors, check
 991	 * checksums.
 992	 * First try to pick the mirror which is completely without I/O
 993	 * errors and also does not have a checksum error.
 994	 * If one is found, and if a checksum is present, the full block
 995	 * that is known to contain an error is rewritten. Afterwards
 996	 * the block is known to be corrected.
 997	 * If a mirror is found which is completely correct, and no
 998	 * checksum is present, only those sectors are rewritten that had
 999	 * an I/O error in the block to be repaired, since it cannot be
1000	 * determined, which copy of the other sectors is better (and it
1001	 * could happen otherwise that a correct sector would be
1002	 * overwritten by a bad one).
1003	 */
1004	for (mirror_index = 0; ;mirror_index++) {
1005		struct scrub_block *sblock_other;
1006
1007		if (mirror_index == failed_mirror_index)
1008			continue;
1009
1010		/* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1011		if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1012			if (mirror_index >= BTRFS_MAX_MIRRORS)
1013				break;
1014			if (!sblocks_for_recheck[mirror_index].sector_count)
1015				break;
1016
1017			sblock_other = sblocks_for_recheck + mirror_index;
1018		} else {
1019			struct scrub_recover *r = sblock_bad->sectors[0]->recover;
1020			int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
1021
1022			if (mirror_index >= max_allowed)
1023				break;
1024			if (!sblocks_for_recheck[1].sector_count)
1025				break;
1026
1027			ASSERT(failed_mirror_index == 0);
1028			sblock_other = sblocks_for_recheck + 1;
1029			sblock_other->sectors[0]->mirror_num = 1 + mirror_index;
1030		}
1031
1032		/* build and submit the bios, check checksums */
1033		scrub_recheck_block(fs_info, sblock_other, 0);
1034
1035		if (!sblock_other->header_error &&
1036		    !sblock_other->checksum_error &&
1037		    sblock_other->no_io_error_seen) {
1038			if (sctx->is_dev_replace) {
1039				scrub_write_block_to_dev_replace(sblock_other);
1040				goto corrected_error;
1041			} else {
1042				ret = scrub_repair_block_from_good_copy(
1043						sblock_bad, sblock_other);
1044				if (!ret)
1045					goto corrected_error;
1046			}
1047		}
1048	}
1049
1050	if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1051		goto did_not_correct_error;
1052
1053	/*
1054	 * In case of I/O errors in the area that is supposed to be
1055	 * repaired, continue by picking good copies of those sectors.
1056	 * Select the good sectors from mirrors to rewrite bad sectors from
1057	 * the area to fix. Afterwards verify the checksum of the block
1058	 * that is supposed to be repaired. This verification step is
1059	 * only done for the purpose of statistic counting and for the
1060	 * final scrub report, whether errors remain.
1061	 * A perfect algorithm could make use of the checksum and try
1062	 * all possible combinations of sectors from the different mirrors
1063	 * until the checksum verification succeeds. For example, when
1064	 * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
1065	 * of mirror #2 is readable but the final checksum test fails,
1066	 * then the 2nd sector of mirror #3 could be tried, whether now
1067	 * the final checksum succeeds. But this would be a rare
1068	 * exception and is therefore not implemented. At least it is
1069	 * avoided that the good copy is overwritten.
1070	 * A more useful improvement would be to pick the sectors
1071	 * without I/O error based on sector sizes (512 bytes on legacy
1072	 * disks) instead of on sectorsize. Then maybe 512 byte of one
1073	 * mirror could be repaired by taking 512 byte of a different
1074	 * mirror, even if other 512 byte sectors in the same sectorsize
1075	 * area are unreadable.
1076	 */
1077	success = 1;
1078	for (sector_num = 0; sector_num < sblock_bad->sector_count;
1079	     sector_num++) {
1080		struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1081		struct scrub_block *sblock_other = NULL;
1082
1083		/* Skip no-io-error sectors in scrub */
1084		if (!sector_bad->io_error && !sctx->is_dev_replace)
1085			continue;
1086
1087		if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1088			/*
1089			 * In case of dev replace, if raid56 rebuild process
1090			 * didn't work out correct data, then copy the content
1091			 * in sblock_bad to make sure target device is identical
1092			 * to source device, instead of writing garbage data in
1093			 * sblock_for_recheck array to target device.
1094			 */
1095			sblock_other = NULL;
1096		} else if (sector_bad->io_error) {
1097			/* Try to find no-io-error sector in mirrors */
1098			for (mirror_index = 0;
1099			     mirror_index < BTRFS_MAX_MIRRORS &&
1100			     sblocks_for_recheck[mirror_index].sector_count > 0;
1101			     mirror_index++) {
1102				if (!sblocks_for_recheck[mirror_index].
1103				    sectors[sector_num]->io_error) {
1104					sblock_other = sblocks_for_recheck +
1105						       mirror_index;
1106					break;
1107				}
1108			}
1109			if (!sblock_other)
1110				success = 0;
1111		}
1112
1113		if (sctx->is_dev_replace) {
1114			/*
1115			 * Did not find a mirror to fetch the sector from.
1116			 * scrub_write_sector_to_dev_replace() handles this
1117			 * case (sector->io_error), by filling the block with
1118			 * zeros before submitting the write request
1119			 */
1120			if (!sblock_other)
1121				sblock_other = sblock_bad;
1122
1123			if (scrub_write_sector_to_dev_replace(sblock_other,
1124							      sector_num) != 0) {
1125				atomic64_inc(
1126					&fs_info->dev_replace.num_write_errors);
1127				success = 0;
1128			}
1129		} else if (sblock_other) {
1130			ret = scrub_repair_sector_from_good_copy(sblock_bad,
1131								 sblock_other,
1132								 sector_num, 0);
1133			if (0 == ret)
1134				sector_bad->io_error = 0;
1135			else
1136				success = 0;
1137		}
1138	}
1139
1140	if (success && !sctx->is_dev_replace) {
1141		if (is_metadata || have_csum) {
1142			/*
1143			 * need to verify the checksum now that all
1144			 * sectors on disk are repaired (the write
1145			 * request for data to be repaired is on its way).
1146			 * Just be lazy and use scrub_recheck_block()
1147			 * which re-reads the data before the checksum
1148			 * is verified, but most likely the data comes out
1149			 * of the page cache.
1150			 */
1151			scrub_recheck_block(fs_info, sblock_bad, 1);
1152			if (!sblock_bad->header_error &&
1153			    !sblock_bad->checksum_error &&
1154			    sblock_bad->no_io_error_seen)
1155				goto corrected_error;
1156			else
1157				goto did_not_correct_error;
1158		} else {
1159corrected_error:
1160			spin_lock(&sctx->stat_lock);
1161			sctx->stat.corrected_errors++;
1162			sblock_to_check->data_corrected = 1;
1163			spin_unlock(&sctx->stat_lock);
1164			btrfs_err_rl_in_rcu(fs_info,
1165				"fixed up error at logical %llu on dev %s",
1166				logical, rcu_str_deref(dev->name));
1167		}
1168	} else {
1169did_not_correct_error:
1170		spin_lock(&sctx->stat_lock);
1171		sctx->stat.uncorrectable_errors++;
1172		spin_unlock(&sctx->stat_lock);
1173		btrfs_err_rl_in_rcu(fs_info,
1174			"unable to fixup (regular) error at logical %llu on dev %s",
1175			logical, rcu_str_deref(dev->name));
1176	}
1177
1178out:
1179	if (sblocks_for_recheck) {
1180		for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1181		     mirror_index++) {
1182			struct scrub_block *sblock = sblocks_for_recheck +
1183						     mirror_index;
1184			struct scrub_recover *recover;
1185			int i;
1186
1187			for (i = 0; i < sblock->sector_count; i++) {
1188				sblock->sectors[i]->sblock = NULL;
1189				recover = sblock->sectors[i]->recover;
1190				if (recover) {
1191					scrub_put_recover(fs_info, recover);
1192					sblock->sectors[i]->recover = NULL;
1193				}
1194				scrub_sector_put(sblock->sectors[i]);
1195			}
1196		}
1197		kfree(sblocks_for_recheck);
1198	}
1199
1200	ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1201	memalloc_nofs_restore(nofs_flag);
1202	if (ret < 0)
1203		return ret;
1204	return 0;
1205}
1206
1207static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
1208{
1209	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1210		return 2;
1211	else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1212		return 3;
1213	else
1214		return (int)bioc->num_stripes;
1215}
1216
1217static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1218						 u64 *raid_map,
1219						 int nstripes, int mirror,
1220						 int *stripe_index,
1221						 u64 *stripe_offset)
1222{
1223	int i;
1224
1225	if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1226		/* RAID5/6 */
1227		for (i = 0; i < nstripes; i++) {
1228			if (raid_map[i] == RAID6_Q_STRIPE ||
1229			    raid_map[i] == RAID5_P_STRIPE)
1230				continue;
1231
1232			if (logical >= raid_map[i] &&
1233			    logical < raid_map[i] + BTRFS_STRIPE_LEN)
1234				break;
1235		}
1236
1237		*stripe_index = i;
1238		*stripe_offset = logical - raid_map[i];
1239	} else {
1240		/* The other RAID type */
1241		*stripe_index = mirror;
1242		*stripe_offset = 0;
1243	}
1244}
1245
1246static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1247				     struct scrub_block *sblocks_for_recheck)
1248{
1249	struct scrub_ctx *sctx = original_sblock->sctx;
1250	struct btrfs_fs_info *fs_info = sctx->fs_info;
1251	u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
1252	u64 logical = original_sblock->sectors[0]->logical;
1253	u64 generation = original_sblock->sectors[0]->generation;
1254	u64 flags = original_sblock->sectors[0]->flags;
1255	u64 have_csum = original_sblock->sectors[0]->have_csum;
1256	struct scrub_recover *recover;
1257	struct btrfs_io_context *bioc;
1258	u64 sublen;
1259	u64 mapped_length;
1260	u64 stripe_offset;
1261	int stripe_index;
1262	int sector_index = 0;
1263	int mirror_index;
1264	int nmirrors;
1265	int ret;
1266
1267	/*
1268	 * Note: the two members refs and outstanding_sectors are not used (and
1269	 * not set) in the blocks that are used for the recheck procedure.
1270	 */
1271
1272	while (length > 0) {
1273		sublen = min_t(u64, length, fs_info->sectorsize);
1274		mapped_length = sublen;
1275		bioc = NULL;
1276
1277		/*
1278		 * With a length of sectorsize, each returned stripe represents
1279		 * one mirror
1280		 */
1281		btrfs_bio_counter_inc_blocked(fs_info);
1282		ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1283				       logical, &mapped_length, &bioc);
1284		if (ret || !bioc || mapped_length < sublen) {
1285			btrfs_put_bioc(bioc);
1286			btrfs_bio_counter_dec(fs_info);
1287			return -EIO;
1288		}
1289
1290		recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1291		if (!recover) {
1292			btrfs_put_bioc(bioc);
1293			btrfs_bio_counter_dec(fs_info);
1294			return -ENOMEM;
1295		}
1296
1297		refcount_set(&recover->refs, 1);
1298		recover->bioc = bioc;
1299		recover->map_length = mapped_length;
1300
1301		ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
1302
1303		nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
1304
1305		for (mirror_index = 0; mirror_index < nmirrors;
1306		     mirror_index++) {
1307			struct scrub_block *sblock;
1308			struct scrub_sector *sector;
1309
1310			sblock = sblocks_for_recheck + mirror_index;
1311			sblock->sctx = sctx;
1312
1313			sector = kzalloc(sizeof(*sector), GFP_NOFS);
1314			if (!sector) {
1315leave_nomem:
1316				spin_lock(&sctx->stat_lock);
1317				sctx->stat.malloc_errors++;
1318				spin_unlock(&sctx->stat_lock);
1319				scrub_put_recover(fs_info, recover);
1320				return -ENOMEM;
1321			}
1322			scrub_sector_get(sector);
1323			sblock->sectors[sector_index] = sector;
1324			sector->sblock = sblock;
1325			sector->flags = flags;
1326			sector->generation = generation;
1327			sector->logical = logical;
1328			sector->have_csum = have_csum;
1329			if (have_csum)
1330				memcpy(sector->csum,
1331				       original_sblock->sectors[0]->csum,
1332				       sctx->fs_info->csum_size);
1333
1334			scrub_stripe_index_and_offset(logical,
1335						      bioc->map_type,
1336						      bioc->raid_map,
1337						      bioc->num_stripes -
1338						      bioc->num_tgtdevs,
1339						      mirror_index,
1340						      &stripe_index,
1341						      &stripe_offset);
1342			sector->physical = bioc->stripes[stripe_index].physical +
1343					 stripe_offset;
1344			sector->dev = bioc->stripes[stripe_index].dev;
1345
1346			BUG_ON(sector_index >= original_sblock->sector_count);
1347			sector->physical_for_dev_replace =
1348				original_sblock->sectors[sector_index]->
1349				physical_for_dev_replace;
1350			/* For missing devices, dev->bdev is NULL */
1351			sector->mirror_num = mirror_index + 1;
1352			sblock->sector_count++;
1353			sector->page = alloc_page(GFP_NOFS);
1354			if (!sector->page)
1355				goto leave_nomem;
1356
1357			scrub_get_recover(recover);
1358			sector->recover = recover;
1359		}
1360		scrub_put_recover(fs_info, recover);
1361		length -= sublen;
1362		logical += sublen;
1363		sector_index++;
1364	}
1365
1366	return 0;
1367}
1368
1369static void scrub_bio_wait_endio(struct bio *bio)
1370{
1371	complete(bio->bi_private);
1372}
1373
1374static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1375					struct bio *bio,
1376					struct scrub_sector *sector)
1377{
1378	DECLARE_COMPLETION_ONSTACK(done);
1379
1380	bio->bi_iter.bi_sector = sector->logical >> 9;
1381	bio->bi_private = &done;
1382	bio->bi_end_io = scrub_bio_wait_endio;
1383	raid56_parity_recover(bio, sector->recover->bioc,
1384			      sector->sblock->sectors[0]->mirror_num, false);
1385
1386	wait_for_completion_io(&done);
1387	return blk_status_to_errno(bio->bi_status);
1388}
1389
1390static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1391					  struct scrub_block *sblock)
1392{
1393	struct scrub_sector *first_sector = sblock->sectors[0];
1394	struct bio *bio;
1395	int i;
1396
1397	/* All sectors in sblock belong to the same stripe on the same device. */
1398	ASSERT(first_sector->dev);
1399	if (!first_sector->dev->bdev)
1400		goto out;
1401
1402	bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
1403
1404	for (i = 0; i < sblock->sector_count; i++) {
1405		struct scrub_sector *sector = sblock->sectors[i];
1406
1407		WARN_ON(!sector->page);
1408		bio_add_page(bio, sector->page, PAGE_SIZE, 0);
1409	}
1410
1411	if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
1412		bio_put(bio);
1413		goto out;
1414	}
1415
1416	bio_put(bio);
1417
1418	scrub_recheck_block_checksum(sblock);
1419
1420	return;
1421out:
1422	for (i = 0; i < sblock->sector_count; i++)
1423		sblock->sectors[i]->io_error = 1;
1424
1425	sblock->no_io_error_seen = 0;
1426}
1427
1428/*
1429 * This function will check the on disk data for checksum errors, header errors
1430 * and read I/O errors. If any I/O errors happen, the exact sectors which are
1431 * errored are marked as being bad. The goal is to enable scrub to take those
1432 * sectors that are not errored from all the mirrors so that the sectors that
1433 * are errored in the just handled mirror can be repaired.
1434 */
1435static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1436				struct scrub_block *sblock,
1437				int retry_failed_mirror)
1438{
1439	int i;
1440
1441	sblock->no_io_error_seen = 1;
1442
1443	/* short cut for raid56 */
1444	if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
1445		return scrub_recheck_block_on_raid56(fs_info, sblock);
1446
1447	for (i = 0; i < sblock->sector_count; i++) {
1448		struct scrub_sector *sector = sblock->sectors[i];
1449		struct bio bio;
1450		struct bio_vec bvec;
1451
1452		if (sector->dev->bdev == NULL) {
1453			sector->io_error = 1;
1454			sblock->no_io_error_seen = 0;
1455			continue;
1456		}
1457
1458		WARN_ON(!sector->page);
1459		bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ);
1460		bio_add_page(&bio, sector->page, fs_info->sectorsize, 0);
1461		bio.bi_iter.bi_sector = sector->physical >> 9;
1462
1463		btrfsic_check_bio(&bio);
1464		if (submit_bio_wait(&bio)) {
1465			sector->io_error = 1;
1466			sblock->no_io_error_seen = 0;
1467		}
1468
1469		bio_uninit(&bio);
1470	}
1471
1472	if (sblock->no_io_error_seen)
1473		scrub_recheck_block_checksum(sblock);
1474}
1475
1476static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
1477{
1478	struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices;
1479	int ret;
1480
1481	ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1482	return !ret;
1483}
1484
1485static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1486{
1487	sblock->header_error = 0;
1488	sblock->checksum_error = 0;
1489	sblock->generation_error = 0;
1490
1491	if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1492		scrub_checksum_data(sblock);
1493	else
1494		scrub_checksum_tree_block(sblock);
1495}
1496
1497static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1498					     struct scrub_block *sblock_good)
1499{
1500	int i;
1501	int ret = 0;
1502
1503	for (i = 0; i < sblock_bad->sector_count; i++) {
1504		int ret_sub;
1505
1506		ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
1507							     sblock_good, i, 1);
1508		if (ret_sub)
1509			ret = ret_sub;
1510	}
1511
1512	return ret;
1513}
1514
1515static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
1516					      struct scrub_block *sblock_good,
1517					      int sector_num, int force_write)
1518{
1519	struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1520	struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
1521	struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1522	const u32 sectorsize = fs_info->sectorsize;
1523
1524	BUG_ON(sector_bad->page == NULL);
1525	BUG_ON(sector_good->page == NULL);
1526	if (force_write || sblock_bad->header_error ||
1527	    sblock_bad->checksum_error || sector_bad->io_error) {
1528		struct bio bio;
1529		struct bio_vec bvec;
1530		int ret;
1531
1532		if (!sector_bad->dev->bdev) {
1533			btrfs_warn_rl(fs_info,
1534				"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1535			return -EIO;
1536		}
1537
1538		bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
1539		bio.bi_iter.bi_sector = sector_bad->physical >> 9;
1540		__bio_add_page(&bio, sector_good->page, sectorsize, 0);
1541
1542		btrfsic_check_bio(&bio);
1543		ret = submit_bio_wait(&bio);
1544		bio_uninit(&bio);
1545
1546		if (ret) {
1547			btrfs_dev_stat_inc_and_print(sector_bad->dev,
1548				BTRFS_DEV_STAT_WRITE_ERRS);
1549			atomic64_inc(&fs_info->dev_replace.num_write_errors);
1550			return -EIO;
1551		}
1552	}
1553
1554	return 0;
1555}
1556
1557static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1558{
1559	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1560	int i;
1561
1562	/*
1563	 * This block is used for the check of the parity on the source device,
1564	 * so the data needn't be written into the destination device.
1565	 */
1566	if (sblock->sparity)
1567		return;
1568
1569	for (i = 0; i < sblock->sector_count; i++) {
1570		int ret;
1571
1572		ret = scrub_write_sector_to_dev_replace(sblock, i);
1573		if (ret)
1574			atomic64_inc(&fs_info->dev_replace.num_write_errors);
1575	}
1576}
1577
1578static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
1579{
1580	struct scrub_sector *sector = sblock->sectors[sector_num];
1581
1582	BUG_ON(sector->page == NULL);
1583	if (sector->io_error)
1584		clear_page(page_address(sector->page));
1585
1586	return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
1587}
1588
1589static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1590{
1591	int ret = 0;
1592	u64 length;
1593
1594	if (!btrfs_is_zoned(sctx->fs_info))
1595		return 0;
1596
1597	if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1598		return 0;
1599
1600	if (sctx->write_pointer < physical) {
1601		length = physical - sctx->write_pointer;
1602
1603		ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1604						sctx->write_pointer, length);
1605		if (!ret)
1606			sctx->write_pointer = physical;
1607	}
1608	return ret;
1609}
1610
1611static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
1612				      struct scrub_sector *sector)
1613{
1614	struct scrub_bio *sbio;
1615	int ret;
1616	const u32 sectorsize = sctx->fs_info->sectorsize;
1617
1618	mutex_lock(&sctx->wr_lock);
1619again:
1620	if (!sctx->wr_curr_bio) {
1621		sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1622					      GFP_KERNEL);
1623		if (!sctx->wr_curr_bio) {
1624			mutex_unlock(&sctx->wr_lock);
1625			return -ENOMEM;
1626		}
1627		sctx->wr_curr_bio->sctx = sctx;
1628		sctx->wr_curr_bio->sector_count = 0;
1629	}
1630	sbio = sctx->wr_curr_bio;
1631	if (sbio->sector_count == 0) {
1632		ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace);
1633		if (ret) {
1634			mutex_unlock(&sctx->wr_lock);
1635			return ret;
1636		}
1637
1638		sbio->physical = sector->physical_for_dev_replace;
1639		sbio->logical = sector->logical;
1640		sbio->dev = sctx->wr_tgtdev;
1641		if (!sbio->bio) {
1642			sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
1643					      REQ_OP_WRITE, GFP_NOFS);
1644		}
1645		sbio->bio->bi_private = sbio;
1646		sbio->bio->bi_end_io = scrub_wr_bio_end_io;
1647		sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
1648		sbio->status = 0;
1649	} else if (sbio->physical + sbio->sector_count * sectorsize !=
1650		   sector->physical_for_dev_replace ||
1651		   sbio->logical + sbio->sector_count * sectorsize !=
1652		   sector->logical) {
1653		scrub_wr_submit(sctx);
1654		goto again;
1655	}
1656
1657	ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
1658	if (ret != sectorsize) {
1659		if (sbio->sector_count < 1) {
1660			bio_put(sbio->bio);
1661			sbio->bio = NULL;
1662			mutex_unlock(&sctx->wr_lock);
1663			return -EIO;
1664		}
1665		scrub_wr_submit(sctx);
1666		goto again;
1667	}
1668
1669	sbio->sectors[sbio->sector_count] = sector;
1670	scrub_sector_get(sector);
1671	sbio->sector_count++;
1672	if (sbio->sector_count == sctx->sectors_per_bio)
1673		scrub_wr_submit(sctx);
1674	mutex_unlock(&sctx->wr_lock);
1675
1676	return 0;
1677}
1678
1679static void scrub_wr_submit(struct scrub_ctx *sctx)
1680{
1681	struct scrub_bio *sbio;
1682
1683	if (!sctx->wr_curr_bio)
1684		return;
1685
1686	sbio = sctx->wr_curr_bio;
1687	sctx->wr_curr_bio = NULL;
1688	scrub_pending_bio_inc(sctx);
1689	/* process all writes in a single worker thread. Then the block layer
1690	 * orders the requests before sending them to the driver which
1691	 * doubled the write performance on spinning disks when measured
1692	 * with Linux 3.5 */
1693	btrfsic_check_bio(sbio->bio);
1694	submit_bio(sbio->bio);
1695
1696	if (btrfs_is_zoned(sctx->fs_info))
1697		sctx->write_pointer = sbio->physical + sbio->sector_count *
1698			sctx->fs_info->sectorsize;
1699}
1700
1701static void scrub_wr_bio_end_io(struct bio *bio)
1702{
1703	struct scrub_bio *sbio = bio->bi_private;
1704	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1705
1706	sbio->status = bio->bi_status;
1707	sbio->bio = bio;
1708
1709	INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
1710	queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1711}
1712
1713static void scrub_wr_bio_end_io_worker(struct work_struct *work)
1714{
1715	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1716	struct scrub_ctx *sctx = sbio->sctx;
1717	int i;
1718
1719	ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
1720	if (sbio->status) {
1721		struct btrfs_dev_replace *dev_replace =
1722			&sbio->sctx->fs_info->dev_replace;
1723
1724		for (i = 0; i < sbio->sector_count; i++) {
1725			struct scrub_sector *sector = sbio->sectors[i];
1726
1727			sector->io_error = 1;
1728			atomic64_inc(&dev_replace->num_write_errors);
1729		}
1730	}
1731
1732	for (i = 0; i < sbio->sector_count; i++)
1733		scrub_sector_put(sbio->sectors[i]);
1734
1735	bio_put(sbio->bio);
1736	kfree(sbio);
1737	scrub_pending_bio_dec(sctx);
1738}
1739
1740static int scrub_checksum(struct scrub_block *sblock)
1741{
1742	u64 flags;
1743	int ret;
1744
1745	/*
1746	 * No need to initialize these stats currently,
1747	 * because this function only use return value
1748	 * instead of these stats value.
1749	 *
1750	 * Todo:
1751	 * always use stats
1752	 */
1753	sblock->header_error = 0;
1754	sblock->generation_error = 0;
1755	sblock->checksum_error = 0;
1756
1757	WARN_ON(sblock->sector_count < 1);
1758	flags = sblock->sectors[0]->flags;
1759	ret = 0;
1760	if (flags & BTRFS_EXTENT_FLAG_DATA)
1761		ret = scrub_checksum_data(sblock);
1762	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1763		ret = scrub_checksum_tree_block(sblock);
1764	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1765		(void)scrub_checksum_super(sblock);
1766	else
1767		WARN_ON(1);
1768	if (ret)
1769		scrub_handle_errored_block(sblock);
1770
1771	return ret;
1772}
1773
1774static int scrub_checksum_data(struct scrub_block *sblock)
1775{
1776	struct scrub_ctx *sctx = sblock->sctx;
1777	struct btrfs_fs_info *fs_info = sctx->fs_info;
1778	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1779	u8 csum[BTRFS_CSUM_SIZE];
1780	struct scrub_sector *sector;
1781	char *kaddr;
1782
1783	BUG_ON(sblock->sector_count < 1);
1784	sector = sblock->sectors[0];
1785	if (!sector->have_csum)
1786		return 0;
1787
1788	kaddr = page_address(sector->page);
1789
1790	shash->tfm = fs_info->csum_shash;
1791	crypto_shash_init(shash);
1792
1793	/*
1794	 * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector
1795	 * only contains one sector of data.
1796	 */
1797	crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1798
1799	if (memcmp(csum, sector->csum, fs_info->csum_size))
1800		sblock->checksum_error = 1;
1801	return sblock->checksum_error;
1802}
1803
1804static int scrub_checksum_tree_block(struct scrub_block *sblock)
1805{
1806	struct scrub_ctx *sctx = sblock->sctx;
1807	struct btrfs_header *h;
1808	struct btrfs_fs_info *fs_info = sctx->fs_info;
1809	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1810	u8 calculated_csum[BTRFS_CSUM_SIZE];
1811	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1812	/*
1813	 * This is done in sectorsize steps even for metadata as there's a
1814	 * constraint for nodesize to be aligned to sectorsize. This will need
1815	 * to change so we don't misuse data and metadata units like that.
1816	 */
1817	const u32 sectorsize = sctx->fs_info->sectorsize;
1818	const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1819	int i;
1820	struct scrub_sector *sector;
1821	char *kaddr;
1822
1823	BUG_ON(sblock->sector_count < 1);
1824
1825	/* Each member in sectors is just one sector */
1826	ASSERT(sblock->sector_count == num_sectors);
1827
1828	sector = sblock->sectors[0];
1829	kaddr = page_address(sector->page);
1830	h = (struct btrfs_header *)kaddr;
1831	memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1832
1833	/*
1834	 * we don't use the getter functions here, as we
1835	 * a) don't have an extent buffer and
1836	 * b) the page is already kmapped
1837	 */
1838	if (sector->logical != btrfs_stack_header_bytenr(h))
1839		sblock->header_error = 1;
1840
1841	if (sector->generation != btrfs_stack_header_generation(h)) {
1842		sblock->header_error = 1;
1843		sblock->generation_error = 1;
1844	}
1845
1846	if (!scrub_check_fsid(h->fsid, sector))
1847		sblock->header_error = 1;
1848
1849	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1850		   BTRFS_UUID_SIZE))
1851		sblock->header_error = 1;
1852
1853	shash->tfm = fs_info->csum_shash;
1854	crypto_shash_init(shash);
1855	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1856			    sectorsize - BTRFS_CSUM_SIZE);
1857
1858	for (i = 1; i < num_sectors; i++) {
1859		kaddr = page_address(sblock->sectors[i]->page);
1860		crypto_shash_update(shash, kaddr, sectorsize);
1861	}
1862
1863	crypto_shash_final(shash, calculated_csum);
1864	if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1865		sblock->checksum_error = 1;
1866
1867	return sblock->header_error || sblock->checksum_error;
1868}
1869
1870static int scrub_checksum_super(struct scrub_block *sblock)
1871{
1872	struct btrfs_super_block *s;
1873	struct scrub_ctx *sctx = sblock->sctx;
1874	struct btrfs_fs_info *fs_info = sctx->fs_info;
1875	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1876	u8 calculated_csum[BTRFS_CSUM_SIZE];
1877	struct scrub_sector *sector;
1878	char *kaddr;
1879	int fail_gen = 0;
1880	int fail_cor = 0;
1881
1882	BUG_ON(sblock->sector_count < 1);
1883	sector = sblock->sectors[0];
1884	kaddr = page_address(sector->page);
1885	s = (struct btrfs_super_block *)kaddr;
1886
1887	if (sector->logical != btrfs_super_bytenr(s))
1888		++fail_cor;
1889
1890	if (sector->generation != btrfs_super_generation(s))
1891		++fail_gen;
1892
1893	if (!scrub_check_fsid(s->fsid, sector))
1894		++fail_cor;
1895
1896	shash->tfm = fs_info->csum_shash;
1897	crypto_shash_init(shash);
1898	crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1899			BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1900
1901	if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1902		++fail_cor;
1903
1904	if (fail_cor + fail_gen) {
1905		/*
1906		 * if we find an error in a super block, we just report it.
1907		 * They will get written with the next transaction commit
1908		 * anyway
1909		 */
1910		spin_lock(&sctx->stat_lock);
1911		++sctx->stat.super_errors;
1912		spin_unlock(&sctx->stat_lock);
1913		if (fail_cor)
1914			btrfs_dev_stat_inc_and_print(sector->dev,
1915				BTRFS_DEV_STAT_CORRUPTION_ERRS);
1916		else
1917			btrfs_dev_stat_inc_and_print(sector->dev,
1918				BTRFS_DEV_STAT_GENERATION_ERRS);
1919	}
1920
1921	return fail_cor + fail_gen;
1922}
1923
1924static void scrub_block_get(struct scrub_block *sblock)
1925{
1926	refcount_inc(&sblock->refs);
1927}
1928
1929static void scrub_block_put(struct scrub_block *sblock)
1930{
1931	if (refcount_dec_and_test(&sblock->refs)) {
1932		int i;
1933
1934		if (sblock->sparity)
1935			scrub_parity_put(sblock->sparity);
1936
1937		for (i = 0; i < sblock->sector_count; i++)
1938			scrub_sector_put(sblock->sectors[i]);
1939		kfree(sblock);
1940	}
1941}
1942
1943static void scrub_sector_get(struct scrub_sector *sector)
1944{
1945	atomic_inc(&sector->refs);
1946}
1947
1948static void scrub_sector_put(struct scrub_sector *sector)
1949{
1950	if (atomic_dec_and_test(&sector->refs)) {
1951		if (sector->page)
1952			__free_page(sector->page);
1953		kfree(sector);
1954	}
1955}
1956
1957/*
1958 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1959 * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1960 */
1961static void scrub_throttle(struct scrub_ctx *sctx)
1962{
1963	const int time_slice = 1000;
1964	struct scrub_bio *sbio;
1965	struct btrfs_device *device;
1966	s64 delta;
1967	ktime_t now;
1968	u32 div;
1969	u64 bwlimit;
1970
1971	sbio = sctx->bios[sctx->curr];
1972	device = sbio->dev;
1973	bwlimit = READ_ONCE(device->scrub_speed_max);
1974	if (bwlimit == 0)
1975		return;
1976
1977	/*
1978	 * Slice is divided into intervals when the IO is submitted, adjust by
1979	 * bwlimit and maximum of 64 intervals.
1980	 */
1981	div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
1982	div = min_t(u32, 64, div);
1983
1984	/* Start new epoch, set deadline */
1985	now = ktime_get();
1986	if (sctx->throttle_deadline == 0) {
1987		sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
1988		sctx->throttle_sent = 0;
1989	}
1990
1991	/* Still in the time to send? */
1992	if (ktime_before(now, sctx->throttle_deadline)) {
1993		/* If current bio is within the limit, send it */
1994		sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
1995		if (sctx->throttle_sent <= div_u64(bwlimit, div))
1996			return;
1997
1998		/* We're over the limit, sleep until the rest of the slice */
1999		delta = ktime_ms_delta(sctx->throttle_deadline, now);
2000	} else {
2001		/* New request after deadline, start new epoch */
2002		delta = 0;
2003	}
2004
2005	if (delta) {
2006		long timeout;
2007
2008		timeout = div_u64(delta * HZ, 1000);
2009		schedule_timeout_interruptible(timeout);
2010	}
2011
2012	/* Next call will start the deadline period */
2013	sctx->throttle_deadline = 0;
2014}
2015
2016static void scrub_submit(struct scrub_ctx *sctx)
2017{
2018	struct scrub_bio *sbio;
2019
2020	if (sctx->curr == -1)
2021		return;
2022
2023	scrub_throttle(sctx);
2024
2025	sbio = sctx->bios[sctx->curr];
2026	sctx->curr = -1;
2027	scrub_pending_bio_inc(sctx);
2028	btrfsic_check_bio(sbio->bio);
2029	submit_bio(sbio->bio);
2030}
2031
2032static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
2033				      struct scrub_sector *sector)
2034{
2035	struct scrub_block *sblock = sector->sblock;
2036	struct scrub_bio *sbio;
2037	const u32 sectorsize = sctx->fs_info->sectorsize;
2038	int ret;
2039
2040again:
2041	/*
2042	 * grab a fresh bio or wait for one to become available
2043	 */
2044	while (sctx->curr == -1) {
2045		spin_lock(&sctx->list_lock);
2046		sctx->curr = sctx->first_free;
2047		if (sctx->curr != -1) {
2048			sctx->first_free = sctx->bios[sctx->curr]->next_free;
2049			sctx->bios[sctx->curr]->next_free = -1;
2050			sctx->bios[sctx->curr]->sector_count = 0;
2051			spin_unlock(&sctx->list_lock);
2052		} else {
2053			spin_unlock(&sctx->list_lock);
2054			wait_event(sctx->list_wait, sctx->first_free != -1);
2055		}
2056	}
2057	sbio = sctx->bios[sctx->curr];
2058	if (sbio->sector_count == 0) {
2059		sbio->physical = sector->physical;
2060		sbio->logical = sector->logical;
2061		sbio->dev = sector->dev;
2062		if (!sbio->bio) {
2063			sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
2064					      REQ_OP_READ, GFP_NOFS);
2065		}
2066		sbio->bio->bi_private = sbio;
2067		sbio->bio->bi_end_io = scrub_bio_end_io;
2068		sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
2069		sbio->status = 0;
2070	} else if (sbio->physical + sbio->sector_count * sectorsize !=
2071		   sector->physical ||
2072		   sbio->logical + sbio->sector_count * sectorsize !=
2073		   sector->logical ||
2074		   sbio->dev != sector->dev) {
2075		scrub_submit(sctx);
2076		goto again;
2077	}
2078
2079	sbio->sectors[sbio->sector_count] = sector;
2080	ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
2081	if (ret != sectorsize) {
2082		if (sbio->sector_count < 1) {
2083			bio_put(sbio->bio);
2084			sbio->bio = NULL;
2085			return -EIO;
2086		}
2087		scrub_submit(sctx);
2088		goto again;
2089	}
2090
2091	scrub_block_get(sblock); /* one for the page added to the bio */
2092	atomic_inc(&sblock->outstanding_sectors);
2093	sbio->sector_count++;
2094	if (sbio->sector_count == sctx->sectors_per_bio)
2095		scrub_submit(sctx);
2096
2097	return 0;
2098}
2099
2100static void scrub_missing_raid56_end_io(struct bio *bio)
2101{
2102	struct scrub_block *sblock = bio->bi_private;
2103	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2104
2105	if (bio->bi_status)
2106		sblock->no_io_error_seen = 0;
2107
2108	bio_put(bio);
2109
2110	queue_work(fs_info->scrub_workers, &sblock->work);
2111}
2112
2113static void scrub_missing_raid56_worker(struct work_struct *work)
2114{
2115	struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2116	struct scrub_ctx *sctx = sblock->sctx;
2117	struct btrfs_fs_info *fs_info = sctx->fs_info;
2118	u64 logical;
2119	struct btrfs_device *dev;
2120
2121	logical = sblock->sectors[0]->logical;
2122	dev = sblock->sectors[0]->dev;
2123
2124	if (sblock->no_io_error_seen)
2125		scrub_recheck_block_checksum(sblock);
2126
2127	if (!sblock->no_io_error_seen) {
2128		spin_lock(&sctx->stat_lock);
2129		sctx->stat.read_errors++;
2130		spin_unlock(&sctx->stat_lock);
2131		btrfs_err_rl_in_rcu(fs_info,
2132			"IO error rebuilding logical %llu for dev %s",
2133			logical, rcu_str_deref(dev->name));
2134	} else if (sblock->header_error || sblock->checksum_error) {
2135		spin_lock(&sctx->stat_lock);
2136		sctx->stat.uncorrectable_errors++;
2137		spin_unlock(&sctx->stat_lock);
2138		btrfs_err_rl_in_rcu(fs_info,
2139			"failed to rebuild valid logical %llu for dev %s",
2140			logical, rcu_str_deref(dev->name));
2141	} else {
2142		scrub_write_block_to_dev_replace(sblock);
2143	}
2144
2145	if (sctx->is_dev_replace && sctx->flush_all_writes) {
2146		mutex_lock(&sctx->wr_lock);
2147		scrub_wr_submit(sctx);
2148		mutex_unlock(&sctx->wr_lock);
2149	}
2150
2151	scrub_block_put(sblock);
2152	scrub_pending_bio_dec(sctx);
2153}
2154
2155static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2156{
2157	struct scrub_ctx *sctx = sblock->sctx;
2158	struct btrfs_fs_info *fs_info = sctx->fs_info;
2159	u64 length = sblock->sector_count << fs_info->sectorsize_bits;
2160	u64 logical = sblock->sectors[0]->logical;
2161	struct btrfs_io_context *bioc = NULL;
2162	struct bio *bio;
2163	struct btrfs_raid_bio *rbio;
2164	int ret;
2165	int i;
2166
2167	btrfs_bio_counter_inc_blocked(fs_info);
2168	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2169			       &length, &bioc);
2170	if (ret || !bioc || !bioc->raid_map)
2171		goto bioc_out;
2172
2173	if (WARN_ON(!sctx->is_dev_replace ||
2174		    !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2175		/*
2176		 * We shouldn't be scrubbing a missing device. Even for dev
2177		 * replace, we should only get here for RAID 5/6. We either
2178		 * managed to mount something with no mirrors remaining or
2179		 * there's a bug in scrub_find_good_copy()/btrfs_map_block().
2180		 */
2181		goto bioc_out;
2182	}
2183
2184	bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2185	bio->bi_iter.bi_sector = logical >> 9;
2186	bio->bi_private = sblock;
2187	bio->bi_end_io = scrub_missing_raid56_end_io;
2188
2189	rbio = raid56_alloc_missing_rbio(bio, bioc);
2190	if (!rbio)
2191		goto rbio_out;
2192
2193	for (i = 0; i < sblock->sector_count; i++) {
2194		struct scrub_sector *sector = sblock->sectors[i];
2195
2196		/*
2197		 * For now, our scrub is still one page per sector, so pgoff
2198		 * is always 0.
2199		 */
2200		raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical);
2201	}
2202
2203	INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
2204	scrub_block_get(sblock);
2205	scrub_pending_bio_inc(sctx);
2206	raid56_submit_missing_rbio(rbio);
2207	return;
2208
2209rbio_out:
2210	bio_put(bio);
2211bioc_out:
2212	btrfs_bio_counter_dec(fs_info);
2213	btrfs_put_bioc(bioc);
2214	spin_lock(&sctx->stat_lock);
2215	sctx->stat.malloc_errors++;
2216	spin_unlock(&sctx->stat_lock);
2217}
2218
2219static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
2220		       u64 physical, struct btrfs_device *dev, u64 flags,
2221		       u64 gen, int mirror_num, u8 *csum,
2222		       u64 physical_for_dev_replace)
2223{
2224	struct scrub_block *sblock;
2225	const u32 sectorsize = sctx->fs_info->sectorsize;
2226	int index;
2227
2228	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2229	if (!sblock) {
2230		spin_lock(&sctx->stat_lock);
2231		sctx->stat.malloc_errors++;
2232		spin_unlock(&sctx->stat_lock);
2233		return -ENOMEM;
2234	}
2235
2236	/* one ref inside this function, plus one for each page added to
2237	 * a bio later on */
2238	refcount_set(&sblock->refs, 1);
2239	sblock->sctx = sctx;
2240	sblock->no_io_error_seen = 1;
2241
2242	for (index = 0; len > 0; index++) {
2243		struct scrub_sector *sector;
2244		/*
2245		 * Here we will allocate one page for one sector to scrub.
2246		 * This is fine if PAGE_SIZE == sectorsize, but will cost
2247		 * more memory for PAGE_SIZE > sectorsize case.
2248		 */
2249		u32 l = min(sectorsize, len);
2250
2251		sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2252		if (!sector) {
2253leave_nomem:
2254			spin_lock(&sctx->stat_lock);
2255			sctx->stat.malloc_errors++;
2256			spin_unlock(&sctx->stat_lock);
2257			scrub_block_put(sblock);
2258			return -ENOMEM;
2259		}
2260		ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2261		scrub_sector_get(sector);
2262		sblock->sectors[index] = sector;
2263		sector->sblock = sblock;
2264		sector->dev = dev;
2265		sector->flags = flags;
2266		sector->generation = gen;
2267		sector->logical = logical;
2268		sector->physical = physical;
2269		sector->physical_for_dev_replace = physical_for_dev_replace;
2270		sector->mirror_num = mirror_num;
2271		if (csum) {
2272			sector->have_csum = 1;
2273			memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2274		} else {
2275			sector->have_csum = 0;
2276		}
2277		sblock->sector_count++;
2278		sector->page = alloc_page(GFP_KERNEL);
2279		if (!sector->page)
2280			goto leave_nomem;
2281		len -= l;
2282		logical += l;
2283		physical += l;
2284		physical_for_dev_replace += l;
2285	}
2286
2287	WARN_ON(sblock->sector_count == 0);
2288	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2289		/*
2290		 * This case should only be hit for RAID 5/6 device replace. See
2291		 * the comment in scrub_missing_raid56_pages() for details.
2292		 */
2293		scrub_missing_raid56_pages(sblock);
2294	} else {
2295		for (index = 0; index < sblock->sector_count; index++) {
2296			struct scrub_sector *sector = sblock->sectors[index];
2297			int ret;
2298
2299			ret = scrub_add_sector_to_rd_bio(sctx, sector);
2300			if (ret) {
2301				scrub_block_put(sblock);
2302				return ret;
2303			}
2304		}
2305
2306		if (flags & BTRFS_EXTENT_FLAG_SUPER)
2307			scrub_submit(sctx);
2308	}
2309
2310	/* last one frees, either here or in bio completion for last page */
2311	scrub_block_put(sblock);
2312	return 0;
2313}
2314
2315static void scrub_bio_end_io(struct bio *bio)
2316{
2317	struct scrub_bio *sbio = bio->bi_private;
2318	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2319
2320	sbio->status = bio->bi_status;
2321	sbio->bio = bio;
2322
2323	queue_work(fs_info->scrub_workers, &sbio->work);
2324}
2325
2326static void scrub_bio_end_io_worker(struct work_struct *work)
2327{
2328	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2329	struct scrub_ctx *sctx = sbio->sctx;
2330	int i;
2331
2332	ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
2333	if (sbio->status) {
2334		for (i = 0; i < sbio->sector_count; i++) {
2335			struct scrub_sector *sector = sbio->sectors[i];
2336
2337			sector->io_error = 1;
2338			sector->sblock->no_io_error_seen = 0;
2339		}
2340	}
2341
2342	/* Now complete the scrub_block items that have all pages completed */
2343	for (i = 0; i < sbio->sector_count; i++) {
2344		struct scrub_sector *sector = sbio->sectors[i];
2345		struct scrub_block *sblock = sector->sblock;
2346
2347		if (atomic_dec_and_test(&sblock->outstanding_sectors))
2348			scrub_block_complete(sblock);
2349		scrub_block_put(sblock);
2350	}
2351
2352	bio_put(sbio->bio);
2353	sbio->bio = NULL;
2354	spin_lock(&sctx->list_lock);
2355	sbio->next_free = sctx->first_free;
2356	sctx->first_free = sbio->index;
2357	spin_unlock(&sctx->list_lock);
2358
2359	if (sctx->is_dev_replace && sctx->flush_all_writes) {
2360		mutex_lock(&sctx->wr_lock);
2361		scrub_wr_submit(sctx);
2362		mutex_unlock(&sctx->wr_lock);
2363	}
2364
2365	scrub_pending_bio_dec(sctx);
2366}
2367
2368static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2369				       unsigned long *bitmap,
2370				       u64 start, u32 len)
2371{
2372	u64 offset;
2373	u32 nsectors;
2374	u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2375
2376	if (len >= sparity->stripe_len) {
2377		bitmap_set(bitmap, 0, sparity->nsectors);
2378		return;
2379	}
2380
2381	start -= sparity->logic_start;
2382	start = div64_u64_rem(start, sparity->stripe_len, &offset);
2383	offset = offset >> sectorsize_bits;
2384	nsectors = len >> sectorsize_bits;
2385
2386	if (offset + nsectors <= sparity->nsectors) {
2387		bitmap_set(bitmap, offset, nsectors);
2388		return;
2389	}
2390
2391	bitmap_set(bitmap, offset, sparity->nsectors - offset);
2392	bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2393}
2394
2395static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2396						   u64 start, u32 len)
2397{
2398	__scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
2399}
2400
2401static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2402						  u64 start, u32 len)
2403{
2404	__scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
2405}
2406
2407static void scrub_block_complete(struct scrub_block *sblock)
2408{
2409	int corrupted = 0;
2410
2411	if (!sblock->no_io_error_seen) {
2412		corrupted = 1;
2413		scrub_handle_errored_block(sblock);
2414	} else {
2415		/*
2416		 * if has checksum error, write via repair mechanism in
2417		 * dev replace case, otherwise write here in dev replace
2418		 * case.
2419		 */
2420		corrupted = scrub_checksum(sblock);
2421		if (!corrupted && sblock->sctx->is_dev_replace)
2422			scrub_write_block_to_dev_replace(sblock);
2423	}
2424
2425	if (sblock->sparity && corrupted && !sblock->data_corrected) {
2426		u64 start = sblock->sectors[0]->logical;
2427		u64 end = sblock->sectors[sblock->sector_count - 1]->logical +
2428			  sblock->sctx->fs_info->sectorsize;
2429
2430		ASSERT(end - start <= U32_MAX);
2431		scrub_parity_mark_sectors_error(sblock->sparity,
2432						start, end - start);
2433	}
2434}
2435
2436static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2437{
2438	sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2439	list_del(&sum->list);
2440	kfree(sum);
2441}
2442
2443/*
2444 * Find the desired csum for range [logical, logical + sectorsize), and store
2445 * the csum into @csum.
2446 *
2447 * The search source is sctx->csum_list, which is a pre-populated list
2448 * storing bytenr ordered csum ranges.  We're responsible to cleanup any range
2449 * that is before @logical.
2450 *
2451 * Return 0 if there is no csum for the range.
2452 * Return 1 if there is csum for the range and copied to @csum.
2453 */
2454static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2455{
2456	bool found = false;
2457
2458	while (!list_empty(&sctx->csum_list)) {
2459		struct btrfs_ordered_sum *sum = NULL;
2460		unsigned long index;
2461		unsigned long num_sectors;
2462
2463		sum = list_first_entry(&sctx->csum_list,
2464				       struct btrfs_ordered_sum, list);
2465		/* The current csum range is beyond our range, no csum found */
2466		if (sum->bytenr > logical)
2467			break;
2468
2469		/*
2470		 * The current sum is before our bytenr, since scrub is always
2471		 * done in bytenr order, the csum will never be used anymore,
2472		 * clean it up so that later calls won't bother with the range,
2473		 * and continue search the next range.
2474		 */
2475		if (sum->bytenr + sum->len <= logical) {
2476			drop_csum_range(sctx, sum);
2477			continue;
2478		}
2479
2480		/* Now the csum range covers our bytenr, copy the csum */
2481		found = true;
2482		index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2483		num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2484
2485		memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2486		       sctx->fs_info->csum_size);
2487
2488		/* Cleanup the range if we're at the end of the csum range */
2489		if (index == num_sectors - 1)
2490			drop_csum_range(sctx, sum);
2491		break;
2492	}
2493	if (!found)
2494		return 0;
2495	return 1;
2496}
2497
2498/* scrub extent tries to collect up to 64 kB for each bio */
2499static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2500			u64 logical, u32 len,
2501			u64 physical, struct btrfs_device *dev, u64 flags,
2502			u64 gen, int mirror_num)
2503{
2504	struct btrfs_device *src_dev = dev;
2505	u64 src_physical = physical;
2506	int src_mirror = mirror_num;
2507	int ret;
2508	u8 csum[BTRFS_CSUM_SIZE];
2509	u32 blocksize;
2510
2511	if (flags & BTRFS_EXTENT_FLAG_DATA) {
2512		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2513			blocksize = map->stripe_len;
2514		else
2515			blocksize = sctx->fs_info->sectorsize;
2516		spin_lock(&sctx->stat_lock);
2517		sctx->stat.data_extents_scrubbed++;
2518		sctx->stat.data_bytes_scrubbed += len;
2519		spin_unlock(&sctx->stat_lock);
2520	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2521		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2522			blocksize = map->stripe_len;
2523		else
2524			blocksize = sctx->fs_info->nodesize;
2525		spin_lock(&sctx->stat_lock);
2526		sctx->stat.tree_extents_scrubbed++;
2527		sctx->stat.tree_bytes_scrubbed += len;
2528		spin_unlock(&sctx->stat_lock);
2529	} else {
2530		blocksize = sctx->fs_info->sectorsize;
2531		WARN_ON(1);
2532	}
2533
2534	/*
2535	 * For dev-replace case, we can have @dev being a missing device.
2536	 * Regular scrub will avoid its execution on missing device at all,
2537	 * as that would trigger tons of read error.
2538	 *
2539	 * Reading from missing device will cause read error counts to
2540	 * increase unnecessarily.
2541	 * So here we change the read source to a good mirror.
2542	 */
2543	if (sctx->is_dev_replace && !dev->bdev)
2544		scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
2545				     &src_dev, &src_mirror);
2546	while (len) {
2547		u32 l = min(len, blocksize);
2548		int have_csum = 0;
2549
2550		if (flags & BTRFS_EXTENT_FLAG_DATA) {
2551			/* push csums to sbio */
2552			have_csum = scrub_find_csum(sctx, logical, csum);
2553			if (have_csum == 0)
2554				++sctx->stat.no_csum;
2555		}
2556		ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
2557				    flags, gen, src_mirror,
2558				    have_csum ? csum : NULL, physical);
2559		if (ret)
2560			return ret;
2561		len -= l;
2562		logical += l;
2563		physical += l;
2564		src_physical += l;
2565	}
2566	return 0;
2567}
2568
2569static int scrub_sectors_for_parity(struct scrub_parity *sparity,
2570				  u64 logical, u32 len,
2571				  u64 physical, struct btrfs_device *dev,
2572				  u64 flags, u64 gen, int mirror_num, u8 *csum)
2573{
2574	struct scrub_ctx *sctx = sparity->sctx;
2575	struct scrub_block *sblock;
2576	const u32 sectorsize = sctx->fs_info->sectorsize;
2577	int index;
2578
2579	ASSERT(IS_ALIGNED(len, sectorsize));
2580
2581	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2582	if (!sblock) {
2583		spin_lock(&sctx->stat_lock);
2584		sctx->stat.malloc_errors++;
2585		spin_unlock(&sctx->stat_lock);
2586		return -ENOMEM;
2587	}
2588
2589	/* one ref inside this function, plus one for each page added to
2590	 * a bio later on */
2591	refcount_set(&sblock->refs, 1);
2592	sblock->sctx = sctx;
2593	sblock->no_io_error_seen = 1;
2594	sblock->sparity = sparity;
2595	scrub_parity_get(sparity);
2596
2597	for (index = 0; len > 0; index++) {
2598		struct scrub_sector *sector;
2599
2600		sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2601		if (!sector) {
2602leave_nomem:
2603			spin_lock(&sctx->stat_lock);
2604			sctx->stat.malloc_errors++;
2605			spin_unlock(&sctx->stat_lock);
2606			scrub_block_put(sblock);
2607			return -ENOMEM;
2608		}
2609		ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2610		/* For scrub block */
2611		scrub_sector_get(sector);
2612		sblock->sectors[index] = sector;
2613		/* For scrub parity */
2614		scrub_sector_get(sector);
2615		list_add_tail(&sector->list, &sparity->sectors_list);
2616		sector->sblock = sblock;
2617		sector->dev = dev;
2618		sector->flags = flags;
2619		sector->generation = gen;
2620		sector->logical = logical;
2621		sector->physical = physical;
2622		sector->mirror_num = mirror_num;
2623		if (csum) {
2624			sector->have_csum = 1;
2625			memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2626		} else {
2627			sector->have_csum = 0;
2628		}
2629		sblock->sector_count++;
2630		sector->page = alloc_page(GFP_KERNEL);
2631		if (!sector->page)
2632			goto leave_nomem;
2633
2634
2635		/* Iterate over the stripe range in sectorsize steps */
2636		len -= sectorsize;
2637		logical += sectorsize;
2638		physical += sectorsize;
2639	}
2640
2641	WARN_ON(sblock->sector_count == 0);
2642	for (index = 0; index < sblock->sector_count; index++) {
2643		struct scrub_sector *sector = sblock->sectors[index];
2644		int ret;
2645
2646		ret = scrub_add_sector_to_rd_bio(sctx, sector);
2647		if (ret) {
2648			scrub_block_put(sblock);
2649			return ret;
2650		}
2651	}
2652
2653	/* Last one frees, either here or in bio completion for last sector */
2654	scrub_block_put(sblock);
2655	return 0;
2656}
2657
2658static int scrub_extent_for_parity(struct scrub_parity *sparity,
2659				   u64 logical, u32 len,
2660				   u64 physical, struct btrfs_device *dev,
2661				   u64 flags, u64 gen, int mirror_num)
2662{
2663	struct scrub_ctx *sctx = sparity->sctx;
2664	int ret;
2665	u8 csum[BTRFS_CSUM_SIZE];
2666	u32 blocksize;
2667
2668	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2669		scrub_parity_mark_sectors_error(sparity, logical, len);
2670		return 0;
2671	}
2672
2673	if (flags & BTRFS_EXTENT_FLAG_DATA) {
2674		blocksize = sparity->stripe_len;
2675	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2676		blocksize = sparity->stripe_len;
2677	} else {
2678		blocksize = sctx->fs_info->sectorsize;
2679		WARN_ON(1);
2680	}
2681
2682	while (len) {
2683		u32 l = min(len, blocksize);
2684		int have_csum = 0;
2685
2686		if (flags & BTRFS_EXTENT_FLAG_DATA) {
2687			/* push csums to sbio */
2688			have_csum = scrub_find_csum(sctx, logical, csum);
2689			if (have_csum == 0)
2690				goto skip;
2691		}
2692		ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
2693					     flags, gen, mirror_num,
2694					     have_csum ? csum : NULL);
2695		if (ret)
2696			return ret;
2697skip:
2698		len -= l;
2699		logical += l;
2700		physical += l;
2701	}
2702	return 0;
2703}
2704
2705/*
2706 * Given a physical address, this will calculate it's
2707 * logical offset. if this is a parity stripe, it will return
2708 * the most left data stripe's logical offset.
2709 *
2710 * return 0 if it is a data stripe, 1 means parity stripe.
2711 */
2712static int get_raid56_logic_offset(u64 physical, int num,
2713				   struct map_lookup *map, u64 *offset,
2714				   u64 *stripe_start)
2715{
2716	int i;
2717	int j = 0;
2718	u64 stripe_nr;
2719	u64 last_offset;
2720	u32 stripe_index;
2721	u32 rot;
2722	const int data_stripes = nr_data_stripes(map);
2723
2724	last_offset = (physical - map->stripes[num].physical) * data_stripes;
2725	if (stripe_start)
2726		*stripe_start = last_offset;
2727
2728	*offset = last_offset;
2729	for (i = 0; i < data_stripes; i++) {
2730		*offset = last_offset + i * map->stripe_len;
2731
2732		stripe_nr = div64_u64(*offset, map->stripe_len);
2733		stripe_nr = div_u64(stripe_nr, data_stripes);
2734
2735		/* Work out the disk rotation on this stripe-set */
2736		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2737		/* calculate which stripe this data locates */
2738		rot += i;
2739		stripe_index = rot % map->num_stripes;
2740		if (stripe_index == num)
2741			return 0;
2742		if (stripe_index < num)
2743			j++;
2744	}
2745	*offset = last_offset + j * map->stripe_len;
2746	return 1;
2747}
2748
2749static void scrub_free_parity(struct scrub_parity *sparity)
2750{
2751	struct scrub_ctx *sctx = sparity->sctx;
2752	struct scrub_sector *curr, *next;
2753	int nbits;
2754
2755	nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
2756	if (nbits) {
2757		spin_lock(&sctx->stat_lock);
2758		sctx->stat.read_errors += nbits;
2759		sctx->stat.uncorrectable_errors += nbits;
2760		spin_unlock(&sctx->stat_lock);
2761	}
2762
2763	list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
2764		list_del_init(&curr->list);
2765		scrub_sector_put(curr);
2766	}
2767
2768	kfree(sparity);
2769}
2770
2771static void scrub_parity_bio_endio_worker(struct work_struct *work)
2772{
2773	struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2774						    work);
2775	struct scrub_ctx *sctx = sparity->sctx;
2776
2777	scrub_free_parity(sparity);
2778	scrub_pending_bio_dec(sctx);
2779}
2780
2781static void scrub_parity_bio_endio(struct bio *bio)
2782{
2783	struct scrub_parity *sparity = bio->bi_private;
2784	struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2785
2786	if (bio->bi_status)
2787		bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
2788			  &sparity->dbitmap, sparity->nsectors);
2789
2790	bio_put(bio);
2791
2792	INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
2793	queue_work(fs_info->scrub_parity_workers, &sparity->work);
2794}
2795
2796static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2797{
2798	struct scrub_ctx *sctx = sparity->sctx;
2799	struct btrfs_fs_info *fs_info = sctx->fs_info;
2800	struct bio *bio;
2801	struct btrfs_raid_bio *rbio;
2802	struct btrfs_io_context *bioc = NULL;
2803	u64 length;
2804	int ret;
2805
2806	if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
2807			   &sparity->ebitmap, sparity->nsectors))
2808		goto out;
2809
2810	length = sparity->logic_end - sparity->logic_start;
2811
2812	btrfs_bio_counter_inc_blocked(fs_info);
2813	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2814			       &length, &bioc);
2815	if (ret || !bioc || !bioc->raid_map)
2816		goto bioc_out;
2817
2818	bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2819	bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2820	bio->bi_private = sparity;
2821	bio->bi_end_io = scrub_parity_bio_endio;
2822
2823	rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
2824					      sparity->scrub_dev,
2825					      &sparity->dbitmap,
2826					      sparity->nsectors);
2827	if (!rbio)
2828		goto rbio_out;
2829
2830	scrub_pending_bio_inc(sctx);
2831	raid56_parity_submit_scrub_rbio(rbio);
2832	return;
2833
2834rbio_out:
2835	bio_put(bio);
2836bioc_out:
2837	btrfs_bio_counter_dec(fs_info);
2838	btrfs_put_bioc(bioc);
2839	bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
2840		  sparity->nsectors);
2841	spin_lock(&sctx->stat_lock);
2842	sctx->stat.malloc_errors++;
2843	spin_unlock(&sctx->stat_lock);
2844out:
2845	scrub_free_parity(sparity);
2846}
2847
2848static void scrub_parity_get(struct scrub_parity *sparity)
2849{
2850	refcount_inc(&sparity->refs);
2851}
2852
2853static void scrub_parity_put(struct scrub_parity *sparity)
2854{
2855	if (!refcount_dec_and_test(&sparity->refs))
2856		return;
2857
2858	scrub_parity_check_and_repair(sparity);
2859}
2860
2861/*
2862 * Return 0 if the extent item range covers any byte of the range.
2863 * Return <0 if the extent item is before @search_start.
2864 * Return >0 if the extent item is after @start_start + @search_len.
2865 */
2866static int compare_extent_item_range(struct btrfs_path *path,
2867				     u64 search_start, u64 search_len)
2868{
2869	struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
2870	u64 len;
2871	struct btrfs_key key;
2872
2873	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2874	ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
2875	       key.type == BTRFS_METADATA_ITEM_KEY);
2876	if (key.type == BTRFS_METADATA_ITEM_KEY)
2877		len = fs_info->nodesize;
2878	else
2879		len = key.offset;
2880
2881	if (key.objectid + len <= search_start)
2882		return -1;
2883	if (key.objectid >= search_start + search_len)
2884		return 1;
2885	return 0;
2886}
2887
2888/*
2889 * Locate one extent item which covers any byte in range
2890 * [@search_start, @search_start + @search_length)
2891 *
2892 * If the path is not initialized, we will initialize the search by doing
2893 * a btrfs_search_slot().
2894 * If the path is already initialized, we will use the path as the initial
2895 * slot, to avoid duplicated btrfs_search_slot() calls.
2896 *
2897 * NOTE: If an extent item starts before @search_start, we will still
2898 * return the extent item. This is for data extent crossing stripe boundary.
2899 *
2900 * Return 0 if we found such extent item, and @path will point to the extent item.
2901 * Return >0 if no such extent item can be found, and @path will be released.
2902 * Return <0 if hit fatal error, and @path will be released.
2903 */
2904static int find_first_extent_item(struct btrfs_root *extent_root,
2905				  struct btrfs_path *path,
2906				  u64 search_start, u64 search_len)
2907{
2908	struct btrfs_fs_info *fs_info = extent_root->fs_info;
2909	struct btrfs_key key;
2910	int ret;
2911
2912	/* Continue using the existing path */
2913	if (path->nodes[0])
2914		goto search_forward;
2915
2916	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2917		key.type = BTRFS_METADATA_ITEM_KEY;
2918	else
2919		key.type = BTRFS_EXTENT_ITEM_KEY;
2920	key.objectid = search_start;
2921	key.offset = (u64)-1;
2922
2923	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2924	if (ret < 0)
2925		return ret;
2926
2927	ASSERT(ret > 0);
2928	/*
2929	 * Here we intentionally pass 0 as @min_objectid, as there could be
2930	 * an extent item starting before @search_start.
2931	 */
2932	ret = btrfs_previous_extent_item(extent_root, path, 0);
2933	if (ret < 0)
2934		return ret;
2935	/*
2936	 * No matter whether we have found an extent item, the next loop will
2937	 * properly do every check on the key.
2938	 */
2939search_forward:
2940	while (true) {
2941		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2942		if (key.objectid >= search_start + search_len)
2943			break;
2944		if (key.type != BTRFS_METADATA_ITEM_KEY &&
2945		    key.type != BTRFS_EXTENT_ITEM_KEY)
2946			goto next;
2947
2948		ret = compare_extent_item_range(path, search_start, search_len);
2949		if (ret == 0)
2950			return ret;
2951		if (ret > 0)
2952			break;
2953next:
2954		path->slots[0]++;
2955		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2956			ret = btrfs_next_leaf(extent_root, path);
2957			if (ret) {
2958				/* Either no more item or fatal error */
2959				btrfs_release_path(path);
2960				return ret;
2961			}
2962		}
2963	}
2964	btrfs_release_path(path);
2965	return 1;
2966}
2967
2968static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
2969			    u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
2970{
2971	struct btrfs_key key;
2972	struct btrfs_extent_item *ei;
2973
2974	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2975	ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
2976	       key.type == BTRFS_EXTENT_ITEM_KEY);
2977	*extent_start_ret = key.objectid;
2978	if (key.type == BTRFS_METADATA_ITEM_KEY)
2979		*size_ret = path->nodes[0]->fs_info->nodesize;
2980	else
2981		*size_ret = key.offset;
2982	ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
2983	*flags_ret = btrfs_extent_flags(path->nodes[0], ei);
2984	*generation_ret = btrfs_extent_generation(path->nodes[0], ei);
2985}
2986
2987static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
2988				      u64 boundary_start, u64 boudary_len)
2989{
2990	return (extent_start < boundary_start &&
2991		extent_start + extent_len > boundary_start) ||
2992	       (extent_start < boundary_start + boudary_len &&
2993		extent_start + extent_len > boundary_start + boudary_len);
2994}
2995
2996static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
2997					       struct scrub_parity *sparity,
2998					       struct map_lookup *map,
2999					       struct btrfs_device *sdev,
3000					       struct btrfs_path *path,
3001					       u64 logical)
3002{
3003	struct btrfs_fs_info *fs_info = sctx->fs_info;
3004	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
3005	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
3006	u64 cur_logical = logical;
3007	int ret;
3008
3009	ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3010
3011	/* Path must not be populated */
3012	ASSERT(!path->nodes[0]);
3013
3014	while (cur_logical < logical + map->stripe_len) {
3015		struct btrfs_io_context *bioc = NULL;
3016		struct btrfs_device *extent_dev;
3017		u64 extent_start;
3018		u64 extent_size;
3019		u64 mapped_length;
3020		u64 extent_flags;
3021		u64 extent_gen;
3022		u64 extent_physical;
3023		u64 extent_mirror_num;
3024
3025		ret = find_first_extent_item(extent_root, path, cur_logical,
3026					     logical + map->stripe_len - cur_logical);
3027		/* No more extent item in this data stripe */
3028		if (ret > 0) {
3029			ret = 0;
3030			break;
3031		}
3032		if (ret < 0)
3033			break;
3034		get_extent_info(path, &extent_start, &extent_size, &extent_flags,
3035				&extent_gen);
3036
3037		/* Metadata should not cross stripe boundaries */
3038		if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3039		    does_range_cross_boundary(extent_start, extent_size,
3040					      logical, map->stripe_len)) {
3041			btrfs_err(fs_info,
3042	"scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3043				  extent_start, logical);
3044			spin_lock(&sctx->stat_lock);
3045			sctx->stat.uncorrectable_errors++;
3046			spin_unlock(&sctx->stat_lock);
3047			cur_logical += extent_size;
3048			continue;
3049		}
3050
3051		/* Skip hole range which doesn't have any extent */
3052		cur_logical = max(extent_start, cur_logical);
3053
3054		/* Truncate the range inside this data stripe */
3055		extent_size = min(extent_start + extent_size,
3056				  logical + map->stripe_len) - cur_logical;
3057		extent_start = cur_logical;
3058		ASSERT(extent_size <= U32_MAX);
3059
3060		scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
3061
3062		mapped_length = extent_size;
3063		ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
3064				      &mapped_length, &bioc, 0);
3065		if (!ret && (!bioc || mapped_length < extent_size))
3066			ret = -EIO;
3067		if (ret) {
3068			btrfs_put_bioc(bioc);
3069			scrub_parity_mark_sectors_error(sparity, extent_start,
3070							extent_size);
3071			break;
3072		}
3073		extent_physical = bioc->stripes[0].physical;
3074		extent_mirror_num = bioc->mirror_num;
3075		extent_dev = bioc->stripes[0].dev;
3076		btrfs_put_bioc(bioc);
3077
3078		ret = btrfs_lookup_csums_range(csum_root, extent_start,
3079					       extent_start + extent_size - 1,
3080					       &sctx->csum_list, 1);
3081		if (ret) {
3082			scrub_parity_mark_sectors_error(sparity, extent_start,
3083							extent_size);
3084			break;
3085		}
3086
3087		ret = scrub_extent_for_parity(sparity, extent_start,
3088					      extent_size, extent_physical,
3089					      extent_dev, extent_flags,
3090					      extent_gen, extent_mirror_num);
3091		scrub_free_csums(sctx);
3092
3093		if (ret) {
3094			scrub_parity_mark_sectors_error(sparity, extent_start,
3095							extent_size);
3096			break;
3097		}
3098
3099		cond_resched();
3100		cur_logical += extent_size;
3101	}
3102	btrfs_release_path(path);
3103	return ret;
3104}
3105
3106static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3107						  struct map_lookup *map,
3108						  struct btrfs_device *sdev,
3109						  u64 logic_start,
3110						  u64 logic_end)
3111{
3112	struct btrfs_fs_info *fs_info = sctx->fs_info;
3113	struct btrfs_path *path;
3114	u64 cur_logical;
3115	int ret;
3116	struct scrub_parity *sparity;
3117	int nsectors;
3118
3119	path = btrfs_alloc_path();
3120	if (!path) {
3121		spin_lock(&sctx->stat_lock);
3122		sctx->stat.malloc_errors++;
3123		spin_unlock(&sctx->stat_lock);
3124		return -ENOMEM;
3125	}
3126	path->search_commit_root = 1;
3127	path->skip_locking = 1;
3128
3129	ASSERT(map->stripe_len <= U32_MAX);
3130	nsectors = map->stripe_len >> fs_info->sectorsize_bits;
3131	ASSERT(nsectors <= BITS_PER_LONG);
3132	sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
3133	if (!sparity) {
3134		spin_lock(&sctx->stat_lock);
3135		sctx->stat.malloc_errors++;
3136		spin_unlock(&sctx->stat_lock);
3137		btrfs_free_path(path);
3138		return -ENOMEM;
3139	}
3140
3141	ASSERT(map->stripe_len <= U32_MAX);
3142	sparity->stripe_len = map->stripe_len;
3143	sparity->nsectors = nsectors;
3144	sparity->sctx = sctx;
3145	sparity->scrub_dev = sdev;
3146	sparity->logic_start = logic_start;
3147	sparity->logic_end = logic_end;
3148	refcount_set(&sparity->refs, 1);
3149	INIT_LIST_HEAD(&sparity->sectors_list);
3150
3151	ret = 0;
3152	for (cur_logical = logic_start; cur_logical < logic_end;
3153	     cur_logical += map->stripe_len) {
3154		ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
3155							  sdev, path, cur_logical);
3156		if (ret < 0)
3157			break;
3158	}
3159
3160	scrub_parity_put(sparity);
3161	scrub_submit(sctx);
3162	mutex_lock(&sctx->wr_lock);
3163	scrub_wr_submit(sctx);
3164	mutex_unlock(&sctx->wr_lock);
3165
3166	btrfs_free_path(path);
3167	return ret < 0 ? ret : 0;
3168}
3169
3170static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3171{
3172	if (!btrfs_is_zoned(sctx->fs_info))
3173		return;
3174
3175	sctx->flush_all_writes = true;
3176	scrub_submit(sctx);
3177	mutex_lock(&sctx->wr_lock);
3178	scrub_wr_submit(sctx);
3179	mutex_unlock(&sctx->wr_lock);
3180
3181	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3182}
3183
3184static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3185					u64 physical, u64 physical_end)
3186{
3187	struct btrfs_fs_info *fs_info = sctx->fs_info;
3188	int ret = 0;
3189
3190	if (!btrfs_is_zoned(fs_info))
3191		return 0;
3192
3193	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3194
3195	mutex_lock(&sctx->wr_lock);
3196	if (sctx->write_pointer < physical_end) {
3197		ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3198						    physical,
3199						    sctx->write_pointer);
3200		if (ret)
3201			btrfs_err(fs_info,
3202				  "zoned: failed to recover write pointer");
3203	}
3204	mutex_unlock(&sctx->wr_lock);
3205	btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3206
3207	return ret;
3208}
3209
3210/*
3211 * Scrub one range which can only has simple mirror based profile.
3212 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
3213 *  RAID0/RAID10).
3214 *
3215 * Since we may need to handle a subset of block group, we need @logical_start
3216 * and @logical_length parameter.
3217 */
3218static int scrub_simple_mirror(struct scrub_ctx *sctx,
3219			       struct btrfs_root *extent_root,
3220			       struct btrfs_root *csum_root,
3221			       struct btrfs_block_group *bg,
3222			       struct map_lookup *map,
3223			       u64 logical_start, u64 logical_length,
3224			       struct btrfs_device *device,
3225			       u64 physical, int mirror_num)
3226{
3227	struct btrfs_fs_info *fs_info = sctx->fs_info;
3228	const u64 logical_end = logical_start + logical_length;
3229	/* An artificial limit, inherit from old scrub behavior */
3230	const u32 max_length = SZ_64K;
3231	struct btrfs_path path = { 0 };
3232	u64 cur_logical = logical_start;
3233	int ret;
3234
3235	/* The range must be inside the bg */
3236	ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
3237
3238	path.search_commit_root = 1;
3239	path.skip_locking = 1;
3240	/* Go through each extent items inside the logical range */
3241	while (cur_logical < logical_end) {
3242		u64 extent_start;
3243		u64 extent_len;
3244		u64 extent_flags;
3245		u64 extent_gen;
3246		u64 scrub_len;
3247
3248		/* Canceled? */
3249		if (atomic_read(&fs_info->scrub_cancel_req) ||
3250		    atomic_read(&sctx->cancel_req)) {
3251			ret = -ECANCELED;
3252			break;
3253		}
3254		/* Paused? */
3255		if (atomic_read(&fs_info->scrub_pause_req)) {
3256			/* Push queued extents */
3257			sctx->flush_all_writes = true;
3258			scrub_submit(sctx);
3259			mutex_lock(&sctx->wr_lock);
3260			scrub_wr_submit(sctx);
3261			mutex_unlock(&sctx->wr_lock);
3262			wait_event(sctx->list_wait,
3263				   atomic_read(&sctx->bios_in_flight) == 0);
3264			sctx->flush_all_writes = false;
3265			scrub_blocked_if_needed(fs_info);
3266		}
3267		/* Block group removed? */
3268		spin_lock(&bg->lock);
3269		if (bg->removed) {
3270			spin_unlock(&bg->lock);
3271			ret = 0;
3272			break;
3273		}
3274		spin_unlock(&bg->lock);
3275
3276		ret = find_first_extent_item(extent_root, &path, cur_logical,
3277					     logical_end - cur_logical);
3278		if (ret > 0) {
3279			/* No more extent, just update the accounting */
3280			sctx->stat.last_physical = physical + logical_length;
3281			ret = 0;
3282			break;
3283		}
3284		if (ret < 0)
3285			break;
3286		get_extent_info(&path, &extent_start, &extent_len,
3287				&extent_flags, &extent_gen);
3288		/* Skip hole range which doesn't have any extent */
3289		cur_logical = max(extent_start, cur_logical);
3290
3291		/*
3292		 * Scrub len has three limits:
3293		 * - Extent size limit
3294		 * - Scrub range limit
3295		 *   This is especially imporatant for RAID0/RAID10 to reuse
3296		 *   this function
3297		 * - Max scrub size limit
3298		 */
3299		scrub_len = min(min(extent_start + extent_len,
3300				    logical_end), cur_logical + max_length) -
3301			    cur_logical;
3302
3303		if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
3304			ret = btrfs_lookup_csums_range(csum_root, cur_logical,
3305					cur_logical + scrub_len - 1,
3306					&sctx->csum_list, 1);
3307			if (ret)
3308				break;
3309		}
3310		if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3311		    does_range_cross_boundary(extent_start, extent_len,
3312					      logical_start, logical_length)) {
3313			btrfs_err(fs_info,
3314"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
3315				  extent_start, logical_start, logical_end);
3316			spin_lock(&sctx->stat_lock);
3317			sctx->stat.uncorrectable_errors++;
3318			spin_unlock(&sctx->stat_lock);
3319			cur_logical += scrub_len;
3320			continue;
3321		}
3322		ret = scrub_extent(sctx, map, cur_logical, scrub_len,
3323				   cur_logical - logical_start + physical,
3324				   device, extent_flags, extent_gen,
3325				   mirror_num);
3326		scrub_free_csums(sctx);
3327		if (ret)
3328			break;
3329		if (sctx->is_dev_replace)
3330			sync_replace_for_zoned(sctx);
3331		cur_logical += scrub_len;
3332		/* Don't hold CPU for too long time */
3333		cond_resched();
3334	}
3335	btrfs_release_path(&path);
3336	return ret;
3337}
3338
3339/* Calculate the full stripe length for simple stripe based profiles */
3340static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
3341{
3342	ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3343			    BTRFS_BLOCK_GROUP_RAID10));
3344
3345	return map->num_stripes / map->sub_stripes * map->stripe_len;
3346}
3347
3348/* Get the logical bytenr for the stripe */
3349static u64 simple_stripe_get_logical(struct map_lookup *map,
3350				     struct btrfs_block_group *bg,
3351				     int stripe_index)
3352{
3353	ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3354			    BTRFS_BLOCK_GROUP_RAID10));
3355	ASSERT(stripe_index < map->num_stripes);
3356
3357	/*
3358	 * (stripe_index / sub_stripes) gives how many data stripes we need to
3359	 * skip.
3360	 */
3361	return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
3362}
3363
3364/* Get the mirror number for the stripe */
3365static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
3366{
3367	ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3368			    BTRFS_BLOCK_GROUP_RAID10));
3369	ASSERT(stripe_index < map->num_stripes);
3370
3371	/* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
3372	return stripe_index % map->sub_stripes + 1;
3373}
3374
3375static int scrub_simple_stripe(struct scrub_ctx *sctx,
3376			       struct btrfs_root *extent_root,
3377			       struct btrfs_root *csum_root,
3378			       struct btrfs_block_group *bg,
3379			       struct map_lookup *map,
3380			       struct btrfs_device *device,
3381			       int stripe_index)
3382{
3383	const u64 logical_increment = simple_stripe_full_stripe_len(map);
3384	const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
3385	const u64 orig_physical = map->stripes[stripe_index].physical;
3386	const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
3387	u64 cur_logical = orig_logical;
3388	u64 cur_physical = orig_physical;
3389	int ret = 0;
3390
3391	while (cur_logical < bg->start + bg->length) {
3392		/*
3393		 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
3394		 * just RAID1, so we can reuse scrub_simple_mirror() to scrub
3395		 * this stripe.
3396		 */
3397		ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
3398					  cur_logical, map->stripe_len, device,
3399					  cur_physical, mirror_num);
3400		if (ret)
3401			return ret;
3402		/* Skip to next stripe which belongs to the target device */
3403		cur_logical += logical_increment;
3404		/* For physical offset, we just go to next stripe */
3405		cur_physical += map->stripe_len;
3406	}
3407	return ret;
3408}
3409
3410static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3411					   struct btrfs_block_group *bg,
3412					   struct extent_map *em,
3413					   struct btrfs_device *scrub_dev,
3414					   int stripe_index)
3415{
3416	struct btrfs_path *path;
3417	struct btrfs_fs_info *fs_info = sctx->fs_info;
3418	struct btrfs_root *root;
3419	struct btrfs_root *csum_root;
3420	struct blk_plug plug;
3421	struct map_lookup *map = em->map_lookup;
3422	const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3423	const u64 chunk_logical = bg->start;
3424	int ret;
3425	u64 physical = map->stripes[stripe_index].physical;
3426	const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
3427	const u64 physical_end = physical + dev_stripe_len;
3428	u64 logical;
3429	u64 logic_end;
3430	/* The logical increment after finishing one stripe */
3431	u64 increment;
3432	/* Offset inside the chunk */
3433	u64 offset;
3434	u64 stripe_logical;
3435	u64 stripe_end;
3436	int stop_loop = 0;
3437
3438	path = btrfs_alloc_path();
3439	if (!path)
3440		return -ENOMEM;
3441
3442	/*
3443	 * work on commit root. The related disk blocks are static as
3444	 * long as COW is applied. This means, it is save to rewrite
3445	 * them to repair disk errors without any race conditions
3446	 */
3447	path->search_commit_root = 1;
3448	path->skip_locking = 1;
3449	path->reada = READA_FORWARD;
3450
3451	wait_event(sctx->list_wait,
3452		   atomic_read(&sctx->bios_in_flight) == 0);
3453	scrub_blocked_if_needed(fs_info);
3454
3455	root = btrfs_extent_root(fs_info, bg->start);
3456	csum_root = btrfs_csum_root(fs_info, bg->start);
3457
3458	/*
3459	 * collect all data csums for the stripe to avoid seeking during
3460	 * the scrub. This might currently (crc32) end up to be about 1MB
3461	 */
3462	blk_start_plug(&plug);
3463
3464	if (sctx->is_dev_replace &&
3465	    btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3466		mutex_lock(&sctx->wr_lock);
3467		sctx->write_pointer = physical;
3468		mutex_unlock(&sctx->wr_lock);
3469		sctx->flush_all_writes = true;
3470	}
3471
3472	/*
3473	 * There used to be a big double loop to handle all profiles using the
3474	 * same routine, which grows larger and more gross over time.
3475	 *
3476	 * So here we handle each profile differently, so simpler profiles
3477	 * have simpler scrubbing function.
3478	 */
3479	if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
3480			 BTRFS_BLOCK_GROUP_RAID56_MASK))) {
3481		/*
3482		 * Above check rules out all complex profile, the remaining
3483		 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
3484		 * mirrored duplication without stripe.
3485		 *
3486		 * Only @physical and @mirror_num needs to calculated using
3487		 * @stripe_index.
3488		 */
3489		ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3490				bg->start, bg->length, scrub_dev,
3491				map->stripes[stripe_index].physical,
3492				stripe_index + 1);
3493		offset = 0;
3494		goto out;
3495	}
3496	if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3497		ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
3498					  scrub_dev, stripe_index);
3499		offset = map->stripe_len * (stripe_index / map->sub_stripes);
3500		goto out;
3501	}
3502
3503	/* Only RAID56 goes through the old code */
3504	ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3505	ret = 0;
3506
3507	/* Calculate the logical end of the stripe */
3508	get_raid56_logic_offset(physical_end, stripe_index,
3509				map, &logic_end, NULL);
3510	logic_end += chunk_logical;
3511
3512	/* Initialize @offset in case we need to go to out: label */
3513	get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
3514	increment = map->stripe_len * nr_data_stripes(map);
3515
3516	/*
3517	 * Due to the rotation, for RAID56 it's better to iterate each stripe
3518	 * using their physical offset.
3519	 */
3520	while (physical < physical_end) {
3521		ret = get_raid56_logic_offset(physical, stripe_index, map,
3522					      &logical, &stripe_logical);
3523		logical += chunk_logical;
3524		if (ret) {
3525			/* it is parity strip */
3526			stripe_logical += chunk_logical;
3527			stripe_end = stripe_logical + increment;
3528			ret = scrub_raid56_parity(sctx, map, scrub_dev,
3529						  stripe_logical,
3530						  stripe_end);
3531			if (ret)
3532				goto out;
3533			goto next;
3534		}
3535
3536		/*
3537		 * Now we're at a data stripe, scrub each extents in the range.
3538		 *
3539		 * At this stage, if we ignore the repair part, inside each data
3540		 * stripe it is no different than SINGLE profile.
3541		 * We can reuse scrub_simple_mirror() here, as the repair part
3542		 * is still based on @mirror_num.
3543		 */
3544		ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3545					  logical, map->stripe_len,
3546					  scrub_dev, physical, 1);
3547		if (ret < 0)
3548			goto out;
3549next:
3550		logical += increment;
3551		physical += map->stripe_len;
3552		spin_lock(&sctx->stat_lock);
3553		if (stop_loop)
3554			sctx->stat.last_physical =
3555				map->stripes[stripe_index].physical + dev_stripe_len;
3556		else
3557			sctx->stat.last_physical = physical;
3558		spin_unlock(&sctx->stat_lock);
3559		if (stop_loop)
3560			break;
3561	}
3562out:
3563	/* push queued extents */
3564	scrub_submit(sctx);
3565	mutex_lock(&sctx->wr_lock);
3566	scrub_wr_submit(sctx);
3567	mutex_unlock(&sctx->wr_lock);
3568
3569	blk_finish_plug(&plug);
3570	btrfs_free_path(path);
3571
3572	if (sctx->is_dev_replace && ret >= 0) {
3573		int ret2;
3574
3575		ret2 = sync_write_pointer_for_zoned(sctx,
3576				chunk_logical + offset,
3577				map->stripes[stripe_index].physical,
3578				physical_end);
3579		if (ret2)
3580			ret = ret2;
3581	}
3582
3583	return ret < 0 ? ret : 0;
3584}
3585
3586static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3587					  struct btrfs_block_group *bg,
3588					  struct btrfs_device *scrub_dev,
3589					  u64 dev_offset,
3590					  u64 dev_extent_len)
3591{
3592	struct btrfs_fs_info *fs_info = sctx->fs_info;
3593	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3594	struct map_lookup *map;
3595	struct extent_map *em;
3596	int i;
3597	int ret = 0;
3598
3599	read_lock(&map_tree->lock);
3600	em = lookup_extent_mapping(map_tree, bg->start, bg->length);
3601	read_unlock(&map_tree->lock);
3602
3603	if (!em) {
3604		/*
3605		 * Might have been an unused block group deleted by the cleaner
3606		 * kthread or relocation.
3607		 */
3608		spin_lock(&bg->lock);
3609		if (!bg->removed)
3610			ret = -EINVAL;
3611		spin_unlock(&bg->lock);
3612
3613		return ret;
3614	}
3615	if (em->start != bg->start)
3616		goto out;
3617	if (em->len < dev_extent_len)
3618		goto out;
3619
3620	map = em->map_lookup;
3621	for (i = 0; i < map->num_stripes; ++i) {
3622		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3623		    map->stripes[i].physical == dev_offset) {
3624			ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
3625			if (ret)
3626				goto out;
3627		}
3628	}
3629out:
3630	free_extent_map(em);
3631
3632	return ret;
3633}
3634
3635static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3636					  struct btrfs_block_group *cache)
3637{
3638	struct btrfs_fs_info *fs_info = cache->fs_info;
3639	struct btrfs_trans_handle *trans;
3640
3641	if (!btrfs_is_zoned(fs_info))
3642		return 0;
3643
3644	btrfs_wait_block_group_reservations(cache);
3645	btrfs_wait_nocow_writers(cache);
3646	btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3647
3648	trans = btrfs_join_transaction(root);
3649	if (IS_ERR(trans))
3650		return PTR_ERR(trans);
3651	return btrfs_commit_transaction(trans);
3652}
3653
3654static noinline_for_stack
3655int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3656			   struct btrfs_device *scrub_dev, u64 start, u64 end)
3657{
3658	struct btrfs_dev_extent *dev_extent = NULL;
3659	struct btrfs_path *path;
3660	struct btrfs_fs_info *fs_info = sctx->fs_info;
3661	struct btrfs_root *root = fs_info->dev_root;
3662	u64 chunk_offset;
3663	int ret = 0;
3664	int ro_set;
3665	int slot;
3666	struct extent_buffer *l;
3667	struct btrfs_key key;
3668	struct btrfs_key found_key;
3669	struct btrfs_block_group *cache;
3670	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3671
3672	path = btrfs_alloc_path();
3673	if (!path)
3674		return -ENOMEM;
3675
3676	path->reada = READA_FORWARD;
3677	path->search_commit_root = 1;
3678	path->skip_locking = 1;
3679
3680	key.objectid = scrub_dev->devid;
3681	key.offset = 0ull;
3682	key.type = BTRFS_DEV_EXTENT_KEY;
3683
3684	while (1) {
3685		u64 dev_extent_len;
3686
3687		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3688		if (ret < 0)
3689			break;
3690		if (ret > 0) {
3691			if (path->slots[0] >=
3692			    btrfs_header_nritems(path->nodes[0])) {
3693				ret = btrfs_next_leaf(root, path);
3694				if (ret < 0)
3695					break;
3696				if (ret > 0) {
3697					ret = 0;
3698					break;
3699				}
3700			} else {
3701				ret = 0;
3702			}
3703		}
3704
3705		l = path->nodes[0];
3706		slot = path->slots[0];
3707
3708		btrfs_item_key_to_cpu(l, &found_key, slot);
3709
3710		if (found_key.objectid != scrub_dev->devid)
3711			break;
3712
3713		if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3714			break;
3715
3716		if (found_key.offset >= end)
3717			break;
3718
3719		if (found_key.offset < key.offset)
3720			break;
3721
3722		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3723		dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
3724
3725		if (found_key.offset + dev_extent_len <= start)
3726			goto skip;
3727
3728		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3729
3730		/*
3731		 * get a reference on the corresponding block group to prevent
3732		 * the chunk from going away while we scrub it
3733		 */
3734		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3735
3736		/* some chunks are removed but not committed to disk yet,
3737		 * continue scrubbing */
3738		if (!cache)
3739			goto skip;
3740
3741		ASSERT(cache->start <= chunk_offset);
3742		/*
3743		 * We are using the commit root to search for device extents, so
3744		 * that means we could have found a device extent item from a
3745		 * block group that was deleted in the current transaction. The
3746		 * logical start offset of the deleted block group, stored at
3747		 * @chunk_offset, might be part of the logical address range of
3748		 * a new block group (which uses different physical extents).
3749		 * In this case btrfs_lookup_block_group() has returned the new
3750		 * block group, and its start address is less than @chunk_offset.
3751		 *
3752		 * We skip such new block groups, because it's pointless to
3753		 * process them, as we won't find their extents because we search
3754		 * for them using the commit root of the extent tree. For a device
3755		 * replace it's also fine to skip it, we won't miss copying them
3756		 * to the target device because we have the write duplication
3757		 * setup through the regular write path (by btrfs_map_block()),
3758		 * and we have committed a transaction when we started the device
3759		 * replace, right after setting up the device replace state.
3760		 */
3761		if (cache->start < chunk_offset) {
3762			btrfs_put_block_group(cache);
3763			goto skip;
3764		}
3765
3766		if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3767			spin_lock(&cache->lock);
3768			if (!cache->to_copy) {
3769				spin_unlock(&cache->lock);
3770				btrfs_put_block_group(cache);
3771				goto skip;
3772			}
3773			spin_unlock(&cache->lock);
3774		}
3775
3776		/*
3777		 * Make sure that while we are scrubbing the corresponding block
3778		 * group doesn't get its logical address and its device extents
3779		 * reused for another block group, which can possibly be of a
3780		 * different type and different profile. We do this to prevent
3781		 * false error detections and crashes due to bogus attempts to
3782		 * repair extents.
3783		 */
3784		spin_lock(&cache->lock);
3785		if (cache->removed) {
3786			spin_unlock(&cache->lock);
3787			btrfs_put_block_group(cache);
3788			goto skip;
3789		}
3790		btrfs_freeze_block_group(cache);
3791		spin_unlock(&cache->lock);
3792
3793		/*
3794		 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3795		 * to avoid deadlock caused by:
3796		 * btrfs_inc_block_group_ro()
3797		 * -> btrfs_wait_for_commit()
3798		 * -> btrfs_commit_transaction()
3799		 * -> btrfs_scrub_pause()
3800		 */
3801		scrub_pause_on(fs_info);
3802
3803		/*
3804		 * Don't do chunk preallocation for scrub.
3805		 *
3806		 * This is especially important for SYSTEM bgs, or we can hit
3807		 * -EFBIG from btrfs_finish_chunk_alloc() like:
3808		 * 1. The only SYSTEM bg is marked RO.
3809		 *    Since SYSTEM bg is small, that's pretty common.
3810		 * 2. New SYSTEM bg will be allocated
3811		 *    Due to regular version will allocate new chunk.
3812		 * 3. New SYSTEM bg is empty and will get cleaned up
3813		 *    Before cleanup really happens, it's marked RO again.
3814		 * 4. Empty SYSTEM bg get scrubbed
3815		 *    We go back to 2.
3816		 *
3817		 * This can easily boost the amount of SYSTEM chunks if cleaner
3818		 * thread can't be triggered fast enough, and use up all space
3819		 * of btrfs_super_block::sys_chunk_array
3820		 *
3821		 * While for dev replace, we need to try our best to mark block
3822		 * group RO, to prevent race between:
3823		 * - Write duplication
3824		 *   Contains latest data
3825		 * - Scrub copy
3826		 *   Contains data from commit tree
3827		 *
3828		 * If target block group is not marked RO, nocow writes can
3829		 * be overwritten by scrub copy, causing data corruption.
3830		 * So for dev-replace, it's not allowed to continue if a block
3831		 * group is not RO.
3832		 */
3833		ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3834		if (!ret && sctx->is_dev_replace) {
3835			ret = finish_extent_writes_for_zoned(root, cache);
3836			if (ret) {
3837				btrfs_dec_block_group_ro(cache);
3838				scrub_pause_off(fs_info);
3839				btrfs_put_block_group(cache);
3840				break;
3841			}
3842		}
3843
3844		if (ret == 0) {
3845			ro_set = 1;
3846		} else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3847			/*
3848			 * btrfs_inc_block_group_ro return -ENOSPC when it
3849			 * failed in creating new chunk for metadata.
3850			 * It is not a problem for scrub, because
3851			 * metadata are always cowed, and our scrub paused
3852			 * commit_transactions.
3853			 */
3854			ro_set = 0;
3855		} else if (ret == -ETXTBSY) {
3856			btrfs_warn(fs_info,
3857		   "skipping scrub of block group %llu due to active swapfile",
3858				   cache->start);
3859			scrub_pause_off(fs_info);
3860			ret = 0;
3861			goto skip_unfreeze;
3862		} else {
3863			btrfs_warn(fs_info,
3864				   "failed setting block group ro: %d", ret);
3865			btrfs_unfreeze_block_group(cache);
3866			btrfs_put_block_group(cache);
3867			scrub_pause_off(fs_info);
3868			break;
3869		}
3870
3871		/*
3872		 * Now the target block is marked RO, wait for nocow writes to
3873		 * finish before dev-replace.
3874		 * COW is fine, as COW never overwrites extents in commit tree.
3875		 */
3876		if (sctx->is_dev_replace) {
3877			btrfs_wait_nocow_writers(cache);
3878			btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3879					cache->length);
3880		}
3881
3882		scrub_pause_off(fs_info);
3883		down_write(&dev_replace->rwsem);
3884		dev_replace->cursor_right = found_key.offset + dev_extent_len;
3885		dev_replace->cursor_left = found_key.offset;
3886		dev_replace->item_needs_writeback = 1;
3887		up_write(&dev_replace->rwsem);
3888
3889		ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
3890				  dev_extent_len);
3891
3892		/*
3893		 * flush, submit all pending read and write bios, afterwards
3894		 * wait for them.
3895		 * Note that in the dev replace case, a read request causes
3896		 * write requests that are submitted in the read completion
3897		 * worker. Therefore in the current situation, it is required
3898		 * that all write requests are flushed, so that all read and
3899		 * write requests are really completed when bios_in_flight
3900		 * changes to 0.
3901		 */
3902		sctx->flush_all_writes = true;
3903		scrub_submit(sctx);
3904		mutex_lock(&sctx->wr_lock);
3905		scrub_wr_submit(sctx);
3906		mutex_unlock(&sctx->wr_lock);
3907
3908		wait_event(sctx->list_wait,
3909			   atomic_read(&sctx->bios_in_flight) == 0);
3910
3911		scrub_pause_on(fs_info);
3912
3913		/*
3914		 * must be called before we decrease @scrub_paused.
3915		 * make sure we don't block transaction commit while
3916		 * we are waiting pending workers finished.
3917		 */
3918		wait_event(sctx->list_wait,
3919			   atomic_read(&sctx->workers_pending) == 0);
3920		sctx->flush_all_writes = false;
3921
3922		scrub_pause_off(fs_info);
3923
3924		if (sctx->is_dev_replace &&
3925		    !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
3926						      cache, found_key.offset))
3927			ro_set = 0;
3928
3929		down_write(&dev_replace->rwsem);
3930		dev_replace->cursor_left = dev_replace->cursor_right;
3931		dev_replace->item_needs_writeback = 1;
3932		up_write(&dev_replace->rwsem);
3933
3934		if (ro_set)
3935			btrfs_dec_block_group_ro(cache);
3936
3937		/*
3938		 * We might have prevented the cleaner kthread from deleting
3939		 * this block group if it was already unused because we raced
3940		 * and set it to RO mode first. So add it back to the unused
3941		 * list, otherwise it might not ever be deleted unless a manual
3942		 * balance is triggered or it becomes used and unused again.
3943		 */
3944		spin_lock(&cache->lock);
3945		if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3946		    cache->used == 0) {
3947			spin_unlock(&cache->lock);
3948			if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3949				btrfs_discard_queue_work(&fs_info->discard_ctl,
3950							 cache);
3951			else
3952				btrfs_mark_bg_unused(cache);
3953		} else {
3954			spin_unlock(&cache->lock);
3955		}
3956skip_unfreeze:
3957		btrfs_unfreeze_block_group(cache);
3958		btrfs_put_block_group(cache);
3959		if (ret)
3960			break;
3961		if (sctx->is_dev_replace &&
3962		    atomic64_read(&dev_replace->num_write_errors) > 0) {
3963			ret = -EIO;
3964			break;
3965		}
3966		if (sctx->stat.malloc_errors > 0) {
3967			ret = -ENOMEM;
3968			break;
3969		}
3970skip:
3971		key.offset = found_key.offset + dev_extent_len;
3972		btrfs_release_path(path);
3973	}
3974
3975	btrfs_free_path(path);
3976
3977	return ret;
3978}
3979
3980static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3981					   struct btrfs_device *scrub_dev)
3982{
3983	int	i;
3984	u64	bytenr;
3985	u64	gen;
3986	int	ret;
3987	struct btrfs_fs_info *fs_info = sctx->fs_info;
3988
3989	if (BTRFS_FS_ERROR(fs_info))
3990		return -EROFS;
3991
3992	/* Seed devices of a new filesystem has their own generation. */
3993	if (scrub_dev->fs_devices != fs_info->fs_devices)
3994		gen = scrub_dev->generation;
3995	else
3996		gen = fs_info->last_trans_committed;
3997
3998	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3999		bytenr = btrfs_sb_offset(i);
4000		if (bytenr + BTRFS_SUPER_INFO_SIZE >
4001		    scrub_dev->commit_total_bytes)
4002			break;
4003		if (!btrfs_check_super_location(scrub_dev, bytenr))
4004			continue;
4005
4006		ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4007				    scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4008				    NULL, bytenr);
4009		if (ret)
4010			return ret;
4011	}
4012	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4013
4014	return 0;
4015}
4016
4017static void scrub_workers_put(struct btrfs_fs_info *fs_info)
4018{
4019	if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
4020					&fs_info->scrub_lock)) {
4021		struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
4022		struct workqueue_struct *scrub_wr_comp =
4023						fs_info->scrub_wr_completion_workers;
4024		struct workqueue_struct *scrub_parity =
4025						fs_info->scrub_parity_workers;
4026
4027		fs_info->scrub_workers = NULL;
4028		fs_info->scrub_wr_completion_workers = NULL;
4029		fs_info->scrub_parity_workers = NULL;
4030		mutex_unlock(&fs_info->scrub_lock);
4031
4032		if (scrub_workers)
4033			destroy_workqueue(scrub_workers);
4034		if (scrub_wr_comp)
4035			destroy_workqueue(scrub_wr_comp);
4036		if (scrub_parity)
4037			destroy_workqueue(scrub_parity);
4038	}
4039}
4040
4041/*
4042 * get a reference count on fs_info->scrub_workers. start worker if necessary
4043 */
4044static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4045						int is_dev_replace)
4046{
4047	struct workqueue_struct *scrub_workers = NULL;
4048	struct workqueue_struct *scrub_wr_comp = NULL;
4049	struct workqueue_struct *scrub_parity = NULL;
4050	unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4051	int max_active = fs_info->thread_pool_size;
4052	int ret = -ENOMEM;
4053
4054	if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4055		return 0;
4056
4057	scrub_workers = alloc_workqueue("btrfs-scrub", flags,
4058					is_dev_replace ? 1 : max_active);
4059	if (!scrub_workers)
4060		goto fail_scrub_workers;
4061
4062	scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
4063	if (!scrub_wr_comp)
4064		goto fail_scrub_wr_completion_workers;
4065
4066	scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
4067	if (!scrub_parity)
4068		goto fail_scrub_parity_workers;
4069
4070	mutex_lock(&fs_info->scrub_lock);
4071	if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4072		ASSERT(fs_info->scrub_workers == NULL &&
4073		       fs_info->scrub_wr_completion_workers == NULL &&
4074		       fs_info->scrub_parity_workers == NULL);
4075		fs_info->scrub_workers = scrub_workers;
4076		fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4077		fs_info->scrub_parity_workers = scrub_parity;
4078		refcount_set(&fs_info->scrub_workers_refcnt, 1);
4079		mutex_unlock(&fs_info->scrub_lock);
4080		return 0;
4081	}
4082	/* Other thread raced in and created the workers for us */
4083	refcount_inc(&fs_info->scrub_workers_refcnt);
4084	mutex_unlock(&fs_info->scrub_lock);
4085
4086	ret = 0;
4087	destroy_workqueue(scrub_parity);
4088fail_scrub_parity_workers:
4089	destroy_workqueue(scrub_wr_comp);
4090fail_scrub_wr_completion_workers:
4091	destroy_workqueue(scrub_workers);
4092fail_scrub_workers:
4093	return ret;
4094}
4095
4096int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4097		    u64 end, struct btrfs_scrub_progress *progress,
4098		    int readonly, int is_dev_replace)
4099{
4100	struct btrfs_dev_lookup_args args = { .devid = devid };
4101	struct scrub_ctx *sctx;
4102	int ret;
4103	struct btrfs_device *dev;
4104	unsigned int nofs_flag;
4105
4106	if (btrfs_fs_closing(fs_info))
4107		return -EAGAIN;
4108
4109	if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4110		/*
4111		 * in this case scrub is unable to calculate the checksum
4112		 * the way scrub is implemented. Do not handle this
4113		 * situation at all because it won't ever happen.
4114		 */
4115		btrfs_err(fs_info,
4116			   "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4117		       fs_info->nodesize,
4118		       BTRFS_STRIPE_LEN);
4119		return -EINVAL;
4120	}
4121
4122	if (fs_info->nodesize >
4123	    SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits ||
4124	    fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) {
4125		/*
4126		 * Would exhaust the array bounds of sectorv member in
4127		 * struct scrub_block
4128		 */
4129		btrfs_err(fs_info,
4130"scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails",
4131		       fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK,
4132		       fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK);
4133		return -EINVAL;
4134	}
4135
4136	/* Allocate outside of device_list_mutex */
4137	sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4138	if (IS_ERR(sctx))
4139		return PTR_ERR(sctx);
4140
4141	ret = scrub_workers_get(fs_info, is_dev_replace);
4142	if (ret)
4143		goto out_free_ctx;
4144
4145	mutex_lock(&fs_info->fs_devices->device_list_mutex);
4146	dev = btrfs_find_device(fs_info->fs_devices, &args);
4147	if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4148		     !is_dev_replace)) {
4149		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4150		ret = -ENODEV;
4151		goto out;
4152	}
4153
4154	if (!is_dev_replace && !readonly &&
4155	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4156		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4157		btrfs_err_in_rcu(fs_info,
4158			"scrub on devid %llu: filesystem on %s is not writable",
4159				 devid, rcu_str_deref(dev->name));
4160		ret = -EROFS;
4161		goto out;
4162	}
4163
4164	mutex_lock(&fs_info->scrub_lock);
4165	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4166	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4167		mutex_unlock(&fs_info->scrub_lock);
4168		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4169		ret = -EIO;
4170		goto out;
4171	}
4172
4173	down_read(&fs_info->dev_replace.rwsem);
4174	if (dev->scrub_ctx ||
4175	    (!is_dev_replace &&
4176	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4177		up_read(&fs_info->dev_replace.rwsem);
4178		mutex_unlock(&fs_info->scrub_lock);
4179		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4180		ret = -EINPROGRESS;
4181		goto out;
4182	}
4183	up_read(&fs_info->dev_replace.rwsem);
4184
4185	sctx->readonly = readonly;
4186	dev->scrub_ctx = sctx;
4187	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4188
4189	/*
4190	 * checking @scrub_pause_req here, we can avoid
4191	 * race between committing transaction and scrubbing.
4192	 */
4193	__scrub_blocked_if_needed(fs_info);
4194	atomic_inc(&fs_info->scrubs_running);
4195	mutex_unlock(&fs_info->scrub_lock);
4196
4197	/*
4198	 * In order to avoid deadlock with reclaim when there is a transaction
4199	 * trying to pause scrub, make sure we use GFP_NOFS for all the
4200	 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
4201	 * invoked by our callees. The pausing request is done when the
4202	 * transaction commit starts, and it blocks the transaction until scrub
4203	 * is paused (done at specific points at scrub_stripe() or right above
4204	 * before incrementing fs_info->scrubs_running).
4205	 */
4206	nofs_flag = memalloc_nofs_save();
4207	if (!is_dev_replace) {
4208		btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4209		/*
4210		 * by holding device list mutex, we can
4211		 * kick off writing super in log tree sync.
4212		 */
4213		mutex_lock(&fs_info->fs_devices->device_list_mutex);
4214		ret = scrub_supers(sctx, dev);
4215		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4216	}
4217
4218	if (!ret)
4219		ret = scrub_enumerate_chunks(sctx, dev, start, end);
4220	memalloc_nofs_restore(nofs_flag);
4221
4222	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4223	atomic_dec(&fs_info->scrubs_running);
4224	wake_up(&fs_info->scrub_pause_wait);
4225
4226	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4227
4228	if (progress)
4229		memcpy(progress, &sctx->stat, sizeof(*progress));
4230
4231	if (!is_dev_replace)
4232		btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4233			ret ? "not finished" : "finished", devid, ret);
4234
4235	mutex_lock(&fs_info->scrub_lock);
4236	dev->scrub_ctx = NULL;
4237	mutex_unlock(&fs_info->scrub_lock);
4238
4239	scrub_workers_put(fs_info);
4240	scrub_put_ctx(sctx);
4241
4242	return ret;
4243out:
4244	scrub_workers_put(fs_info);
4245out_free_ctx:
4246	scrub_free_ctx(sctx);
4247
4248	return ret;
4249}
4250
4251void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4252{
4253	mutex_lock(&fs_info->scrub_lock);
4254	atomic_inc(&fs_info->scrub_pause_req);
4255	while (atomic_read(&fs_info->scrubs_paused) !=
4256	       atomic_read(&fs_info->scrubs_running)) {
4257		mutex_unlock(&fs_info->scrub_lock);
4258		wait_event(fs_info->scrub_pause_wait,
4259			   atomic_read(&fs_info->scrubs_paused) ==
4260			   atomic_read(&fs_info->scrubs_running));
4261		mutex_lock(&fs_info->scrub_lock);
4262	}
4263	mutex_unlock(&fs_info->scrub_lock);
4264}
4265
4266void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4267{
4268	atomic_dec(&fs_info->scrub_pause_req);
4269	wake_up(&fs_info->scrub_pause_wait);
4270}
4271
4272int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4273{
4274	mutex_lock(&fs_info->scrub_lock);
4275	if (!atomic_read(&fs_info->scrubs_running)) {
4276		mutex_unlock(&fs_info->scrub_lock);
4277		return -ENOTCONN;
4278	}
4279
4280	atomic_inc(&fs_info->scrub_cancel_req);
4281	while (atomic_read(&fs_info->scrubs_running)) {
4282		mutex_unlock(&fs_info->scrub_lock);
4283		wait_event(fs_info->scrub_pause_wait,
4284			   atomic_read(&fs_info->scrubs_running) == 0);
4285		mutex_lock(&fs_info->scrub_lock);
4286	}
4287	atomic_dec(&fs_info->scrub_cancel_req);
4288	mutex_unlock(&fs_info->scrub_lock);
4289
4290	return 0;
4291}
4292
4293int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4294{
4295	struct btrfs_fs_info *fs_info = dev->fs_info;
4296	struct scrub_ctx *sctx;
4297
4298	mutex_lock(&fs_info->scrub_lock);
4299	sctx = dev->scrub_ctx;
4300	if (!sctx) {
4301		mutex_unlock(&fs_info->scrub_lock);
4302		return -ENOTCONN;
4303	}
4304	atomic_inc(&sctx->cancel_req);
4305	while (dev->scrub_ctx) {
4306		mutex_unlock(&fs_info->scrub_lock);
4307		wait_event(fs_info->scrub_pause_wait,
4308			   dev->scrub_ctx == NULL);
4309		mutex_lock(&fs_info->scrub_lock);
4310	}
4311	mutex_unlock(&fs_info->scrub_lock);
4312
4313	return 0;
4314}
4315
4316int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4317			 struct btrfs_scrub_progress *progress)
4318{
4319	struct btrfs_dev_lookup_args args = { .devid = devid };
4320	struct btrfs_device *dev;
4321	struct scrub_ctx *sctx = NULL;
4322
4323	mutex_lock(&fs_info->fs_devices->device_list_mutex);
4324	dev = btrfs_find_device(fs_info->fs_devices, &args);
4325	if (dev)
4326		sctx = dev->scrub_ctx;
4327	if (sctx)
4328		memcpy(progress, &sctx->stat, sizeof(*progress));
4329	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4330
4331	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4332}
4333
4334static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
4335				 u64 extent_logical, u32 extent_len,
4336				 u64 *extent_physical,
4337				 struct btrfs_device **extent_dev,
4338				 int *extent_mirror_num)
4339{
4340	u64 mapped_length;
4341	struct btrfs_io_context *bioc = NULL;
4342	int ret;
4343
4344	mapped_length = extent_len;
4345	ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4346			      &mapped_length, &bioc, 0);
4347	if (ret || !bioc || mapped_length < extent_len ||
4348	    !bioc->stripes[0].dev->bdev) {
4349		btrfs_put_bioc(bioc);
4350		return;
4351	}
4352
4353	*extent_physical = bioc->stripes[0].physical;
4354	*extent_mirror_num = bioc->mirror_num;
4355	*extent_dev = bioc->stripes[0].dev;
4356	btrfs_put_bioc(bioc);
4357}
Configure Feed

Configure Feed