fs/btrfs/raid56.c at v6.19 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / btrfs / raid56.c
at v6.19 3058 lines 85 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2012 Fusion-io  All rights reserved.
   4 * Copyright (C) 2012 Intel Corp. All rights reserved.
   5 */
   6
   7#include <linux/sched.h>
   8#include <linux/bio.h>
   9#include <linux/slab.h>
  10#include <linux/blkdev.h>
  11#include <linux/raid/pq.h>
  12#include <linux/hash.h>
  13#include <linux/list_sort.h>
  14#include <linux/raid/xor.h>
  15#include <linux/mm.h>
  16#include "messages.h"
  17#include "ctree.h"
  18#include "disk-io.h"
  19#include "volumes.h"
  20#include "raid56.h"
  21#include "async-thread.h"
  22#include "file-item.h"
  23#include "btrfs_inode.h"
  24
  25/* set when additional merges to this rbio are not allowed */
  26#define RBIO_RMW_LOCKED_BIT	1
  27
  28/*
  29 * set when this rbio is sitting in the hash, but it is just a cache
  30 * of past RMW
  31 */
  32#define RBIO_CACHE_BIT		2
  33
  34/*
  35 * set when it is safe to trust the stripe_pages for caching
  36 */
  37#define RBIO_CACHE_READY_BIT	3
  38
  39#define RBIO_CACHE_SIZE 1024
  40
  41#define BTRFS_STRIPE_HASH_TABLE_BITS				11
  42
  43static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc)
  44{
  45	if (unlikely(!bioc)) {
  46		btrfs_crit(fs_info, "bioc=NULL");
  47		return;
  48	}
  49	btrfs_crit(fs_info,
  50"bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u",
  51		bioc->logical, bioc->full_stripe_logical, bioc->size,
  52		bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes,
  53		bioc->replace_stripe_src, bioc->num_stripes);
  54	for (int i = 0; i < bioc->num_stripes; i++) {
  55		btrfs_crit(fs_info, "    nr=%d devid=%llu physical=%llu",
  56			   i, bioc->stripes[i].dev->devid,
  57			   bioc->stripes[i].physical);
  58	}
  59}
  60
  61static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info,
  62			    const struct btrfs_raid_bio *rbio)
  63{
  64	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
  65		return;
  66
  67	dump_bioc(fs_info, rbio->bioc);
  68	btrfs_crit(fs_info,
  69"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u sector_nsteps=%u scrubp=%u dbitmap=0x%lx",
  70		rbio->flags, rbio->nr_sectors, rbio->nr_data,
  71		rbio->real_stripes, rbio->stripe_nsectors,
  72		rbio->sector_nsteps, rbio->scrubp, rbio->dbitmap);
  73}
  74
  75#define ASSERT_RBIO(expr, rbio)						\
  76({									\
  77	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
  78		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
  79					(rbio)->bioc->fs_info : NULL;	\
  80									\
  81		btrfs_dump_rbio(__fs_info, (rbio));			\
  82	}								\
  83	ASSERT((expr));							\
  84})
  85
  86#define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr)			\
  87({									\
  88	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
  89		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
  90					(rbio)->bioc->fs_info : NULL;	\
  91									\
  92		btrfs_dump_rbio(__fs_info, (rbio));			\
  93		btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr));	\
  94	}								\
  95	ASSERT((expr));							\
  96})
  97
  98#define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr)			\
  99({									\
 100	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
 101		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
 102					(rbio)->bioc->fs_info : NULL;	\
 103									\
 104		btrfs_dump_rbio(__fs_info, (rbio));			\
 105		btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr));	\
 106	}								\
 107	ASSERT((expr));							\
 108})
 109
 110#define ASSERT_RBIO_LOGICAL(expr, rbio, logical)			\
 111({									\
 112	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
 113		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
 114					(rbio)->bioc->fs_info : NULL;	\
 115									\
 116		btrfs_dump_rbio(__fs_info, (rbio));			\
 117		btrfs_crit(__fs_info, "logical=%llu", (logical));		\
 118	}								\
 119	ASSERT((expr));							\
 120})
 121
 122/* Used by the raid56 code to lock stripes for read/modify/write */
 123struct btrfs_stripe_hash {
 124	struct list_head hash_list;
 125	spinlock_t lock;
 126};
 127
 128/* Used by the raid56 code to lock stripes for read/modify/write */
 129struct btrfs_stripe_hash_table {
 130	struct list_head stripe_cache;
 131	spinlock_t cache_lock;
 132	int cache_size;
 133	struct btrfs_stripe_hash table[];
 134};
 135
 136/*
 137 * The PFN may still be valid, but our paddrs should always be block size
 138 * aligned, thus such -1 paddr is definitely not a valid one.
 139 */
 140#define INVALID_PADDR	(~(phys_addr_t)0)
 141
 142static void rmw_rbio_work(struct work_struct *work);
 143static void rmw_rbio_work_locked(struct work_struct *work);
 144static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 145static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
 146
 147static int finish_parity_scrub(struct btrfs_raid_bio *rbio);
 148static void scrub_rbio_work_locked(struct work_struct *work);
 149
 150static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
 151{
 152	bitmap_free(rbio->error_bitmap);
 153	bitmap_free(rbio->stripe_uptodate_bitmap);
 154	kfree(rbio->stripe_pages);
 155	kfree(rbio->bio_paddrs);
 156	kfree(rbio->stripe_paddrs);
 157	kfree(rbio->finish_pointers);
 158}
 159
 160static void free_raid_bio(struct btrfs_raid_bio *rbio)
 161{
 162	int i;
 163
 164	if (!refcount_dec_and_test(&rbio->refs))
 165		return;
 166
 167	WARN_ON(!list_empty(&rbio->stripe_cache));
 168	WARN_ON(!list_empty(&rbio->hash_list));
 169	WARN_ON(!bio_list_empty(&rbio->bio_list));
 170
 171	for (i = 0; i < rbio->nr_pages; i++) {
 172		if (rbio->stripe_pages[i]) {
 173			__free_page(rbio->stripe_pages[i]);
 174			rbio->stripe_pages[i] = NULL;
 175		}
 176	}
 177
 178	btrfs_put_bioc(rbio->bioc);
 179	free_raid_bio_pointers(rbio);
 180	kfree(rbio);
 181}
 182
 183static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
 184{
 185	INIT_WORK(&rbio->work, work_func);
 186	queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
 187}
 188
 189/*
 190 * the stripe hash table is used for locking, and to collect
 191 * bios in hopes of making a full stripe
 192 */
 193int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 194{
 195	struct btrfs_stripe_hash_table *table;
 196	struct btrfs_stripe_hash_table *x;
 197	struct btrfs_stripe_hash *cur;
 198	struct btrfs_stripe_hash *h;
 199	unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS;
 200
 201	if (info->stripe_hash_table)
 202		return 0;
 203
 204	/*
 205	 * The table is large, starting with order 4 and can go as high as
 206	 * order 7 in case lock debugging is turned on.
 207	 *
 208	 * Try harder to allocate and fallback to vmalloc to lower the chance
 209	 * of a failing mount.
 210	 */
 211	table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
 212	if (!table)
 213		return -ENOMEM;
 214
 215	spin_lock_init(&table->cache_lock);
 216	INIT_LIST_HEAD(&table->stripe_cache);
 217
 218	h = table->table;
 219
 220	for (unsigned int i = 0; i < num_entries; i++) {
 221		cur = h + i;
 222		INIT_LIST_HEAD(&cur->hash_list);
 223		spin_lock_init(&cur->lock);
 224	}
 225
 226	x = cmpxchg(&info->stripe_hash_table, NULL, table);
 227	kvfree(x);
 228	return 0;
 229}
 230
 231static void memcpy_from_bio_to_stripe(struct btrfs_raid_bio *rbio, unsigned int sector_nr)
 232{
 233	const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
 234
 235	ASSERT(sector_nr < rbio->nr_sectors);
 236	for (int i = 0; i < rbio->sector_nsteps; i++) {
 237		unsigned int index = sector_nr * rbio->sector_nsteps + i;
 238		phys_addr_t dst = rbio->stripe_paddrs[index];
 239		phys_addr_t src = rbio->bio_paddrs[index];
 240
 241		ASSERT(dst != INVALID_PADDR);
 242		ASSERT(src != INVALID_PADDR);
 243
 244		memcpy_page(phys_to_page(dst), offset_in_page(dst),
 245			    phys_to_page(src), offset_in_page(src), step);
 246	}
 247}
 248
 249/*
 250 * caching an rbio means to copy anything from the
 251 * bio_sectors array into the stripe_pages array.  We
 252 * use the page uptodate bit in the stripe cache array
 253 * to indicate if it has valid data
 254 *
 255 * once the caching is done, we set the cache ready
 256 * bit.
 257 */
 258static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
 259{
 260	int i;
 261	int ret;
 262
 263	ret = alloc_rbio_pages(rbio);
 264	if (ret)
 265		return;
 266
 267	for (i = 0; i < rbio->nr_sectors; i++) {
 268		/* Some range not covered by bio (partial write), skip it */
 269		if (rbio->bio_paddrs[i * rbio->sector_nsteps] == INVALID_PADDR) {
 270			/*
 271			 * Even if the sector is not covered by bio, if it is
 272			 * a data sector it should still be uptodate as it is
 273			 * read from disk.
 274			 */
 275			if (i < rbio->nr_data * rbio->stripe_nsectors)
 276				ASSERT(test_bit(i, rbio->stripe_uptodate_bitmap));
 277			continue;
 278		}
 279
 280		memcpy_from_bio_to_stripe(rbio, i);
 281		set_bit(i, rbio->stripe_uptodate_bitmap);
 282	}
 283	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 284}
 285
 286/*
 287 * we hash on the first logical address of the stripe
 288 */
 289static int rbio_bucket(struct btrfs_raid_bio *rbio)
 290{
 291	u64 num = rbio->bioc->full_stripe_logical;
 292
 293	/*
 294	 * we shift down quite a bit.  We're using byte
 295	 * addressing, and most of the lower bits are zeros.
 296	 * This tends to upset hash_64, and it consistently
 297	 * returns just one or two different values.
 298	 *
 299	 * shifting off the lower bits fixes things.
 300	 */
 301	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
 302}
 303
 304/* Get the sector number of the first sector covered by @page_nr. */
 305static u32 page_nr_to_sector_nr(struct btrfs_raid_bio *rbio, unsigned int page_nr)
 306{
 307	u32 sector_nr;
 308
 309	ASSERT(page_nr < rbio->nr_pages);
 310
 311	sector_nr = (page_nr << PAGE_SHIFT) >> rbio->bioc->fs_info->sectorsize_bits;
 312	ASSERT(sector_nr < rbio->nr_sectors);
 313	return sector_nr;
 314}
 315
 316/*
 317 * Get the number of sectors covered by @page_nr.
 318 *
 319 * For bs > ps cases, the result will always be 1.
 320 * For bs <= ps cases, the result will be ps / bs.
 321 */
 322static u32 page_nr_to_num_sectors(struct btrfs_raid_bio *rbio, unsigned int page_nr)
 323{
 324	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
 325	u32 nr_sectors;
 326
 327	ASSERT(page_nr < rbio->nr_pages);
 328
 329	nr_sectors = round_up(PAGE_SIZE, fs_info->sectorsize) >> fs_info->sectorsize_bits;
 330	ASSERT(nr_sectors > 0);
 331	return nr_sectors;
 332}
 333
 334static __maybe_unused bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
 335						      unsigned int page_nr)
 336{
 337	const u32 sector_nr = page_nr_to_sector_nr(rbio, page_nr);
 338	const u32 nr_bits = page_nr_to_num_sectors(rbio, page_nr);
 339	int i;
 340
 341	ASSERT(page_nr < rbio->nr_pages);
 342	ASSERT(sector_nr + nr_bits < rbio->nr_sectors);
 343
 344	for (i = sector_nr; i < sector_nr + nr_bits; i++) {
 345		if (!test_bit(i, rbio->stripe_uptodate_bitmap))
 346			return false;
 347	}
 348	return true;
 349}
 350
 351/*
 352 * Update the stripe_sectors[] array to use correct page and pgoff
 353 *
 354 * Should be called every time any page pointer in stripes_pages[] got modified.
 355 */
 356static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
 357{
 358	const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
 359	u32 offset;
 360	int i;
 361
 362	for (i = 0, offset = 0; i < rbio->nr_sectors * rbio->sector_nsteps;
 363	     i++, offset += step) {
 364		int page_index = offset >> PAGE_SHIFT;
 365
 366		ASSERT(page_index < rbio->nr_pages);
 367		if (!rbio->stripe_pages[page_index])
 368			continue;
 369
 370		rbio->stripe_paddrs[i] = page_to_phys(rbio->stripe_pages[page_index]) +
 371					 offset_in_page(offset);
 372	}
 373}
 374
 375static void steal_rbio_page(struct btrfs_raid_bio *src,
 376			    struct btrfs_raid_bio *dest, int page_nr)
 377{
 378	const u32 sector_nr = page_nr_to_sector_nr(src, page_nr);
 379	const u32 nr_bits = page_nr_to_num_sectors(src, page_nr);
 380
 381	ASSERT(page_nr < src->nr_pages);
 382	ASSERT(sector_nr + nr_bits < src->nr_sectors);
 383
 384	if (dest->stripe_pages[page_nr])
 385		__free_page(dest->stripe_pages[page_nr]);
 386	dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
 387	src->stripe_pages[page_nr] = NULL;
 388
 389	/* Also update the stripe_uptodate_bitmap bits. */
 390	bitmap_set(dest->stripe_uptodate_bitmap, sector_nr, nr_bits);
 391}
 392
 393static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
 394{
 395	const int sector_nr = page_nr_to_sector_nr(rbio, page_nr);
 396
 397	/*
 398	 * We have ensured PAGE_SIZE is aligned with sectorsize, thus
 399	 * we won't have a page which is half data half parity.
 400	 *
 401	 * Thus if the first sector of the page belongs to data stripes, then
 402	 * the full page belongs to data stripes.
 403	 */
 404	return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
 405}
 406
 407/*
 408 * Stealing an rbio means taking all the uptodate pages from the stripe array
 409 * in the source rbio and putting them into the destination rbio.
 410 *
 411 * This will also update the involved stripe_sectors[] which are referring to
 412 * the old pages.
 413 */
 414static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 415{
 416	int i;
 417
 418	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
 419		return;
 420
 421	for (i = 0; i < dest->nr_pages; i++) {
 422		struct page *p = src->stripe_pages[i];
 423
 424		/*
 425		 * We don't need to steal P/Q pages as they will always be
 426		 * regenerated for RMW or full write anyway.
 427		 */
 428		if (!is_data_stripe_page(src, i))
 429			continue;
 430
 431		/*
 432		 * If @src already has RBIO_CACHE_READY_BIT, it should have
 433		 * all data stripe pages present and uptodate.
 434		 */
 435		ASSERT(p);
 436		ASSERT(full_page_sectors_uptodate(src, i));
 437		steal_rbio_page(src, dest, i);
 438	}
 439	index_stripe_sectors(dest);
 440	index_stripe_sectors(src);
 441}
 442
 443/*
 444 * merging means we take the bio_list from the victim and
 445 * splice it into the destination.  The victim should
 446 * be discarded afterwards.
 447 *
 448 * must be called with dest->rbio_list_lock held
 449 */
 450static void merge_rbio(struct btrfs_raid_bio *dest,
 451		       struct btrfs_raid_bio *victim)
 452{
 453	bio_list_merge_init(&dest->bio_list, &victim->bio_list);
 454	dest->bio_list_bytes += victim->bio_list_bytes;
 455	/* Also inherit the bitmaps from @victim. */
 456	bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
 457		  dest->stripe_nsectors);
 458}
 459
 460/*
 461 * used to prune items that are in the cache.  The caller
 462 * must hold the hash table lock.
 463 */
 464static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 465{
 466	int bucket = rbio_bucket(rbio);
 467	struct btrfs_stripe_hash_table *table;
 468	struct btrfs_stripe_hash *h;
 469	int freeit = 0;
 470
 471	/*
 472	 * check the bit again under the hash table lock.
 473	 */
 474	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 475		return;
 476
 477	table = rbio->bioc->fs_info->stripe_hash_table;
 478	h = table->table + bucket;
 479
 480	/* hold the lock for the bucket because we may be
 481	 * removing it from the hash table
 482	 */
 483	spin_lock(&h->lock);
 484
 485	/*
 486	 * hold the lock for the bio list because we need
 487	 * to make sure the bio list is empty
 488	 */
 489	spin_lock(&rbio->bio_list_lock);
 490
 491	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 492		list_del_init(&rbio->stripe_cache);
 493		table->cache_size -= 1;
 494		freeit = 1;
 495
 496		/* if the bio list isn't empty, this rbio is
 497		 * still involved in an IO.  We take it out
 498		 * of the cache list, and drop the ref that
 499		 * was held for the list.
 500		 *
 501		 * If the bio_list was empty, we also remove
 502		 * the rbio from the hash_table, and drop
 503		 * the corresponding ref
 504		 */
 505		if (bio_list_empty(&rbio->bio_list)) {
 506			if (!list_empty(&rbio->hash_list)) {
 507				list_del_init(&rbio->hash_list);
 508				refcount_dec(&rbio->refs);
 509				BUG_ON(!list_empty(&rbio->plug_list));
 510			}
 511		}
 512	}
 513
 514	spin_unlock(&rbio->bio_list_lock);
 515	spin_unlock(&h->lock);
 516
 517	if (freeit)
 518		free_raid_bio(rbio);
 519}
 520
 521/*
 522 * prune a given rbio from the cache
 523 */
 524static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 525{
 526	struct btrfs_stripe_hash_table *table;
 527
 528	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 529		return;
 530
 531	table = rbio->bioc->fs_info->stripe_hash_table;
 532
 533	spin_lock(&table->cache_lock);
 534	__remove_rbio_from_cache(rbio);
 535	spin_unlock(&table->cache_lock);
 536}
 537
 538/*
 539 * remove everything in the cache
 540 */
 541static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
 542{
 543	struct btrfs_stripe_hash_table *table;
 544	struct btrfs_raid_bio *rbio;
 545
 546	table = info->stripe_hash_table;
 547
 548	spin_lock(&table->cache_lock);
 549	while (!list_empty(&table->stripe_cache)) {
 550		rbio = list_first_entry(&table->stripe_cache,
 551					struct btrfs_raid_bio, stripe_cache);
 552		__remove_rbio_from_cache(rbio);
 553	}
 554	spin_unlock(&table->cache_lock);
 555}
 556
 557/*
 558 * remove all cached entries and free the hash table
 559 * used by unmount
 560 */
 561void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 562{
 563	if (!info->stripe_hash_table)
 564		return;
 565	btrfs_clear_rbio_cache(info);
 566	kvfree(info->stripe_hash_table);
 567	info->stripe_hash_table = NULL;
 568}
 569
 570/*
 571 * insert an rbio into the stripe cache.  It
 572 * must have already been prepared by calling
 573 * cache_rbio_pages
 574 *
 575 * If this rbio was already cached, it gets
 576 * moved to the front of the lru.
 577 *
 578 * If the size of the rbio cache is too big, we
 579 * prune an item.
 580 */
 581static void cache_rbio(struct btrfs_raid_bio *rbio)
 582{
 583	struct btrfs_stripe_hash_table *table;
 584
 585	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
 586		return;
 587
 588	table = rbio->bioc->fs_info->stripe_hash_table;
 589
 590	spin_lock(&table->cache_lock);
 591	spin_lock(&rbio->bio_list_lock);
 592
 593	/* bump our ref if we were not in the list before */
 594	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
 595		refcount_inc(&rbio->refs);
 596
 597	if (!list_empty(&rbio->stripe_cache)){
 598		list_move(&rbio->stripe_cache, &table->stripe_cache);
 599	} else {
 600		list_add(&rbio->stripe_cache, &table->stripe_cache);
 601		table->cache_size += 1;
 602	}
 603
 604	spin_unlock(&rbio->bio_list_lock);
 605
 606	if (table->cache_size > RBIO_CACHE_SIZE) {
 607		struct btrfs_raid_bio *found;
 608
 609		found = list_last_entry(&table->stripe_cache,
 610					struct btrfs_raid_bio,
 611					stripe_cache);
 612
 613		if (found != rbio)
 614			__remove_rbio_from_cache(found);
 615	}
 616
 617	spin_unlock(&table->cache_lock);
 618}
 619
 620/*
 621 * helper function to run the xor_blocks api.  It is only
 622 * able to do MAX_XOR_BLOCKS at a time, so we need to
 623 * loop through.
 624 */
 625static void run_xor(void **pages, int src_cnt, ssize_t len)
 626{
 627	int src_off = 0;
 628	int xor_src_cnt = 0;
 629	void *dest = pages[src_cnt];
 630
 631	while(src_cnt > 0) {
 632		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
 633		xor_blocks(xor_src_cnt, len, dest, pages + src_off);
 634
 635		src_cnt -= xor_src_cnt;
 636		src_off += xor_src_cnt;
 637	}
 638}
 639
 640/*
 641 * Returns true if the bio list inside this rbio covers an entire stripe (no
 642 * rmw required).
 643 */
 644static int rbio_is_full(struct btrfs_raid_bio *rbio)
 645{
 646	unsigned long size = rbio->bio_list_bytes;
 647	int ret = 1;
 648
 649	spin_lock(&rbio->bio_list_lock);
 650	if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
 651		ret = 0;
 652	BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
 653	spin_unlock(&rbio->bio_list_lock);
 654
 655	return ret;
 656}
 657
 658/*
 659 * returns 1 if it is safe to merge two rbios together.
 660 * The merging is safe if the two rbios correspond to
 661 * the same stripe and if they are both going in the same
 662 * direction (read vs write), and if neither one is
 663 * locked for final IO
 664 *
 665 * The caller is responsible for locking such that
 666 * rmw_locked is safe to test
 667 */
 668static int rbio_can_merge(struct btrfs_raid_bio *last,
 669			  struct btrfs_raid_bio *cur)
 670{
 671	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
 672	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
 673		return 0;
 674
 675	/*
 676	 * we can't merge with cached rbios, since the
 677	 * idea is that when we merge the destination
 678	 * rbio is going to run our IO for us.  We can
 679	 * steal from cached rbios though, other functions
 680	 * handle that.
 681	 */
 682	if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
 683	    test_bit(RBIO_CACHE_BIT, &cur->flags))
 684		return 0;
 685
 686	if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
 687		return 0;
 688
 689	/* we can't merge with different operations */
 690	if (last->operation != cur->operation)
 691		return 0;
 692	/*
 693	 * We've need read the full stripe from the drive.
 694	 * check and repair the parity and write the new results.
 695	 *
 696	 * We're not allowed to add any new bios to the
 697	 * bio list here, anyone else that wants to
 698	 * change this stripe needs to do their own rmw.
 699	 */
 700	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
 701		return 0;
 702
 703	if (last->operation == BTRFS_RBIO_READ_REBUILD)
 704		return 0;
 705
 706	return 1;
 707}
 708
 709/* Return the sector index for @stripe_nr and @sector_nr. */
 710static unsigned int rbio_sector_index(const struct btrfs_raid_bio *rbio,
 711				      unsigned int stripe_nr,
 712				      unsigned int sector_nr)
 713{
 714	unsigned int ret;
 715
 716	ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr);
 717	ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr);
 718
 719	ret = stripe_nr * rbio->stripe_nsectors + sector_nr;
 720	ASSERT(ret < rbio->nr_sectors);
 721	return ret;
 722}
 723
 724/* Return the paddr array index for @stripe_nr, @sector_nr and @step_nr. */
 725static unsigned int rbio_paddr_index(const struct btrfs_raid_bio *rbio,
 726				     unsigned int stripe_nr,
 727				     unsigned int sector_nr,
 728				     unsigned int step_nr)
 729{
 730	unsigned int ret;
 731
 732	ASSERT_RBIO_SECTOR(step_nr < rbio->sector_nsteps, rbio, step_nr);
 733
 734	ret = rbio_sector_index(rbio, stripe_nr, sector_nr) * rbio->sector_nsteps + step_nr;
 735	ASSERT(ret < rbio->nr_sectors * rbio->sector_nsteps);
 736	return ret;
 737}
 738
 739static phys_addr_t rbio_stripe_paddr(const struct btrfs_raid_bio *rbio,
 740					  unsigned int stripe_nr, unsigned int sector_nr,
 741					  unsigned int step_nr)
 742{
 743	return rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr)];
 744}
 745
 746static phys_addr_t rbio_pstripe_paddr(const struct btrfs_raid_bio *rbio,
 747					   unsigned int sector_nr, unsigned int step_nr)
 748{
 749	return rbio_stripe_paddr(rbio, rbio->nr_data, sector_nr, step_nr);
 750}
 751
 752static phys_addr_t rbio_qstripe_paddr(const struct btrfs_raid_bio *rbio,
 753					   unsigned int sector_nr, unsigned int step_nr)
 754{
 755	if (rbio->nr_data + 1 == rbio->real_stripes)
 756		return INVALID_PADDR;
 757	return rbio_stripe_paddr(rbio, rbio->nr_data + 1, sector_nr, step_nr);
 758}
 759
 760/* Return a paddr pointer into the rbio::stripe_paddrs[] for the specified sector. */
 761static phys_addr_t *rbio_stripe_paddrs(const struct btrfs_raid_bio *rbio,
 762				       unsigned int stripe_nr, unsigned int sector_nr)
 763{
 764	return &rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, 0)];
 765}
 766
 767/*
 768 * The first stripe in the table for a logical address
 769 * has the lock.  rbios are added in one of three ways:
 770 *
 771 * 1) Nobody has the stripe locked yet.  The rbio is given
 772 * the lock and 0 is returned.  The caller must start the IO
 773 * themselves.
 774 *
 775 * 2) Someone has the stripe locked, but we're able to merge
 776 * with the lock owner.  The rbio is freed and the IO will
 777 * start automatically along with the existing rbio.  1 is returned.
 778 *
 779 * 3) Someone has the stripe locked, but we're not able to merge.
 780 * The rbio is added to the lock owner's plug list, or merged into
 781 * an rbio already on the plug list.  When the lock owner unlocks,
 782 * the next rbio on the list is run and the IO is started automatically.
 783 * 1 is returned
 784 *
 785 * If we return 0, the caller still owns the rbio and must continue with
 786 * IO submission.  If we return 1, the caller must assume the rbio has
 787 * already been freed.
 788 */
 789static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 790{
 791	struct btrfs_stripe_hash *h;
 792	struct btrfs_raid_bio *cur;
 793	struct btrfs_raid_bio *pending;
 794	struct btrfs_raid_bio *freeit = NULL;
 795	struct btrfs_raid_bio *cache_drop = NULL;
 796	int ret = 0;
 797
 798	h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
 799
 800	spin_lock(&h->lock);
 801	list_for_each_entry(cur, &h->hash_list, hash_list) {
 802		if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
 803			continue;
 804
 805		spin_lock(&cur->bio_list_lock);
 806
 807		/* Can we steal this cached rbio's pages? */
 808		if (bio_list_empty(&cur->bio_list) &&
 809		    list_empty(&cur->plug_list) &&
 810		    test_bit(RBIO_CACHE_BIT, &cur->flags) &&
 811		    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
 812			list_del_init(&cur->hash_list);
 813			refcount_dec(&cur->refs);
 814
 815			steal_rbio(cur, rbio);
 816			cache_drop = cur;
 817			spin_unlock(&cur->bio_list_lock);
 818
 819			goto lockit;
 820		}
 821
 822		/* Can we merge into the lock owner? */
 823		if (rbio_can_merge(cur, rbio)) {
 824			merge_rbio(cur, rbio);
 825			spin_unlock(&cur->bio_list_lock);
 826			freeit = rbio;
 827			ret = 1;
 828			goto out;
 829		}
 830
 831
 832		/*
 833		 * We couldn't merge with the running rbio, see if we can merge
 834		 * with the pending ones.  We don't have to check for rmw_locked
 835		 * because there is no way they are inside finish_rmw right now
 836		 */
 837		list_for_each_entry(pending, &cur->plug_list, plug_list) {
 838			if (rbio_can_merge(pending, rbio)) {
 839				merge_rbio(pending, rbio);
 840				spin_unlock(&cur->bio_list_lock);
 841				freeit = rbio;
 842				ret = 1;
 843				goto out;
 844			}
 845		}
 846
 847		/*
 848		 * No merging, put us on the tail of the plug list, our rbio
 849		 * will be started with the currently running rbio unlocks
 850		 */
 851		list_add_tail(&rbio->plug_list, &cur->plug_list);
 852		spin_unlock(&cur->bio_list_lock);
 853		ret = 1;
 854		goto out;
 855	}
 856lockit:
 857	refcount_inc(&rbio->refs);
 858	list_add(&rbio->hash_list, &h->hash_list);
 859out:
 860	spin_unlock(&h->lock);
 861	if (cache_drop)
 862		remove_rbio_from_cache(cache_drop);
 863	if (freeit)
 864		free_raid_bio(freeit);
 865	return ret;
 866}
 867
 868static void recover_rbio_work_locked(struct work_struct *work);
 869
 870/*
 871 * called as rmw or parity rebuild is completed.  If the plug list has more
 872 * rbios waiting for this stripe, the next one on the list will be started
 873 */
 874static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 875{
 876	int bucket;
 877	struct btrfs_stripe_hash *h;
 878	int keep_cache = 0;
 879
 880	bucket = rbio_bucket(rbio);
 881	h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
 882
 883	if (list_empty(&rbio->plug_list))
 884		cache_rbio(rbio);
 885
 886	spin_lock(&h->lock);
 887	spin_lock(&rbio->bio_list_lock);
 888
 889	if (!list_empty(&rbio->hash_list)) {
 890		/*
 891		 * if we're still cached and there is no other IO
 892		 * to perform, just leave this rbio here for others
 893		 * to steal from later
 894		 */
 895		if (list_empty(&rbio->plug_list) &&
 896		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 897			keep_cache = 1;
 898			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 899			BUG_ON(!bio_list_empty(&rbio->bio_list));
 900			goto done;
 901		}
 902
 903		list_del_init(&rbio->hash_list);
 904		refcount_dec(&rbio->refs);
 905
 906		/*
 907		 * we use the plug list to hold all the rbios
 908		 * waiting for the chance to lock this stripe.
 909		 * hand the lock over to one of them.
 910		 */
 911		if (!list_empty(&rbio->plug_list)) {
 912			struct btrfs_raid_bio *next;
 913			struct list_head *head = rbio->plug_list.next;
 914
 915			next = list_entry(head, struct btrfs_raid_bio,
 916					  plug_list);
 917
 918			list_del_init(&rbio->plug_list);
 919
 920			list_add(&next->hash_list, &h->hash_list);
 921			refcount_inc(&next->refs);
 922			spin_unlock(&rbio->bio_list_lock);
 923			spin_unlock(&h->lock);
 924
 925			if (next->operation == BTRFS_RBIO_READ_REBUILD) {
 926				start_async_work(next, recover_rbio_work_locked);
 927			} else if (next->operation == BTRFS_RBIO_WRITE) {
 928				steal_rbio(rbio, next);
 929				start_async_work(next, rmw_rbio_work_locked);
 930			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
 931				steal_rbio(rbio, next);
 932				start_async_work(next, scrub_rbio_work_locked);
 933			}
 934
 935			goto done_nolock;
 936		}
 937	}
 938done:
 939	spin_unlock(&rbio->bio_list_lock);
 940	spin_unlock(&h->lock);
 941
 942done_nolock:
 943	if (!keep_cache)
 944		remove_rbio_from_cache(rbio);
 945}
 946
 947static void rbio_endio_bio_list(struct bio *cur, blk_status_t status)
 948{
 949	struct bio *next;
 950
 951	while (cur) {
 952		next = cur->bi_next;
 953		cur->bi_next = NULL;
 954		cur->bi_status = status;
 955		bio_endio(cur);
 956		cur = next;
 957	}
 958}
 959
 960/*
 961 * this frees the rbio and runs through all the bios in the
 962 * bio_list and calls end_io on them
 963 */
 964static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status)
 965{
 966	struct bio *cur = bio_list_get(&rbio->bio_list);
 967	struct bio *extra;
 968
 969	kfree(rbio->csum_buf);
 970	bitmap_free(rbio->csum_bitmap);
 971	rbio->csum_buf = NULL;
 972	rbio->csum_bitmap = NULL;
 973
 974	/*
 975	 * Clear the data bitmap, as the rbio may be cached for later usage.
 976	 * do this before before unlock_stripe() so there will be no new bio
 977	 * for this bio.
 978	 */
 979	bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
 980
 981	/*
 982	 * At this moment, rbio->bio_list is empty, however since rbio does not
 983	 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
 984	 * hash list, rbio may be merged with others so that rbio->bio_list
 985	 * becomes non-empty.
 986	 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
 987	 * more and we can call bio_endio() on all queued bios.
 988	 */
 989	unlock_stripe(rbio);
 990	extra = bio_list_get(&rbio->bio_list);
 991	free_raid_bio(rbio);
 992
 993	rbio_endio_bio_list(cur, status);
 994	if (extra)
 995		rbio_endio_bio_list(extra, status);
 996}
 997
 998/*
 999 * Get paddr pointer for the sector specified by its @stripe_nr and @sector_nr.
1000 *
1001 * @rbio:               The raid bio
1002 * @stripe_nr:          Stripe number, valid range [0, real_stripe)
1003 * @sector_nr:		Sector number inside the stripe,
1004 *			valid range [0, stripe_nsectors)
1005 * @bio_list_only:      Whether to use sectors inside the bio list only.
1006 *
1007 * The read/modify/write code wants to reuse the original bio page as much
1008 * as possible, and only use stripe_sectors as fallback.
1009 *
1010 * Return NULL if bio_list_only is set but the specified sector has no
1011 * coresponding bio.
1012 */
1013static phys_addr_t *sector_paddrs_in_rbio(struct btrfs_raid_bio *rbio,
1014					  int stripe_nr, int sector_nr,
1015					  bool bio_list_only)
1016{
1017	phys_addr_t *ret = NULL;
1018	const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, 0);
1019
1020	ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps);
1021
1022	scoped_guard(spinlock, &rbio->bio_list_lock) {
1023		if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) {
1024			/* Don't return sector without a valid page pointer */
1025			if (rbio->bio_paddrs[index] != INVALID_PADDR)
1026				ret = &rbio->bio_paddrs[index];
1027			return ret;
1028		}
1029	}
1030	return &rbio->stripe_paddrs[index];
1031}
1032
1033/*
1034 * Similar to sector_paddr_in_rbio(), but with extra consideration for
1035 * bs > ps cases, where we can have multiple steps for a fs block.
1036 */
1037static phys_addr_t sector_paddr_in_rbio(struct btrfs_raid_bio *rbio,
1038					int stripe_nr, int sector_nr, int step_nr,
1039					bool bio_list_only)
1040{
1041	phys_addr_t ret = INVALID_PADDR;
1042	const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr);
1043
1044	ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps);
1045
1046	scoped_guard(spinlock, &rbio->bio_list_lock) {
1047		if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) {
1048			/* Don't return sector without a valid page pointer */
1049			if (rbio->bio_paddrs[index] != INVALID_PADDR)
1050				ret = rbio->bio_paddrs[index];
1051			return ret;
1052		}
1053	}
1054	return rbio->stripe_paddrs[index];
1055}
1056
1057/*
1058 * allocation and initial setup for the btrfs_raid_bio.  Not
1059 * this does not allocate any pages for rbio->pages.
1060 */
1061static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
1062					 struct btrfs_io_context *bioc)
1063{
1064	const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
1065	const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
1066	const unsigned int num_pages = stripe_npages * real_stripes;
1067	const unsigned int stripe_nsectors =
1068		BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
1069	const unsigned int num_sectors = stripe_nsectors * real_stripes;
1070	const unsigned int step = min(fs_info->sectorsize, PAGE_SIZE);
1071	const unsigned int sector_nsteps = fs_info->sectorsize / step;
1072	struct btrfs_raid_bio *rbio;
1073
1074	/*
1075	 * For bs <= ps cases, ps must be aligned to bs.
1076	 * For bs > ps cases, bs must be aligned to ps.
1077	 */
1078	ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize) ||
1079	       IS_ALIGNED(fs_info->sectorsize, PAGE_SIZE));
1080	/*
1081	 * Our current stripe len should be fixed to 64k thus stripe_nsectors
1082	 * (at most 16) should be no larger than BITS_PER_LONG.
1083	 */
1084	ASSERT(stripe_nsectors <= BITS_PER_LONG);
1085
1086	/*
1087	 * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
1088	 * (limited by u8).
1089	 */
1090	ASSERT(real_stripes >= 2);
1091	ASSERT(real_stripes <= U8_MAX);
1092
1093	rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
1094	if (!rbio)
1095		return ERR_PTR(-ENOMEM);
1096	rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
1097				     GFP_NOFS);
1098	rbio->bio_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS);
1099	rbio->stripe_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS);
1100	rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
1101	rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
1102	rbio->stripe_uptodate_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
1103
1104	if (!rbio->stripe_pages || !rbio->bio_paddrs || !rbio->stripe_paddrs ||
1105	    !rbio->finish_pointers || !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) {
1106		free_raid_bio_pointers(rbio);
1107		kfree(rbio);
1108		return ERR_PTR(-ENOMEM);
1109	}
1110	for (int i = 0; i < num_sectors * sector_nsteps; i++) {
1111		rbio->stripe_paddrs[i] = INVALID_PADDR;
1112		rbio->bio_paddrs[i] = INVALID_PADDR;
1113	}
1114
1115	bio_list_init(&rbio->bio_list);
1116	init_waitqueue_head(&rbio->io_wait);
1117	INIT_LIST_HEAD(&rbio->plug_list);
1118	spin_lock_init(&rbio->bio_list_lock);
1119	INIT_LIST_HEAD(&rbio->stripe_cache);
1120	INIT_LIST_HEAD(&rbio->hash_list);
1121	btrfs_get_bioc(bioc);
1122	rbio->bioc = bioc;
1123	rbio->nr_pages = num_pages;
1124	rbio->nr_sectors = num_sectors;
1125	rbio->real_stripes = real_stripes;
1126	rbio->stripe_npages = stripe_npages;
1127	rbio->stripe_nsectors = stripe_nsectors;
1128	rbio->sector_nsteps = sector_nsteps;
1129	refcount_set(&rbio->refs, 1);
1130	atomic_set(&rbio->stripes_pending, 0);
1131
1132	ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
1133	rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
1134	ASSERT(rbio->nr_data > 0);
1135
1136	return rbio;
1137}
1138
1139/* allocate pages for all the stripes in the bio, including parity */
1140static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1141{
1142	int ret;
1143
1144	ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false);
1145	if (ret < 0)
1146		return ret;
1147	/* Mapping all sectors */
1148	index_stripe_sectors(rbio);
1149	return 0;
1150}
1151
1152/* only allocate pages for p/q stripes */
1153static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1154{
1155	const int data_pages = rbio->nr_data * rbio->stripe_npages;
1156	int ret;
1157
1158	ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
1159				     rbio->stripe_pages + data_pages, false);
1160	if (ret < 0)
1161		return ret;
1162
1163	index_stripe_sectors(rbio);
1164	return 0;
1165}
1166
1167/*
1168 * Return the total number of errors found in the vertical stripe of @sector_nr.
1169 *
1170 * @faila and @failb will also be updated to the first and second stripe
1171 * number of the errors.
1172 */
1173static int get_rbio_vertical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
1174				    int *faila, int *failb)
1175{
1176	int stripe_nr;
1177	int found_errors = 0;
1178
1179	if (faila || failb) {
1180		/*
1181		 * Both @faila and @failb should be valid pointers if any of
1182		 * them is specified.
1183		 */
1184		ASSERT(faila && failb);
1185		*faila = -1;
1186		*failb = -1;
1187	}
1188
1189	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1190		int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
1191
1192		if (test_bit(total_sector_nr, rbio->error_bitmap)) {
1193			found_errors++;
1194			if (faila) {
1195				/* Update faila and failb. */
1196				if (*faila < 0)
1197					*faila = stripe_nr;
1198				else if (*failb < 0)
1199					*failb = stripe_nr;
1200			}
1201		}
1202	}
1203	return found_errors;
1204}
1205
1206static int bio_add_paddrs(struct bio *bio, phys_addr_t *paddrs, unsigned int nr_steps,
1207			  unsigned int step)
1208{
1209	int added = 0;
1210	int ret;
1211
1212	for (int i = 0; i < nr_steps; i++) {
1213		ret = bio_add_page(bio, phys_to_page(paddrs[i]), step,
1214				   offset_in_page(paddrs[i]));
1215		if (ret != step)
1216			goto revert;
1217		added += ret;
1218	}
1219	return added;
1220revert:
1221	/*
1222	 * We don't need to revert the bvec, as the bio will be submitted immediately,
1223	 * as long as the size is reduced the extra bvec will not be accessed.
1224	 */
1225	bio->bi_iter.bi_size -= added;
1226	return 0;
1227}
1228
1229/*
1230 * Add a single sector @sector into our list of bios for IO.
1231 *
1232 * Return 0 if everything went well.
1233 * Return <0 for error, and no byte will be added to @rbio.
1234 */
1235static int rbio_add_io_paddrs(struct btrfs_raid_bio *rbio, struct bio_list *bio_list,
1236			      phys_addr_t *paddrs, unsigned int stripe_nr,
1237			      unsigned int sector_nr, enum req_op op)
1238{
1239	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1240	const u32 step = min(sectorsize, PAGE_SIZE);
1241	struct bio *last = bio_list->tail;
1242	int ret;
1243	struct bio *bio;
1244	struct btrfs_io_stripe *stripe;
1245	u64 disk_start;
1246
1247	/*
1248	 * Note: here stripe_nr has taken device replace into consideration,
1249	 * thus it can be larger than rbio->real_stripe.
1250	 * So here we check against bioc->num_stripes, not rbio->real_stripes.
1251	 */
1252	ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes,
1253			   rbio, stripe_nr);
1254	ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
1255			   rbio, sector_nr);
1256	ASSERT(paddrs != NULL);
1257
1258	stripe = &rbio->bioc->stripes[stripe_nr];
1259	disk_start = stripe->physical + sector_nr * sectorsize;
1260
1261	/* if the device is missing, just fail this stripe */
1262	if (!stripe->dev->bdev) {
1263		int found_errors;
1264
1265		set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
1266			rbio->error_bitmap);
1267
1268		/* Check if we have reached tolerance early. */
1269		found_errors = get_rbio_vertical_errors(rbio, sector_nr,
1270							NULL, NULL);
1271		if (unlikely(found_errors > rbio->bioc->max_errors))
1272			return -EIO;
1273		return 0;
1274	}
1275
1276	/* see if we can add this page onto our existing bio */
1277	if (last) {
1278		u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT;
1279		last_end += last->bi_iter.bi_size;
1280
1281		/*
1282		 * we can't merge these if they are from different
1283		 * devices or if they are not contiguous
1284		 */
1285		if (last_end == disk_start && !last->bi_status &&
1286		    last->bi_bdev == stripe->dev->bdev) {
1287			ret = bio_add_paddrs(last, paddrs, rbio->sector_nsteps, step);
1288			if (ret == sectorsize)
1289				return 0;
1290		}
1291	}
1292
1293	/* put a new bio on the list */
1294	bio = bio_alloc(stripe->dev->bdev,
1295			max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
1296			op, GFP_NOFS);
1297	bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
1298	bio->bi_private = rbio;
1299
1300	ret = bio_add_paddrs(bio, paddrs, rbio->sector_nsteps, step);
1301	ASSERT(ret == sectorsize);
1302	bio_list_add(bio_list, bio);
1303	return 0;
1304}
1305
1306static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1307{
1308	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1309	const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
1310	const u32 step_bits = min(fs_info->sectorsize_bits, PAGE_SHIFT);
1311	struct bvec_iter iter = bio->bi_iter;
1312	phys_addr_t paddr;
1313	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1314		     rbio->bioc->full_stripe_logical;
1315
1316	btrfs_bio_for_each_block(paddr, bio, &iter, step) {
1317		unsigned int index = (offset >> step_bits);
1318
1319		rbio->bio_paddrs[index] = paddr;
1320		offset += step;
1321	}
1322}
1323
1324/*
1325 * helper function to walk our bio list and populate the bio_pages array with
1326 * the result.  This seems expensive, but it is faster than constantly
1327 * searching through the bio list as we setup the IO in finish_rmw or stripe
1328 * reconstruction.
1329 *
1330 * This must be called before you trust the answers from page_in_rbio
1331 */
1332static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1333{
1334	struct bio *bio;
1335
1336	spin_lock(&rbio->bio_list_lock);
1337	bio_list_for_each(bio, &rbio->bio_list)
1338		index_one_bio(rbio, bio);
1339
1340	spin_unlock(&rbio->bio_list_lock);
1341}
1342
1343static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
1344			       struct raid56_bio_trace_info *trace_info)
1345{
1346	const struct btrfs_io_context *bioc = rbio->bioc;
1347	int i;
1348
1349	ASSERT(bioc);
1350
1351	/* We rely on bio->bi_bdev to find the stripe number. */
1352	if (!bio->bi_bdev)
1353		goto not_found;
1354
1355	for (i = 0; i < bioc->num_stripes; i++) {
1356		if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
1357			continue;
1358		trace_info->stripe_nr = i;
1359		trace_info->devid = bioc->stripes[i].dev->devid;
1360		trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1361				     bioc->stripes[i].physical;
1362		return;
1363	}
1364
1365not_found:
1366	trace_info->devid = -1;
1367	trace_info->offset = -1;
1368	trace_info->stripe_nr = -1;
1369}
1370
1371static inline void bio_list_put(struct bio_list *bio_list)
1372{
1373	struct bio *bio;
1374
1375	while ((bio = bio_list_pop(bio_list)))
1376		bio_put(bio);
1377}
1378
1379static void assert_rbio(struct btrfs_raid_bio *rbio)
1380{
1381	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
1382		return;
1383
1384	/*
1385	 * At least two stripes (2 disks RAID5), and since real_stripes is U8,
1386	 * we won't go beyond 256 disks anyway.
1387	 */
1388	ASSERT_RBIO(rbio->real_stripes >= 2, rbio);
1389	ASSERT_RBIO(rbio->nr_data > 0, rbio);
1390
1391	/*
1392	 * This is another check to make sure nr data stripes is smaller
1393	 * than total stripes.
1394	 */
1395	ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio);
1396}
1397
1398static inline void *kmap_local_paddr(phys_addr_t paddr)
1399{
1400	/* The sector pointer must have a page mapped to it. */
1401	ASSERT(paddr != INVALID_PADDR);
1402
1403	return kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr);
1404}
1405
1406static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int sector_nr,
1407				      unsigned int step_nr)
1408{
1409	void **pointers = rbio->finish_pointers;
1410	const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
1411	int stripe;
1412	const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
1413
1414	/* First collect one sector from each data stripe */
1415	for (stripe = 0; stripe < rbio->nr_data; stripe++)
1416		pointers[stripe] = kmap_local_paddr(
1417				sector_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0));
1418
1419	/* Then add the parity stripe */
1420	pointers[stripe++] = kmap_local_paddr(rbio_pstripe_paddr(rbio, sector_nr, step_nr));
1421
1422	if (has_qstripe) {
1423		/*
1424		 * RAID6, add the qstripe and call the library function
1425		 * to fill in our p/q
1426		 */
1427		pointers[stripe++] = kmap_local_paddr(
1428				rbio_qstripe_paddr(rbio, sector_nr, step_nr));
1429
1430		assert_rbio(rbio);
1431		raid6_call.gen_syndrome(rbio->real_stripes, step, pointers);
1432	} else {
1433		/* raid5 */
1434		memcpy(pointers[rbio->nr_data], pointers[0], step);
1435		run_xor(pointers + 1, rbio->nr_data - 1, step);
1436	}
1437	for (stripe = stripe - 1; stripe >= 0; stripe--)
1438		kunmap_local(pointers[stripe]);
1439}
1440
1441/* Generate PQ for one vertical stripe. */
1442static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
1443{
1444	const bool has_qstripe = (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6);
1445
1446	for (int i = 0; i < rbio->sector_nsteps; i++)
1447		generate_pq_vertical_step(rbio, sectornr, i);
1448
1449	set_bit(rbio_sector_index(rbio, rbio->nr_data, sectornr),
1450		rbio->stripe_uptodate_bitmap);
1451	if (has_qstripe)
1452		set_bit(rbio_sector_index(rbio, rbio->nr_data + 1, sectornr),
1453			rbio->stripe_uptodate_bitmap);
1454}
1455
1456static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
1457				   struct bio_list *bio_list)
1458{
1459	/* The total sector number inside the full stripe. */
1460	int total_sector_nr;
1461	int sectornr;
1462	int stripe;
1463	int ret;
1464
1465	ASSERT(bio_list_size(bio_list) == 0);
1466
1467	/* We should have at least one data sector. */
1468	ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1469
1470	/*
1471	 * Reset errors, as we may have errors inherited from from degraded
1472	 * write.
1473	 */
1474	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
1475
1476	/*
1477	 * Start assembly.  Make bios for everything from the higher layers (the
1478	 * bio_list in our rbio) and our P/Q.  Ignore everything else.
1479	 */
1480	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1481	     total_sector_nr++) {
1482		phys_addr_t *paddrs;
1483
1484		stripe = total_sector_nr / rbio->stripe_nsectors;
1485		sectornr = total_sector_nr % rbio->stripe_nsectors;
1486
1487		/* This vertical stripe has no data, skip it. */
1488		if (!test_bit(sectornr, &rbio->dbitmap))
1489			continue;
1490
1491		if (stripe < rbio->nr_data) {
1492			paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1);
1493			if (paddrs == NULL)
1494				continue;
1495		} else {
1496			paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
1497		}
1498
1499		ret = rbio_add_io_paddrs(rbio, bio_list, paddrs, stripe,
1500					 sectornr, REQ_OP_WRITE);
1501		if (ret)
1502			goto error;
1503	}
1504
1505	if (likely(!rbio->bioc->replace_nr_stripes))
1506		return 0;
1507
1508	/*
1509	 * Make a copy for the replace target device.
1510	 *
1511	 * Thus the source stripe number (in replace_stripe_src) should be valid.
1512	 */
1513	ASSERT(rbio->bioc->replace_stripe_src >= 0);
1514
1515	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1516	     total_sector_nr++) {
1517		phys_addr_t *paddrs;
1518
1519		stripe = total_sector_nr / rbio->stripe_nsectors;
1520		sectornr = total_sector_nr % rbio->stripe_nsectors;
1521
1522		/*
1523		 * For RAID56, there is only one device that can be replaced,
1524		 * and replace_stripe_src[0] indicates the stripe number we
1525		 * need to copy from.
1526		 */
1527		if (stripe != rbio->bioc->replace_stripe_src) {
1528			/*
1529			 * We can skip the whole stripe completely, note
1530			 * total_sector_nr will be increased by one anyway.
1531			 */
1532			ASSERT(sectornr == 0);
1533			total_sector_nr += rbio->stripe_nsectors - 1;
1534			continue;
1535		}
1536
1537		/* This vertical stripe has no data, skip it. */
1538		if (!test_bit(sectornr, &rbio->dbitmap))
1539			continue;
1540
1541		if (stripe < rbio->nr_data) {
1542			paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1);
1543			if (paddrs == NULL)
1544				continue;
1545		} else {
1546			paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
1547		}
1548
1549		ret = rbio_add_io_paddrs(rbio, bio_list, paddrs,
1550					 rbio->real_stripes,
1551					 sectornr, REQ_OP_WRITE);
1552		if (ret)
1553			goto error;
1554	}
1555
1556	return 0;
1557error:
1558	bio_list_put(bio_list);
1559	return -EIO;
1560}
1561
1562static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
1563{
1564	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1565	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1566		     rbio->bioc->full_stripe_logical;
1567	int total_nr_sector = offset >> fs_info->sectorsize_bits;
1568
1569	ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
1570
1571	bitmap_set(rbio->error_bitmap, total_nr_sector,
1572		   bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
1573
1574	/*
1575	 * Special handling for raid56_alloc_missing_rbio() used by
1576	 * scrub/replace.  Unlike call path in raid56_parity_recover(), they
1577	 * pass an empty bio here.  Thus we have to find out the missing device
1578	 * and mark the stripe error instead.
1579	 */
1580	if (bio->bi_iter.bi_size == 0) {
1581		bool found_missing = false;
1582		int stripe_nr;
1583
1584		for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1585			if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
1586				found_missing = true;
1587				bitmap_set(rbio->error_bitmap,
1588					   stripe_nr * rbio->stripe_nsectors,
1589					   rbio->stripe_nsectors);
1590			}
1591		}
1592		ASSERT(found_missing);
1593	}
1594}
1595
1596/*
1597 * Return the index inside the rbio->stripe_sectors[] array.
1598 *
1599 * Return -1 if not found.
1600 */
1601static int find_stripe_sector_nr(struct btrfs_raid_bio *rbio, phys_addr_t paddr)
1602{
1603	for (int i = 0; i < rbio->nr_sectors; i++) {
1604		if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == paddr)
1605			return i;
1606	}
1607	return -1;
1608}
1609
1610/*
1611 * this sets each page in the bio uptodate.  It should only be used on private
1612 * rbio pages, nothing that comes in from the higher layers
1613 */
1614static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
1615{
1616	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1617	const u32 step = min(sectorsize, PAGE_SIZE);
1618	u32 offset = 0;
1619	phys_addr_t paddr;
1620
1621	ASSERT(!bio_flagged(bio, BIO_CLONED));
1622
1623	btrfs_bio_for_each_block_all(paddr, bio, step) {
1624		/* Hitting the first step of a sector. */
1625		if (IS_ALIGNED(offset, sectorsize)) {
1626			int sector_nr = find_stripe_sector_nr(rbio, paddr);
1627
1628			ASSERT(sector_nr >= 0);
1629			if (sector_nr >= 0)
1630				set_bit(sector_nr, rbio->stripe_uptodate_bitmap);
1631		}
1632		offset += step;
1633	}
1634}
1635
1636static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
1637{
1638	phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio));
1639	int i;
1640
1641	for (i = 0; i < rbio->nr_sectors; i++) {
1642		if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == bvec_paddr)
1643			break;
1644		if (rbio->bio_paddrs[i * rbio->sector_nsteps] == bvec_paddr)
1645			break;
1646	}
1647	ASSERT(i < rbio->nr_sectors);
1648	return i;
1649}
1650
1651static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
1652{
1653	int total_sector_nr = get_bio_sector_nr(rbio, bio);
1654	u32 bio_size = 0;
1655	struct bio_vec *bvec;
1656	int i;
1657
1658	bio_for_each_bvec_all(bvec, bio, i)
1659		bio_size += bvec->bv_len;
1660
1661	/*
1662	 * Since we can have multiple bios touching the error_bitmap, we cannot
1663	 * call bitmap_set() without protection.
1664	 *
1665	 * Instead use set_bit() for each bit, as set_bit() itself is atomic.
1666	 */
1667	for (i = total_sector_nr; i < total_sector_nr +
1668	     (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++)
1669		set_bit(i, rbio->error_bitmap);
1670}
1671
1672/* Verify the data sectors at read time. */
1673static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
1674				    struct bio *bio)
1675{
1676	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1677	const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
1678	const u32 nr_steps = rbio->sector_nsteps;
1679	int total_sector_nr = get_bio_sector_nr(rbio, bio);
1680	u32 offset = 0;
1681	phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
1682	phys_addr_t paddr;
1683
1684	/* No data csum for the whole stripe, no need to verify. */
1685	if (!rbio->csum_bitmap || !rbio->csum_buf)
1686		return;
1687
1688	/* P/Q stripes, they have no data csum to verify against. */
1689	if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
1690		return;
1691
1692	btrfs_bio_for_each_block_all(paddr, bio, step) {
1693		u8 csum_buf[BTRFS_CSUM_SIZE];
1694		u8 *expected_csum;
1695
1696		paddrs[(offset / step) % nr_steps] = paddr;
1697		offset += step;
1698
1699		/* Not yet covering the full fs block, continue to the next step. */
1700		if (!IS_ALIGNED(offset, fs_info->sectorsize))
1701			continue;
1702
1703		/* No csum for this sector, skip to the next sector. */
1704		if (!test_bit(total_sector_nr, rbio->csum_bitmap))
1705			continue;
1706
1707		expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size;
1708		btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf);
1709		if (unlikely(memcmp(csum_buf, expected_csum, fs_info->csum_size) != 0))
1710			set_bit(total_sector_nr, rbio->error_bitmap);
1711		total_sector_nr++;
1712	}
1713}
1714
1715static void raid_wait_read_end_io(struct bio *bio)
1716{
1717	struct btrfs_raid_bio *rbio = bio->bi_private;
1718
1719	if (bio->bi_status) {
1720		rbio_update_error_bitmap(rbio, bio);
1721	} else {
1722		set_bio_pages_uptodate(rbio, bio);
1723		verify_bio_data_sectors(rbio, bio);
1724	}
1725
1726	bio_put(bio);
1727	if (atomic_dec_and_test(&rbio->stripes_pending))
1728		wake_up(&rbio->io_wait);
1729}
1730
1731static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
1732			     struct bio_list *bio_list)
1733{
1734	struct bio *bio;
1735
1736	atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
1737	while ((bio = bio_list_pop(bio_list))) {
1738		bio->bi_end_io = raid_wait_read_end_io;
1739
1740		if (trace_raid56_read_enabled()) {
1741			struct raid56_bio_trace_info trace_info = { 0 };
1742
1743			bio_get_trace_info(rbio, bio, &trace_info);
1744			trace_raid56_read(rbio, bio, &trace_info);
1745		}
1746		submit_bio(bio);
1747	}
1748
1749	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
1750}
1751
1752static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
1753{
1754	const int data_pages = rbio->nr_data * rbio->stripe_npages;
1755	int ret;
1756
1757	ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false);
1758	if (ret < 0)
1759		return ret;
1760
1761	index_stripe_sectors(rbio);
1762	return 0;
1763}
1764
1765/*
1766 * We use plugging call backs to collect full stripes.
1767 * Any time we get a partial stripe write while plugged
1768 * we collect it into a list.  When the unplug comes down,
1769 * we sort the list by logical block number and merge
1770 * everything we can into the same rbios
1771 */
1772struct btrfs_plug_cb {
1773	struct blk_plug_cb cb;
1774	struct btrfs_fs_info *info;
1775	struct list_head rbio_list;
1776};
1777
1778/*
1779 * rbios on the plug list are sorted for easier merging.
1780 */
1781static int plug_cmp(void *priv, const struct list_head *a,
1782		    const struct list_head *b)
1783{
1784	const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1785						       plug_list);
1786	const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1787						       plug_list);
1788	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1789	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1790
1791	if (a_sector < b_sector)
1792		return -1;
1793	if (a_sector > b_sector)
1794		return 1;
1795	return 0;
1796}
1797
1798static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1799{
1800	struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
1801	struct btrfs_raid_bio *cur;
1802	struct btrfs_raid_bio *last = NULL;
1803
1804	list_sort(NULL, &plug->rbio_list, plug_cmp);
1805
1806	while (!list_empty(&plug->rbio_list)) {
1807		cur = list_first_entry(&plug->rbio_list,
1808				       struct btrfs_raid_bio, plug_list);
1809		list_del_init(&cur->plug_list);
1810
1811		if (rbio_is_full(cur)) {
1812			/* We have a full stripe, queue it down. */
1813			start_async_work(cur, rmw_rbio_work);
1814			continue;
1815		}
1816		if (last) {
1817			if (rbio_can_merge(last, cur)) {
1818				merge_rbio(last, cur);
1819				free_raid_bio(cur);
1820				continue;
1821			}
1822			start_async_work(last, rmw_rbio_work);
1823		}
1824		last = cur;
1825	}
1826	if (last)
1827		start_async_work(last, rmw_rbio_work);
1828	kfree(plug);
1829}
1830
1831/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
1832static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
1833{
1834	const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1835	const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1836	const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
1837	const u32 orig_len = orig_bio->bi_iter.bi_size;
1838	const u32 sectorsize = fs_info->sectorsize;
1839	u64 cur_logical;
1840
1841	ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start &&
1842			    orig_logical + orig_len <= full_stripe_start +
1843			    rbio->nr_data * BTRFS_STRIPE_LEN,
1844			    rbio, orig_logical);
1845
1846	bio_list_add(&rbio->bio_list, orig_bio);
1847	rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1848
1849	/* Update the dbitmap. */
1850	for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1851	     cur_logical += sectorsize) {
1852		int bit = ((u32)(cur_logical - full_stripe_start) >>
1853			   fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1854
1855		set_bit(bit, &rbio->dbitmap);
1856	}
1857}
1858
1859/*
1860 * our main entry point for writes from the rest of the FS.
1861 */
1862void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
1863{
1864	struct btrfs_fs_info *fs_info = bioc->fs_info;
1865	struct btrfs_raid_bio *rbio;
1866	struct btrfs_plug_cb *plug = NULL;
1867	struct blk_plug_cb *cb;
1868
1869	rbio = alloc_rbio(fs_info, bioc);
1870	if (IS_ERR(rbio)) {
1871		bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
1872		bio_endio(bio);
1873		return;
1874	}
1875	rbio->operation = BTRFS_RBIO_WRITE;
1876	rbio_add_bio(rbio, bio);
1877
1878	/*
1879	 * Don't plug on full rbios, just get them out the door
1880	 * as quickly as we can
1881	 */
1882	if (!rbio_is_full(rbio)) {
1883		cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
1884		if (cb) {
1885			plug = container_of(cb, struct btrfs_plug_cb, cb);
1886			if (!plug->info) {
1887				plug->info = fs_info;
1888				INIT_LIST_HEAD(&plug->rbio_list);
1889			}
1890			list_add_tail(&rbio->plug_list, &plug->rbio_list);
1891			return;
1892		}
1893	}
1894
1895	/*
1896	 * Either we don't have any existing plug, or we're doing a full stripe,
1897	 * queue the rmw work now.
1898	 */
1899	start_async_work(rbio, rmw_rbio_work);
1900}
1901
1902static int verify_one_sector(struct btrfs_raid_bio *rbio,
1903			     int stripe_nr, int sector_nr)
1904{
1905	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1906	phys_addr_t *paddrs;
1907	u8 csum_buf[BTRFS_CSUM_SIZE];
1908	u8 *csum_expected;
1909
1910	if (!rbio->csum_bitmap || !rbio->csum_buf)
1911		return 0;
1912
1913	/* No way to verify P/Q as they are not covered by data csum. */
1914	if (stripe_nr >= rbio->nr_data)
1915		return 0;
1916	/*
1917	 * If we're rebuilding a read, we have to use pages from the
1918	 * bio list if possible.
1919	 */
1920	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1921		paddrs = sector_paddrs_in_rbio(rbio, stripe_nr, sector_nr, 0);
1922	} else {
1923		paddrs = rbio_stripe_paddrs(rbio, stripe_nr, sector_nr);
1924	}
1925
1926	csum_expected = rbio->csum_buf +
1927			(stripe_nr * rbio->stripe_nsectors + sector_nr) *
1928			fs_info->csum_size;
1929	btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf);
1930	if (unlikely(memcmp(csum_buf, csum_expected, fs_info->csum_size) != 0))
1931		return -EIO;
1932	return 0;
1933}
1934
1935static void recover_vertical_step(struct btrfs_raid_bio *rbio,
1936				  unsigned int sector_nr,
1937				  unsigned int step_nr,
1938				  int faila, int failb,
1939				  void **pointers, void **unmap_array)
1940{
1941	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1942	const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
1943	int stripe_nr;
1944
1945	ASSERT(step_nr < rbio->sector_nsteps);
1946	ASSERT(sector_nr < rbio->stripe_nsectors);
1947
1948	/*
1949	 * Setup our array of pointers with sectors from each stripe
1950	 *
1951	 * NOTE: store a duplicate array of pointers to preserve the
1952	 * pointer order.
1953	 */
1954	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1955		phys_addr_t paddr;
1956
1957		/*
1958		 * If we're rebuilding a read, we have to use pages from the
1959		 * bio list if possible.
1960		 */
1961		if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1962			paddr = sector_paddr_in_rbio(rbio, stripe_nr, sector_nr, step_nr, 0);
1963		} else {
1964			paddr = rbio_stripe_paddr(rbio, stripe_nr, sector_nr, step_nr);
1965		}
1966		pointers[stripe_nr] = kmap_local_paddr(paddr);
1967		unmap_array[stripe_nr] = pointers[stripe_nr];
1968	}
1969
1970	/* All raid6 handling here */
1971	if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1972		/* Single failure, rebuild from parity raid5 style */
1973		if (failb < 0) {
1974			if (faila == rbio->nr_data)
1975				/*
1976				 * Just the P stripe has failed, without
1977				 * a bad data or Q stripe.
1978				 * We have nothing to do, just skip the
1979				 * recovery for this stripe.
1980				 */
1981				goto cleanup;
1982			/*
1983			 * a single failure in raid6 is rebuilt
1984			 * in the pstripe code below
1985			 */
1986			goto pstripe;
1987		}
1988
1989		/*
1990		 * If the q stripe is failed, do a pstripe reconstruction from
1991		 * the xors.
1992		 * If both the q stripe and the P stripe are failed, we're
1993		 * here due to a crc mismatch and we can't give them the
1994		 * data they want.
1995		 */
1996		if (failb == rbio->real_stripes - 1) {
1997			if (faila == rbio->real_stripes - 2)
1998				/*
1999				 * Only P and Q are corrupted.
2000				 * We only care about data stripes recovery,
2001				 * can skip this vertical stripe.
2002				 */
2003				goto cleanup;
2004			/*
2005			 * Otherwise we have one bad data stripe and
2006			 * a good P stripe.  raid5!
2007			 */
2008			goto pstripe;
2009		}
2010
2011		if (failb == rbio->real_stripes - 2) {
2012			raid6_datap_recov(rbio->real_stripes, step,
2013					  faila, pointers);
2014		} else {
2015			raid6_2data_recov(rbio->real_stripes, step,
2016					  faila, failb, pointers);
2017		}
2018	} else {
2019		void *p;
2020
2021		/* Rebuild from P stripe here (raid5 or raid6). */
2022		ASSERT(failb == -1);
2023pstripe:
2024		/* Copy parity block into failed block to start with */
2025		memcpy(pointers[faila], pointers[rbio->nr_data], step);
2026
2027		/* Rearrange the pointer array */
2028		p = pointers[faila];
2029		for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
2030		     stripe_nr++)
2031			pointers[stripe_nr] = pointers[stripe_nr + 1];
2032		pointers[rbio->nr_data - 1] = p;
2033
2034		/* Xor in the rest */
2035		run_xor(pointers, rbio->nr_data - 1, step);
2036	}
2037
2038cleanup:
2039	for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
2040		kunmap_local(unmap_array[stripe_nr]);
2041}
2042
2043/*
2044 * Recover a vertical stripe specified by @sector_nr.
2045 * @*pointers are the pre-allocated pointers by the caller, so we don't
2046 * need to allocate/free the pointers again and again.
2047 */
2048static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
2049			    void **pointers, void **unmap_array)
2050{
2051	int found_errors;
2052	int faila;
2053	int failb;
2054	int ret = 0;
2055
2056	/*
2057	 * Now we just use bitmap to mark the horizontal stripes in
2058	 * which we have data when doing parity scrub.
2059	 */
2060	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
2061	    !test_bit(sector_nr, &rbio->dbitmap))
2062		return 0;
2063
2064	found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila,
2065						&failb);
2066	/*
2067	 * No errors in the vertical stripe, skip it.  Can happen for recovery
2068	 * which only part of a stripe failed csum check.
2069	 */
2070	if (!found_errors)
2071		return 0;
2072
2073	if (unlikely(found_errors > rbio->bioc->max_errors))
2074		return -EIO;
2075
2076	for (int i = 0; i < rbio->sector_nsteps; i++)
2077		recover_vertical_step(rbio, sector_nr, i, faila, failb,
2078					    pointers, unmap_array);
2079	if (faila >= 0) {
2080		ret = verify_one_sector(rbio, faila, sector_nr);
2081		if (ret < 0)
2082			return ret;
2083
2084		set_bit(rbio_sector_index(rbio, faila, sector_nr),
2085			rbio->stripe_uptodate_bitmap);
2086	}
2087	if (failb >= 0) {
2088		ret = verify_one_sector(rbio, failb, sector_nr);
2089		if (ret < 0)
2090			return ret;
2091
2092		set_bit(rbio_sector_index(rbio, failb, sector_nr),
2093			rbio->stripe_uptodate_bitmap);
2094	}
2095	return ret;
2096}
2097
2098static int recover_sectors(struct btrfs_raid_bio *rbio)
2099{
2100	void **pointers = NULL;
2101	void **unmap_array = NULL;
2102	int sectornr;
2103	int ret = 0;
2104
2105	/*
2106	 * @pointers array stores the pointer for each sector.
2107	 *
2108	 * @unmap_array stores copy of pointers that does not get reordered
2109	 * during reconstruction so that kunmap_local works.
2110	 */
2111	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2112	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2113	if (!pointers || !unmap_array) {
2114		ret = -ENOMEM;
2115		goto out;
2116	}
2117
2118	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
2119		spin_lock(&rbio->bio_list_lock);
2120		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
2121		spin_unlock(&rbio->bio_list_lock);
2122	}
2123
2124	index_rbio_pages(rbio);
2125
2126	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2127		ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
2128		if (ret < 0)
2129			break;
2130	}
2131
2132out:
2133	kfree(pointers);
2134	kfree(unmap_array);
2135	return ret;
2136}
2137
2138static void recover_rbio(struct btrfs_raid_bio *rbio)
2139{
2140	struct bio_list bio_list = BIO_EMPTY_LIST;
2141	int total_sector_nr;
2142	int ret = 0;
2143
2144	/*
2145	 * Either we're doing recover for a read failure or degraded write,
2146	 * caller should have set error bitmap correctly.
2147	 */
2148	ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
2149
2150	/* For recovery, we need to read all sectors including P/Q. */
2151	ret = alloc_rbio_pages(rbio);
2152	if (ret < 0)
2153		goto out;
2154
2155	index_rbio_pages(rbio);
2156
2157	/*
2158	 * Read everything that hasn't failed. However this time we will
2159	 * not trust any cached sector.
2160	 * As we may read out some stale data but higher layer is not reading
2161	 * that stale part.
2162	 *
2163	 * So here we always re-read everything in recovery path.
2164	 */
2165	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2166	     total_sector_nr++) {
2167		int stripe = total_sector_nr / rbio->stripe_nsectors;
2168		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2169		phys_addr_t *paddrs;
2170
2171		/*
2172		 * Skip the range which has error.  It can be a range which is
2173		 * marked error (for csum mismatch), or it can be a missing
2174		 * device.
2175		 */
2176		if (!rbio->bioc->stripes[stripe].dev->bdev ||
2177		    test_bit(total_sector_nr, rbio->error_bitmap)) {
2178			/*
2179			 * Also set the error bit for missing device, which
2180			 * may not yet have its error bit set.
2181			 */
2182			set_bit(total_sector_nr, rbio->error_bitmap);
2183			continue;
2184		}
2185
2186		paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
2187		ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe,
2188					 sectornr, REQ_OP_READ);
2189		if (ret < 0) {
2190			bio_list_put(&bio_list);
2191			goto out;
2192		}
2193	}
2194
2195	submit_read_wait_bio_list(rbio, &bio_list);
2196	ret = recover_sectors(rbio);
2197out:
2198	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2199}
2200
2201static void recover_rbio_work(struct work_struct *work)
2202{
2203	struct btrfs_raid_bio *rbio;
2204
2205	rbio = container_of(work, struct btrfs_raid_bio, work);
2206	if (!lock_stripe_add(rbio))
2207		recover_rbio(rbio);
2208}
2209
2210static void recover_rbio_work_locked(struct work_struct *work)
2211{
2212	recover_rbio(container_of(work, struct btrfs_raid_bio, work));
2213}
2214
2215static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
2216{
2217	bool found = false;
2218	int sector_nr;
2219
2220	/*
2221	 * This is for RAID6 extra recovery tries, thus mirror number should
2222	 * be large than 2.
2223	 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
2224	 * RAID5 methods.
2225	 */
2226	ASSERT(mirror_num > 2);
2227	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2228		int found_errors;
2229		int faila;
2230		int failb;
2231
2232		found_errors = get_rbio_vertical_errors(rbio, sector_nr,
2233							 &faila, &failb);
2234		/* This vertical stripe doesn't have errors. */
2235		if (!found_errors)
2236			continue;
2237
2238		/*
2239		 * If we found errors, there should be only one error marked
2240		 * by previous set_rbio_range_error().
2241		 */
2242		ASSERT(found_errors == 1);
2243		found = true;
2244
2245		/* Now select another stripe to mark as error. */
2246		failb = rbio->real_stripes - (mirror_num - 1);
2247		if (failb <= faila)
2248			failb--;
2249
2250		/* Set the extra bit in error bitmap. */
2251		if (failb >= 0)
2252			set_bit(failb * rbio->stripe_nsectors + sector_nr,
2253				rbio->error_bitmap);
2254	}
2255
2256	/* We should found at least one vertical stripe with error.*/
2257	ASSERT(found);
2258}
2259
2260/*
2261 * the main entry point for reads from the higher layers.  This
2262 * is really only called when the normal read path had a failure,
2263 * so we assume the bio they send down corresponds to a failed part
2264 * of the drive.
2265 */
2266void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2267			   int mirror_num)
2268{
2269	struct btrfs_fs_info *fs_info = bioc->fs_info;
2270	struct btrfs_raid_bio *rbio;
2271
2272	rbio = alloc_rbio(fs_info, bioc);
2273	if (IS_ERR(rbio)) {
2274		bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
2275		bio_endio(bio);
2276		return;
2277	}
2278
2279	rbio->operation = BTRFS_RBIO_READ_REBUILD;
2280	rbio_add_bio(rbio, bio);
2281
2282	set_rbio_range_error(rbio, bio);
2283
2284	/*
2285	 * Loop retry:
2286	 * for 'mirror == 2', reconstruct from all other stripes.
2287	 * for 'mirror_num > 2', select a stripe to fail on every retry.
2288	 */
2289	if (mirror_num > 2)
2290		set_rbio_raid6_extra_error(rbio, mirror_num);
2291
2292	start_async_work(rbio, recover_rbio_work);
2293}
2294
2295static void fill_data_csums(struct btrfs_raid_bio *rbio)
2296{
2297	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
2298	struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
2299						       rbio->bioc->full_stripe_logical);
2300	const u64 start = rbio->bioc->full_stripe_logical;
2301	const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
2302			fs_info->sectorsize_bits;
2303	int ret;
2304
2305	/* The rbio should not have its csum buffer initialized. */
2306	ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
2307
2308	/*
2309	 * Skip the csum search if:
2310	 *
2311	 * - The rbio doesn't belong to data block groups
2312	 *   Then we are doing IO for tree blocks, no need to search csums.
2313	 *
2314	 * - The rbio belongs to mixed block groups
2315	 *   This is to avoid deadlock, as we're already holding the full
2316	 *   stripe lock, if we trigger a metadata read, and it needs to do
2317	 *   raid56 recovery, we will deadlock.
2318	 */
2319	if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
2320	    rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
2321		return;
2322
2323	rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors *
2324				 fs_info->csum_size, GFP_NOFS);
2325	rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors,
2326					  GFP_NOFS);
2327	if (!rbio->csum_buf || !rbio->csum_bitmap) {
2328		ret = -ENOMEM;
2329		goto error;
2330	}
2331
2332	ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1,
2333					rbio->csum_buf, rbio->csum_bitmap);
2334	if (ret < 0)
2335		goto error;
2336	if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
2337		goto no_csum;
2338	return;
2339
2340error:
2341	/*
2342	 * We failed to allocate memory or grab the csum, but it's not fatal,
2343	 * we can still continue.  But better to warn users that RMW is no
2344	 * longer safe for this particular sub-stripe write.
2345	 */
2346	btrfs_warn_rl(fs_info,
2347"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
2348			rbio->bioc->full_stripe_logical, ret);
2349no_csum:
2350	kfree(rbio->csum_buf);
2351	bitmap_free(rbio->csum_bitmap);
2352	rbio->csum_buf = NULL;
2353	rbio->csum_bitmap = NULL;
2354}
2355
2356static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
2357{
2358	struct bio_list bio_list = BIO_EMPTY_LIST;
2359	int total_sector_nr;
2360	int ret = 0;
2361
2362	/*
2363	 * Fill the data csums we need for data verification.  We need to fill
2364	 * the csum_bitmap/csum_buf first, as our endio function will try to
2365	 * verify the data sectors.
2366	 */
2367	fill_data_csums(rbio);
2368
2369	/*
2370	 * Build a list of bios to read all sectors (including data and P/Q).
2371	 *
2372	 * This behavior is to compensate the later csum verification and recovery.
2373	 */
2374	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2375	     total_sector_nr++) {
2376		int stripe = total_sector_nr / rbio->stripe_nsectors;
2377		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2378		phys_addr_t *paddrs;
2379
2380		paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
2381		ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe,
2382					 sectornr, REQ_OP_READ);
2383		if (ret) {
2384			bio_list_put(&bio_list);
2385			return ret;
2386		}
2387	}
2388
2389	/*
2390	 * We may or may not have any corrupted sectors (including missing dev
2391	 * and csum mismatch), just let recover_sectors() to handle them all.
2392	 */
2393	submit_read_wait_bio_list(rbio, &bio_list);
2394	return recover_sectors(rbio);
2395}
2396
2397static void raid_wait_write_end_io(struct bio *bio)
2398{
2399	struct btrfs_raid_bio *rbio = bio->bi_private;
2400
2401	if (bio->bi_status)
2402		rbio_update_error_bitmap(rbio, bio);
2403	bio_put(bio);
2404	if (atomic_dec_and_test(&rbio->stripes_pending))
2405		wake_up(&rbio->io_wait);
2406}
2407
2408static void submit_write_bios(struct btrfs_raid_bio *rbio,
2409			      struct bio_list *bio_list)
2410{
2411	struct bio *bio;
2412
2413	atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
2414	while ((bio = bio_list_pop(bio_list))) {
2415		bio->bi_end_io = raid_wait_write_end_io;
2416
2417		if (trace_raid56_write_enabled()) {
2418			struct raid56_bio_trace_info trace_info = { 0 };
2419
2420			bio_get_trace_info(rbio, bio, &trace_info);
2421			trace_raid56_write(rbio, bio, &trace_info);
2422		}
2423		submit_bio(bio);
2424	}
2425}
2426
2427/*
2428 * To determine if we need to read any sector from the disk.
2429 * Should only be utilized in RMW path, to skip cached rbio.
2430 */
2431static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
2432{
2433	int i;
2434
2435	for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
2436		phys_addr_t paddr = rbio->stripe_paddrs[i * rbio->sector_nsteps];
2437
2438		/*
2439		 * We have a sector which doesn't have page nor uptodate,
2440		 * thus this rbio can not be cached one, as cached one must
2441		 * have all its data sectors present and uptodate.
2442		 */
2443		if (paddr == INVALID_PADDR ||
2444		    !test_bit(i, rbio->stripe_uptodate_bitmap))
2445			return true;
2446	}
2447	return false;
2448}
2449
2450static void rmw_rbio(struct btrfs_raid_bio *rbio)
2451{
2452	struct bio_list bio_list;
2453	int sectornr;
2454	int ret = 0;
2455
2456	/*
2457	 * Allocate the pages for parity first, as P/Q pages will always be
2458	 * needed for both full-stripe and sub-stripe writes.
2459	 */
2460	ret = alloc_rbio_parity_pages(rbio);
2461	if (ret < 0)
2462		goto out;
2463
2464	/*
2465	 * Either full stripe write, or we have every data sector already
2466	 * cached, can go to write path immediately.
2467	 */
2468	if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) {
2469		/*
2470		 * Now we're doing sub-stripe write, also need all data stripes
2471		 * to do the full RMW.
2472		 */
2473		ret = alloc_rbio_data_pages(rbio);
2474		if (ret < 0)
2475			goto out;
2476
2477		index_rbio_pages(rbio);
2478
2479		ret = rmw_read_wait_recover(rbio);
2480		if (ret < 0)
2481			goto out;
2482	}
2483
2484	/*
2485	 * At this stage we're not allowed to add any new bios to the
2486	 * bio list any more, anyone else that wants to change this stripe
2487	 * needs to do their own rmw.
2488	 */
2489	spin_lock(&rbio->bio_list_lock);
2490	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
2491	spin_unlock(&rbio->bio_list_lock);
2492
2493	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2494
2495	index_rbio_pages(rbio);
2496
2497	/*
2498	 * We don't cache full rbios because we're assuming
2499	 * the higher layers are unlikely to use this area of
2500	 * the disk again soon.  If they do use it again,
2501	 * hopefully they will send another full bio.
2502	 */
2503	if (!rbio_is_full(rbio))
2504		cache_rbio_pages(rbio);
2505	else
2506		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2507
2508	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
2509		generate_pq_vertical(rbio, sectornr);
2510
2511	bio_list_init(&bio_list);
2512	ret = rmw_assemble_write_bios(rbio, &bio_list);
2513	if (ret < 0)
2514		goto out;
2515
2516	/* We should have at least one bio assembled. */
2517	ASSERT(bio_list_size(&bio_list));
2518	submit_write_bios(rbio, &bio_list);
2519	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2520
2521	/* We may have more errors than our tolerance during the read. */
2522	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2523		int found_errors;
2524
2525		found_errors = get_rbio_vertical_errors(rbio, sectornr, NULL, NULL);
2526		if (unlikely(found_errors > rbio->bioc->max_errors)) {
2527			ret = -EIO;
2528			break;
2529		}
2530	}
2531out:
2532	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2533}
2534
2535static void rmw_rbio_work(struct work_struct *work)
2536{
2537	struct btrfs_raid_bio *rbio;
2538
2539	rbio = container_of(work, struct btrfs_raid_bio, work);
2540	if (lock_stripe_add(rbio) == 0)
2541		rmw_rbio(rbio);
2542}
2543
2544static void rmw_rbio_work_locked(struct work_struct *work)
2545{
2546	rmw_rbio(container_of(work, struct btrfs_raid_bio, work));
2547}
2548
2549/*
2550 * The following code is used to scrub/replace the parity stripe
2551 *
2552 * Caller must have already increased bio_counter for getting @bioc.
2553 *
2554 * Note: We need make sure all the pages that add into the scrub/replace
2555 * raid bio are correct and not be changed during the scrub/replace. That
2556 * is those pages just hold metadata or file data with checksum.
2557 */
2558
2559struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2560				struct btrfs_io_context *bioc,
2561				struct btrfs_device *scrub_dev,
2562				unsigned long *dbitmap, int stripe_nsectors)
2563{
2564	struct btrfs_fs_info *fs_info = bioc->fs_info;
2565	struct btrfs_raid_bio *rbio;
2566	int i;
2567
2568	rbio = alloc_rbio(fs_info, bioc);
2569	if (IS_ERR(rbio))
2570		return NULL;
2571	bio_list_add(&rbio->bio_list, bio);
2572	/*
2573	 * This is a special bio which is used to hold the completion handler
2574	 * and make the scrub rbio is similar to the other types
2575	 */
2576	ASSERT(!bio->bi_iter.bi_size);
2577	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2578
2579	/*
2580	 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2581	 * to the end position, so this search can start from the first parity
2582	 * stripe.
2583	 */
2584	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2585		if (bioc->stripes[i].dev == scrub_dev) {
2586			rbio->scrubp = i;
2587			break;
2588		}
2589	}
2590	ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i);
2591
2592	bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
2593	return rbio;
2594}
2595
2596static int alloc_rbio_sector_pages(struct btrfs_raid_bio *rbio,
2597				  int sector_nr)
2598{
2599	const u32 step = min(PAGE_SIZE, rbio->bioc->fs_info->sectorsize);
2600	const u32 base = sector_nr * rbio->sector_nsteps;
2601
2602	for (int i = base; i < base + rbio->sector_nsteps; i++) {
2603		const unsigned int page_index = (i * step) >> PAGE_SHIFT;
2604		struct page *page;
2605
2606		if (rbio->stripe_pages[page_index])
2607			continue;
2608		page = alloc_page(GFP_NOFS);
2609		if (!page)
2610			return -ENOMEM;
2611		rbio->stripe_pages[page_index] = page;
2612	}
2613	return 0;
2614}
2615
2616/*
2617 * We just scrub the parity that we have correct data on the same horizontal,
2618 * so we needn't allocate all pages for all the stripes.
2619 */
2620static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2621{
2622	int total_sector_nr;
2623
2624	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2625	     total_sector_nr++) {
2626		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2627		int ret;
2628
2629		if (!test_bit(sectornr, &rbio->dbitmap))
2630			continue;
2631		ret = alloc_rbio_sector_pages(rbio, total_sector_nr);
2632		if (ret < 0)
2633			return ret;
2634	}
2635	index_stripe_sectors(rbio);
2636	return 0;
2637}
2638
2639/* Return true if the content of the step matches the caclulated one. */
2640static bool verify_one_parity_step(struct btrfs_raid_bio *rbio,
2641				   void *pointers[], unsigned int sector_nr,
2642				   unsigned int step_nr)
2643{
2644	const unsigned int nr_data = rbio->nr_data;
2645	const bool has_qstripe = (rbio->real_stripes - rbio->nr_data == 2);
2646	const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
2647	void *parity;
2648	bool ret = false;
2649
2650	ASSERT(step_nr < rbio->sector_nsteps);
2651
2652	/* First collect one page from each data stripe. */
2653	for (int stripe = 0; stripe < nr_data; stripe++)
2654		pointers[stripe] = kmap_local_paddr(
2655				sector_paddr_in_rbio(rbio, stripe, sector_nr,
2656						     step_nr, 0));
2657
2658	if (has_qstripe) {
2659		assert_rbio(rbio);
2660		/* RAID6, call the library function to fill in our P/Q. */
2661		raid6_call.gen_syndrome(rbio->real_stripes, step, pointers);
2662	} else {
2663		/* RAID5. */
2664		memcpy(pointers[nr_data], pointers[0], step);
2665		run_xor(pointers + 1, nr_data - 1, step);
2666	}
2667
2668	/* Check scrubbing parity and repair it. */
2669	parity = kmap_local_paddr(rbio_stripe_paddr(rbio, rbio->scrubp, sector_nr, step_nr));
2670	if (memcmp(parity, pointers[rbio->scrubp], step) != 0)
2671		memcpy(parity, pointers[rbio->scrubp], step);
2672	else
2673		ret = true;
2674	kunmap_local(parity);
2675
2676	for (int stripe = nr_data - 1; stripe >= 0; stripe--)
2677		kunmap_local(pointers[stripe]);
2678	return ret;
2679}
2680
2681/*
2682 * The @pointers array should have the P/Q parity already mapped.
2683 */
2684static void verify_one_parity_sector(struct btrfs_raid_bio *rbio,
2685				     void *pointers[], unsigned int sector_nr)
2686{
2687	bool found_error = false;
2688
2689	for (int step_nr = 0; step_nr < rbio->sector_nsteps; step_nr++) {
2690		bool match;
2691
2692		match = verify_one_parity_step(rbio, pointers, sector_nr, step_nr);
2693		if (!match)
2694			found_error = true;
2695	}
2696	if (!found_error)
2697		bitmap_clear(&rbio->dbitmap, sector_nr, 1);
2698}
2699
2700static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
2701{
2702	struct btrfs_io_context *bioc = rbio->bioc;
2703	void **pointers = rbio->finish_pointers;
2704	unsigned long *pbitmap = &rbio->finish_pbitmap;
2705	int nr_data = rbio->nr_data;
2706	int sectornr;
2707	bool has_qstripe;
2708	struct page *page;
2709	phys_addr_t p_paddr = INVALID_PADDR;
2710	phys_addr_t q_paddr = INVALID_PADDR;
2711	struct bio_list bio_list;
2712	int is_replace = 0;
2713	int ret;
2714
2715	bio_list_init(&bio_list);
2716
2717	if (rbio->real_stripes - rbio->nr_data == 1)
2718		has_qstripe = false;
2719	else if (rbio->real_stripes - rbio->nr_data == 2)
2720		has_qstripe = true;
2721	else
2722		BUG();
2723
2724	/*
2725	 * Replace is running and our P/Q stripe is being replaced, then we
2726	 * need to duplicate the final write to replace target.
2727	 */
2728	if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
2729		is_replace = 1;
2730		bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
2731	}
2732
2733	/*
2734	 * Because the higher layers(scrubber) are unlikely to
2735	 * use this area of the disk again soon, so don't cache
2736	 * it.
2737	 */
2738	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2739
2740	page = alloc_page(GFP_NOFS);
2741	if (!page)
2742		return -ENOMEM;
2743	p_paddr = page_to_phys(page);
2744	page = NULL;
2745	pointers[nr_data] = kmap_local_paddr(p_paddr);
2746
2747	if (has_qstripe) {
2748		/* RAID6, allocate and map temp space for the Q stripe */
2749		page = alloc_page(GFP_NOFS);
2750		if (!page) {
2751			__free_page(phys_to_page(p_paddr));
2752			p_paddr = INVALID_PADDR;
2753			return -ENOMEM;
2754		}
2755		q_paddr = page_to_phys(page);
2756		page = NULL;
2757		pointers[rbio->real_stripes - 1] = kmap_local_paddr(q_paddr);
2758	}
2759
2760	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2761
2762	/* Map the parity stripe just once */
2763
2764	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors)
2765		verify_one_parity_sector(rbio, pointers, sectornr);
2766
2767	kunmap_local(pointers[nr_data]);
2768	__free_page(phys_to_page(p_paddr));
2769	p_paddr = INVALID_PADDR;
2770	if (q_paddr != INVALID_PADDR) {
2771		__free_page(phys_to_page(q_paddr));
2772		q_paddr = INVALID_PADDR;
2773	}
2774
2775	/*
2776	 * time to start writing.  Make bios for everything from the
2777	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2778	 * everything else.
2779	 */
2780	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2781		phys_addr_t *paddrs;
2782
2783		paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr);
2784		ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->scrubp,
2785					 sectornr, REQ_OP_WRITE);
2786		if (ret)
2787			goto cleanup;
2788	}
2789
2790	if (!is_replace)
2791		goto submit_write;
2792
2793	/*
2794	 * Replace is running and our parity stripe needs to be duplicated to
2795	 * the target device.  Check we have a valid source stripe number.
2796	 */
2797	ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio);
2798	for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2799		phys_addr_t *paddrs;
2800
2801		paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr);
2802		ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->real_stripes,
2803					 sectornr, REQ_OP_WRITE);
2804		if (ret)
2805			goto cleanup;
2806	}
2807
2808submit_write:
2809	submit_write_bios(rbio, &bio_list);
2810	return 0;
2811
2812cleanup:
2813	bio_list_put(&bio_list);
2814	return ret;
2815}
2816
2817static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2818{
2819	if (stripe >= 0 && stripe < rbio->nr_data)
2820		return 1;
2821	return 0;
2822}
2823
2824static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
2825{
2826	void **pointers = NULL;
2827	void **unmap_array = NULL;
2828	int sector_nr;
2829	int ret = 0;
2830
2831	/*
2832	 * @pointers array stores the pointer for each sector.
2833	 *
2834	 * @unmap_array stores copy of pointers that does not get reordered
2835	 * during reconstruction so that kunmap_local works.
2836	 */
2837	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2838	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2839	if (!pointers || !unmap_array) {
2840		ret = -ENOMEM;
2841		goto out;
2842	}
2843
2844	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2845		int dfail = 0, failp = -1;
2846		int faila;
2847		int failb;
2848		int found_errors;
2849
2850		found_errors = get_rbio_vertical_errors(rbio, sector_nr,
2851							 &faila, &failb);
2852		if (unlikely(found_errors > rbio->bioc->max_errors)) {
2853			ret = -EIO;
2854			goto out;
2855		}
2856		if (found_errors == 0)
2857			continue;
2858
2859		/* We should have at least one error here. */
2860		ASSERT(faila >= 0 || failb >= 0);
2861
2862		if (is_data_stripe(rbio, faila))
2863			dfail++;
2864		else if (is_parity_stripe(faila))
2865			failp = faila;
2866
2867		if (is_data_stripe(rbio, failb))
2868			dfail++;
2869		else if (is_parity_stripe(failb))
2870			failp = failb;
2871		/*
2872		 * Because we can not use a scrubbing parity to repair the
2873		 * data, so the capability of the repair is declined.  (In the
2874		 * case of RAID5, we can not repair anything.)
2875		 */
2876		if (unlikely(dfail > rbio->bioc->max_errors - 1)) {
2877			ret = -EIO;
2878			goto out;
2879		}
2880		/*
2881		 * If all data is good, only parity is correctly, just repair
2882		 * the parity, no need to recover data stripes.
2883		 */
2884		if (dfail == 0)
2885			continue;
2886
2887		/*
2888		 * Here means we got one corrupted data stripe and one
2889		 * corrupted parity on RAID6, if the corrupted parity is
2890		 * scrubbing parity, luckily, use the other one to repair the
2891		 * data, or we can not repair the data stripe.
2892		 */
2893		if (unlikely(failp != rbio->scrubp)) {
2894			ret = -EIO;
2895			goto out;
2896		}
2897
2898		ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
2899		if (ret < 0)
2900			goto out;
2901	}
2902out:
2903	kfree(pointers);
2904	kfree(unmap_array);
2905	return ret;
2906}
2907
2908static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
2909{
2910	struct bio_list bio_list = BIO_EMPTY_LIST;
2911	int total_sector_nr;
2912	int ret = 0;
2913
2914	/* Build a list of bios to read all the missing parts. */
2915	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2916	     total_sector_nr++) {
2917		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2918		int stripe = total_sector_nr / rbio->stripe_nsectors;
2919		phys_addr_t *paddrs;
2920
2921		/* No data in the vertical stripe, no need to read. */
2922		if (!test_bit(sectornr, &rbio->dbitmap))
2923			continue;
2924
2925		/*
2926		 * We want to find all the sectors missing from the rbio and
2927		 * read them from the disk. If sector_paddr_in_rbio() finds a sector
2928		 * in the bio list we don't need to read it off the stripe.
2929		 */
2930		paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1);
2931		if (paddrs == NULL)
2932			continue;
2933
2934		paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
2935		/*
2936		 * The bio cache may have handed us an uptodate sector.  If so,
2937		 * use it.
2938		 */
2939		if (test_bit(rbio_sector_index(rbio, stripe, sectornr),
2940			     rbio->stripe_uptodate_bitmap))
2941			continue;
2942
2943		ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe,
2944					 sectornr, REQ_OP_READ);
2945		if (ret) {
2946			bio_list_put(&bio_list);
2947			return ret;
2948		}
2949	}
2950
2951	submit_read_wait_bio_list(rbio, &bio_list);
2952	return 0;
2953}
2954
2955static void scrub_rbio(struct btrfs_raid_bio *rbio)
2956{
2957	int sector_nr;
2958	int ret;
2959
2960	ret = alloc_rbio_essential_pages(rbio);
2961	if (ret)
2962		goto out;
2963
2964	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2965
2966	ret = scrub_assemble_read_bios(rbio);
2967	if (ret < 0)
2968		goto out;
2969
2970	/* We may have some failures, recover the failed sectors first. */
2971	ret = recover_scrub_rbio(rbio);
2972	if (ret < 0)
2973		goto out;
2974
2975	/*
2976	 * We have every sector properly prepared. Can finish the scrub
2977	 * and writeback the good content.
2978	 */
2979	ret = finish_parity_scrub(rbio);
2980	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2981	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2982		int found_errors;
2983
2984		found_errors = get_rbio_vertical_errors(rbio, sector_nr, NULL, NULL);
2985		if (unlikely(found_errors > rbio->bioc->max_errors)) {
2986			ret = -EIO;
2987			break;
2988		}
2989	}
2990out:
2991	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2992}
2993
2994static void scrub_rbio_work_locked(struct work_struct *work)
2995{
2996	scrub_rbio(container_of(work, struct btrfs_raid_bio, work));
2997}
2998
2999void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
3000{
3001	if (!lock_stripe_add(rbio))
3002		start_async_work(rbio, scrub_rbio_work_locked);
3003}
3004
3005/*
3006 * This is for scrub call sites where we already have correct data contents.
3007 * This allows us to avoid reading data stripes again.
3008 *
3009 * Unfortunately here we have to do folio copy, other than reusing the pages.
3010 * This is due to the fact rbio has its own page management for its cache.
3011 */
3012void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio,
3013				     struct folio **data_folios, u64 data_logical)
3014{
3015	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
3016	const u64 offset_in_full_stripe = data_logical -
3017					  rbio->bioc->full_stripe_logical;
3018	unsigned int findex = 0;
3019	unsigned int foffset = 0;
3020	int ret;
3021
3022	/*
3023	 * If we hit ENOMEM temporarily, but later at
3024	 * raid56_parity_submit_scrub_rbio() time it succeeded, we just do
3025	 * the extra read, not a big deal.
3026	 *
3027	 * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time,
3028	 * the bio would got proper error number set.
3029	 */
3030	ret = alloc_rbio_data_pages(rbio);
3031	if (ret < 0)
3032		return;
3033
3034	/* data_logical must be at stripe boundary and inside the full stripe. */
3035	ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
3036	ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
3037
3038	for (unsigned int cur_off = offset_in_full_stripe;
3039	     cur_off < offset_in_full_stripe + BTRFS_STRIPE_LEN;
3040	     cur_off += PAGE_SIZE) {
3041		const unsigned int pindex = cur_off >> PAGE_SHIFT;
3042		void *kaddr;
3043
3044		kaddr = kmap_local_page(rbio->stripe_pages[pindex]);
3045		memcpy_from_folio(kaddr, data_folios[findex], foffset, PAGE_SIZE);
3046		kunmap_local(kaddr);
3047
3048		foffset += PAGE_SIZE;
3049		ASSERT(foffset <= folio_size(data_folios[findex]));
3050		if (foffset == folio_size(data_folios[findex])) {
3051			findex++;
3052			foffset = 0;
3053		}
3054	}
3055	bitmap_set(rbio->stripe_uptodate_bitmap,
3056		   offset_in_full_stripe >> fs_info->sectorsize_bits,
3057		   BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
3058}