drivers/md/raid10.c at v2.6.30-rc2 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / drivers / md / raid10.c
at v2.6.30-rc2 2301 lines 63 kB view raw
   1/*
   2 * raid10.c : Multiple Devices driver for Linux
   3 *
   4 * Copyright (C) 2000-2004 Neil Brown
   5 *
   6 * RAID-10 support for md.
   7 *
   8 * Base on code in raid1.c.  See raid1.c for futher copyright information.
   9 *
  10 *
  11 * This program is free software; you can redistribute it and/or modify
  12 * it under the terms of the GNU General Public License as published by
  13 * the Free Software Foundation; either version 2, or (at your option)
  14 * any later version.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * (for example /usr/src/linux/COPYING); if not, write to the Free
  18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19 */
  20
  21#include <linux/delay.h>
  22#include <linux/blkdev.h>
  23#include <linux/seq_file.h>
  24#include "md.h"
  25#include "dm-bio-list.h"
  26#include "raid10.h"
  27#include "bitmap.h"
  28
  29/*
  30 * RAID10 provides a combination of RAID0 and RAID1 functionality.
  31 * The layout of data is defined by
  32 *    chunk_size
  33 *    raid_disks
  34 *    near_copies (stored in low byte of layout)
  35 *    far_copies (stored in second byte of layout)
  36 *    far_offset (stored in bit 16 of layout )
  37 *
  38 * The data to be stored is divided into chunks using chunksize.
  39 * Each device is divided into far_copies sections.
  40 * In each section, chunks are laid out in a style similar to raid0, but
  41 * near_copies copies of each chunk is stored (each on a different drive).
  42 * The starting device for each section is offset near_copies from the starting
  43 * device of the previous section.
  44 * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
  45 * drive.
  46 * near_copies and far_copies must be at least one, and their product is at most
  47 * raid_disks.
  48 *
  49 * If far_offset is true, then the far_copies are handled a bit differently.
  50 * The copies are still in different stripes, but instead of be very far apart
  51 * on disk, there are adjacent stripes.
  52 */
  53
  54/*
  55 * Number of guaranteed r10bios in case of extreme VM load:
  56 */
  57#define	NR_RAID10_BIOS 256
  58
  59static void unplug_slaves(mddev_t *mddev);
  60
  61static void allow_barrier(conf_t *conf);
  62static void lower_barrier(conf_t *conf);
  63
  64static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
  65{
  66	conf_t *conf = data;
  67	r10bio_t *r10_bio;
  68	int size = offsetof(struct r10bio_s, devs[conf->copies]);
  69
  70	/* allocate a r10bio with room for raid_disks entries in the bios array */
  71	r10_bio = kzalloc(size, gfp_flags);
  72	if (!r10_bio)
  73		unplug_slaves(conf->mddev);
  74
  75	return r10_bio;
  76}
  77
  78static void r10bio_pool_free(void *r10_bio, void *data)
  79{
  80	kfree(r10_bio);
  81}
  82
  83/* Maximum size of each resync request */
  84#define RESYNC_BLOCK_SIZE (64*1024)
  85#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
  86/* amount of memory to reserve for resync requests */
  87#define RESYNC_WINDOW (1024*1024)
  88/* maximum number of concurrent requests, memory permitting */
  89#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
  90
  91/*
  92 * When performing a resync, we need to read and compare, so
  93 * we need as many pages are there are copies.
  94 * When performing a recovery, we need 2 bios, one for read,
  95 * one for write (we recover only one drive per r10buf)
  96 *
  97 */
  98static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
  99{
 100	conf_t *conf = data;
 101	struct page *page;
 102	r10bio_t *r10_bio;
 103	struct bio *bio;
 104	int i, j;
 105	int nalloc;
 106
 107	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
 108	if (!r10_bio) {
 109		unplug_slaves(conf->mddev);
 110		return NULL;
 111	}
 112
 113	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
 114		nalloc = conf->copies; /* resync */
 115	else
 116		nalloc = 2; /* recovery */
 117
 118	/*
 119	 * Allocate bios.
 120	 */
 121	for (j = nalloc ; j-- ; ) {
 122		bio = bio_alloc(gfp_flags, RESYNC_PAGES);
 123		if (!bio)
 124			goto out_free_bio;
 125		r10_bio->devs[j].bio = bio;
 126	}
 127	/*
 128	 * Allocate RESYNC_PAGES data pages and attach them
 129	 * where needed.
 130	 */
 131	for (j = 0 ; j < nalloc; j++) {
 132		bio = r10_bio->devs[j].bio;
 133		for (i = 0; i < RESYNC_PAGES; i++) {
 134			page = alloc_page(gfp_flags);
 135			if (unlikely(!page))
 136				goto out_free_pages;
 137
 138			bio->bi_io_vec[i].bv_page = page;
 139		}
 140	}
 141
 142	return r10_bio;
 143
 144out_free_pages:
 145	for ( ; i > 0 ; i--)
 146		safe_put_page(bio->bi_io_vec[i-1].bv_page);
 147	while (j--)
 148		for (i = 0; i < RESYNC_PAGES ; i++)
 149			safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
 150	j = -1;
 151out_free_bio:
 152	while ( ++j < nalloc )
 153		bio_put(r10_bio->devs[j].bio);
 154	r10bio_pool_free(r10_bio, conf);
 155	return NULL;
 156}
 157
 158static void r10buf_pool_free(void *__r10_bio, void *data)
 159{
 160	int i;
 161	conf_t *conf = data;
 162	r10bio_t *r10bio = __r10_bio;
 163	int j;
 164
 165	for (j=0; j < conf->copies; j++) {
 166		struct bio *bio = r10bio->devs[j].bio;
 167		if (bio) {
 168			for (i = 0; i < RESYNC_PAGES; i++) {
 169				safe_put_page(bio->bi_io_vec[i].bv_page);
 170				bio->bi_io_vec[i].bv_page = NULL;
 171			}
 172			bio_put(bio);
 173		}
 174	}
 175	r10bio_pool_free(r10bio, conf);
 176}
 177
 178static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
 179{
 180	int i;
 181
 182	for (i = 0; i < conf->copies; i++) {
 183		struct bio **bio = & r10_bio->devs[i].bio;
 184		if (*bio && *bio != IO_BLOCKED)
 185			bio_put(*bio);
 186		*bio = NULL;
 187	}
 188}
 189
 190static void free_r10bio(r10bio_t *r10_bio)
 191{
 192	conf_t *conf = mddev_to_conf(r10_bio->mddev);
 193
 194	/*
 195	 * Wake up any possible resync thread that waits for the device
 196	 * to go idle.
 197	 */
 198	allow_barrier(conf);
 199
 200	put_all_bios(conf, r10_bio);
 201	mempool_free(r10_bio, conf->r10bio_pool);
 202}
 203
 204static void put_buf(r10bio_t *r10_bio)
 205{
 206	conf_t *conf = mddev_to_conf(r10_bio->mddev);
 207
 208	mempool_free(r10_bio, conf->r10buf_pool);
 209
 210	lower_barrier(conf);
 211}
 212
 213static void reschedule_retry(r10bio_t *r10_bio)
 214{
 215	unsigned long flags;
 216	mddev_t *mddev = r10_bio->mddev;
 217	conf_t *conf = mddev_to_conf(mddev);
 218
 219	spin_lock_irqsave(&conf->device_lock, flags);
 220	list_add(&r10_bio->retry_list, &conf->retry_list);
 221	conf->nr_queued ++;
 222	spin_unlock_irqrestore(&conf->device_lock, flags);
 223
 224	/* wake up frozen array... */
 225	wake_up(&conf->wait_barrier);
 226
 227	md_wakeup_thread(mddev->thread);
 228}
 229
 230/*
 231 * raid_end_bio_io() is called when we have finished servicing a mirrored
 232 * operation and are ready to return a success/failure code to the buffer
 233 * cache layer.
 234 */
 235static void raid_end_bio_io(r10bio_t *r10_bio)
 236{
 237	struct bio *bio = r10_bio->master_bio;
 238
 239	bio_endio(bio,
 240		test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
 241	free_r10bio(r10_bio);
 242}
 243
 244/*
 245 * Update disk head position estimator based on IRQ completion info.
 246 */
 247static inline void update_head_pos(int slot, r10bio_t *r10_bio)
 248{
 249	conf_t *conf = mddev_to_conf(r10_bio->mddev);
 250
 251	conf->mirrors[r10_bio->devs[slot].devnum].head_position =
 252		r10_bio->devs[slot].addr + (r10_bio->sectors);
 253}
 254
 255static void raid10_end_read_request(struct bio *bio, int error)
 256{
 257	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 258	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
 259	int slot, dev;
 260	conf_t *conf = mddev_to_conf(r10_bio->mddev);
 261
 262
 263	slot = r10_bio->read_slot;
 264	dev = r10_bio->devs[slot].devnum;
 265	/*
 266	 * this branch is our 'one mirror IO has finished' event handler:
 267	 */
 268	update_head_pos(slot, r10_bio);
 269
 270	if (uptodate) {
 271		/*
 272		 * Set R10BIO_Uptodate in our master bio, so that
 273		 * we will return a good error code to the higher
 274		 * levels even if IO on some other mirrored buffer fails.
 275		 *
 276		 * The 'master' represents the composite IO operation to
 277		 * user-side. So if something waits for IO, then it will
 278		 * wait for the 'master' bio.
 279		 */
 280		set_bit(R10BIO_Uptodate, &r10_bio->state);
 281		raid_end_bio_io(r10_bio);
 282	} else {
 283		/*
 284		 * oops, read error:
 285		 */
 286		char b[BDEVNAME_SIZE];
 287		if (printk_ratelimit())
 288			printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n",
 289			       bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
 290		reschedule_retry(r10_bio);
 291	}
 292
 293	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
 294}
 295
 296static void raid10_end_write_request(struct bio *bio, int error)
 297{
 298	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 299	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
 300	int slot, dev;
 301	conf_t *conf = mddev_to_conf(r10_bio->mddev);
 302
 303	for (slot = 0; slot < conf->copies; slot++)
 304		if (r10_bio->devs[slot].bio == bio)
 305			break;
 306	dev = r10_bio->devs[slot].devnum;
 307
 308	/*
 309	 * this branch is our 'one mirror IO has finished' event handler:
 310	 */
 311	if (!uptodate) {
 312		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
 313		/* an I/O failed, we can't clear the bitmap */
 314		set_bit(R10BIO_Degraded, &r10_bio->state);
 315	} else
 316		/*
 317		 * Set R10BIO_Uptodate in our master bio, so that
 318		 * we will return a good error code for to the higher
 319		 * levels even if IO on some other mirrored buffer fails.
 320		 *
 321		 * The 'master' represents the composite IO operation to
 322		 * user-side. So if something waits for IO, then it will
 323		 * wait for the 'master' bio.
 324		 */
 325		set_bit(R10BIO_Uptodate, &r10_bio->state);
 326
 327	update_head_pos(slot, r10_bio);
 328
 329	/*
 330	 *
 331	 * Let's see if all mirrored write operations have finished
 332	 * already.
 333	 */
 334	if (atomic_dec_and_test(&r10_bio->remaining)) {
 335		/* clear the bitmap if all writes complete successfully */
 336		bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
 337				r10_bio->sectors,
 338				!test_bit(R10BIO_Degraded, &r10_bio->state),
 339				0);
 340		md_write_end(r10_bio->mddev);
 341		raid_end_bio_io(r10_bio);
 342	}
 343
 344	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
 345}
 346
 347
 348/*
 349 * RAID10 layout manager
 350 * Aswell as the chunksize and raid_disks count, there are two
 351 * parameters: near_copies and far_copies.
 352 * near_copies * far_copies must be <= raid_disks.
 353 * Normally one of these will be 1.
 354 * If both are 1, we get raid0.
 355 * If near_copies == raid_disks, we get raid1.
 356 *
 357 * Chunks are layed out in raid0 style with near_copies copies of the
 358 * first chunk, followed by near_copies copies of the next chunk and
 359 * so on.
 360 * If far_copies > 1, then after 1/far_copies of the array has been assigned
 361 * as described above, we start again with a device offset of near_copies.
 362 * So we effectively have another copy of the whole array further down all
 363 * the drives, but with blocks on different drives.
 364 * With this layout, and block is never stored twice on the one device.
 365 *
 366 * raid10_find_phys finds the sector offset of a given virtual sector
 367 * on each device that it is on.
 368 *
 369 * raid10_find_virt does the reverse mapping, from a device and a
 370 * sector offset to a virtual address
 371 */
 372
 373static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
 374{
 375	int n,f;
 376	sector_t sector;
 377	sector_t chunk;
 378	sector_t stripe;
 379	int dev;
 380
 381	int slot = 0;
 382
 383	/* now calculate first sector/dev */
 384	chunk = r10bio->sector >> conf->chunk_shift;
 385	sector = r10bio->sector & conf->chunk_mask;
 386
 387	chunk *= conf->near_copies;
 388	stripe = chunk;
 389	dev = sector_div(stripe, conf->raid_disks);
 390	if (conf->far_offset)
 391		stripe *= conf->far_copies;
 392
 393	sector += stripe << conf->chunk_shift;
 394
 395	/* and calculate all the others */
 396	for (n=0; n < conf->near_copies; n++) {
 397		int d = dev;
 398		sector_t s = sector;
 399		r10bio->devs[slot].addr = sector;
 400		r10bio->devs[slot].devnum = d;
 401		slot++;
 402
 403		for (f = 1; f < conf->far_copies; f++) {
 404			d += conf->near_copies;
 405			if (d >= conf->raid_disks)
 406				d -= conf->raid_disks;
 407			s += conf->stride;
 408			r10bio->devs[slot].devnum = d;
 409			r10bio->devs[slot].addr = s;
 410			slot++;
 411		}
 412		dev++;
 413		if (dev >= conf->raid_disks) {
 414			dev = 0;
 415			sector += (conf->chunk_mask + 1);
 416		}
 417	}
 418	BUG_ON(slot != conf->copies);
 419}
 420
 421static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
 422{
 423	sector_t offset, chunk, vchunk;
 424
 425	offset = sector & conf->chunk_mask;
 426	if (conf->far_offset) {
 427		int fc;
 428		chunk = sector >> conf->chunk_shift;
 429		fc = sector_div(chunk, conf->far_copies);
 430		dev -= fc * conf->near_copies;
 431		if (dev < 0)
 432			dev += conf->raid_disks;
 433	} else {
 434		while (sector >= conf->stride) {
 435			sector -= conf->stride;
 436			if (dev < conf->near_copies)
 437				dev += conf->raid_disks - conf->near_copies;
 438			else
 439				dev -= conf->near_copies;
 440		}
 441		chunk = sector >> conf->chunk_shift;
 442	}
 443	vchunk = chunk * conf->raid_disks + dev;
 444	sector_div(vchunk, conf->near_copies);
 445	return (vchunk << conf->chunk_shift) + offset;
 446}
 447
 448/**
 449 *	raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
 450 *	@q: request queue
 451 *	@bvm: properties of new bio
 452 *	@biovec: the request that could be merged to it.
 453 *
 454 *	Return amount of bytes we can accept at this offset
 455 *      If near_copies == raid_disk, there are no striping issues,
 456 *      but in that case, the function isn't called at all.
 457 */
 458static int raid10_mergeable_bvec(struct request_queue *q,
 459				 struct bvec_merge_data *bvm,
 460				 struct bio_vec *biovec)
 461{
 462	mddev_t *mddev = q->queuedata;
 463	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 464	int max;
 465	unsigned int chunk_sectors = mddev->chunk_size >> 9;
 466	unsigned int bio_sectors = bvm->bi_size >> 9;
 467
 468	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
 469	if (max < 0) max = 0; /* bio_add cannot handle a negative return */
 470	if (max <= biovec->bv_len && bio_sectors == 0)
 471		return biovec->bv_len;
 472	else
 473		return max;
 474}
 475
 476/*
 477 * This routine returns the disk from which the requested read should
 478 * be done. There is a per-array 'next expected sequential IO' sector
 479 * number - if this matches on the next IO then we use the last disk.
 480 * There is also a per-disk 'last know head position' sector that is
 481 * maintained from IRQ contexts, both the normal and the resync IO
 482 * completion handlers update this position correctly. If there is no
 483 * perfect sequential match then we pick the disk whose head is closest.
 484 *
 485 * If there are 2 mirrors in the same 2 devices, performance degrades
 486 * because position is mirror, not device based.
 487 *
 488 * The rdev for the device selected will have nr_pending incremented.
 489 */
 490
 491/*
 492 * FIXME: possibly should rethink readbalancing and do it differently
 493 * depending on near_copies / far_copies geometry.
 494 */
 495static int read_balance(conf_t *conf, r10bio_t *r10_bio)
 496{
 497	const unsigned long this_sector = r10_bio->sector;
 498	int disk, slot, nslot;
 499	const int sectors = r10_bio->sectors;
 500	sector_t new_distance, current_distance;
 501	mdk_rdev_t *rdev;
 502
 503	raid10_find_phys(conf, r10_bio);
 504	rcu_read_lock();
 505	/*
 506	 * Check if we can balance. We can balance on the whole
 507	 * device if no resync is going on (recovery is ok), or below
 508	 * the resync window. We take the first readable disk when
 509	 * above the resync window.
 510	 */
 511	if (conf->mddev->recovery_cp < MaxSector
 512	    && (this_sector + sectors >= conf->next_resync)) {
 513		/* make sure that disk is operational */
 514		slot = 0;
 515		disk = r10_bio->devs[slot].devnum;
 516
 517		while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
 518		       r10_bio->devs[slot].bio == IO_BLOCKED ||
 519		       !test_bit(In_sync, &rdev->flags)) {
 520			slot++;
 521			if (slot == conf->copies) {
 522				slot = 0;
 523				disk = -1;
 524				break;
 525			}
 526			disk = r10_bio->devs[slot].devnum;
 527		}
 528		goto rb_out;
 529	}
 530
 531
 532	/* make sure the disk is operational */
 533	slot = 0;
 534	disk = r10_bio->devs[slot].devnum;
 535	while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
 536	       r10_bio->devs[slot].bio == IO_BLOCKED ||
 537	       !test_bit(In_sync, &rdev->flags)) {
 538		slot ++;
 539		if (slot == conf->copies) {
 540			disk = -1;
 541			goto rb_out;
 542		}
 543		disk = r10_bio->devs[slot].devnum;
 544	}
 545
 546
 547	current_distance = abs(r10_bio->devs[slot].addr -
 548			       conf->mirrors[disk].head_position);
 549
 550	/* Find the disk whose head is closest,
 551	 * or - for far > 1 - find the closest to partition beginning */
 552
 553	for (nslot = slot; nslot < conf->copies; nslot++) {
 554		int ndisk = r10_bio->devs[nslot].devnum;
 555
 556
 557		if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
 558		    r10_bio->devs[nslot].bio == IO_BLOCKED ||
 559		    !test_bit(In_sync, &rdev->flags))
 560			continue;
 561
 562		/* This optimisation is debatable, and completely destroys
 563		 * sequential read speed for 'far copies' arrays.  So only
 564		 * keep it for 'near' arrays, and review those later.
 565		 */
 566		if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) {
 567			disk = ndisk;
 568			slot = nslot;
 569			break;
 570		}
 571
 572		/* for far > 1 always use the lowest address */
 573		if (conf->far_copies > 1)
 574			new_distance = r10_bio->devs[nslot].addr;
 575		else
 576			new_distance = abs(r10_bio->devs[nslot].addr -
 577					   conf->mirrors[ndisk].head_position);
 578		if (new_distance < current_distance) {
 579			current_distance = new_distance;
 580			disk = ndisk;
 581			slot = nslot;
 582		}
 583	}
 584
 585rb_out:
 586	r10_bio->read_slot = slot;
 587/*	conf->next_seq_sect = this_sector + sectors;*/
 588
 589	if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
 590		atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
 591	else
 592		disk = -1;
 593	rcu_read_unlock();
 594
 595	return disk;
 596}
 597
 598static void unplug_slaves(mddev_t *mddev)
 599{
 600	conf_t *conf = mddev_to_conf(mddev);
 601	int i;
 602
 603	rcu_read_lock();
 604	for (i=0; i<mddev->raid_disks; i++) {
 605		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
 606		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
 607			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
 608
 609			atomic_inc(&rdev->nr_pending);
 610			rcu_read_unlock();
 611
 612			blk_unplug(r_queue);
 613
 614			rdev_dec_pending(rdev, mddev);
 615			rcu_read_lock();
 616		}
 617	}
 618	rcu_read_unlock();
 619}
 620
 621static void raid10_unplug(struct request_queue *q)
 622{
 623	mddev_t *mddev = q->queuedata;
 624
 625	unplug_slaves(q->queuedata);
 626	md_wakeup_thread(mddev->thread);
 627}
 628
 629static int raid10_congested(void *data, int bits)
 630{
 631	mddev_t *mddev = data;
 632	conf_t *conf = mddev_to_conf(mddev);
 633	int i, ret = 0;
 634
 635	rcu_read_lock();
 636	for (i = 0; i < mddev->raid_disks && ret == 0; i++) {
 637		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
 638		if (rdev && !test_bit(Faulty, &rdev->flags)) {
 639			struct request_queue *q = bdev_get_queue(rdev->bdev);
 640
 641			ret |= bdi_congested(&q->backing_dev_info, bits);
 642		}
 643	}
 644	rcu_read_unlock();
 645	return ret;
 646}
 647
 648static int flush_pending_writes(conf_t *conf)
 649{
 650	/* Any writes that have been queued but are awaiting
 651	 * bitmap updates get flushed here.
 652	 * We return 1 if any requests were actually submitted.
 653	 */
 654	int rv = 0;
 655
 656	spin_lock_irq(&conf->device_lock);
 657
 658	if (conf->pending_bio_list.head) {
 659		struct bio *bio;
 660		bio = bio_list_get(&conf->pending_bio_list);
 661		blk_remove_plug(conf->mddev->queue);
 662		spin_unlock_irq(&conf->device_lock);
 663		/* flush any pending bitmap writes to disk
 664		 * before proceeding w/ I/O */
 665		bitmap_unplug(conf->mddev->bitmap);
 666
 667		while (bio) { /* submit pending writes */
 668			struct bio *next = bio->bi_next;
 669			bio->bi_next = NULL;
 670			generic_make_request(bio);
 671			bio = next;
 672		}
 673		rv = 1;
 674	} else
 675		spin_unlock_irq(&conf->device_lock);
 676	return rv;
 677}
 678/* Barriers....
 679 * Sometimes we need to suspend IO while we do something else,
 680 * either some resync/recovery, or reconfigure the array.
 681 * To do this we raise a 'barrier'.
 682 * The 'barrier' is a counter that can be raised multiple times
 683 * to count how many activities are happening which preclude
 684 * normal IO.
 685 * We can only raise the barrier if there is no pending IO.
 686 * i.e. if nr_pending == 0.
 687 * We choose only to raise the barrier if no-one is waiting for the
 688 * barrier to go down.  This means that as soon as an IO request
 689 * is ready, no other operations which require a barrier will start
 690 * until the IO request has had a chance.
 691 *
 692 * So: regular IO calls 'wait_barrier'.  When that returns there
 693 *    is no backgroup IO happening,  It must arrange to call
 694 *    allow_barrier when it has finished its IO.
 695 * backgroup IO calls must call raise_barrier.  Once that returns
 696 *    there is no normal IO happeing.  It must arrange to call
 697 *    lower_barrier when the particular background IO completes.
 698 */
 699
 700static void raise_barrier(conf_t *conf, int force)
 701{
 702	BUG_ON(force && !conf->barrier);
 703	spin_lock_irq(&conf->resync_lock);
 704
 705	/* Wait until no block IO is waiting (unless 'force') */
 706	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
 707			    conf->resync_lock,
 708			    raid10_unplug(conf->mddev->queue));
 709
 710	/* block any new IO from starting */
 711	conf->barrier++;
 712
 713	/* No wait for all pending IO to complete */
 714	wait_event_lock_irq(conf->wait_barrier,
 715			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
 716			    conf->resync_lock,
 717			    raid10_unplug(conf->mddev->queue));
 718
 719	spin_unlock_irq(&conf->resync_lock);
 720}
 721
 722static void lower_barrier(conf_t *conf)
 723{
 724	unsigned long flags;
 725	spin_lock_irqsave(&conf->resync_lock, flags);
 726	conf->barrier--;
 727	spin_unlock_irqrestore(&conf->resync_lock, flags);
 728	wake_up(&conf->wait_barrier);
 729}
 730
 731static void wait_barrier(conf_t *conf)
 732{
 733	spin_lock_irq(&conf->resync_lock);
 734	if (conf->barrier) {
 735		conf->nr_waiting++;
 736		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
 737				    conf->resync_lock,
 738				    raid10_unplug(conf->mddev->queue));
 739		conf->nr_waiting--;
 740	}
 741	conf->nr_pending++;
 742	spin_unlock_irq(&conf->resync_lock);
 743}
 744
 745static void allow_barrier(conf_t *conf)
 746{
 747	unsigned long flags;
 748	spin_lock_irqsave(&conf->resync_lock, flags);
 749	conf->nr_pending--;
 750	spin_unlock_irqrestore(&conf->resync_lock, flags);
 751	wake_up(&conf->wait_barrier);
 752}
 753
 754static void freeze_array(conf_t *conf)
 755{
 756	/* stop syncio and normal IO and wait for everything to
 757	 * go quiet.
 758	 * We increment barrier and nr_waiting, and then
 759	 * wait until nr_pending match nr_queued+1
 760	 * This is called in the context of one normal IO request
 761	 * that has failed. Thus any sync request that might be pending
 762	 * will be blocked by nr_pending, and we need to wait for
 763	 * pending IO requests to complete or be queued for re-try.
 764	 * Thus the number queued (nr_queued) plus this request (1)
 765	 * must match the number of pending IOs (nr_pending) before
 766	 * we continue.
 767	 */
 768	spin_lock_irq(&conf->resync_lock);
 769	conf->barrier++;
 770	conf->nr_waiting++;
 771	wait_event_lock_irq(conf->wait_barrier,
 772			    conf->nr_pending == conf->nr_queued+1,
 773			    conf->resync_lock,
 774			    ({ flush_pending_writes(conf);
 775			       raid10_unplug(conf->mddev->queue); }));
 776	spin_unlock_irq(&conf->resync_lock);
 777}
 778
 779static void unfreeze_array(conf_t *conf)
 780{
 781	/* reverse the effect of the freeze */
 782	spin_lock_irq(&conf->resync_lock);
 783	conf->barrier--;
 784	conf->nr_waiting--;
 785	wake_up(&conf->wait_barrier);
 786	spin_unlock_irq(&conf->resync_lock);
 787}
 788
 789static int make_request(struct request_queue *q, struct bio * bio)
 790{
 791	mddev_t *mddev = q->queuedata;
 792	conf_t *conf = mddev_to_conf(mddev);
 793	mirror_info_t *mirror;
 794	r10bio_t *r10_bio;
 795	struct bio *read_bio;
 796	int cpu;
 797	int i;
 798	int chunk_sects = conf->chunk_mask + 1;
 799	const int rw = bio_data_dir(bio);
 800	const int do_sync = bio_sync(bio);
 801	struct bio_list bl;
 802	unsigned long flags;
 803	mdk_rdev_t *blocked_rdev;
 804
 805	if (unlikely(bio_barrier(bio))) {
 806		bio_endio(bio, -EOPNOTSUPP);
 807		return 0;
 808	}
 809
 810	/* If this request crosses a chunk boundary, we need to
 811	 * split it.  This will only happen for 1 PAGE (or less) requests.
 812	 */
 813	if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
 814		      > chunk_sects &&
 815		    conf->near_copies < conf->raid_disks)) {
 816		struct bio_pair *bp;
 817		/* Sanity check -- queue functions should prevent this happening */
 818		if (bio->bi_vcnt != 1 ||
 819		    bio->bi_idx != 0)
 820			goto bad_map;
 821		/* This is a one page bio that upper layers
 822		 * refuse to split for us, so we need to split it.
 823		 */
 824		bp = bio_split(bio,
 825			       chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
 826		if (make_request(q, &bp->bio1))
 827			generic_make_request(&bp->bio1);
 828		if (make_request(q, &bp->bio2))
 829			generic_make_request(&bp->bio2);
 830
 831		bio_pair_release(bp);
 832		return 0;
 833	bad_map:
 834		printk("raid10_make_request bug: can't convert block across chunks"
 835		       " or bigger than %dk %llu %d\n", chunk_sects/2,
 836		       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
 837
 838		bio_io_error(bio);
 839		return 0;
 840	}
 841
 842	md_write_start(mddev, bio);
 843
 844	/*
 845	 * Register the new request and wait if the reconstruction
 846	 * thread has put up a bar for new requests.
 847	 * Continue immediately if no resync is active currently.
 848	 */
 849	wait_barrier(conf);
 850
 851	cpu = part_stat_lock();
 852	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
 853	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
 854		      bio_sectors(bio));
 855	part_stat_unlock();
 856
 857	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 858
 859	r10_bio->master_bio = bio;
 860	r10_bio->sectors = bio->bi_size >> 9;
 861
 862	r10_bio->mddev = mddev;
 863	r10_bio->sector = bio->bi_sector;
 864	r10_bio->state = 0;
 865
 866	if (rw == READ) {
 867		/*
 868		 * read balancing logic:
 869		 */
 870		int disk = read_balance(conf, r10_bio);
 871		int slot = r10_bio->read_slot;
 872		if (disk < 0) {
 873			raid_end_bio_io(r10_bio);
 874			return 0;
 875		}
 876		mirror = conf->mirrors + disk;
 877
 878		read_bio = bio_clone(bio, GFP_NOIO);
 879
 880		r10_bio->devs[slot].bio = read_bio;
 881
 882		read_bio->bi_sector = r10_bio->devs[slot].addr +
 883			mirror->rdev->data_offset;
 884		read_bio->bi_bdev = mirror->rdev->bdev;
 885		read_bio->bi_end_io = raid10_end_read_request;
 886		read_bio->bi_rw = READ | do_sync;
 887		read_bio->bi_private = r10_bio;
 888
 889		generic_make_request(read_bio);
 890		return 0;
 891	}
 892
 893	/*
 894	 * WRITE:
 895	 */
 896	/* first select target devices under rcu_lock and
 897	 * inc refcount on their rdev.  Record them by setting
 898	 * bios[x] to bio
 899	 */
 900	raid10_find_phys(conf, r10_bio);
 901 retry_write:
 902	blocked_rdev = NULL;
 903	rcu_read_lock();
 904	for (i = 0;  i < conf->copies; i++) {
 905		int d = r10_bio->devs[i].devnum;
 906		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
 907		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
 908			atomic_inc(&rdev->nr_pending);
 909			blocked_rdev = rdev;
 910			break;
 911		}
 912		if (rdev && !test_bit(Faulty, &rdev->flags)) {
 913			atomic_inc(&rdev->nr_pending);
 914			r10_bio->devs[i].bio = bio;
 915		} else {
 916			r10_bio->devs[i].bio = NULL;
 917			set_bit(R10BIO_Degraded, &r10_bio->state);
 918		}
 919	}
 920	rcu_read_unlock();
 921
 922	if (unlikely(blocked_rdev)) {
 923		/* Have to wait for this device to get unblocked, then retry */
 924		int j;
 925		int d;
 926
 927		for (j = 0; j < i; j++)
 928			if (r10_bio->devs[j].bio) {
 929				d = r10_bio->devs[j].devnum;
 930				rdev_dec_pending(conf->mirrors[d].rdev, mddev);
 931			}
 932		allow_barrier(conf);
 933		md_wait_for_blocked_rdev(blocked_rdev, mddev);
 934		wait_barrier(conf);
 935		goto retry_write;
 936	}
 937
 938	atomic_set(&r10_bio->remaining, 0);
 939
 940	bio_list_init(&bl);
 941	for (i = 0; i < conf->copies; i++) {
 942		struct bio *mbio;
 943		int d = r10_bio->devs[i].devnum;
 944		if (!r10_bio->devs[i].bio)
 945			continue;
 946
 947		mbio = bio_clone(bio, GFP_NOIO);
 948		r10_bio->devs[i].bio = mbio;
 949
 950		mbio->bi_sector	= r10_bio->devs[i].addr+
 951			conf->mirrors[d].rdev->data_offset;
 952		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
 953		mbio->bi_end_io	= raid10_end_write_request;
 954		mbio->bi_rw = WRITE | do_sync;
 955		mbio->bi_private = r10_bio;
 956
 957		atomic_inc(&r10_bio->remaining);
 958		bio_list_add(&bl, mbio);
 959	}
 960
 961	if (unlikely(!atomic_read(&r10_bio->remaining))) {
 962		/* the array is dead */
 963		md_write_end(mddev);
 964		raid_end_bio_io(r10_bio);
 965		return 0;
 966	}
 967
 968	bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
 969	spin_lock_irqsave(&conf->device_lock, flags);
 970	bio_list_merge(&conf->pending_bio_list, &bl);
 971	blk_plug_device(mddev->queue);
 972	spin_unlock_irqrestore(&conf->device_lock, flags);
 973
 974	/* In case raid10d snuck in to freeze_array */
 975	wake_up(&conf->wait_barrier);
 976
 977	if (do_sync)
 978		md_wakeup_thread(mddev->thread);
 979
 980	return 0;
 981}
 982
 983static void status(struct seq_file *seq, mddev_t *mddev)
 984{
 985	conf_t *conf = mddev_to_conf(mddev);
 986	int i;
 987
 988	if (conf->near_copies < conf->raid_disks)
 989		seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);
 990	if (conf->near_copies > 1)
 991		seq_printf(seq, " %d near-copies", conf->near_copies);
 992	if (conf->far_copies > 1) {
 993		if (conf->far_offset)
 994			seq_printf(seq, " %d offset-copies", conf->far_copies);
 995		else
 996			seq_printf(seq, " %d far-copies", conf->far_copies);
 997	}
 998	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
 999					conf->raid_disks - mddev->degraded);
1000	for (i = 0; i < conf->raid_disks; i++)
1001		seq_printf(seq, "%s",
1002			      conf->mirrors[i].rdev &&
1003			      test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1004	seq_printf(seq, "]");
1005}
1006
1007static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1008{
1009	char b[BDEVNAME_SIZE];
1010	conf_t *conf = mddev_to_conf(mddev);
1011
1012	/*
1013	 * If it is not operational, then we have already marked it as dead
1014	 * else if it is the last working disks, ignore the error, let the
1015	 * next level up know.
1016	 * else mark the drive as failed
1017	 */
1018	if (test_bit(In_sync, &rdev->flags)
1019	    && conf->raid_disks-mddev->degraded == 1)
1020		/*
1021		 * Don't fail the drive, just return an IO error.
1022		 * The test should really be more sophisticated than
1023		 * "working_disks == 1", but it isn't critical, and
1024		 * can wait until we do more sophisticated "is the drive
1025		 * really dead" tests...
1026		 */
1027		return;
1028	if (test_and_clear_bit(In_sync, &rdev->flags)) {
1029		unsigned long flags;
1030		spin_lock_irqsave(&conf->device_lock, flags);
1031		mddev->degraded++;
1032		spin_unlock_irqrestore(&conf->device_lock, flags);
1033		/*
1034		 * if recovery is running, make sure it aborts.
1035		 */
1036		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1037	}
1038	set_bit(Faulty, &rdev->flags);
1039	set_bit(MD_CHANGE_DEVS, &mddev->flags);
1040	printk(KERN_ALERT "raid10: Disk failure on %s, disabling device.\n"
1041		"raid10: Operation continuing on %d devices.\n",
1042		bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
1043}
1044
1045static void print_conf(conf_t *conf)
1046{
1047	int i;
1048	mirror_info_t *tmp;
1049
1050	printk("RAID10 conf printout:\n");
1051	if (!conf) {
1052		printk("(!conf)\n");
1053		return;
1054	}
1055	printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1056		conf->raid_disks);
1057
1058	for (i = 0; i < conf->raid_disks; i++) {
1059		char b[BDEVNAME_SIZE];
1060		tmp = conf->mirrors + i;
1061		if (tmp->rdev)
1062			printk(" disk %d, wo:%d, o:%d, dev:%s\n",
1063				i, !test_bit(In_sync, &tmp->rdev->flags),
1064			        !test_bit(Faulty, &tmp->rdev->flags),
1065				bdevname(tmp->rdev->bdev,b));
1066	}
1067}
1068
1069static void close_sync(conf_t *conf)
1070{
1071	wait_barrier(conf);
1072	allow_barrier(conf);
1073
1074	mempool_destroy(conf->r10buf_pool);
1075	conf->r10buf_pool = NULL;
1076}
1077
1078/* check if there are enough drives for
1079 * every block to appear on atleast one
1080 */
1081static int enough(conf_t *conf)
1082{
1083	int first = 0;
1084
1085	do {
1086		int n = conf->copies;
1087		int cnt = 0;
1088		while (n--) {
1089			if (conf->mirrors[first].rdev)
1090				cnt++;
1091			first = (first+1) % conf->raid_disks;
1092		}
1093		if (cnt == 0)
1094			return 0;
1095	} while (first != 0);
1096	return 1;
1097}
1098
1099static int raid10_spare_active(mddev_t *mddev)
1100{
1101	int i;
1102	conf_t *conf = mddev->private;
1103	mirror_info_t *tmp;
1104
1105	/*
1106	 * Find all non-in_sync disks within the RAID10 configuration
1107	 * and mark them in_sync
1108	 */
1109	for (i = 0; i < conf->raid_disks; i++) {
1110		tmp = conf->mirrors + i;
1111		if (tmp->rdev
1112		    && !test_bit(Faulty, &tmp->rdev->flags)
1113		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1114			unsigned long flags;
1115			spin_lock_irqsave(&conf->device_lock, flags);
1116			mddev->degraded--;
1117			spin_unlock_irqrestore(&conf->device_lock, flags);
1118		}
1119	}
1120
1121	print_conf(conf);
1122	return 0;
1123}
1124
1125
1126static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1127{
1128	conf_t *conf = mddev->private;
1129	int err = -EEXIST;
1130	int mirror;
1131	mirror_info_t *p;
1132	int first = 0;
1133	int last = mddev->raid_disks - 1;
1134
1135	if (mddev->recovery_cp < MaxSector)
1136		/* only hot-add to in-sync arrays, as recovery is
1137		 * very different from resync
1138		 */
1139		return -EBUSY;
1140	if (!enough(conf))
1141		return -EINVAL;
1142
1143	if (rdev->raid_disk >= 0)
1144		first = last = rdev->raid_disk;
1145
1146	if (rdev->saved_raid_disk >= 0 &&
1147	    rdev->saved_raid_disk >= first &&
1148	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1149		mirror = rdev->saved_raid_disk;
1150	else
1151		mirror = first;
1152	for ( ; mirror <= last ; mirror++)
1153		if ( !(p=conf->mirrors+mirror)->rdev) {
1154
1155			blk_queue_stack_limits(mddev->queue,
1156					       rdev->bdev->bd_disk->queue);
1157			/* as we don't honour merge_bvec_fn, we must never risk
1158			 * violating it, so limit ->max_sector to one PAGE, as
1159			 * a one page request is never in violation.
1160			 */
1161			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
1162			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
1163				mddev->queue->max_sectors = (PAGE_SIZE>>9);
1164
1165			p->head_position = 0;
1166			rdev->raid_disk = mirror;
1167			err = 0;
1168			if (rdev->saved_raid_disk != mirror)
1169				conf->fullsync = 1;
1170			rcu_assign_pointer(p->rdev, rdev);
1171			break;
1172		}
1173
1174	print_conf(conf);
1175	return err;
1176}
1177
1178static int raid10_remove_disk(mddev_t *mddev, int number)
1179{
1180	conf_t *conf = mddev->private;
1181	int err = 0;
1182	mdk_rdev_t *rdev;
1183	mirror_info_t *p = conf->mirrors+ number;
1184
1185	print_conf(conf);
1186	rdev = p->rdev;
1187	if (rdev) {
1188		if (test_bit(In_sync, &rdev->flags) ||
1189		    atomic_read(&rdev->nr_pending)) {
1190			err = -EBUSY;
1191			goto abort;
1192		}
1193		/* Only remove faulty devices in recovery
1194		 * is not possible.
1195		 */
1196		if (!test_bit(Faulty, &rdev->flags) &&
1197		    enough(conf)) {
1198			err = -EBUSY;
1199			goto abort;
1200		}
1201		p->rdev = NULL;
1202		synchronize_rcu();
1203		if (atomic_read(&rdev->nr_pending)) {
1204			/* lost the race, try later */
1205			err = -EBUSY;
1206			p->rdev = rdev;
1207		}
1208	}
1209abort:
1210
1211	print_conf(conf);
1212	return err;
1213}
1214
1215
1216static void end_sync_read(struct bio *bio, int error)
1217{
1218	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
1219	conf_t *conf = mddev_to_conf(r10_bio->mddev);
1220	int i,d;
1221
1222	for (i=0; i<conf->copies; i++)
1223		if (r10_bio->devs[i].bio == bio)
1224			break;
1225	BUG_ON(i == conf->copies);
1226	update_head_pos(i, r10_bio);
1227	d = r10_bio->devs[i].devnum;
1228
1229	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1230		set_bit(R10BIO_Uptodate, &r10_bio->state);
1231	else {
1232		atomic_add(r10_bio->sectors,
1233			   &conf->mirrors[d].rdev->corrected_errors);
1234		if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
1235			md_error(r10_bio->mddev,
1236				 conf->mirrors[d].rdev);
1237	}
1238
1239	/* for reconstruct, we always reschedule after a read.
1240	 * for resync, only after all reads
1241	 */
1242	rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1243	if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1244	    atomic_dec_and_test(&r10_bio->remaining)) {
1245		/* we have read all the blocks,
1246		 * do the comparison in process context in raid10d
1247		 */
1248		reschedule_retry(r10_bio);
1249	}
1250}
1251
1252static void end_sync_write(struct bio *bio, int error)
1253{
1254	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1255	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
1256	mddev_t *mddev = r10_bio->mddev;
1257	conf_t *conf = mddev_to_conf(mddev);
1258	int i,d;
1259
1260	for (i = 0; i < conf->copies; i++)
1261		if (r10_bio->devs[i].bio == bio)
1262			break;
1263	d = r10_bio->devs[i].devnum;
1264
1265	if (!uptodate)
1266		md_error(mddev, conf->mirrors[d].rdev);
1267
1268	update_head_pos(i, r10_bio);
1269
1270	rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1271	while (atomic_dec_and_test(&r10_bio->remaining)) {
1272		if (r10_bio->master_bio == NULL) {
1273			/* the primary of several recovery bios */
1274			sector_t s = r10_bio->sectors;
1275			put_buf(r10_bio);
1276			md_done_sync(mddev, s, 1);
1277			break;
1278		} else {
1279			r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
1280			put_buf(r10_bio);
1281			r10_bio = r10_bio2;
1282		}
1283	}
1284}
1285
1286/*
1287 * Note: sync and recover and handled very differently for raid10
1288 * This code is for resync.
1289 * For resync, we read through virtual addresses and read all blocks.
1290 * If there is any error, we schedule a write.  The lowest numbered
1291 * drive is authoritative.
1292 * However requests come for physical address, so we need to map.
1293 * For every physical address there are raid_disks/copies virtual addresses,
1294 * which is always are least one, but is not necessarly an integer.
1295 * This means that a physical address can span multiple chunks, so we may
1296 * have to submit multiple io requests for a single sync request.
1297 */
1298/*
1299 * We check if all blocks are in-sync and only write to blocks that
1300 * aren't in sync
1301 */
1302static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1303{
1304	conf_t *conf = mddev_to_conf(mddev);
1305	int i, first;
1306	struct bio *tbio, *fbio;
1307
1308	atomic_set(&r10_bio->remaining, 1);
1309
1310	/* find the first device with a block */
1311	for (i=0; i<conf->copies; i++)
1312		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1313			break;
1314
1315	if (i == conf->copies)
1316		goto done;
1317
1318	first = i;
1319	fbio = r10_bio->devs[i].bio;
1320
1321	/* now find blocks with errors */
1322	for (i=0 ; i < conf->copies ; i++) {
1323		int  j, d;
1324		int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
1325
1326		tbio = r10_bio->devs[i].bio;
1327
1328		if (tbio->bi_end_io != end_sync_read)
1329			continue;
1330		if (i == first)
1331			continue;
1332		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
1333			/* We know that the bi_io_vec layout is the same for
1334			 * both 'first' and 'i', so we just compare them.
1335			 * All vec entries are PAGE_SIZE;
1336			 */
1337			for (j = 0; j < vcnt; j++)
1338				if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
1339					   page_address(tbio->bi_io_vec[j].bv_page),
1340					   PAGE_SIZE))
1341					break;
1342			if (j == vcnt)
1343				continue;
1344			mddev->resync_mismatches += r10_bio->sectors;
1345		}
1346		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1347			/* Don't fix anything. */
1348			continue;
1349		/* Ok, we need to write this bio
1350		 * First we need to fixup bv_offset, bv_len and
1351		 * bi_vecs, as the read request might have corrupted these
1352		 */
1353		tbio->bi_vcnt = vcnt;
1354		tbio->bi_size = r10_bio->sectors << 9;
1355		tbio->bi_idx = 0;
1356		tbio->bi_phys_segments = 0;
1357		tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1358		tbio->bi_flags |= 1 << BIO_UPTODATE;
1359		tbio->bi_next = NULL;
1360		tbio->bi_rw = WRITE;
1361		tbio->bi_private = r10_bio;
1362		tbio->bi_sector = r10_bio->devs[i].addr;
1363
1364		for (j=0; j < vcnt ; j++) {
1365			tbio->bi_io_vec[j].bv_offset = 0;
1366			tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
1367
1368			memcpy(page_address(tbio->bi_io_vec[j].bv_page),
1369			       page_address(fbio->bi_io_vec[j].bv_page),
1370			       PAGE_SIZE);
1371		}
1372		tbio->bi_end_io = end_sync_write;
1373
1374		d = r10_bio->devs[i].devnum;
1375		atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1376		atomic_inc(&r10_bio->remaining);
1377		md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
1378
1379		tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
1380		tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1381		generic_make_request(tbio);
1382	}
1383
1384done:
1385	if (atomic_dec_and_test(&r10_bio->remaining)) {
1386		md_done_sync(mddev, r10_bio->sectors, 1);
1387		put_buf(r10_bio);
1388	}
1389}
1390
1391/*
1392 * Now for the recovery code.
1393 * Recovery happens across physical sectors.
1394 * We recover all non-is_sync drives by finding the virtual address of
1395 * each, and then choose a working drive that also has that virt address.
1396 * There is a separate r10_bio for each non-in_sync drive.
1397 * Only the first two slots are in use. The first for reading,
1398 * The second for writing.
1399 *
1400 */
1401
1402static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1403{
1404	conf_t *conf = mddev_to_conf(mddev);
1405	int i, d;
1406	struct bio *bio, *wbio;
1407
1408
1409	/* move the pages across to the second bio
1410	 * and submit the write request
1411	 */
1412	bio = r10_bio->devs[0].bio;
1413	wbio = r10_bio->devs[1].bio;
1414	for (i=0; i < wbio->bi_vcnt; i++) {
1415		struct page *p = bio->bi_io_vec[i].bv_page;
1416		bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
1417		wbio->bi_io_vec[i].bv_page = p;
1418	}
1419	d = r10_bio->devs[1].devnum;
1420
1421	atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1422	md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1423	if (test_bit(R10BIO_Uptodate, &r10_bio->state))
1424		generic_make_request(wbio);
1425	else
1426		bio_endio(wbio, -EIO);
1427}
1428
1429
1430/*
1431 * This is a kernel thread which:
1432 *
1433 *	1.	Retries failed read operations on working mirrors.
1434 *	2.	Updates the raid superblock when problems encounter.
1435 *	3.	Performs writes following reads for array synchronising.
1436 */
1437
1438static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1439{
1440	int sect = 0; /* Offset from r10_bio->sector */
1441	int sectors = r10_bio->sectors;
1442	mdk_rdev_t*rdev;
1443	while(sectors) {
1444		int s = sectors;
1445		int sl = r10_bio->read_slot;
1446		int success = 0;
1447		int start;
1448
1449		if (s > (PAGE_SIZE>>9))
1450			s = PAGE_SIZE >> 9;
1451
1452		rcu_read_lock();
1453		do {
1454			int d = r10_bio->devs[sl].devnum;
1455			rdev = rcu_dereference(conf->mirrors[d].rdev);
1456			if (rdev &&
1457			    test_bit(In_sync, &rdev->flags)) {
1458				atomic_inc(&rdev->nr_pending);
1459				rcu_read_unlock();
1460				success = sync_page_io(rdev->bdev,
1461						       r10_bio->devs[sl].addr +
1462						       sect + rdev->data_offset,
1463						       s<<9,
1464						       conf->tmppage, READ);
1465				rdev_dec_pending(rdev, mddev);
1466				rcu_read_lock();
1467				if (success)
1468					break;
1469			}
1470			sl++;
1471			if (sl == conf->copies)
1472				sl = 0;
1473		} while (!success && sl != r10_bio->read_slot);
1474		rcu_read_unlock();
1475
1476		if (!success) {
1477			/* Cannot read from anywhere -- bye bye array */
1478			int dn = r10_bio->devs[r10_bio->read_slot].devnum;
1479			md_error(mddev, conf->mirrors[dn].rdev);
1480			break;
1481		}
1482
1483		start = sl;
1484		/* write it back and re-read */
1485		rcu_read_lock();
1486		while (sl != r10_bio->read_slot) {
1487			int d;
1488			if (sl==0)
1489				sl = conf->copies;
1490			sl--;
1491			d = r10_bio->devs[sl].devnum;
1492			rdev = rcu_dereference(conf->mirrors[d].rdev);
1493			if (rdev &&
1494			    test_bit(In_sync, &rdev->flags)) {
1495				atomic_inc(&rdev->nr_pending);
1496				rcu_read_unlock();
1497				atomic_add(s, &rdev->corrected_errors);
1498				if (sync_page_io(rdev->bdev,
1499						 r10_bio->devs[sl].addr +
1500						 sect + rdev->data_offset,
1501						 s<<9, conf->tmppage, WRITE)
1502				    == 0)
1503					/* Well, this device is dead */
1504					md_error(mddev, rdev);
1505				rdev_dec_pending(rdev, mddev);
1506				rcu_read_lock();
1507			}
1508		}
1509		sl = start;
1510		while (sl != r10_bio->read_slot) {
1511			int d;
1512			if (sl==0)
1513				sl = conf->copies;
1514			sl--;
1515			d = r10_bio->devs[sl].devnum;
1516			rdev = rcu_dereference(conf->mirrors[d].rdev);
1517			if (rdev &&
1518			    test_bit(In_sync, &rdev->flags)) {
1519				char b[BDEVNAME_SIZE];
1520				atomic_inc(&rdev->nr_pending);
1521				rcu_read_unlock();
1522				if (sync_page_io(rdev->bdev,
1523						 r10_bio->devs[sl].addr +
1524						 sect + rdev->data_offset,
1525						 s<<9, conf->tmppage, READ) == 0)
1526					/* Well, this device is dead */
1527					md_error(mddev, rdev);
1528				else
1529					printk(KERN_INFO
1530					       "raid10:%s: read error corrected"
1531					       " (%d sectors at %llu on %s)\n",
1532					       mdname(mddev), s,
1533					       (unsigned long long)(sect+
1534					            rdev->data_offset),
1535					       bdevname(rdev->bdev, b));
1536
1537				rdev_dec_pending(rdev, mddev);
1538				rcu_read_lock();
1539			}
1540		}
1541		rcu_read_unlock();
1542
1543		sectors -= s;
1544		sect += s;
1545	}
1546}
1547
1548static void raid10d(mddev_t *mddev)
1549{
1550	r10bio_t *r10_bio;
1551	struct bio *bio;
1552	unsigned long flags;
1553	conf_t *conf = mddev_to_conf(mddev);
1554	struct list_head *head = &conf->retry_list;
1555	int unplug=0;
1556	mdk_rdev_t *rdev;
1557
1558	md_check_recovery(mddev);
1559
1560	for (;;) {
1561		char b[BDEVNAME_SIZE];
1562
1563		unplug += flush_pending_writes(conf);
1564
1565		spin_lock_irqsave(&conf->device_lock, flags);
1566		if (list_empty(head)) {
1567			spin_unlock_irqrestore(&conf->device_lock, flags);
1568			break;
1569		}
1570		r10_bio = list_entry(head->prev, r10bio_t, retry_list);
1571		list_del(head->prev);
1572		conf->nr_queued--;
1573		spin_unlock_irqrestore(&conf->device_lock, flags);
1574
1575		mddev = r10_bio->mddev;
1576		conf = mddev_to_conf(mddev);
1577		if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
1578			sync_request_write(mddev, r10_bio);
1579			unplug = 1;
1580		} else 	if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
1581			recovery_request_write(mddev, r10_bio);
1582			unplug = 1;
1583		} else {
1584			int mirror;
1585			/* we got a read error. Maybe the drive is bad.  Maybe just
1586			 * the block and we can fix it.
1587			 * We freeze all other IO, and try reading the block from
1588			 * other devices.  When we find one, we re-write
1589			 * and check it that fixes the read error.
1590			 * This is all done synchronously while the array is
1591			 * frozen.
1592			 */
1593			if (mddev->ro == 0) {
1594				freeze_array(conf);
1595				fix_read_error(conf, mddev, r10_bio);
1596				unfreeze_array(conf);
1597			}
1598
1599			bio = r10_bio->devs[r10_bio->read_slot].bio;
1600			r10_bio->devs[r10_bio->read_slot].bio =
1601				mddev->ro ? IO_BLOCKED : NULL;
1602			mirror = read_balance(conf, r10_bio);
1603			if (mirror == -1) {
1604				printk(KERN_ALERT "raid10: %s: unrecoverable I/O"
1605				       " read error for block %llu\n",
1606				       bdevname(bio->bi_bdev,b),
1607				       (unsigned long long)r10_bio->sector);
1608				raid_end_bio_io(r10_bio);
1609				bio_put(bio);
1610			} else {
1611				const int do_sync = bio_sync(r10_bio->master_bio);
1612				bio_put(bio);
1613				rdev = conf->mirrors[mirror].rdev;
1614				if (printk_ratelimit())
1615					printk(KERN_ERR "raid10: %s: redirecting sector %llu to"
1616					       " another mirror\n",
1617					       bdevname(rdev->bdev,b),
1618					       (unsigned long long)r10_bio->sector);
1619				bio = bio_clone(r10_bio->master_bio, GFP_NOIO);
1620				r10_bio->devs[r10_bio->read_slot].bio = bio;
1621				bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
1622					+ rdev->data_offset;
1623				bio->bi_bdev = rdev->bdev;
1624				bio->bi_rw = READ | do_sync;
1625				bio->bi_private = r10_bio;
1626				bio->bi_end_io = raid10_end_read_request;
1627				unplug = 1;
1628				generic_make_request(bio);
1629			}
1630		}
1631	}
1632	if (unplug)
1633		unplug_slaves(mddev);
1634}
1635
1636
1637static int init_resync(conf_t *conf)
1638{
1639	int buffs;
1640
1641	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
1642	BUG_ON(conf->r10buf_pool);
1643	conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
1644	if (!conf->r10buf_pool)
1645		return -ENOMEM;
1646	conf->next_resync = 0;
1647	return 0;
1648}
1649
1650/*
1651 * perform a "sync" on one "block"
1652 *
1653 * We need to make sure that no normal I/O request - particularly write
1654 * requests - conflict with active sync requests.
1655 *
1656 * This is achieved by tracking pending requests and a 'barrier' concept
1657 * that can be installed to exclude normal IO requests.
1658 *
1659 * Resync and recovery are handled very differently.
1660 * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
1661 *
1662 * For resync, we iterate over virtual addresses, read all copies,
1663 * and update if there are differences.  If only one copy is live,
1664 * skip it.
1665 * For recovery, we iterate over physical addresses, read a good
1666 * value for each non-in_sync drive, and over-write.
1667 *
1668 * So, for recovery we may have several outstanding complex requests for a
1669 * given address, one for each out-of-sync device.  We model this by allocating
1670 * a number of r10_bio structures, one for each out-of-sync device.
1671 * As we setup these structures, we collect all bio's together into a list
1672 * which we then process collectively to add pages, and then process again
1673 * to pass to generic_make_request.
1674 *
1675 * The r10_bio structures are linked using a borrowed master_bio pointer.
1676 * This link is counted in ->remaining.  When the r10_bio that points to NULL
1677 * has its remaining count decremented to 0, the whole complex operation
1678 * is complete.
1679 *
1680 */
1681
1682static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1683{
1684	conf_t *conf = mddev_to_conf(mddev);
1685	r10bio_t *r10_bio;
1686	struct bio *biolist = NULL, *bio;
1687	sector_t max_sector, nr_sectors;
1688	int disk;
1689	int i;
1690	int max_sync;
1691	int sync_blocks;
1692
1693	sector_t sectors_skipped = 0;
1694	int chunks_skipped = 0;
1695
1696	if (!conf->r10buf_pool)
1697		if (init_resync(conf))
1698			return 0;
1699
1700 skipped:
1701	max_sector = mddev->dev_sectors;
1702	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1703		max_sector = mddev->resync_max_sectors;
1704	if (sector_nr >= max_sector) {
1705		/* If we aborted, we need to abort the
1706		 * sync on the 'current' bitmap chucks (there can
1707		 * be several when recovering multiple devices).
1708		 * as we may have started syncing it but not finished.
1709		 * We can find the current address in
1710		 * mddev->curr_resync, but for recovery,
1711		 * we need to convert that to several
1712		 * virtual addresses.
1713		 */
1714		if (mddev->curr_resync < max_sector) { /* aborted */
1715			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1716				bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1717						&sync_blocks, 1);
1718			else for (i=0; i<conf->raid_disks; i++) {
1719				sector_t sect =
1720					raid10_find_virt(conf, mddev->curr_resync, i);
1721				bitmap_end_sync(mddev->bitmap, sect,
1722						&sync_blocks, 1);
1723			}
1724		} else /* completed sync */
1725			conf->fullsync = 0;
1726
1727		bitmap_close_sync(mddev->bitmap);
1728		close_sync(conf);
1729		*skipped = 1;
1730		return sectors_skipped;
1731	}
1732	if (chunks_skipped >= conf->raid_disks) {
1733		/* if there has been nothing to do on any drive,
1734		 * then there is nothing to do at all..
1735		 */
1736		*skipped = 1;
1737		return (max_sector - sector_nr) + sectors_skipped;
1738	}
1739
1740	if (max_sector > mddev->resync_max)
1741		max_sector = mddev->resync_max; /* Don't do IO beyond here */
1742
1743	/* make sure whole request will fit in a chunk - if chunks
1744	 * are meaningful
1745	 */
1746	if (conf->near_copies < conf->raid_disks &&
1747	    max_sector > (sector_nr | conf->chunk_mask))
1748		max_sector = (sector_nr | conf->chunk_mask) + 1;
1749	/*
1750	 * If there is non-resync activity waiting for us then
1751	 * put in a delay to throttle resync.
1752	 */
1753	if (!go_faster && conf->nr_waiting)
1754		msleep_interruptible(1000);
1755
1756	/* Again, very different code for resync and recovery.
1757	 * Both must result in an r10bio with a list of bios that
1758	 * have bi_end_io, bi_sector, bi_bdev set,
1759	 * and bi_private set to the r10bio.
1760	 * For recovery, we may actually create several r10bios
1761	 * with 2 bios in each, that correspond to the bios in the main one.
1762	 * In this case, the subordinate r10bios link back through a
1763	 * borrowed master_bio pointer, and the counter in the master
1764	 * includes a ref from each subordinate.
1765	 */
1766	/* First, we decide what to do and set ->bi_end_io
1767	 * To end_sync_read if we want to read, and
1768	 * end_sync_write if we will want to write.
1769	 */
1770
1771	max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
1772	if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1773		/* recovery... the complicated one */
1774		int i, j, k;
1775		r10_bio = NULL;
1776
1777		for (i=0 ; i<conf->raid_disks; i++)
1778			if (conf->mirrors[i].rdev &&
1779			    !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
1780				int still_degraded = 0;
1781				/* want to reconstruct this device */
1782				r10bio_t *rb2 = r10_bio;
1783				sector_t sect = raid10_find_virt(conf, sector_nr, i);
1784				int must_sync;
1785				/* Unless we are doing a full sync, we only need
1786				 * to recover the block if it is set in the bitmap
1787				 */
1788				must_sync = bitmap_start_sync(mddev->bitmap, sect,
1789							      &sync_blocks, 1);
1790				if (sync_blocks < max_sync)
1791					max_sync = sync_blocks;
1792				if (!must_sync &&
1793				    !conf->fullsync) {
1794					/* yep, skip the sync_blocks here, but don't assume
1795					 * that there will never be anything to do here
1796					 */
1797					chunks_skipped = -1;
1798					continue;
1799				}
1800
1801				r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1802				raise_barrier(conf, rb2 != NULL);
1803				atomic_set(&r10_bio->remaining, 0);
1804
1805				r10_bio->master_bio = (struct bio*)rb2;
1806				if (rb2)
1807					atomic_inc(&rb2->remaining);
1808				r10_bio->mddev = mddev;
1809				set_bit(R10BIO_IsRecover, &r10_bio->state);
1810				r10_bio->sector = sect;
1811
1812				raid10_find_phys(conf, r10_bio);
1813				/* Need to check if this section will still be
1814				 * degraded
1815				 */
1816				for (j=0; j<conf->copies;j++) {
1817					int d = r10_bio->devs[j].devnum;
1818					if (conf->mirrors[d].rdev == NULL ||
1819					    test_bit(Faulty, &conf->mirrors[d].rdev->flags)) {
1820						still_degraded = 1;
1821						break;
1822					}
1823				}
1824				must_sync = bitmap_start_sync(mddev->bitmap, sect,
1825							      &sync_blocks, still_degraded);
1826
1827				for (j=0; j<conf->copies;j++) {
1828					int d = r10_bio->devs[j].devnum;
1829					if (conf->mirrors[d].rdev &&
1830					    test_bit(In_sync, &conf->mirrors[d].rdev->flags)) {
1831						/* This is where we read from */
1832						bio = r10_bio->devs[0].bio;
1833						bio->bi_next = biolist;
1834						biolist = bio;
1835						bio->bi_private = r10_bio;
1836						bio->bi_end_io = end_sync_read;
1837						bio->bi_rw = READ;
1838						bio->bi_sector = r10_bio->devs[j].addr +
1839							conf->mirrors[d].rdev->data_offset;
1840						bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1841						atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1842						atomic_inc(&r10_bio->remaining);
1843						/* and we write to 'i' */
1844
1845						for (k=0; k<conf->copies; k++)
1846							if (r10_bio->devs[k].devnum == i)
1847								break;
1848						BUG_ON(k == conf->copies);
1849						bio = r10_bio->devs[1].bio;
1850						bio->bi_next = biolist;
1851						biolist = bio;
1852						bio->bi_private = r10_bio;
1853						bio->bi_end_io = end_sync_write;
1854						bio->bi_rw = WRITE;
1855						bio->bi_sector = r10_bio->devs[k].addr +
1856							conf->mirrors[i].rdev->data_offset;
1857						bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1858
1859						r10_bio->devs[0].devnum = d;
1860						r10_bio->devs[1].devnum = i;
1861
1862						break;
1863					}
1864				}
1865				if (j == conf->copies) {
1866					/* Cannot recover, so abort the recovery */
1867					put_buf(r10_bio);
1868					if (rb2)
1869						atomic_dec(&rb2->remaining);
1870					r10_bio = rb2;
1871					if (!test_and_set_bit(MD_RECOVERY_INTR,
1872							      &mddev->recovery))
1873						printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n",
1874						       mdname(mddev));
1875					break;
1876				}
1877			}
1878		if (biolist == NULL) {
1879			while (r10_bio) {
1880				r10bio_t *rb2 = r10_bio;
1881				r10_bio = (r10bio_t*) rb2->master_bio;
1882				rb2->master_bio = NULL;
1883				put_buf(rb2);
1884			}
1885			goto giveup;
1886		}
1887	} else {
1888		/* resync. Schedule a read for every block at this virt offset */
1889		int count = 0;
1890
1891		bitmap_cond_end_sync(mddev->bitmap, sector_nr);
1892
1893		if (!bitmap_start_sync(mddev->bitmap, sector_nr,
1894				       &sync_blocks, mddev->degraded) &&
1895		    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1896			/* We can skip this block */
1897			*skipped = 1;
1898			return sync_blocks + sectors_skipped;
1899		}
1900		if (sync_blocks < max_sync)
1901			max_sync = sync_blocks;
1902		r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1903
1904		r10_bio->mddev = mddev;
1905		atomic_set(&r10_bio->remaining, 0);
1906		raise_barrier(conf, 0);
1907		conf->next_resync = sector_nr;
1908
1909		r10_bio->master_bio = NULL;
1910		r10_bio->sector = sector_nr;
1911		set_bit(R10BIO_IsSync, &r10_bio->state);
1912		raid10_find_phys(conf, r10_bio);
1913		r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
1914
1915		for (i=0; i<conf->copies; i++) {
1916			int d = r10_bio->devs[i].devnum;
1917			bio = r10_bio->devs[i].bio;
1918			bio->bi_end_io = NULL;
1919			clear_bit(BIO_UPTODATE, &bio->bi_flags);
1920			if (conf->mirrors[d].rdev == NULL ||
1921			    test_bit(Faulty, &conf->mirrors[d].rdev->flags))
1922				continue;
1923			atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1924			atomic_inc(&r10_bio->remaining);
1925			bio->bi_next = biolist;
1926			biolist = bio;
1927			bio->bi_private = r10_bio;
1928			bio->bi_end_io = end_sync_read;
1929			bio->bi_rw = READ;
1930			bio->bi_sector = r10_bio->devs[i].addr +
1931				conf->mirrors[d].rdev->data_offset;
1932			bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1933			count++;
1934		}
1935
1936		if (count < 2) {
1937			for (i=0; i<conf->copies; i++) {
1938				int d = r10_bio->devs[i].devnum;
1939				if (r10_bio->devs[i].bio->bi_end_io)
1940					rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1941			}
1942			put_buf(r10_bio);
1943			biolist = NULL;
1944			goto giveup;
1945		}
1946	}
1947
1948	for (bio = biolist; bio ; bio=bio->bi_next) {
1949
1950		bio->bi_flags &= ~(BIO_POOL_MASK - 1);
1951		if (bio->bi_end_io)
1952			bio->bi_flags |= 1 << BIO_UPTODATE;
1953		bio->bi_vcnt = 0;
1954		bio->bi_idx = 0;
1955		bio->bi_phys_segments = 0;
1956		bio->bi_size = 0;
1957	}
1958
1959	nr_sectors = 0;
1960	if (sector_nr + max_sync < max_sector)
1961		max_sector = sector_nr + max_sync;
1962	do {
1963		struct page *page;
1964		int len = PAGE_SIZE;
1965		disk = 0;
1966		if (sector_nr + (len>>9) > max_sector)
1967			len = (max_sector - sector_nr) << 9;
1968		if (len == 0)
1969			break;
1970		for (bio= biolist ; bio ; bio=bio->bi_next) {
1971			page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
1972			if (bio_add_page(bio, page, len, 0) == 0) {
1973				/* stop here */
1974				struct bio *bio2;
1975				bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
1976				for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
1977					/* remove last page from this bio */
1978					bio2->bi_vcnt--;
1979					bio2->bi_size -= len;
1980					bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
1981				}
1982				goto bio_full;
1983			}
1984			disk = i;
1985		}
1986		nr_sectors += len>>9;
1987		sector_nr += len>>9;
1988	} while (biolist->bi_vcnt < RESYNC_PAGES);
1989 bio_full:
1990	r10_bio->sectors = nr_sectors;
1991
1992	while (biolist) {
1993		bio = biolist;
1994		biolist = biolist->bi_next;
1995
1996		bio->bi_next = NULL;
1997		r10_bio = bio->bi_private;
1998		r10_bio->sectors = nr_sectors;
1999
2000		if (bio->bi_end_io == end_sync_read) {
2001			md_sync_acct(bio->bi_bdev, nr_sectors);
2002			generic_make_request(bio);
2003		}
2004	}
2005
2006	if (sectors_skipped)
2007		/* pretend they weren't skipped, it makes
2008		 * no important difference in this case
2009		 */
2010		md_done_sync(mddev, sectors_skipped, 1);
2011
2012	return sectors_skipped + nr_sectors;
2013 giveup:
2014	/* There is nowhere to write, so all non-sync
2015	 * drives must be failed, so try the next chunk...
2016	 */
2017	if (sector_nr + max_sync < max_sector)
2018		max_sector = sector_nr + max_sync;
2019
2020	sectors_skipped += (max_sector - sector_nr);
2021	chunks_skipped ++;
2022	sector_nr = max_sector;
2023	goto skipped;
2024}
2025
2026static sector_t
2027raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
2028{
2029	sector_t size;
2030	conf_t *conf = mddev_to_conf(mddev);
2031
2032	if (!raid_disks)
2033		raid_disks = mddev->raid_disks;
2034	if (!sectors)
2035		sectors = mddev->dev_sectors;
2036
2037	size = sectors >> conf->chunk_shift;
2038	sector_div(size, conf->far_copies);
2039	size = size * raid_disks;
2040	sector_div(size, conf->near_copies);
2041
2042	return size << conf->chunk_shift;
2043}
2044
2045static int run(mddev_t *mddev)
2046{
2047	conf_t *conf;
2048	int i, disk_idx;
2049	mirror_info_t *disk;
2050	mdk_rdev_t *rdev;
2051	int nc, fc, fo;
2052	sector_t stride, size;
2053
2054	if (mddev->chunk_size < PAGE_SIZE) {
2055		printk(KERN_ERR "md/raid10: chunk size must be "
2056		       "at least PAGE_SIZE(%ld).\n", PAGE_SIZE);
2057		return -EINVAL;
2058	}
2059
2060	nc = mddev->layout & 255;
2061	fc = (mddev->layout >> 8) & 255;
2062	fo = mddev->layout & (1<<16);
2063	if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
2064	    (mddev->layout >> 17)) {
2065		printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n",
2066		       mdname(mddev), mddev->layout);
2067		goto out;
2068	}
2069	/*
2070	 * copy the already verified devices into our private RAID10
2071	 * bookkeeping area. [whatever we allocate in run(),
2072	 * should be freed in stop()]
2073	 */
2074	conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
2075	mddev->private = conf;
2076	if (!conf) {
2077		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
2078			mdname(mddev));
2079		goto out;
2080	}
2081	conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
2082				 GFP_KERNEL);
2083	if (!conf->mirrors) {
2084		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
2085		       mdname(mddev));
2086		goto out_free_conf;
2087	}
2088
2089	conf->tmppage = alloc_page(GFP_KERNEL);
2090	if (!conf->tmppage)
2091		goto out_free_conf;
2092
2093	conf->mddev = mddev;
2094	conf->raid_disks = mddev->raid_disks;
2095	conf->near_copies = nc;
2096	conf->far_copies = fc;
2097	conf->copies = nc*fc;
2098	conf->far_offset = fo;
2099	conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
2100	conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
2101	size = mddev->dev_sectors >> conf->chunk_shift;
2102	sector_div(size, fc);
2103	size = size * conf->raid_disks;
2104	sector_div(size, nc);
2105	/* 'size' is now the number of chunks in the array */
2106	/* calculate "used chunks per device" in 'stride' */
2107	stride = size * conf->copies;
2108
2109	/* We need to round up when dividing by raid_disks to
2110	 * get the stride size.
2111	 */
2112	stride += conf->raid_disks - 1;
2113	sector_div(stride, conf->raid_disks);
2114	mddev->dev_sectors = stride << conf->chunk_shift;
2115
2116	if (fo)
2117		stride = 1;
2118	else
2119		sector_div(stride, fc);
2120	conf->stride = stride << conf->chunk_shift;
2121
2122	conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
2123						r10bio_pool_free, conf);
2124	if (!conf->r10bio_pool) {
2125		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
2126			mdname(mddev));
2127		goto out_free_conf;
2128	}
2129
2130	spin_lock_init(&conf->device_lock);
2131	mddev->queue->queue_lock = &conf->device_lock;
2132
2133	list_for_each_entry(rdev, &mddev->disks, same_set) {
2134		disk_idx = rdev->raid_disk;
2135		if (disk_idx >= mddev->raid_disks
2136		    || disk_idx < 0)
2137			continue;
2138		disk = conf->mirrors + disk_idx;
2139
2140		disk->rdev = rdev;
2141
2142		blk_queue_stack_limits(mddev->queue,
2143				       rdev->bdev->bd_disk->queue);
2144		/* as we don't honour merge_bvec_fn, we must never risk
2145		 * violating it, so limit ->max_sector to one PAGE, as
2146		 * a one page request is never in violation.
2147		 */
2148		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
2149		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
2150			mddev->queue->max_sectors = (PAGE_SIZE>>9);
2151
2152		disk->head_position = 0;
2153	}
2154	INIT_LIST_HEAD(&conf->retry_list);
2155
2156	spin_lock_init(&conf->resync_lock);
2157	init_waitqueue_head(&conf->wait_barrier);
2158
2159	/* need to check that every block has at least one working mirror */
2160	if (!enough(conf)) {
2161		printk(KERN_ERR "raid10: not enough operational mirrors for %s\n",
2162		       mdname(mddev));
2163		goto out_free_conf;
2164	}
2165
2166	mddev->degraded = 0;
2167	for (i = 0; i < conf->raid_disks; i++) {
2168
2169		disk = conf->mirrors + i;
2170
2171		if (!disk->rdev ||
2172		    !test_bit(In_sync, &disk->rdev->flags)) {
2173			disk->head_position = 0;
2174			mddev->degraded++;
2175			if (disk->rdev)
2176				conf->fullsync = 1;
2177		}
2178	}
2179
2180
2181	mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10");
2182	if (!mddev->thread) {
2183		printk(KERN_ERR
2184		       "raid10: couldn't allocate thread for %s\n",
2185		       mdname(mddev));
2186		goto out_free_conf;
2187	}
2188
2189	printk(KERN_INFO
2190		"raid10: raid set %s active with %d out of %d devices\n",
2191		mdname(mddev), mddev->raid_disks - mddev->degraded,
2192		mddev->raid_disks);
2193	/*
2194	 * Ok, everything is just fine now
2195	 */
2196	md_set_array_sectors(mddev, raid10_size(mddev, 0, 0));
2197	mddev->resync_max_sectors = raid10_size(mddev, 0, 0);
2198
2199	mddev->queue->unplug_fn = raid10_unplug;
2200	mddev->queue->backing_dev_info.congested_fn = raid10_congested;
2201	mddev->queue->backing_dev_info.congested_data = mddev;
2202
2203	/* Calculate max read-ahead size.
2204	 * We need to readahead at least twice a whole stripe....
2205	 * maybe...
2206	 */
2207	{
2208		int stripe = conf->raid_disks * (mddev->chunk_size / PAGE_SIZE);
2209		stripe /= conf->near_copies;
2210		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
2211			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
2212	}
2213
2214	if (conf->near_copies < mddev->raid_disks)
2215		blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
2216	return 0;
2217
2218out_free_conf:
2219	if (conf->r10bio_pool)
2220		mempool_destroy(conf->r10bio_pool);
2221	safe_put_page(conf->tmppage);
2222	kfree(conf->mirrors);
2223	kfree(conf);
2224	mddev->private = NULL;
2225out:
2226	return -EIO;
2227}
2228
2229static int stop(mddev_t *mddev)
2230{
2231	conf_t *conf = mddev_to_conf(mddev);
2232
2233	raise_barrier(conf, 0);
2234	lower_barrier(conf);
2235
2236	md_unregister_thread(mddev->thread);
2237	mddev->thread = NULL;
2238	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
2239	if (conf->r10bio_pool)
2240		mempool_destroy(conf->r10bio_pool);
2241	kfree(conf->mirrors);
2242	kfree(conf);
2243	mddev->private = NULL;
2244	return 0;
2245}
2246
2247static void raid10_quiesce(mddev_t *mddev, int state)
2248{
2249	conf_t *conf = mddev_to_conf(mddev);
2250
2251	switch(state) {
2252	case 1:
2253		raise_barrier(conf, 0);
2254		break;
2255	case 0:
2256		lower_barrier(conf);
2257		break;
2258	}
2259	if (mddev->thread) {
2260		if (mddev->bitmap)
2261			mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
2262		else
2263			mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
2264		md_wakeup_thread(mddev->thread);
2265	}
2266}
2267
2268static struct mdk_personality raid10_personality =
2269{
2270	.name		= "raid10",
2271	.level		= 10,
2272	.owner		= THIS_MODULE,
2273	.make_request	= make_request,
2274	.run		= run,
2275	.stop		= stop,
2276	.status		= status,
2277	.error_handler	= error,
2278	.hot_add_disk	= raid10_add_disk,
2279	.hot_remove_disk= raid10_remove_disk,
2280	.spare_active	= raid10_spare_active,
2281	.sync_request	= sync_request,
2282	.quiesce	= raid10_quiesce,
2283	.size		= raid10_size,
2284};
2285
2286static int __init raid_init(void)
2287{
2288	return register_md_personality(&raid10_personality);
2289}
2290
2291static void raid_exit(void)
2292{
2293	unregister_md_personality(&raid10_personality);
2294}
2295
2296module_init(raid_init);
2297module_exit(raid_exit);
2298MODULE_LICENSE("GPL");
2299MODULE_ALIAS("md-personality-9"); /* RAID10 */
2300MODULE_ALIAS("md-raid10");
2301MODULE_ALIAS("md-level-10");