drivers/md/raid10.c at v2.6.15-rc2

tjh.dev / kernel
fork atom
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork atom
kernel / drivers / md / raid10.c
at v2.6.15-rc2 1826 lines 49 kB view raw
wrap content
   1/*
   2 * raid10.c : Multiple Devices driver for Linux
   3 *
   4 * Copyright (C) 2000-2004 Neil Brown
   5 *
   6 * RAID-10 support for md.
   7 *
   8 * Base on code in raid1.c.  See raid1.c for futher copyright information.
   9 *
  10 *
  11 * This program is free software; you can redistribute it and/or modify
  12 * it under the terms of the GNU General Public License as published by
  13 * the Free Software Foundation; either version 2, or (at your option)
  14 * any later version.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * (for example /usr/src/linux/COPYING); if not, write to the Free
  18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19 */
  20
  21#include <linux/raid/raid10.h>
  22
  23/*
  24 * RAID10 provides a combination of RAID0 and RAID1 functionality.
  25 * The layout of data is defined by
  26 *    chunk_size
  27 *    raid_disks
  28 *    near_copies (stored in low byte of layout)
  29 *    far_copies (stored in second byte of layout)
  30 *
  31 * The data to be stored is divided into chunks using chunksize.
  32 * Each device is divided into far_copies sections.
  33 * In each section, chunks are laid out in a style similar to raid0, but
  34 * near_copies copies of each chunk is stored (each on a different drive).
  35 * The starting device for each section is offset near_copies from the starting
  36 * device of the previous section.
  37 * Thus there are (near_copies*far_copies) of each chunk, and each is on a different
  38 * drive.
  39 * near_copies and far_copies must be at least one, and their product is at most
  40 * raid_disks.
  41 */
  42
  43/*
  44 * Number of guaranteed r10bios in case of extreme VM load:
  45 */
  46#define	NR_RAID10_BIOS 256
  47
  48static void unplug_slaves(mddev_t *mddev);
  49
  50static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
  51{
  52	conf_t *conf = data;
  53	r10bio_t *r10_bio;
  54	int size = offsetof(struct r10bio_s, devs[conf->copies]);
  55
  56	/* allocate a r10bio with room for raid_disks entries in the bios array */
  57	r10_bio = kmalloc(size, gfp_flags);
  58	if (r10_bio)
  59		memset(r10_bio, 0, size);
  60	else
  61		unplug_slaves(conf->mddev);
  62
  63	return r10_bio;
  64}
  65
  66static void r10bio_pool_free(void *r10_bio, void *data)
  67{
  68	kfree(r10_bio);
  69}
  70
  71#define RESYNC_BLOCK_SIZE (64*1024)
  72//#define RESYNC_BLOCK_SIZE PAGE_SIZE
  73#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
  74#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
  75#define RESYNC_WINDOW (2048*1024)
  76
  77/*
  78 * When performing a resync, we need to read and compare, so
  79 * we need as many pages are there are copies.
  80 * When performing a recovery, we need 2 bios, one for read,
  81 * one for write (we recover only one drive per r10buf)
  82 *
  83 */
  84static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
  85{
  86	conf_t *conf = data;
  87	struct page *page;
  88	r10bio_t *r10_bio;
  89	struct bio *bio;
  90	int i, j;
  91	int nalloc;
  92
  93	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
  94	if (!r10_bio) {
  95		unplug_slaves(conf->mddev);
  96		return NULL;
  97	}
  98
  99	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
 100		nalloc = conf->copies; /* resync */
 101	else
 102		nalloc = 2; /* recovery */
 103
 104	/*
 105	 * Allocate bios.
 106	 */
 107	for (j = nalloc ; j-- ; ) {
 108		bio = bio_alloc(gfp_flags, RESYNC_PAGES);
 109		if (!bio)
 110			goto out_free_bio;
 111		r10_bio->devs[j].bio = bio;
 112	}
 113	/*
 114	 * Allocate RESYNC_PAGES data pages and attach them
 115	 * where needed.
 116	 */
 117	for (j = 0 ; j < nalloc; j++) {
 118		bio = r10_bio->devs[j].bio;
 119		for (i = 0; i < RESYNC_PAGES; i++) {
 120			page = alloc_page(gfp_flags);
 121			if (unlikely(!page))
 122				goto out_free_pages;
 123
 124			bio->bi_io_vec[i].bv_page = page;
 125		}
 126	}
 127
 128	return r10_bio;
 129
 130out_free_pages:
 131	for ( ; i > 0 ; i--)
 132		__free_page(bio->bi_io_vec[i-1].bv_page);
 133	while (j--)
 134		for (i = 0; i < RESYNC_PAGES ; i++)
 135			__free_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
 136	j = -1;
 137out_free_bio:
 138	while ( ++j < nalloc )
 139		bio_put(r10_bio->devs[j].bio);
 140	r10bio_pool_free(r10_bio, conf);
 141	return NULL;
 142}
 143
 144static void r10buf_pool_free(void *__r10_bio, void *data)
 145{
 146	int i;
 147	conf_t *conf = data;
 148	r10bio_t *r10bio = __r10_bio;
 149	int j;
 150
 151	for (j=0; j < conf->copies; j++) {
 152		struct bio *bio = r10bio->devs[j].bio;
 153		if (bio) {
 154			for (i = 0; i < RESYNC_PAGES; i++) {
 155				__free_page(bio->bi_io_vec[i].bv_page);
 156				bio->bi_io_vec[i].bv_page = NULL;
 157			}
 158			bio_put(bio);
 159		}
 160	}
 161	r10bio_pool_free(r10bio, conf);
 162}
 163
 164static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
 165{
 166	int i;
 167
 168	for (i = 0; i < conf->copies; i++) {
 169		struct bio **bio = & r10_bio->devs[i].bio;
 170		if (*bio)
 171			bio_put(*bio);
 172		*bio = NULL;
 173	}
 174}
 175
 176static inline void free_r10bio(r10bio_t *r10_bio)
 177{
 178	unsigned long flags;
 179
 180	conf_t *conf = mddev_to_conf(r10_bio->mddev);
 181
 182	/*
 183	 * Wake up any possible resync thread that waits for the device
 184	 * to go idle.
 185	 */
 186	spin_lock_irqsave(&conf->resync_lock, flags);
 187	if (!--conf->nr_pending) {
 188		wake_up(&conf->wait_idle);
 189		wake_up(&conf->wait_resume);
 190	}
 191	spin_unlock_irqrestore(&conf->resync_lock, flags);
 192
 193	put_all_bios(conf, r10_bio);
 194	mempool_free(r10_bio, conf->r10bio_pool);
 195}
 196
 197static inline void put_buf(r10bio_t *r10_bio)
 198{
 199	conf_t *conf = mddev_to_conf(r10_bio->mddev);
 200	unsigned long flags;
 201
 202	mempool_free(r10_bio, conf->r10buf_pool);
 203
 204	spin_lock_irqsave(&conf->resync_lock, flags);
 205	if (!conf->barrier)
 206		BUG();
 207	--conf->barrier;
 208	wake_up(&conf->wait_resume);
 209	wake_up(&conf->wait_idle);
 210
 211	if (!--conf->nr_pending) {
 212		wake_up(&conf->wait_idle);
 213		wake_up(&conf->wait_resume);
 214	}
 215	spin_unlock_irqrestore(&conf->resync_lock, flags);
 216}
 217
 218static void reschedule_retry(r10bio_t *r10_bio)
 219{
 220	unsigned long flags;
 221	mddev_t *mddev = r10_bio->mddev;
 222	conf_t *conf = mddev_to_conf(mddev);
 223
 224	spin_lock_irqsave(&conf->device_lock, flags);
 225	list_add(&r10_bio->retry_list, &conf->retry_list);
 226	spin_unlock_irqrestore(&conf->device_lock, flags);
 227
 228	md_wakeup_thread(mddev->thread);
 229}
 230
 231/*
 232 * raid_end_bio_io() is called when we have finished servicing a mirrored
 233 * operation and are ready to return a success/failure code to the buffer
 234 * cache layer.
 235 */
 236static void raid_end_bio_io(r10bio_t *r10_bio)
 237{
 238	struct bio *bio = r10_bio->master_bio;
 239
 240	bio_endio(bio, bio->bi_size,
 241		test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
 242	free_r10bio(r10_bio);
 243}
 244
 245/*
 246 * Update disk head position estimator based on IRQ completion info.
 247 */
 248static inline void update_head_pos(int slot, r10bio_t *r10_bio)
 249{
 250	conf_t *conf = mddev_to_conf(r10_bio->mddev);
 251
 252	conf->mirrors[r10_bio->devs[slot].devnum].head_position =
 253		r10_bio->devs[slot].addr + (r10_bio->sectors);
 254}
 255
 256static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int error)
 257{
 258	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 259	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
 260	int slot, dev;
 261	conf_t *conf = mddev_to_conf(r10_bio->mddev);
 262
 263	if (bio->bi_size)
 264		return 1;
 265
 266	slot = r10_bio->read_slot;
 267	dev = r10_bio->devs[slot].devnum;
 268	/*
 269	 * this branch is our 'one mirror IO has finished' event handler:
 270	 */
 271	if (!uptodate)
 272		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
 273	else
 274		/*
 275		 * Set R10BIO_Uptodate in our master bio, so that
 276		 * we will return a good error code to the higher
 277		 * levels even if IO on some other mirrored buffer fails.
 278		 *
 279		 * The 'master' represents the composite IO operation to
 280		 * user-side. So if something waits for IO, then it will
 281		 * wait for the 'master' bio.
 282		 */
 283		set_bit(R10BIO_Uptodate, &r10_bio->state);
 284
 285	update_head_pos(slot, r10_bio);
 286
 287	/*
 288	 * we have only one bio on the read side
 289	 */
 290	if (uptodate)
 291		raid_end_bio_io(r10_bio);
 292	else {
 293		/*
 294		 * oops, read error:
 295		 */
 296		char b[BDEVNAME_SIZE];
 297		if (printk_ratelimit())
 298			printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n",
 299			       bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
 300		reschedule_retry(r10_bio);
 301	}
 302
 303	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
 304	return 0;
 305}
 306
 307static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, int error)
 308{
 309	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 310	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
 311	int slot, dev;
 312	conf_t *conf = mddev_to_conf(r10_bio->mddev);
 313
 314	if (bio->bi_size)
 315		return 1;
 316
 317	for (slot = 0; slot < conf->copies; slot++)
 318		if (r10_bio->devs[slot].bio == bio)
 319			break;
 320	dev = r10_bio->devs[slot].devnum;
 321
 322	/*
 323	 * this branch is our 'one mirror IO has finished' event handler:
 324	 */
 325	if (!uptodate)
 326		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
 327	else
 328		/*
 329		 * Set R10BIO_Uptodate in our master bio, so that
 330		 * we will return a good error code for to the higher
 331		 * levels even if IO on some other mirrored buffer fails.
 332		 *
 333		 * The 'master' represents the composite IO operation to
 334		 * user-side. So if something waits for IO, then it will
 335		 * wait for the 'master' bio.
 336		 */
 337		set_bit(R10BIO_Uptodate, &r10_bio->state);
 338
 339	update_head_pos(slot, r10_bio);
 340
 341	/*
 342	 *
 343	 * Let's see if all mirrored write operations have finished
 344	 * already.
 345	 */
 346	if (atomic_dec_and_test(&r10_bio->remaining)) {
 347		md_write_end(r10_bio->mddev);
 348		raid_end_bio_io(r10_bio);
 349	}
 350
 351	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
 352	return 0;
 353}
 354
 355
 356/*
 357 * RAID10 layout manager
 358 * Aswell as the chunksize and raid_disks count, there are two
 359 * parameters: near_copies and far_copies.
 360 * near_copies * far_copies must be <= raid_disks.
 361 * Normally one of these will be 1.
 362 * If both are 1, we get raid0.
 363 * If near_copies == raid_disks, we get raid1.
 364 *
 365 * Chunks are layed out in raid0 style with near_copies copies of the
 366 * first chunk, followed by near_copies copies of the next chunk and
 367 * so on.
 368 * If far_copies > 1, then after 1/far_copies of the array has been assigned
 369 * as described above, we start again with a device offset of near_copies.
 370 * So we effectively have another copy of the whole array further down all
 371 * the drives, but with blocks on different drives.
 372 * With this layout, and block is never stored twice on the one device.
 373 *
 374 * raid10_find_phys finds the sector offset of a given virtual sector
 375 * on each device that it is on. If a block isn't on a device,
 376 * that entry in the array is set to MaxSector.
 377 *
 378 * raid10_find_virt does the reverse mapping, from a device and a
 379 * sector offset to a virtual address
 380 */
 381
 382static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
 383{
 384	int n,f;
 385	sector_t sector;
 386	sector_t chunk;
 387	sector_t stripe;
 388	int dev;
 389
 390	int slot = 0;
 391
 392	/* now calculate first sector/dev */
 393	chunk = r10bio->sector >> conf->chunk_shift;
 394	sector = r10bio->sector & conf->chunk_mask;
 395
 396	chunk *= conf->near_copies;
 397	stripe = chunk;
 398	dev = sector_div(stripe, conf->raid_disks);
 399
 400	sector += stripe << conf->chunk_shift;
 401
 402	/* and calculate all the others */
 403	for (n=0; n < conf->near_copies; n++) {
 404		int d = dev;
 405		sector_t s = sector;
 406		r10bio->devs[slot].addr = sector;
 407		r10bio->devs[slot].devnum = d;
 408		slot++;
 409
 410		for (f = 1; f < conf->far_copies; f++) {
 411			d += conf->near_copies;
 412			if (d >= conf->raid_disks)
 413				d -= conf->raid_disks;
 414			s += conf->stride;
 415			r10bio->devs[slot].devnum = d;
 416			r10bio->devs[slot].addr = s;
 417			slot++;
 418		}
 419		dev++;
 420		if (dev >= conf->raid_disks) {
 421			dev = 0;
 422			sector += (conf->chunk_mask + 1);
 423		}
 424	}
 425	BUG_ON(slot != conf->copies);
 426}
 427
 428static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
 429{
 430	sector_t offset, chunk, vchunk;
 431
 432	while (sector > conf->stride) {
 433		sector -= conf->stride;
 434		if (dev < conf->near_copies)
 435			dev += conf->raid_disks - conf->near_copies;
 436		else
 437			dev -= conf->near_copies;
 438	}
 439
 440	offset = sector & conf->chunk_mask;
 441	chunk = sector >> conf->chunk_shift;
 442	vchunk = chunk * conf->raid_disks + dev;
 443	sector_div(vchunk, conf->near_copies);
 444	return (vchunk << conf->chunk_shift) + offset;
 445}
 446
 447/**
 448 *	raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
 449 *	@q: request queue
 450 *	@bio: the buffer head that's been built up so far
 451 *	@biovec: the request that could be merged to it.
 452 *
 453 *	Return amount of bytes we can accept at this offset
 454 *      If near_copies == raid_disk, there are no striping issues,
 455 *      but in that case, the function isn't called at all.
 456 */
 457static int raid10_mergeable_bvec(request_queue_t *q, struct bio *bio,
 458				struct bio_vec *bio_vec)
 459{
 460	mddev_t *mddev = q->queuedata;
 461	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
 462	int max;
 463	unsigned int chunk_sectors = mddev->chunk_size >> 9;
 464	unsigned int bio_sectors = bio->bi_size >> 9;
 465
 466	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
 467	if (max < 0) max = 0; /* bio_add cannot handle a negative return */
 468	if (max <= bio_vec->bv_len && bio_sectors == 0)
 469		return bio_vec->bv_len;
 470	else
 471		return max;
 472}
 473
 474/*
 475 * This routine returns the disk from which the requested read should
 476 * be done. There is a per-array 'next expected sequential IO' sector
 477 * number - if this matches on the next IO then we use the last disk.
 478 * There is also a per-disk 'last know head position' sector that is
 479 * maintained from IRQ contexts, both the normal and the resync IO
 480 * completion handlers update this position correctly. If there is no
 481 * perfect sequential match then we pick the disk whose head is closest.
 482 *
 483 * If there are 2 mirrors in the same 2 devices, performance degrades
 484 * because position is mirror, not device based.
 485 *
 486 * The rdev for the device selected will have nr_pending incremented.
 487 */
 488
 489/*
 490 * FIXME: possibly should rethink readbalancing and do it differently
 491 * depending on near_copies / far_copies geometry.
 492 */
 493static int read_balance(conf_t *conf, r10bio_t *r10_bio)
 494{
 495	const unsigned long this_sector = r10_bio->sector;
 496	int disk, slot, nslot;
 497	const int sectors = r10_bio->sectors;
 498	sector_t new_distance, current_distance;
 499	mdk_rdev_t *rdev;
 500
 501	raid10_find_phys(conf, r10_bio);
 502	rcu_read_lock();
 503	/*
 504	 * Check if we can balance. We can balance on the whole
 505	 * device if no resync is going on, or below the resync window.
 506	 * We take the first readable disk when above the resync window.
 507	 */
 508	if (conf->mddev->recovery_cp < MaxSector
 509	    && (this_sector + sectors >= conf->next_resync)) {
 510		/* make sure that disk is operational */
 511		slot = 0;
 512		disk = r10_bio->devs[slot].devnum;
 513
 514		while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
 515		       !test_bit(In_sync, &rdev->flags)) {
 516			slot++;
 517			if (slot == conf->copies) {
 518				slot = 0;
 519				disk = -1;
 520				break;
 521			}
 522			disk = r10_bio->devs[slot].devnum;
 523		}
 524		goto rb_out;
 525	}
 526
 527
 528	/* make sure the disk is operational */
 529	slot = 0;
 530	disk = r10_bio->devs[slot].devnum;
 531	while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
 532	       !test_bit(In_sync, &rdev->flags)) {
 533		slot ++;
 534		if (slot == conf->copies) {
 535			disk = -1;
 536			goto rb_out;
 537		}
 538		disk = r10_bio->devs[slot].devnum;
 539	}
 540
 541
 542	current_distance = abs(r10_bio->devs[slot].addr -
 543			       conf->mirrors[disk].head_position);
 544
 545	/* Find the disk whose head is closest */
 546
 547	for (nslot = slot; nslot < conf->copies; nslot++) {
 548		int ndisk = r10_bio->devs[nslot].devnum;
 549
 550
 551		if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
 552		    !test_bit(In_sync, &rdev->flags))
 553			continue;
 554
 555		if (!atomic_read(&rdev->nr_pending)) {
 556			disk = ndisk;
 557			slot = nslot;
 558			break;
 559		}
 560		new_distance = abs(r10_bio->devs[nslot].addr -
 561				   conf->mirrors[ndisk].head_position);
 562		if (new_distance < current_distance) {
 563			current_distance = new_distance;
 564			disk = ndisk;
 565			slot = nslot;
 566		}
 567	}
 568
 569rb_out:
 570	r10_bio->read_slot = slot;
 571/*	conf->next_seq_sect = this_sector + sectors;*/
 572
 573	if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
 574		atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
 575	rcu_read_unlock();
 576
 577	return disk;
 578}
 579
 580static void unplug_slaves(mddev_t *mddev)
 581{
 582	conf_t *conf = mddev_to_conf(mddev);
 583	int i;
 584
 585	rcu_read_lock();
 586	for (i=0; i<mddev->raid_disks; i++) {
 587		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
 588		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
 589			request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
 590
 591			atomic_inc(&rdev->nr_pending);
 592			rcu_read_unlock();
 593
 594			if (r_queue->unplug_fn)
 595				r_queue->unplug_fn(r_queue);
 596
 597			rdev_dec_pending(rdev, mddev);
 598			rcu_read_lock();
 599		}
 600	}
 601	rcu_read_unlock();
 602}
 603
 604static void raid10_unplug(request_queue_t *q)
 605{
 606	unplug_slaves(q->queuedata);
 607}
 608
 609static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
 610			     sector_t *error_sector)
 611{
 612	mddev_t *mddev = q->queuedata;
 613	conf_t *conf = mddev_to_conf(mddev);
 614	int i, ret = 0;
 615
 616	rcu_read_lock();
 617	for (i=0; i<mddev->raid_disks && ret == 0; i++) {
 618		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
 619		if (rdev && !test_bit(Faulty, &rdev->flags)) {
 620			struct block_device *bdev = rdev->bdev;
 621			request_queue_t *r_queue = bdev_get_queue(bdev);
 622
 623			if (!r_queue->issue_flush_fn)
 624				ret = -EOPNOTSUPP;
 625			else {
 626				atomic_inc(&rdev->nr_pending);
 627				rcu_read_unlock();
 628				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
 629							      error_sector);
 630				rdev_dec_pending(rdev, mddev);
 631				rcu_read_lock();
 632			}
 633		}
 634	}
 635	rcu_read_unlock();
 636	return ret;
 637}
 638
 639/*
 640 * Throttle resync depth, so that we can both get proper overlapping of
 641 * requests, but are still able to handle normal requests quickly.
 642 */
 643#define RESYNC_DEPTH 32
 644
 645static void device_barrier(conf_t *conf, sector_t sect)
 646{
 647	spin_lock_irq(&conf->resync_lock);
 648	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
 649			    conf->resync_lock, unplug_slaves(conf->mddev));
 650
 651	if (!conf->barrier++) {
 652		wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
 653				    conf->resync_lock, unplug_slaves(conf->mddev));
 654		if (conf->nr_pending)
 655			BUG();
 656	}
 657	wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
 658			    conf->resync_lock, unplug_slaves(conf->mddev));
 659	conf->next_resync = sect;
 660	spin_unlock_irq(&conf->resync_lock);
 661}
 662
 663static int make_request(request_queue_t *q, struct bio * bio)
 664{
 665	mddev_t *mddev = q->queuedata;
 666	conf_t *conf = mddev_to_conf(mddev);
 667	mirror_info_t *mirror;
 668	r10bio_t *r10_bio;
 669	struct bio *read_bio;
 670	int i;
 671	int chunk_sects = conf->chunk_mask + 1;
 672	const int rw = bio_data_dir(bio);
 673
 674	if (unlikely(bio_barrier(bio))) {
 675		bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
 676		return 0;
 677	}
 678
 679	/* If this request crosses a chunk boundary, we need to
 680	 * split it.  This will only happen for 1 PAGE (or less) requests.
 681	 */
 682	if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
 683		      > chunk_sects &&
 684		    conf->near_copies < conf->raid_disks)) {
 685		struct bio_pair *bp;
 686		/* Sanity check -- queue functions should prevent this happening */
 687		if (bio->bi_vcnt != 1 ||
 688		    bio->bi_idx != 0)
 689			goto bad_map;
 690		/* This is a one page bio that upper layers
 691		 * refuse to split for us, so we need to split it.
 692		 */
 693		bp = bio_split(bio, bio_split_pool,
 694			       chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
 695		if (make_request(q, &bp->bio1))
 696			generic_make_request(&bp->bio1);
 697		if (make_request(q, &bp->bio2))
 698			generic_make_request(&bp->bio2);
 699
 700		bio_pair_release(bp);
 701		return 0;
 702	bad_map:
 703		printk("raid10_make_request bug: can't convert block across chunks"
 704		       " or bigger than %dk %llu %d\n", chunk_sects/2,
 705		       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
 706
 707		bio_io_error(bio, bio->bi_size);
 708		return 0;
 709	}
 710
 711	md_write_start(mddev, bio);
 712
 713	/*
 714	 * Register the new request and wait if the reconstruction
 715	 * thread has put up a bar for new requests.
 716	 * Continue immediately if no resync is active currently.
 717	 */
 718	spin_lock_irq(&conf->resync_lock);
 719	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
 720	conf->nr_pending++;
 721	spin_unlock_irq(&conf->resync_lock);
 722
 723	disk_stat_inc(mddev->gendisk, ios[rw]);
 724	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
 725
 726	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 727
 728	r10_bio->master_bio = bio;
 729	r10_bio->sectors = bio->bi_size >> 9;
 730
 731	r10_bio->mddev = mddev;
 732	r10_bio->sector = bio->bi_sector;
 733
 734	if (rw == READ) {
 735		/*
 736		 * read balancing logic:
 737		 */
 738		int disk = read_balance(conf, r10_bio);
 739		int slot = r10_bio->read_slot;
 740		if (disk < 0) {
 741			raid_end_bio_io(r10_bio);
 742			return 0;
 743		}
 744		mirror = conf->mirrors + disk;
 745
 746		read_bio = bio_clone(bio, GFP_NOIO);
 747
 748		r10_bio->devs[slot].bio = read_bio;
 749
 750		read_bio->bi_sector = r10_bio->devs[slot].addr +
 751			mirror->rdev->data_offset;
 752		read_bio->bi_bdev = mirror->rdev->bdev;
 753		read_bio->bi_end_io = raid10_end_read_request;
 754		read_bio->bi_rw = READ;
 755		read_bio->bi_private = r10_bio;
 756
 757		generic_make_request(read_bio);
 758		return 0;
 759	}
 760
 761	/*
 762	 * WRITE:
 763	 */
 764	/* first select target devices under spinlock and
 765	 * inc refcount on their rdev.  Record them by setting
 766	 * bios[x] to bio
 767	 */
 768	raid10_find_phys(conf, r10_bio);
 769	rcu_read_lock();
 770	for (i = 0;  i < conf->copies; i++) {
 771		int d = r10_bio->devs[i].devnum;
 772		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
 773		if (rdev &&
 774		    !test_bit(Faulty, &rdev->flags)) {
 775			atomic_inc(&rdev->nr_pending);
 776			r10_bio->devs[i].bio = bio;
 777		} else
 778			r10_bio->devs[i].bio = NULL;
 779	}
 780	rcu_read_unlock();
 781
 782	atomic_set(&r10_bio->remaining, 1);
 783
 784	for (i = 0; i < conf->copies; i++) {
 785		struct bio *mbio;
 786		int d = r10_bio->devs[i].devnum;
 787		if (!r10_bio->devs[i].bio)
 788			continue;
 789
 790		mbio = bio_clone(bio, GFP_NOIO);
 791		r10_bio->devs[i].bio = mbio;
 792
 793		mbio->bi_sector	= r10_bio->devs[i].addr+
 794			conf->mirrors[d].rdev->data_offset;
 795		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
 796		mbio->bi_end_io	= raid10_end_write_request;
 797		mbio->bi_rw = WRITE;
 798		mbio->bi_private = r10_bio;
 799
 800		atomic_inc(&r10_bio->remaining);
 801		generic_make_request(mbio);
 802	}
 803
 804	if (atomic_dec_and_test(&r10_bio->remaining)) {
 805		md_write_end(mddev);
 806		raid_end_bio_io(r10_bio);
 807	}
 808
 809	return 0;
 810}
 811
 812static void status(struct seq_file *seq, mddev_t *mddev)
 813{
 814	conf_t *conf = mddev_to_conf(mddev);
 815	int i;
 816
 817	if (conf->near_copies < conf->raid_disks)
 818		seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);
 819	if (conf->near_copies > 1)
 820		seq_printf(seq, " %d near-copies", conf->near_copies);
 821	if (conf->far_copies > 1)
 822		seq_printf(seq, " %d far-copies", conf->far_copies);
 823
 824	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
 825						conf->working_disks);
 826	for (i = 0; i < conf->raid_disks; i++)
 827		seq_printf(seq, "%s",
 828			      conf->mirrors[i].rdev &&
 829			      test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
 830	seq_printf(seq, "]");
 831}
 832
 833static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 834{
 835	char b[BDEVNAME_SIZE];
 836	conf_t *conf = mddev_to_conf(mddev);
 837
 838	/*
 839	 * If it is not operational, then we have already marked it as dead
 840	 * else if it is the last working disks, ignore the error, let the
 841	 * next level up know.
 842	 * else mark the drive as failed
 843	 */
 844	if (test_bit(In_sync, &rdev->flags)
 845	    && conf->working_disks == 1)
 846		/*
 847		 * Don't fail the drive, just return an IO error.
 848		 * The test should really be more sophisticated than
 849		 * "working_disks == 1", but it isn't critical, and
 850		 * can wait until we do more sophisticated "is the drive
 851		 * really dead" tests...
 852		 */
 853		return;
 854	if (test_bit(In_sync, &rdev->flags)) {
 855		mddev->degraded++;
 856		conf->working_disks--;
 857		/*
 858		 * if recovery is running, make sure it aborts.
 859		 */
 860		set_bit(MD_RECOVERY_ERR, &mddev->recovery);
 861	}
 862	clear_bit(In_sync, &rdev->flags);
 863	set_bit(Faulty, &rdev->flags);
 864	mddev->sb_dirty = 1;
 865	printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n"
 866		"	Operation continuing on %d devices\n",
 867		bdevname(rdev->bdev,b), conf->working_disks);
 868}
 869
 870static void print_conf(conf_t *conf)
 871{
 872	int i;
 873	mirror_info_t *tmp;
 874
 875	printk("RAID10 conf printout:\n");
 876	if (!conf) {
 877		printk("(!conf)\n");
 878		return;
 879	}
 880	printk(" --- wd:%d rd:%d\n", conf->working_disks,
 881		conf->raid_disks);
 882
 883	for (i = 0; i < conf->raid_disks; i++) {
 884		char b[BDEVNAME_SIZE];
 885		tmp = conf->mirrors + i;
 886		if (tmp->rdev)
 887			printk(" disk %d, wo:%d, o:%d, dev:%s\n",
 888				i, !test_bit(In_sync, &tmp->rdev->flags),
 889			        !test_bit(Faulty, &tmp->rdev->flags),
 890				bdevname(tmp->rdev->bdev,b));
 891	}
 892}
 893
 894static void close_sync(conf_t *conf)
 895{
 896	spin_lock_irq(&conf->resync_lock);
 897	wait_event_lock_irq(conf->wait_resume, !conf->barrier,
 898			    conf->resync_lock, 	unplug_slaves(conf->mddev));
 899	spin_unlock_irq(&conf->resync_lock);
 900
 901	if (conf->barrier) BUG();
 902	if (waitqueue_active(&conf->wait_idle)) BUG();
 903
 904	mempool_destroy(conf->r10buf_pool);
 905	conf->r10buf_pool = NULL;
 906}
 907
 908/* check if there are enough drives for
 909 * every block to appear on atleast one
 910 */
 911static int enough(conf_t *conf)
 912{
 913	int first = 0;
 914
 915	do {
 916		int n = conf->copies;
 917		int cnt = 0;
 918		while (n--) {
 919			if (conf->mirrors[first].rdev)
 920				cnt++;
 921			first = (first+1) % conf->raid_disks;
 922		}
 923		if (cnt == 0)
 924			return 0;
 925	} while (first != 0);
 926	return 1;
 927}
 928
 929static int raid10_spare_active(mddev_t *mddev)
 930{
 931	int i;
 932	conf_t *conf = mddev->private;
 933	mirror_info_t *tmp;
 934
 935	/*
 936	 * Find all non-in_sync disks within the RAID10 configuration
 937	 * and mark them in_sync
 938	 */
 939	for (i = 0; i < conf->raid_disks; i++) {
 940		tmp = conf->mirrors + i;
 941		if (tmp->rdev
 942		    && !test_bit(Faulty, &tmp->rdev->flags)
 943		    && !test_bit(In_sync, &tmp->rdev->flags)) {
 944			conf->working_disks++;
 945			mddev->degraded--;
 946			set_bit(In_sync, &tmp->rdev->flags);
 947		}
 948	}
 949
 950	print_conf(conf);
 951	return 0;
 952}
 953
 954
 955static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 956{
 957	conf_t *conf = mddev->private;
 958	int found = 0;
 959	int mirror;
 960	mirror_info_t *p;
 961
 962	if (mddev->recovery_cp < MaxSector)
 963		/* only hot-add to in-sync arrays, as recovery is
 964		 * very different from resync
 965		 */
 966		return 0;
 967	if (!enough(conf))
 968		return 0;
 969
 970	for (mirror=0; mirror < mddev->raid_disks; mirror++)
 971		if ( !(p=conf->mirrors+mirror)->rdev) {
 972
 973			blk_queue_stack_limits(mddev->queue,
 974					       rdev->bdev->bd_disk->queue);
 975			/* as we don't honour merge_bvec_fn, we must never risk
 976			 * violating it, so limit ->max_sector to one PAGE, as
 977			 * a one page request is never in violation.
 978			 */
 979			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
 980			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
 981				mddev->queue->max_sectors = (PAGE_SIZE>>9);
 982
 983			p->head_position = 0;
 984			rdev->raid_disk = mirror;
 985			found = 1;
 986			rcu_assign_pointer(p->rdev, rdev);
 987			break;
 988		}
 989
 990	print_conf(conf);
 991	return found;
 992}
 993
 994static int raid10_remove_disk(mddev_t *mddev, int number)
 995{
 996	conf_t *conf = mddev->private;
 997	int err = 0;
 998	mdk_rdev_t *rdev;
 999	mirror_info_t *p = conf->mirrors+ number;
1000
1001	print_conf(conf);
1002	rdev = p->rdev;
1003	if (rdev) {
1004		if (test_bit(In_sync, &rdev->flags) ||
1005		    atomic_read(&rdev->nr_pending)) {
1006			err = -EBUSY;
1007			goto abort;
1008		}
1009		p->rdev = NULL;
1010		synchronize_rcu();
1011		if (atomic_read(&rdev->nr_pending)) {
1012			/* lost the race, try later */
1013			err = -EBUSY;
1014			p->rdev = rdev;
1015		}
1016	}
1017abort:
1018
1019	print_conf(conf);
1020	return err;
1021}
1022
1023
1024static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
1025{
1026	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1027	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
1028	conf_t *conf = mddev_to_conf(r10_bio->mddev);
1029	int i,d;
1030
1031	if (bio->bi_size)
1032		return 1;
1033
1034	for (i=0; i<conf->copies; i++)
1035		if (r10_bio->devs[i].bio == bio)
1036			break;
1037	if (i == conf->copies)
1038		BUG();
1039	update_head_pos(i, r10_bio);
1040	d = r10_bio->devs[i].devnum;
1041	if (!uptodate)
1042		md_error(r10_bio->mddev,
1043			 conf->mirrors[d].rdev);
1044
1045	/* for reconstruct, we always reschedule after a read.
1046	 * for resync, only after all reads
1047	 */
1048	if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1049	    atomic_dec_and_test(&r10_bio->remaining)) {
1050		/* we have read all the blocks,
1051		 * do the comparison in process context in raid10d
1052		 */
1053		reschedule_retry(r10_bio);
1054	}
1055	rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1056	return 0;
1057}
1058
1059static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
1060{
1061	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1062	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
1063	mddev_t *mddev = r10_bio->mddev;
1064	conf_t *conf = mddev_to_conf(mddev);
1065	int i,d;
1066
1067	if (bio->bi_size)
1068		return 1;
1069
1070	for (i = 0; i < conf->copies; i++)
1071		if (r10_bio->devs[i].bio == bio)
1072			break;
1073	d = r10_bio->devs[i].devnum;
1074
1075	if (!uptodate)
1076		md_error(mddev, conf->mirrors[d].rdev);
1077	update_head_pos(i, r10_bio);
1078
1079	while (atomic_dec_and_test(&r10_bio->remaining)) {
1080		if (r10_bio->master_bio == NULL) {
1081			/* the primary of several recovery bios */
1082			md_done_sync(mddev, r10_bio->sectors, 1);
1083			put_buf(r10_bio);
1084			break;
1085		} else {
1086			r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
1087			put_buf(r10_bio);
1088			r10_bio = r10_bio2;
1089		}
1090	}
1091	rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1092	return 0;
1093}
1094
1095/*
1096 * Note: sync and recover and handled very differently for raid10
1097 * This code is for resync.
1098 * For resync, we read through virtual addresses and read all blocks.
1099 * If there is any error, we schedule a write.  The lowest numbered
1100 * drive is authoritative.
1101 * However requests come for physical address, so we need to map.
1102 * For every physical address there are raid_disks/copies virtual addresses,
1103 * which is always are least one, but is not necessarly an integer.
1104 * This means that a physical address can span multiple chunks, so we may
1105 * have to submit multiple io requests for a single sync request.
1106 */
1107/*
1108 * We check if all blocks are in-sync and only write to blocks that
1109 * aren't in sync
1110 */
1111static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1112{
1113	conf_t *conf = mddev_to_conf(mddev);
1114	int i, first;
1115	struct bio *tbio, *fbio;
1116
1117	atomic_set(&r10_bio->remaining, 1);
1118
1119	/* find the first device with a block */
1120	for (i=0; i<conf->copies; i++)
1121		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1122			break;
1123
1124	if (i == conf->copies)
1125		goto done;
1126
1127	first = i;
1128	fbio = r10_bio->devs[i].bio;
1129
1130	/* now find blocks with errors */
1131	for (i=first+1 ; i < conf->copies ; i++) {
1132		int vcnt, j, d;
1133
1134		if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1135			continue;
1136		/* We know that the bi_io_vec layout is the same for
1137		 * both 'first' and 'i', so we just compare them.
1138		 * All vec entries are PAGE_SIZE;
1139		 */
1140		tbio = r10_bio->devs[i].bio;
1141		vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
1142		for (j = 0; j < vcnt; j++)
1143			if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
1144				   page_address(tbio->bi_io_vec[j].bv_page),
1145				   PAGE_SIZE))
1146				break;
1147		if (j == vcnt)
1148			continue;
1149		/* Ok, we need to write this bio
1150		 * First we need to fixup bv_offset, bv_len and
1151		 * bi_vecs, as the read request might have corrupted these
1152		 */
1153		tbio->bi_vcnt = vcnt;
1154		tbio->bi_size = r10_bio->sectors << 9;
1155		tbio->bi_idx = 0;
1156		tbio->bi_phys_segments = 0;
1157		tbio->bi_hw_segments = 0;
1158		tbio->bi_hw_front_size = 0;
1159		tbio->bi_hw_back_size = 0;
1160		tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1161		tbio->bi_flags |= 1 << BIO_UPTODATE;
1162		tbio->bi_next = NULL;
1163		tbio->bi_rw = WRITE;
1164		tbio->bi_private = r10_bio;
1165		tbio->bi_sector = r10_bio->devs[i].addr;
1166
1167		for (j=0; j < vcnt ; j++) {
1168			tbio->bi_io_vec[j].bv_offset = 0;
1169			tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
1170
1171			memcpy(page_address(tbio->bi_io_vec[j].bv_page),
1172			       page_address(fbio->bi_io_vec[j].bv_page),
1173			       PAGE_SIZE);
1174		}
1175		tbio->bi_end_io = end_sync_write;
1176
1177		d = r10_bio->devs[i].devnum;
1178		atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1179		atomic_inc(&r10_bio->remaining);
1180		md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
1181
1182		tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
1183		tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1184		generic_make_request(tbio);
1185	}
1186
1187done:
1188	if (atomic_dec_and_test(&r10_bio->remaining)) {
1189		md_done_sync(mddev, r10_bio->sectors, 1);
1190		put_buf(r10_bio);
1191	}
1192}
1193
1194/*
1195 * Now for the recovery code.
1196 * Recovery happens across physical sectors.
1197 * We recover all non-is_sync drives by finding the virtual address of
1198 * each, and then choose a working drive that also has that virt address.
1199 * There is a separate r10_bio for each non-in_sync drive.
1200 * Only the first two slots are in use. The first for reading,
1201 * The second for writing.
1202 *
1203 */
1204
1205static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1206{
1207	conf_t *conf = mddev_to_conf(mddev);
1208	int i, d;
1209	struct bio *bio, *wbio;
1210
1211
1212	/* move the pages across to the second bio
1213	 * and submit the write request
1214	 */
1215	bio = r10_bio->devs[0].bio;
1216	wbio = r10_bio->devs[1].bio;
1217	for (i=0; i < wbio->bi_vcnt; i++) {
1218		struct page *p = bio->bi_io_vec[i].bv_page;
1219		bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
1220		wbio->bi_io_vec[i].bv_page = p;
1221	}
1222	d = r10_bio->devs[1].devnum;
1223
1224	atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1225	md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1226	generic_make_request(wbio);
1227}
1228
1229
1230/*
1231 * This is a kernel thread which:
1232 *
1233 *	1.	Retries failed read operations on working mirrors.
1234 *	2.	Updates the raid superblock when problems encounter.
1235 *	3.	Performs writes following reads for array syncronising.
1236 */
1237
1238static void raid10d(mddev_t *mddev)
1239{
1240	r10bio_t *r10_bio;
1241	struct bio *bio;
1242	unsigned long flags;
1243	conf_t *conf = mddev_to_conf(mddev);
1244	struct list_head *head = &conf->retry_list;
1245	int unplug=0;
1246	mdk_rdev_t *rdev;
1247
1248	md_check_recovery(mddev);
1249
1250	for (;;) {
1251		char b[BDEVNAME_SIZE];
1252		spin_lock_irqsave(&conf->device_lock, flags);
1253		if (list_empty(head))
1254			break;
1255		r10_bio = list_entry(head->prev, r10bio_t, retry_list);
1256		list_del(head->prev);
1257		spin_unlock_irqrestore(&conf->device_lock, flags);
1258
1259		mddev = r10_bio->mddev;
1260		conf = mddev_to_conf(mddev);
1261		if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
1262			sync_request_write(mddev, r10_bio);
1263			unplug = 1;
1264		} else 	if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
1265			recovery_request_write(mddev, r10_bio);
1266			unplug = 1;
1267		} else {
1268			int mirror;
1269			bio = r10_bio->devs[r10_bio->read_slot].bio;
1270			r10_bio->devs[r10_bio->read_slot].bio = NULL;
1271			bio_put(bio);
1272			mirror = read_balance(conf, r10_bio);
1273			if (mirror == -1) {
1274				printk(KERN_ALERT "raid10: %s: unrecoverable I/O"
1275				       " read error for block %llu\n",
1276				       bdevname(bio->bi_bdev,b),
1277				       (unsigned long long)r10_bio->sector);
1278				raid_end_bio_io(r10_bio);
1279			} else {
1280				rdev = conf->mirrors[mirror].rdev;
1281				if (printk_ratelimit())
1282					printk(KERN_ERR "raid10: %s: redirecting sector %llu to"
1283					       " another mirror\n",
1284					       bdevname(rdev->bdev,b),
1285					       (unsigned long long)r10_bio->sector);
1286				bio = bio_clone(r10_bio->master_bio, GFP_NOIO);
1287				r10_bio->devs[r10_bio->read_slot].bio = bio;
1288				bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
1289					+ rdev->data_offset;
1290				bio->bi_bdev = rdev->bdev;
1291				bio->bi_rw = READ;
1292				bio->bi_private = r10_bio;
1293				bio->bi_end_io = raid10_end_read_request;
1294				unplug = 1;
1295				generic_make_request(bio);
1296			}
1297		}
1298	}
1299	spin_unlock_irqrestore(&conf->device_lock, flags);
1300	if (unplug)
1301		unplug_slaves(mddev);
1302}
1303
1304
1305static int init_resync(conf_t *conf)
1306{
1307	int buffs;
1308
1309	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
1310	if (conf->r10buf_pool)
1311		BUG();
1312	conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
1313	if (!conf->r10buf_pool)
1314		return -ENOMEM;
1315	conf->next_resync = 0;
1316	return 0;
1317}
1318
1319/*
1320 * perform a "sync" on one "block"
1321 *
1322 * We need to make sure that no normal I/O request - particularly write
1323 * requests - conflict with active sync requests.
1324 *
1325 * This is achieved by tracking pending requests and a 'barrier' concept
1326 * that can be installed to exclude normal IO requests.
1327 *
1328 * Resync and recovery are handled very differently.
1329 * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
1330 *
1331 * For resync, we iterate over virtual addresses, read all copies,
1332 * and update if there are differences.  If only one copy is live,
1333 * skip it.
1334 * For recovery, we iterate over physical addresses, read a good
1335 * value for each non-in_sync drive, and over-write.
1336 *
1337 * So, for recovery we may have several outstanding complex requests for a
1338 * given address, one for each out-of-sync device.  We model this by allocating
1339 * a number of r10_bio structures, one for each out-of-sync device.
1340 * As we setup these structures, we collect all bio's together into a list
1341 * which we then process collectively to add pages, and then process again
1342 * to pass to generic_make_request.
1343 *
1344 * The r10_bio structures are linked using a borrowed master_bio pointer.
1345 * This link is counted in ->remaining.  When the r10_bio that points to NULL
1346 * has its remaining count decremented to 0, the whole complex operation
1347 * is complete.
1348 *
1349 */
1350
1351static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1352{
1353	conf_t *conf = mddev_to_conf(mddev);
1354	r10bio_t *r10_bio;
1355	struct bio *biolist = NULL, *bio;
1356	sector_t max_sector, nr_sectors;
1357	int disk;
1358	int i;
1359
1360	sector_t sectors_skipped = 0;
1361	int chunks_skipped = 0;
1362
1363	if (!conf->r10buf_pool)
1364		if (init_resync(conf))
1365			return 0;
1366
1367 skipped:
1368	max_sector = mddev->size << 1;
1369	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1370		max_sector = mddev->resync_max_sectors;
1371	if (sector_nr >= max_sector) {
1372		close_sync(conf);
1373		*skipped = 1;
1374		return sectors_skipped;
1375	}
1376	if (chunks_skipped >= conf->raid_disks) {
1377		/* if there has been nothing to do on any drive,
1378		 * then there is nothing to do at all..
1379		 */
1380		*skipped = 1;
1381		return (max_sector - sector_nr) + sectors_skipped;
1382	}
1383
1384	/* make sure whole request will fit in a chunk - if chunks
1385	 * are meaningful
1386	 */
1387	if (conf->near_copies < conf->raid_disks &&
1388	    max_sector > (sector_nr | conf->chunk_mask))
1389		max_sector = (sector_nr | conf->chunk_mask) + 1;
1390	/*
1391	 * If there is non-resync activity waiting for us then
1392	 * put in a delay to throttle resync.
1393	 */
1394	if (!go_faster && waitqueue_active(&conf->wait_resume))
1395		msleep_interruptible(1000);
1396	device_barrier(conf, sector_nr + RESYNC_SECTORS);
1397
1398	/* Again, very different code for resync and recovery.
1399	 * Both must result in an r10bio with a list of bios that
1400	 * have bi_end_io, bi_sector, bi_bdev set,
1401	 * and bi_private set to the r10bio.
1402	 * For recovery, we may actually create several r10bios
1403	 * with 2 bios in each, that correspond to the bios in the main one.
1404	 * In this case, the subordinate r10bios link back through a
1405	 * borrowed master_bio pointer, and the counter in the master
1406	 * includes a ref from each subordinate.
1407	 */
1408	/* First, we decide what to do and set ->bi_end_io
1409	 * To end_sync_read if we want to read, and
1410	 * end_sync_write if we will want to write.
1411	 */
1412
1413	if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1414		/* recovery... the complicated one */
1415		int i, j, k;
1416		r10_bio = NULL;
1417
1418		for (i=0 ; i<conf->raid_disks; i++)
1419			if (conf->mirrors[i].rdev &&
1420			    !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
1421				/* want to reconstruct this device */
1422				r10bio_t *rb2 = r10_bio;
1423
1424				r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1425				spin_lock_irq(&conf->resync_lock);
1426				conf->nr_pending++;
1427				if (rb2) conf->barrier++;
1428				spin_unlock_irq(&conf->resync_lock);
1429				atomic_set(&r10_bio->remaining, 0);
1430
1431				r10_bio->master_bio = (struct bio*)rb2;
1432				if (rb2)
1433					atomic_inc(&rb2->remaining);
1434				r10_bio->mddev = mddev;
1435				set_bit(R10BIO_IsRecover, &r10_bio->state);
1436				r10_bio->sector = raid10_find_virt(conf, sector_nr, i);
1437				raid10_find_phys(conf, r10_bio);
1438				for (j=0; j<conf->copies;j++) {
1439					int d = r10_bio->devs[j].devnum;
1440					if (conf->mirrors[d].rdev &&
1441					    test_bit(In_sync, &conf->mirrors[d].rdev->flags)) {
1442						/* This is where we read from */
1443						bio = r10_bio->devs[0].bio;
1444						bio->bi_next = biolist;
1445						biolist = bio;
1446						bio->bi_private = r10_bio;
1447						bio->bi_end_io = end_sync_read;
1448						bio->bi_rw = 0;
1449						bio->bi_sector = r10_bio->devs[j].addr +
1450							conf->mirrors[d].rdev->data_offset;
1451						bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1452						atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1453						atomic_inc(&r10_bio->remaining);
1454						/* and we write to 'i' */
1455
1456						for (k=0; k<conf->copies; k++)
1457							if (r10_bio->devs[k].devnum == i)
1458								break;
1459						bio = r10_bio->devs[1].bio;
1460						bio->bi_next = biolist;
1461						biolist = bio;
1462						bio->bi_private = r10_bio;
1463						bio->bi_end_io = end_sync_write;
1464						bio->bi_rw = 1;
1465						bio->bi_sector = r10_bio->devs[k].addr +
1466							conf->mirrors[i].rdev->data_offset;
1467						bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1468
1469						r10_bio->devs[0].devnum = d;
1470						r10_bio->devs[1].devnum = i;
1471
1472						break;
1473					}
1474				}
1475				if (j == conf->copies) {
1476					/* Cannot recover, so abort the recovery */
1477					put_buf(r10_bio);
1478					r10_bio = rb2;
1479					if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery))
1480						printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n",
1481						       mdname(mddev));
1482					break;
1483				}
1484			}
1485		if (biolist == NULL) {
1486			while (r10_bio) {
1487				r10bio_t *rb2 = r10_bio;
1488				r10_bio = (r10bio_t*) rb2->master_bio;
1489				rb2->master_bio = NULL;
1490				put_buf(rb2);
1491			}
1492			goto giveup;
1493		}
1494	} else {
1495		/* resync. Schedule a read for every block at this virt offset */
1496		int count = 0;
1497		r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1498
1499		spin_lock_irq(&conf->resync_lock);
1500		conf->nr_pending++;
1501		spin_unlock_irq(&conf->resync_lock);
1502
1503		r10_bio->mddev = mddev;
1504		atomic_set(&r10_bio->remaining, 0);
1505
1506		r10_bio->master_bio = NULL;
1507		r10_bio->sector = sector_nr;
1508		set_bit(R10BIO_IsSync, &r10_bio->state);
1509		raid10_find_phys(conf, r10_bio);
1510		r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
1511
1512		for (i=0; i<conf->copies; i++) {
1513			int d = r10_bio->devs[i].devnum;
1514			bio = r10_bio->devs[i].bio;
1515			bio->bi_end_io = NULL;
1516			if (conf->mirrors[d].rdev == NULL ||
1517			    test_bit(Faulty, &conf->mirrors[d].rdev->flags))
1518				continue;
1519			atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1520			atomic_inc(&r10_bio->remaining);
1521			bio->bi_next = biolist;
1522			biolist = bio;
1523			bio->bi_private = r10_bio;
1524			bio->bi_end_io = end_sync_read;
1525			bio->bi_rw = 0;
1526			bio->bi_sector = r10_bio->devs[i].addr +
1527				conf->mirrors[d].rdev->data_offset;
1528			bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1529			count++;
1530		}
1531
1532		if (count < 2) {
1533			for (i=0; i<conf->copies; i++) {
1534				int d = r10_bio->devs[i].devnum;
1535				if (r10_bio->devs[i].bio->bi_end_io)
1536					rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1537			}
1538			put_buf(r10_bio);
1539			biolist = NULL;
1540			goto giveup;
1541		}
1542	}
1543
1544	for (bio = biolist; bio ; bio=bio->bi_next) {
1545
1546		bio->bi_flags &= ~(BIO_POOL_MASK - 1);
1547		if (bio->bi_end_io)
1548			bio->bi_flags |= 1 << BIO_UPTODATE;
1549		bio->bi_vcnt = 0;
1550		bio->bi_idx = 0;
1551		bio->bi_phys_segments = 0;
1552		bio->bi_hw_segments = 0;
1553		bio->bi_size = 0;
1554	}
1555
1556	nr_sectors = 0;
1557	do {
1558		struct page *page;
1559		int len = PAGE_SIZE;
1560		disk = 0;
1561		if (sector_nr + (len>>9) > max_sector)
1562			len = (max_sector - sector_nr) << 9;
1563		if (len == 0)
1564			break;
1565		for (bio= biolist ; bio ; bio=bio->bi_next) {
1566			page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
1567			if (bio_add_page(bio, page, len, 0) == 0) {
1568				/* stop here */
1569				struct bio *bio2;
1570				bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
1571				for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
1572					/* remove last page from this bio */
1573					bio2->bi_vcnt--;
1574					bio2->bi_size -= len;
1575					bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
1576				}
1577				goto bio_full;
1578			}
1579			disk = i;
1580		}
1581		nr_sectors += len>>9;
1582		sector_nr += len>>9;
1583	} while (biolist->bi_vcnt < RESYNC_PAGES);
1584 bio_full:
1585	r10_bio->sectors = nr_sectors;
1586
1587	while (biolist) {
1588		bio = biolist;
1589		biolist = biolist->bi_next;
1590
1591		bio->bi_next = NULL;
1592		r10_bio = bio->bi_private;
1593		r10_bio->sectors = nr_sectors;
1594
1595		if (bio->bi_end_io == end_sync_read) {
1596			md_sync_acct(bio->bi_bdev, nr_sectors);
1597			generic_make_request(bio);
1598		}
1599	}
1600
1601	if (sectors_skipped)
1602		/* pretend they weren't skipped, it makes
1603		 * no important difference in this case
1604		 */
1605		md_done_sync(mddev, sectors_skipped, 1);
1606
1607	return sectors_skipped + nr_sectors;
1608 giveup:
1609	/* There is nowhere to write, so all non-sync
1610	 * drives must be failed, so try the next chunk...
1611	 */
1612	{
1613	sector_t sec = max_sector - sector_nr;
1614	sectors_skipped += sec;
1615	chunks_skipped ++;
1616	sector_nr = max_sector;
1617	goto skipped;
1618	}
1619}
1620
1621static int run(mddev_t *mddev)
1622{
1623	conf_t *conf;
1624	int i, disk_idx;
1625	mirror_info_t *disk;
1626	mdk_rdev_t *rdev;
1627	struct list_head *tmp;
1628	int nc, fc;
1629	sector_t stride, size;
1630
1631	if (mddev->level != 10) {
1632		printk(KERN_ERR "raid10: %s: raid level not set correctly... (%d)\n",
1633		       mdname(mddev), mddev->level);
1634		goto out;
1635	}
1636	nc = mddev->layout & 255;
1637	fc = (mddev->layout >> 8) & 255;
1638	if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
1639	    (mddev->layout >> 16)) {
1640		printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n",
1641		       mdname(mddev), mddev->layout);
1642		goto out;
1643	}
1644	/*
1645	 * copy the already verified devices into our private RAID10
1646	 * bookkeeping area. [whatever we allocate in run(),
1647	 * should be freed in stop()]
1648	 */
1649	conf = kmalloc(sizeof(conf_t), GFP_KERNEL);
1650	mddev->private = conf;
1651	if (!conf) {
1652		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
1653			mdname(mddev));
1654		goto out;
1655	}
1656	memset(conf, 0, sizeof(*conf));
1657	conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1658				 GFP_KERNEL);
1659	if (!conf->mirrors) {
1660		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
1661		       mdname(mddev));
1662		goto out_free_conf;
1663	}
1664	memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
1665
1666	conf->near_copies = nc;
1667	conf->far_copies = fc;
1668	conf->copies = nc*fc;
1669	conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
1670	conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
1671	stride = mddev->size >> (conf->chunk_shift-1);
1672	sector_div(stride, fc);
1673	conf->stride = stride << conf->chunk_shift;
1674
1675	conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
1676						r10bio_pool_free, conf);
1677	if (!conf->r10bio_pool) {
1678		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
1679			mdname(mddev));
1680		goto out_free_conf;
1681	}
1682
1683	ITERATE_RDEV(mddev, rdev, tmp) {
1684		disk_idx = rdev->raid_disk;
1685		if (disk_idx >= mddev->raid_disks
1686		    || disk_idx < 0)
1687			continue;
1688		disk = conf->mirrors + disk_idx;
1689
1690		disk->rdev = rdev;
1691
1692		blk_queue_stack_limits(mddev->queue,
1693				       rdev->bdev->bd_disk->queue);
1694		/* as we don't honour merge_bvec_fn, we must never risk
1695		 * violating it, so limit ->max_sector to one PAGE, as
1696		 * a one page request is never in violation.
1697		 */
1698		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
1699		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
1700			mddev->queue->max_sectors = (PAGE_SIZE>>9);
1701
1702		disk->head_position = 0;
1703		if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags))
1704			conf->working_disks++;
1705	}
1706	conf->raid_disks = mddev->raid_disks;
1707	conf->mddev = mddev;
1708	spin_lock_init(&conf->device_lock);
1709	INIT_LIST_HEAD(&conf->retry_list);
1710
1711	spin_lock_init(&conf->resync_lock);
1712	init_waitqueue_head(&conf->wait_idle);
1713	init_waitqueue_head(&conf->wait_resume);
1714
1715	/* need to check that every block has at least one working mirror */
1716	if (!enough(conf)) {
1717		printk(KERN_ERR "raid10: not enough operational mirrors for %s\n",
1718		       mdname(mddev));
1719		goto out_free_conf;
1720	}
1721
1722	mddev->degraded = 0;
1723	for (i = 0; i < conf->raid_disks; i++) {
1724
1725		disk = conf->mirrors + i;
1726
1727		if (!disk->rdev) {
1728			disk->head_position = 0;
1729			mddev->degraded++;
1730		}
1731	}
1732
1733
1734	mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10");
1735	if (!mddev->thread) {
1736		printk(KERN_ERR
1737		       "raid10: couldn't allocate thread for %s\n",
1738		       mdname(mddev));
1739		goto out_free_conf;
1740	}
1741
1742	printk(KERN_INFO
1743		"raid10: raid set %s active with %d out of %d devices\n",
1744		mdname(mddev), mddev->raid_disks - mddev->degraded,
1745		mddev->raid_disks);
1746	/*
1747	 * Ok, everything is just fine now
1748	 */
1749	size = conf->stride * conf->raid_disks;
1750	sector_div(size, conf->near_copies);
1751	mddev->array_size = size/2;
1752	mddev->resync_max_sectors = size;
1753
1754	mddev->queue->unplug_fn = raid10_unplug;
1755	mddev->queue->issue_flush_fn = raid10_issue_flush;
1756
1757	/* Calculate max read-ahead size.
1758	 * We need to readahead at least twice a whole stripe....
1759	 * maybe...
1760	 */
1761	{
1762		int stripe = conf->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE;
1763		stripe /= conf->near_copies;
1764		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
1765			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
1766	}
1767
1768	if (conf->near_copies < mddev->raid_disks)
1769		blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
1770	return 0;
1771
1772out_free_conf:
1773	if (conf->r10bio_pool)
1774		mempool_destroy(conf->r10bio_pool);
1775	kfree(conf->mirrors);
1776	kfree(conf);
1777	mddev->private = NULL;
1778out:
1779	return -EIO;
1780}
1781
1782static int stop(mddev_t *mddev)
1783{
1784	conf_t *conf = mddev_to_conf(mddev);
1785
1786	md_unregister_thread(mddev->thread);
1787	mddev->thread = NULL;
1788	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
1789	if (conf->r10bio_pool)
1790		mempool_destroy(conf->r10bio_pool);
1791	kfree(conf->mirrors);
1792	kfree(conf);
1793	mddev->private = NULL;
1794	return 0;
1795}
1796
1797
1798static mdk_personality_t raid10_personality =
1799{
1800	.name		= "raid10",
1801	.owner		= THIS_MODULE,
1802	.make_request	= make_request,
1803	.run		= run,
1804	.stop		= stop,
1805	.status		= status,
1806	.error_handler	= error,
1807	.hot_add_disk	= raid10_add_disk,
1808	.hot_remove_disk= raid10_remove_disk,
1809	.spare_active	= raid10_spare_active,
1810	.sync_request	= sync_request,
1811};
1812
1813static int __init raid_init(void)
1814{
1815	return register_md_personality(RAID10, &raid10_personality);
1816}
1817
1818static void raid_exit(void)
1819{
1820	unregister_md_personality(RAID10);
1821}
1822
1823module_init(raid_init);
1824module_exit(raid_exit);
1825MODULE_LICENSE("GPL");
1826MODULE_ALIAS("md-personality-9"); /* RAID10 */