Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
at v2.6.19 5657 lines 142 kB view raw
1/* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33*/ 34 35#include <linux/module.h> 36#include <linux/kthread.h> 37#include <linux/linkage.h> 38#include <linux/raid/md.h> 39#include <linux/raid/bitmap.h> 40#include <linux/sysctl.h> 41#include <linux/buffer_head.h> /* for invalidate_bdev */ 42#include <linux/suspend.h> 43#include <linux/poll.h> 44#include <linux/mutex.h> 45#include <linux/ctype.h> 46 47#include <linux/init.h> 48 49#include <linux/file.h> 50 51#ifdef CONFIG_KMOD 52#include <linux/kmod.h> 53#endif 54 55#include <asm/unaligned.h> 56 57#define MAJOR_NR MD_MAJOR 58#define MD_DRIVER 59 60/* 63 partitions with the alternate major number (mdp) */ 61#define MdpMinorShift 6 62 63#define DEBUG 0 64#define dprintk(x...) ((void)(DEBUG && printk(x))) 65 66 67#ifndef MODULE 68static void autostart_arrays (int part); 69#endif 70 71static LIST_HEAD(pers_list); 72static DEFINE_SPINLOCK(pers_lock); 73 74static void md_print_devices(void); 75 76#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 77 78/* 79 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 80 * is 1000 KB/sec, so the extra system load does not show up that much. 81 * Increase it if you want to have more _guaranteed_ speed. Note that 82 * the RAID driver will use the maximum available bandwidth if the IO 83 * subsystem is idle. There is also an 'absolute maximum' reconstruction 84 * speed limit - in case reconstruction slows down your system despite 85 * idle IO detection. 86 * 87 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 88 * or /sys/block/mdX/md/sync_speed_{min,max} 89 */ 90 91static int sysctl_speed_limit_min = 1000; 92static int sysctl_speed_limit_max = 200000; 93static inline int speed_min(mddev_t *mddev) 94{ 95 return mddev->sync_speed_min ? 96 mddev->sync_speed_min : sysctl_speed_limit_min; 97} 98 99static inline int speed_max(mddev_t *mddev) 100{ 101 return mddev->sync_speed_max ? 102 mddev->sync_speed_max : sysctl_speed_limit_max; 103} 104 105static struct ctl_table_header *raid_table_header; 106 107static ctl_table raid_table[] = { 108 { 109 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 110 .procname = "speed_limit_min", 111 .data = &sysctl_speed_limit_min, 112 .maxlen = sizeof(int), 113 .mode = S_IRUGO|S_IWUSR, 114 .proc_handler = &proc_dointvec, 115 }, 116 { 117 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 118 .procname = "speed_limit_max", 119 .data = &sysctl_speed_limit_max, 120 .maxlen = sizeof(int), 121 .mode = S_IRUGO|S_IWUSR, 122 .proc_handler = &proc_dointvec, 123 }, 124 { .ctl_name = 0 } 125}; 126 127static ctl_table raid_dir_table[] = { 128 { 129 .ctl_name = DEV_RAID, 130 .procname = "raid", 131 .maxlen = 0, 132 .mode = S_IRUGO|S_IXUGO, 133 .child = raid_table, 134 }, 135 { .ctl_name = 0 } 136}; 137 138static ctl_table raid_root_table[] = { 139 { 140 .ctl_name = CTL_DEV, 141 .procname = "dev", 142 .maxlen = 0, 143 .mode = 0555, 144 .child = raid_dir_table, 145 }, 146 { .ctl_name = 0 } 147}; 148 149static struct block_device_operations md_fops; 150 151static int start_readonly; 152 153/* 154 * We have a system wide 'event count' that is incremented 155 * on any 'interesting' event, and readers of /proc/mdstat 156 * can use 'poll' or 'select' to find out when the event 157 * count increases. 158 * 159 * Events are: 160 * start array, stop array, error, add device, remove device, 161 * start build, activate spare 162 */ 163static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 164static atomic_t md_event_count; 165void md_new_event(mddev_t *mddev) 166{ 167 atomic_inc(&md_event_count); 168 wake_up(&md_event_waiters); 169 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 170} 171EXPORT_SYMBOL_GPL(md_new_event); 172 173/* Alternate version that can be called from interrupts 174 * when calling sysfs_notify isn't needed. 175 */ 176static void md_new_event_inintr(mddev_t *mddev) 177{ 178 atomic_inc(&md_event_count); 179 wake_up(&md_event_waiters); 180} 181 182/* 183 * Enables to iterate over all existing md arrays 184 * all_mddevs_lock protects this list. 185 */ 186static LIST_HEAD(all_mddevs); 187static DEFINE_SPINLOCK(all_mddevs_lock); 188 189 190/* 191 * iterates through all used mddevs in the system. 192 * We take care to grab the all_mddevs_lock whenever navigating 193 * the list, and to always hold a refcount when unlocked. 194 * Any code which breaks out of this loop while own 195 * a reference to the current mddev and must mddev_put it. 196 */ 197#define ITERATE_MDDEV(mddev,tmp) \ 198 \ 199 for (({ spin_lock(&all_mddevs_lock); \ 200 tmp = all_mddevs.next; \ 201 mddev = NULL;}); \ 202 ({ if (tmp != &all_mddevs) \ 203 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 204 spin_unlock(&all_mddevs_lock); \ 205 if (mddev) mddev_put(mddev); \ 206 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 207 tmp != &all_mddevs;}); \ 208 ({ spin_lock(&all_mddevs_lock); \ 209 tmp = tmp->next;}) \ 210 ) 211 212 213static int md_fail_request (request_queue_t *q, struct bio *bio) 214{ 215 bio_io_error(bio, bio->bi_size); 216 return 0; 217} 218 219static inline mddev_t *mddev_get(mddev_t *mddev) 220{ 221 atomic_inc(&mddev->active); 222 return mddev; 223} 224 225static void mddev_put(mddev_t *mddev) 226{ 227 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 228 return; 229 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 230 list_del(&mddev->all_mddevs); 231 spin_unlock(&all_mddevs_lock); 232 blk_cleanup_queue(mddev->queue); 233 kobject_unregister(&mddev->kobj); 234 } else 235 spin_unlock(&all_mddevs_lock); 236} 237 238static mddev_t * mddev_find(dev_t unit) 239{ 240 mddev_t *mddev, *new = NULL; 241 242 retry: 243 spin_lock(&all_mddevs_lock); 244 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 245 if (mddev->unit == unit) { 246 mddev_get(mddev); 247 spin_unlock(&all_mddevs_lock); 248 kfree(new); 249 return mddev; 250 } 251 252 if (new) { 253 list_add(&new->all_mddevs, &all_mddevs); 254 spin_unlock(&all_mddevs_lock); 255 return new; 256 } 257 spin_unlock(&all_mddevs_lock); 258 259 new = kzalloc(sizeof(*new), GFP_KERNEL); 260 if (!new) 261 return NULL; 262 263 new->unit = unit; 264 if (MAJOR(unit) == MD_MAJOR) 265 new->md_minor = MINOR(unit); 266 else 267 new->md_minor = MINOR(unit) >> MdpMinorShift; 268 269 mutex_init(&new->reconfig_mutex); 270 INIT_LIST_HEAD(&new->disks); 271 INIT_LIST_HEAD(&new->all_mddevs); 272 init_timer(&new->safemode_timer); 273 atomic_set(&new->active, 1); 274 spin_lock_init(&new->write_lock); 275 init_waitqueue_head(&new->sb_wait); 276 277 new->queue = blk_alloc_queue(GFP_KERNEL); 278 if (!new->queue) { 279 kfree(new); 280 return NULL; 281 } 282 set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags); 283 284 blk_queue_make_request(new->queue, md_fail_request); 285 286 goto retry; 287} 288 289static inline int mddev_lock(mddev_t * mddev) 290{ 291 return mutex_lock_interruptible(&mddev->reconfig_mutex); 292} 293 294static inline int mddev_trylock(mddev_t * mddev) 295{ 296 return mutex_trylock(&mddev->reconfig_mutex); 297} 298 299static inline void mddev_unlock(mddev_t * mddev) 300{ 301 mutex_unlock(&mddev->reconfig_mutex); 302 303 md_wakeup_thread(mddev->thread); 304} 305 306static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 307{ 308 mdk_rdev_t * rdev; 309 struct list_head *tmp; 310 311 ITERATE_RDEV(mddev,rdev,tmp) { 312 if (rdev->desc_nr == nr) 313 return rdev; 314 } 315 return NULL; 316} 317 318static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 319{ 320 struct list_head *tmp; 321 mdk_rdev_t *rdev; 322 323 ITERATE_RDEV(mddev,rdev,tmp) { 324 if (rdev->bdev->bd_dev == dev) 325 return rdev; 326 } 327 return NULL; 328} 329 330static struct mdk_personality *find_pers(int level, char *clevel) 331{ 332 struct mdk_personality *pers; 333 list_for_each_entry(pers, &pers_list, list) { 334 if (level != LEVEL_NONE && pers->level == level) 335 return pers; 336 if (strcmp(pers->name, clevel)==0) 337 return pers; 338 } 339 return NULL; 340} 341 342static inline sector_t calc_dev_sboffset(struct block_device *bdev) 343{ 344 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 345 return MD_NEW_SIZE_BLOCKS(size); 346} 347 348static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 349{ 350 sector_t size; 351 352 size = rdev->sb_offset; 353 354 if (chunk_size) 355 size &= ~((sector_t)chunk_size/1024 - 1); 356 return size; 357} 358 359static int alloc_disk_sb(mdk_rdev_t * rdev) 360{ 361 if (rdev->sb_page) 362 MD_BUG(); 363 364 rdev->sb_page = alloc_page(GFP_KERNEL); 365 if (!rdev->sb_page) { 366 printk(KERN_ALERT "md: out of memory.\n"); 367 return -EINVAL; 368 } 369 370 return 0; 371} 372 373static void free_disk_sb(mdk_rdev_t * rdev) 374{ 375 if (rdev->sb_page) { 376 put_page(rdev->sb_page); 377 rdev->sb_loaded = 0; 378 rdev->sb_page = NULL; 379 rdev->sb_offset = 0; 380 rdev->size = 0; 381 } 382} 383 384 385static int super_written(struct bio *bio, unsigned int bytes_done, int error) 386{ 387 mdk_rdev_t *rdev = bio->bi_private; 388 mddev_t *mddev = rdev->mddev; 389 if (bio->bi_size) 390 return 1; 391 392 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 393 printk("md: super_written gets error=%d, uptodate=%d\n", 394 error, test_bit(BIO_UPTODATE, &bio->bi_flags)); 395 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); 396 md_error(mddev, rdev); 397 } 398 399 if (atomic_dec_and_test(&mddev->pending_writes)) 400 wake_up(&mddev->sb_wait); 401 bio_put(bio); 402 return 0; 403} 404 405static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error) 406{ 407 struct bio *bio2 = bio->bi_private; 408 mdk_rdev_t *rdev = bio2->bi_private; 409 mddev_t *mddev = rdev->mddev; 410 if (bio->bi_size) 411 return 1; 412 413 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 414 error == -EOPNOTSUPP) { 415 unsigned long flags; 416 /* barriers don't appear to be supported :-( */ 417 set_bit(BarriersNotsupp, &rdev->flags); 418 mddev->barriers_work = 0; 419 spin_lock_irqsave(&mddev->write_lock, flags); 420 bio2->bi_next = mddev->biolist; 421 mddev->biolist = bio2; 422 spin_unlock_irqrestore(&mddev->write_lock, flags); 423 wake_up(&mddev->sb_wait); 424 bio_put(bio); 425 return 0; 426 } 427 bio_put(bio2); 428 bio->bi_private = rdev; 429 return super_written(bio, bytes_done, error); 430} 431 432void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 433 sector_t sector, int size, struct page *page) 434{ 435 /* write first size bytes of page to sector of rdev 436 * Increment mddev->pending_writes before returning 437 * and decrement it on completion, waking up sb_wait 438 * if zero is reached. 439 * If an error occurred, call md_error 440 * 441 * As we might need to resubmit the request if BIO_RW_BARRIER 442 * causes ENOTSUPP, we allocate a spare bio... 443 */ 444 struct bio *bio = bio_alloc(GFP_NOIO, 1); 445 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC); 446 447 bio->bi_bdev = rdev->bdev; 448 bio->bi_sector = sector; 449 bio_add_page(bio, page, size, 0); 450 bio->bi_private = rdev; 451 bio->bi_end_io = super_written; 452 bio->bi_rw = rw; 453 454 atomic_inc(&mddev->pending_writes); 455 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 456 struct bio *rbio; 457 rw |= (1<<BIO_RW_BARRIER); 458 rbio = bio_clone(bio, GFP_NOIO); 459 rbio->bi_private = bio; 460 rbio->bi_end_io = super_written_barrier; 461 submit_bio(rw, rbio); 462 } else 463 submit_bio(rw, bio); 464} 465 466void md_super_wait(mddev_t *mddev) 467{ 468 /* wait for all superblock writes that were scheduled to complete. 469 * if any had to be retried (due to BARRIER problems), retry them 470 */ 471 DEFINE_WAIT(wq); 472 for(;;) { 473 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 474 if (atomic_read(&mddev->pending_writes)==0) 475 break; 476 while (mddev->biolist) { 477 struct bio *bio; 478 spin_lock_irq(&mddev->write_lock); 479 bio = mddev->biolist; 480 mddev->biolist = bio->bi_next ; 481 bio->bi_next = NULL; 482 spin_unlock_irq(&mddev->write_lock); 483 submit_bio(bio->bi_rw, bio); 484 } 485 schedule(); 486 } 487 finish_wait(&mddev->sb_wait, &wq); 488} 489 490static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 491{ 492 if (bio->bi_size) 493 return 1; 494 495 complete((struct completion*)bio->bi_private); 496 return 0; 497} 498 499int sync_page_io(struct block_device *bdev, sector_t sector, int size, 500 struct page *page, int rw) 501{ 502 struct bio *bio = bio_alloc(GFP_NOIO, 1); 503 struct completion event; 504 int ret; 505 506 rw |= (1 << BIO_RW_SYNC); 507 508 bio->bi_bdev = bdev; 509 bio->bi_sector = sector; 510 bio_add_page(bio, page, size, 0); 511 init_completion(&event); 512 bio->bi_private = &event; 513 bio->bi_end_io = bi_complete; 514 submit_bio(rw, bio); 515 wait_for_completion(&event); 516 517 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 518 bio_put(bio); 519 return ret; 520} 521EXPORT_SYMBOL_GPL(sync_page_io); 522 523static int read_disk_sb(mdk_rdev_t * rdev, int size) 524{ 525 char b[BDEVNAME_SIZE]; 526 if (!rdev->sb_page) { 527 MD_BUG(); 528 return -EINVAL; 529 } 530 if (rdev->sb_loaded) 531 return 0; 532 533 534 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) 535 goto fail; 536 rdev->sb_loaded = 1; 537 return 0; 538 539fail: 540 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 541 bdevname(rdev->bdev,b)); 542 return -EINVAL; 543} 544 545static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 546{ 547 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 548 (sb1->set_uuid1 == sb2->set_uuid1) && 549 (sb1->set_uuid2 == sb2->set_uuid2) && 550 (sb1->set_uuid3 == sb2->set_uuid3)) 551 552 return 1; 553 554 return 0; 555} 556 557 558static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 559{ 560 int ret; 561 mdp_super_t *tmp1, *tmp2; 562 563 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 564 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 565 566 if (!tmp1 || !tmp2) { 567 ret = 0; 568 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 569 goto abort; 570 } 571 572 *tmp1 = *sb1; 573 *tmp2 = *sb2; 574 575 /* 576 * nr_disks is not constant 577 */ 578 tmp1->nr_disks = 0; 579 tmp2->nr_disks = 0; 580 581 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 582 ret = 0; 583 else 584 ret = 1; 585 586abort: 587 kfree(tmp1); 588 kfree(tmp2); 589 return ret; 590} 591 592static unsigned int calc_sb_csum(mdp_super_t * sb) 593{ 594 unsigned int disk_csum, csum; 595 596 disk_csum = sb->sb_csum; 597 sb->sb_csum = 0; 598 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 599 sb->sb_csum = disk_csum; 600 return csum; 601} 602 603 604/* 605 * Handle superblock details. 606 * We want to be able to handle multiple superblock formats 607 * so we have a common interface to them all, and an array of 608 * different handlers. 609 * We rely on user-space to write the initial superblock, and support 610 * reading and updating of superblocks. 611 * Interface methods are: 612 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 613 * loads and validates a superblock on dev. 614 * if refdev != NULL, compare superblocks on both devices 615 * Return: 616 * 0 - dev has a superblock that is compatible with refdev 617 * 1 - dev has a superblock that is compatible and newer than refdev 618 * so dev should be used as the refdev in future 619 * -EINVAL superblock incompatible or invalid 620 * -othererror e.g. -EIO 621 * 622 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 623 * Verify that dev is acceptable into mddev. 624 * The first time, mddev->raid_disks will be 0, and data from 625 * dev should be merged in. Subsequent calls check that dev 626 * is new enough. Return 0 or -EINVAL 627 * 628 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 629 * Update the superblock for rdev with data in mddev 630 * This does not write to disc. 631 * 632 */ 633 634struct super_type { 635 char *name; 636 struct module *owner; 637 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 638 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 639 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 640}; 641 642/* 643 * load_super for 0.90.0 644 */ 645static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 646{ 647 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 648 mdp_super_t *sb; 649 int ret; 650 sector_t sb_offset; 651 652 /* 653 * Calculate the position of the superblock, 654 * it's at the end of the disk. 655 * 656 * It also happens to be a multiple of 4Kb. 657 */ 658 sb_offset = calc_dev_sboffset(rdev->bdev); 659 rdev->sb_offset = sb_offset; 660 661 ret = read_disk_sb(rdev, MD_SB_BYTES); 662 if (ret) return ret; 663 664 ret = -EINVAL; 665 666 bdevname(rdev->bdev, b); 667 sb = (mdp_super_t*)page_address(rdev->sb_page); 668 669 if (sb->md_magic != MD_SB_MAGIC) { 670 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 671 b); 672 goto abort; 673 } 674 675 if (sb->major_version != 0 || 676 sb->minor_version < 90 || 677 sb->minor_version > 91) { 678 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 679 sb->major_version, sb->minor_version, 680 b); 681 goto abort; 682 } 683 684 if (sb->raid_disks <= 0) 685 goto abort; 686 687 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { 688 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 689 b); 690 goto abort; 691 } 692 693 rdev->preferred_minor = sb->md_minor; 694 rdev->data_offset = 0; 695 rdev->sb_size = MD_SB_BYTES; 696 697 if (sb->level == LEVEL_MULTIPATH) 698 rdev->desc_nr = -1; 699 else 700 rdev->desc_nr = sb->this_disk.number; 701 702 if (refdev == 0) 703 ret = 1; 704 else { 705 __u64 ev1, ev2; 706 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 707 if (!uuid_equal(refsb, sb)) { 708 printk(KERN_WARNING "md: %s has different UUID to %s\n", 709 b, bdevname(refdev->bdev,b2)); 710 goto abort; 711 } 712 if (!sb_equal(refsb, sb)) { 713 printk(KERN_WARNING "md: %s has same UUID" 714 " but different superblock to %s\n", 715 b, bdevname(refdev->bdev, b2)); 716 goto abort; 717 } 718 ev1 = md_event(sb); 719 ev2 = md_event(refsb); 720 if (ev1 > ev2) 721 ret = 1; 722 else 723 ret = 0; 724 } 725 rdev->size = calc_dev_size(rdev, sb->chunk_size); 726 727 if (rdev->size < sb->size && sb->level > 1) 728 /* "this cannot possibly happen" ... */ 729 ret = -EINVAL; 730 731 abort: 732 return ret; 733} 734 735/* 736 * validate_super for 0.90.0 737 */ 738static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 739{ 740 mdp_disk_t *desc; 741 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 742 __u64 ev1 = md_event(sb); 743 744 rdev->raid_disk = -1; 745 rdev->flags = 0; 746 if (mddev->raid_disks == 0) { 747 mddev->major_version = 0; 748 mddev->minor_version = sb->minor_version; 749 mddev->patch_version = sb->patch_version; 750 mddev->persistent = ! sb->not_persistent; 751 mddev->chunk_size = sb->chunk_size; 752 mddev->ctime = sb->ctime; 753 mddev->utime = sb->utime; 754 mddev->level = sb->level; 755 mddev->clevel[0] = 0; 756 mddev->layout = sb->layout; 757 mddev->raid_disks = sb->raid_disks; 758 mddev->size = sb->size; 759 mddev->events = ev1; 760 mddev->bitmap_offset = 0; 761 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 762 763 if (mddev->minor_version >= 91) { 764 mddev->reshape_position = sb->reshape_position; 765 mddev->delta_disks = sb->delta_disks; 766 mddev->new_level = sb->new_level; 767 mddev->new_layout = sb->new_layout; 768 mddev->new_chunk = sb->new_chunk; 769 } else { 770 mddev->reshape_position = MaxSector; 771 mddev->delta_disks = 0; 772 mddev->new_level = mddev->level; 773 mddev->new_layout = mddev->layout; 774 mddev->new_chunk = mddev->chunk_size; 775 } 776 777 if (sb->state & (1<<MD_SB_CLEAN)) 778 mddev->recovery_cp = MaxSector; 779 else { 780 if (sb->events_hi == sb->cp_events_hi && 781 sb->events_lo == sb->cp_events_lo) { 782 mddev->recovery_cp = sb->recovery_cp; 783 } else 784 mddev->recovery_cp = 0; 785 } 786 787 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 788 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 789 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 790 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 791 792 mddev->max_disks = MD_SB_DISKS; 793 794 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 795 mddev->bitmap_file == NULL) { 796 if (mddev->level != 1 && mddev->level != 4 797 && mddev->level != 5 && mddev->level != 6 798 && mddev->level != 10) { 799 /* FIXME use a better test */ 800 printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); 801 return -EINVAL; 802 } 803 mddev->bitmap_offset = mddev->default_bitmap_offset; 804 } 805 806 } else if (mddev->pers == NULL) { 807 /* Insist on good event counter while assembling */ 808 ++ev1; 809 if (ev1 < mddev->events) 810 return -EINVAL; 811 } else if (mddev->bitmap) { 812 /* if adding to array with a bitmap, then we can accept an 813 * older device ... but not too old. 814 */ 815 if (ev1 < mddev->bitmap->events_cleared) 816 return 0; 817 } else { 818 if (ev1 < mddev->events) 819 /* just a hot-add of a new device, leave raid_disk at -1 */ 820 return 0; 821 } 822 823 if (mddev->level != LEVEL_MULTIPATH) { 824 desc = sb->disks + rdev->desc_nr; 825 826 if (desc->state & (1<<MD_DISK_FAULTY)) 827 set_bit(Faulty, &rdev->flags); 828 else if (desc->state & (1<<MD_DISK_SYNC) /* && 829 desc->raid_disk < mddev->raid_disks */) { 830 set_bit(In_sync, &rdev->flags); 831 rdev->raid_disk = desc->raid_disk; 832 } 833 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 834 set_bit(WriteMostly, &rdev->flags); 835 } else /* MULTIPATH are always insync */ 836 set_bit(In_sync, &rdev->flags); 837 return 0; 838} 839 840/* 841 * sync_super for 0.90.0 842 */ 843static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 844{ 845 mdp_super_t *sb; 846 struct list_head *tmp; 847 mdk_rdev_t *rdev2; 848 int next_spare = mddev->raid_disks; 849 850 851 /* make rdev->sb match mddev data.. 852 * 853 * 1/ zero out disks 854 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 855 * 3/ any empty disks < next_spare become removed 856 * 857 * disks[0] gets initialised to REMOVED because 858 * we cannot be sure from other fields if it has 859 * been initialised or not. 860 */ 861 int i; 862 int active=0, working=0,failed=0,spare=0,nr_disks=0; 863 864 rdev->sb_size = MD_SB_BYTES; 865 866 sb = (mdp_super_t*)page_address(rdev->sb_page); 867 868 memset(sb, 0, sizeof(*sb)); 869 870 sb->md_magic = MD_SB_MAGIC; 871 sb->major_version = mddev->major_version; 872 sb->patch_version = mddev->patch_version; 873 sb->gvalid_words = 0; /* ignored */ 874 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 875 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 876 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 877 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 878 879 sb->ctime = mddev->ctime; 880 sb->level = mddev->level; 881 sb->size = mddev->size; 882 sb->raid_disks = mddev->raid_disks; 883 sb->md_minor = mddev->md_minor; 884 sb->not_persistent = !mddev->persistent; 885 sb->utime = mddev->utime; 886 sb->state = 0; 887 sb->events_hi = (mddev->events>>32); 888 sb->events_lo = (u32)mddev->events; 889 890 if (mddev->reshape_position == MaxSector) 891 sb->minor_version = 90; 892 else { 893 sb->minor_version = 91; 894 sb->reshape_position = mddev->reshape_position; 895 sb->new_level = mddev->new_level; 896 sb->delta_disks = mddev->delta_disks; 897 sb->new_layout = mddev->new_layout; 898 sb->new_chunk = mddev->new_chunk; 899 } 900 mddev->minor_version = sb->minor_version; 901 if (mddev->in_sync) 902 { 903 sb->recovery_cp = mddev->recovery_cp; 904 sb->cp_events_hi = (mddev->events>>32); 905 sb->cp_events_lo = (u32)mddev->events; 906 if (mddev->recovery_cp == MaxSector) 907 sb->state = (1<< MD_SB_CLEAN); 908 } else 909 sb->recovery_cp = 0; 910 911 sb->layout = mddev->layout; 912 sb->chunk_size = mddev->chunk_size; 913 914 if (mddev->bitmap && mddev->bitmap_file == NULL) 915 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 916 917 sb->disks[0].state = (1<<MD_DISK_REMOVED); 918 ITERATE_RDEV(mddev,rdev2,tmp) { 919 mdp_disk_t *d; 920 int desc_nr; 921 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 922 && !test_bit(Faulty, &rdev2->flags)) 923 desc_nr = rdev2->raid_disk; 924 else 925 desc_nr = next_spare++; 926 rdev2->desc_nr = desc_nr; 927 d = &sb->disks[rdev2->desc_nr]; 928 nr_disks++; 929 d->number = rdev2->desc_nr; 930 d->major = MAJOR(rdev2->bdev->bd_dev); 931 d->minor = MINOR(rdev2->bdev->bd_dev); 932 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 933 && !test_bit(Faulty, &rdev2->flags)) 934 d->raid_disk = rdev2->raid_disk; 935 else 936 d->raid_disk = rdev2->desc_nr; /* compatibility */ 937 if (test_bit(Faulty, &rdev2->flags)) 938 d->state = (1<<MD_DISK_FAULTY); 939 else if (test_bit(In_sync, &rdev2->flags)) { 940 d->state = (1<<MD_DISK_ACTIVE); 941 d->state |= (1<<MD_DISK_SYNC); 942 active++; 943 working++; 944 } else { 945 d->state = 0; 946 spare++; 947 working++; 948 } 949 if (test_bit(WriteMostly, &rdev2->flags)) 950 d->state |= (1<<MD_DISK_WRITEMOSTLY); 951 } 952 /* now set the "removed" and "faulty" bits on any missing devices */ 953 for (i=0 ; i < mddev->raid_disks ; i++) { 954 mdp_disk_t *d = &sb->disks[i]; 955 if (d->state == 0 && d->number == 0) { 956 d->number = i; 957 d->raid_disk = i; 958 d->state = (1<<MD_DISK_REMOVED); 959 d->state |= (1<<MD_DISK_FAULTY); 960 failed++; 961 } 962 } 963 sb->nr_disks = nr_disks; 964 sb->active_disks = active; 965 sb->working_disks = working; 966 sb->failed_disks = failed; 967 sb->spare_disks = spare; 968 969 sb->this_disk = sb->disks[rdev->desc_nr]; 970 sb->sb_csum = calc_sb_csum(sb); 971} 972 973/* 974 * version 1 superblock 975 */ 976 977static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) 978{ 979 __le32 disk_csum; 980 u32 csum; 981 unsigned long long newcsum; 982 int size = 256 + le32_to_cpu(sb->max_dev)*2; 983 __le32 *isuper = (__le32*)sb; 984 int i; 985 986 disk_csum = sb->sb_csum; 987 sb->sb_csum = 0; 988 newcsum = 0; 989 for (i=0; size>=4; size -= 4 ) 990 newcsum += le32_to_cpu(*isuper++); 991 992 if (size == 2) 993 newcsum += le16_to_cpu(*(__le16*) isuper); 994 995 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 996 sb->sb_csum = disk_csum; 997 return cpu_to_le32(csum); 998} 999 1000static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 1001{ 1002 struct mdp_superblock_1 *sb; 1003 int ret; 1004 sector_t sb_offset; 1005 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1006 int bmask; 1007 1008 /* 1009 * Calculate the position of the superblock. 1010 * It is always aligned to a 4K boundary and 1011 * depeding on minor_version, it can be: 1012 * 0: At least 8K, but less than 12K, from end of device 1013 * 1: At start of device 1014 * 2: 4K from start of device. 1015 */ 1016 switch(minor_version) { 1017 case 0: 1018 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 1019 sb_offset -= 8*2; 1020 sb_offset &= ~(sector_t)(4*2-1); 1021 /* convert from sectors to K */ 1022 sb_offset /= 2; 1023 break; 1024 case 1: 1025 sb_offset = 0; 1026 break; 1027 case 2: 1028 sb_offset = 4; 1029 break; 1030 default: 1031 return -EINVAL; 1032 } 1033 rdev->sb_offset = sb_offset; 1034 1035 /* superblock is rarely larger than 1K, but it can be larger, 1036 * and it is safe to read 4k, so we do that 1037 */ 1038 ret = read_disk_sb(rdev, 4096); 1039 if (ret) return ret; 1040 1041 1042 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1043 1044 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1045 sb->major_version != cpu_to_le32(1) || 1046 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1047 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 1048 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1049 return -EINVAL; 1050 1051 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1052 printk("md: invalid superblock checksum on %s\n", 1053 bdevname(rdev->bdev,b)); 1054 return -EINVAL; 1055 } 1056 if (le64_to_cpu(sb->data_size) < 10) { 1057 printk("md: data_size too small on %s\n", 1058 bdevname(rdev->bdev,b)); 1059 return -EINVAL; 1060 } 1061 rdev->preferred_minor = 0xffff; 1062 rdev->data_offset = le64_to_cpu(sb->data_offset); 1063 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1064 1065 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1066 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1067 if (rdev->sb_size & bmask) 1068 rdev-> sb_size = (rdev->sb_size | bmask)+1; 1069 1070 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1071 rdev->desc_nr = -1; 1072 else 1073 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1074 1075 if (refdev == 0) 1076 ret = 1; 1077 else { 1078 __u64 ev1, ev2; 1079 struct mdp_superblock_1 *refsb = 1080 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1081 1082 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1083 sb->level != refsb->level || 1084 sb->layout != refsb->layout || 1085 sb->chunksize != refsb->chunksize) { 1086 printk(KERN_WARNING "md: %s has strangely different" 1087 " superblock to %s\n", 1088 bdevname(rdev->bdev,b), 1089 bdevname(refdev->bdev,b2)); 1090 return -EINVAL; 1091 } 1092 ev1 = le64_to_cpu(sb->events); 1093 ev2 = le64_to_cpu(refsb->events); 1094 1095 if (ev1 > ev2) 1096 ret = 1; 1097 else 1098 ret = 0; 1099 } 1100 if (minor_version) 1101 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1102 else 1103 rdev->size = rdev->sb_offset; 1104 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1105 return -EINVAL; 1106 rdev->size = le64_to_cpu(sb->data_size)/2; 1107 if (le32_to_cpu(sb->chunksize)) 1108 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1109 1110 if (le64_to_cpu(sb->size) > rdev->size*2) 1111 return -EINVAL; 1112 return ret; 1113} 1114 1115static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1116{ 1117 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1118 __u64 ev1 = le64_to_cpu(sb->events); 1119 1120 rdev->raid_disk = -1; 1121 rdev->flags = 0; 1122 if (mddev->raid_disks == 0) { 1123 mddev->major_version = 1; 1124 mddev->patch_version = 0; 1125 mddev->persistent = 1; 1126 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 1127 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1128 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1129 mddev->level = le32_to_cpu(sb->level); 1130 mddev->clevel[0] = 0; 1131 mddev->layout = le32_to_cpu(sb->layout); 1132 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1133 mddev->size = le64_to_cpu(sb->size)/2; 1134 mddev->events = ev1; 1135 mddev->bitmap_offset = 0; 1136 mddev->default_bitmap_offset = 1024 >> 9; 1137 1138 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1139 memcpy(mddev->uuid, sb->set_uuid, 16); 1140 1141 mddev->max_disks = (4096-256)/2; 1142 1143 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1144 mddev->bitmap_file == NULL ) { 1145 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6 1146 && mddev->level != 10) { 1147 printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); 1148 return -EINVAL; 1149 } 1150 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1151 } 1152 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1153 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1154 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1155 mddev->new_level = le32_to_cpu(sb->new_level); 1156 mddev->new_layout = le32_to_cpu(sb->new_layout); 1157 mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; 1158 } else { 1159 mddev->reshape_position = MaxSector; 1160 mddev->delta_disks = 0; 1161 mddev->new_level = mddev->level; 1162 mddev->new_layout = mddev->layout; 1163 mddev->new_chunk = mddev->chunk_size; 1164 } 1165 1166 } else if (mddev->pers == NULL) { 1167 /* Insist of good event counter while assembling */ 1168 ++ev1; 1169 if (ev1 < mddev->events) 1170 return -EINVAL; 1171 } else if (mddev->bitmap) { 1172 /* If adding to array with a bitmap, then we can accept an 1173 * older device, but not too old. 1174 */ 1175 if (ev1 < mddev->bitmap->events_cleared) 1176 return 0; 1177 } else { 1178 if (ev1 < mddev->events) 1179 /* just a hot-add of a new device, leave raid_disk at -1 */ 1180 return 0; 1181 } 1182 if (mddev->level != LEVEL_MULTIPATH) { 1183 int role; 1184 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1185 switch(role) { 1186 case 0xffff: /* spare */ 1187 break; 1188 case 0xfffe: /* faulty */ 1189 set_bit(Faulty, &rdev->flags); 1190 break; 1191 default: 1192 if ((le32_to_cpu(sb->feature_map) & 1193 MD_FEATURE_RECOVERY_OFFSET)) 1194 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1195 else 1196 set_bit(In_sync, &rdev->flags); 1197 rdev->raid_disk = role; 1198 break; 1199 } 1200 if (sb->devflags & WriteMostly1) 1201 set_bit(WriteMostly, &rdev->flags); 1202 } else /* MULTIPATH are always insync */ 1203 set_bit(In_sync, &rdev->flags); 1204 1205 return 0; 1206} 1207 1208static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1209{ 1210 struct mdp_superblock_1 *sb; 1211 struct list_head *tmp; 1212 mdk_rdev_t *rdev2; 1213 int max_dev, i; 1214 /* make rdev->sb match mddev and rdev data. */ 1215 1216 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1217 1218 sb->feature_map = 0; 1219 sb->pad0 = 0; 1220 sb->recovery_offset = cpu_to_le64(0); 1221 memset(sb->pad1, 0, sizeof(sb->pad1)); 1222 memset(sb->pad2, 0, sizeof(sb->pad2)); 1223 memset(sb->pad3, 0, sizeof(sb->pad3)); 1224 1225 sb->utime = cpu_to_le64((__u64)mddev->utime); 1226 sb->events = cpu_to_le64(mddev->events); 1227 if (mddev->in_sync) 1228 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1229 else 1230 sb->resync_offset = cpu_to_le64(0); 1231 1232 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1233 1234 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1235 sb->size = cpu_to_le64(mddev->size<<1); 1236 1237 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1238 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1239 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1240 } 1241 1242 if (rdev->raid_disk >= 0 && 1243 !test_bit(In_sync, &rdev->flags) && 1244 rdev->recovery_offset > 0) { 1245 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1246 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); 1247 } 1248 1249 if (mddev->reshape_position != MaxSector) { 1250 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1251 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1252 sb->new_layout = cpu_to_le32(mddev->new_layout); 1253 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1254 sb->new_level = cpu_to_le32(mddev->new_level); 1255 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); 1256 } 1257 1258 max_dev = 0; 1259 ITERATE_RDEV(mddev,rdev2,tmp) 1260 if (rdev2->desc_nr+1 > max_dev) 1261 max_dev = rdev2->desc_nr+1; 1262 1263 sb->max_dev = cpu_to_le32(max_dev); 1264 for (i=0; i<max_dev;i++) 1265 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1266 1267 ITERATE_RDEV(mddev,rdev2,tmp) { 1268 i = rdev2->desc_nr; 1269 if (test_bit(Faulty, &rdev2->flags)) 1270 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1271 else if (test_bit(In_sync, &rdev2->flags)) 1272 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1273 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) 1274 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1275 else 1276 sb->dev_roles[i] = cpu_to_le16(0xffff); 1277 } 1278 1279 sb->sb_csum = calc_sb_1_csum(sb); 1280} 1281 1282 1283static struct super_type super_types[] = { 1284 [0] = { 1285 .name = "0.90.0", 1286 .owner = THIS_MODULE, 1287 .load_super = super_90_load, 1288 .validate_super = super_90_validate, 1289 .sync_super = super_90_sync, 1290 }, 1291 [1] = { 1292 .name = "md-1", 1293 .owner = THIS_MODULE, 1294 .load_super = super_1_load, 1295 .validate_super = super_1_validate, 1296 .sync_super = super_1_sync, 1297 }, 1298}; 1299 1300static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) 1301{ 1302 struct list_head *tmp; 1303 mdk_rdev_t *rdev; 1304 1305 ITERATE_RDEV(mddev,rdev,tmp) 1306 if (rdev->bdev->bd_contains == dev->bdev->bd_contains) 1307 return rdev; 1308 1309 return NULL; 1310} 1311 1312static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1313{ 1314 struct list_head *tmp; 1315 mdk_rdev_t *rdev; 1316 1317 ITERATE_RDEV(mddev1,rdev,tmp) 1318 if (match_dev_unit(mddev2, rdev)) 1319 return 1; 1320 1321 return 0; 1322} 1323 1324static LIST_HEAD(pending_raid_disks); 1325 1326static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1327{ 1328 mdk_rdev_t *same_pdev; 1329 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1330 struct kobject *ko; 1331 char *s; 1332 1333 if (rdev->mddev) { 1334 MD_BUG(); 1335 return -EINVAL; 1336 } 1337 /* make sure rdev->size exceeds mddev->size */ 1338 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { 1339 if (mddev->pers) 1340 /* Cannot change size, so fail */ 1341 return -ENOSPC; 1342 else 1343 mddev->size = rdev->size; 1344 } 1345 same_pdev = match_dev_unit(mddev, rdev); 1346 if (same_pdev) 1347 printk(KERN_WARNING 1348 "%s: WARNING: %s appears to be on the same physical" 1349 " disk as %s. True\n protection against single-disk" 1350 " failure might be compromised.\n", 1351 mdname(mddev), bdevname(rdev->bdev,b), 1352 bdevname(same_pdev->bdev,b2)); 1353 1354 /* Verify rdev->desc_nr is unique. 1355 * If it is -1, assign a free number, else 1356 * check number is not in use 1357 */ 1358 if (rdev->desc_nr < 0) { 1359 int choice = 0; 1360 if (mddev->pers) choice = mddev->raid_disks; 1361 while (find_rdev_nr(mddev, choice)) 1362 choice++; 1363 rdev->desc_nr = choice; 1364 } else { 1365 if (find_rdev_nr(mddev, rdev->desc_nr)) 1366 return -EBUSY; 1367 } 1368 bdevname(rdev->bdev,b); 1369 if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0) 1370 return -ENOMEM; 1371 while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL) 1372 *s = '!'; 1373 1374 list_add(&rdev->same_set, &mddev->disks); 1375 rdev->mddev = mddev; 1376 printk(KERN_INFO "md: bind<%s>\n", b); 1377 1378 rdev->kobj.parent = &mddev->kobj; 1379 kobject_add(&rdev->kobj); 1380 1381 if (rdev->bdev->bd_part) 1382 ko = &rdev->bdev->bd_part->kobj; 1383 else 1384 ko = &rdev->bdev->bd_disk->kobj; 1385 sysfs_create_link(&rdev->kobj, ko, "block"); 1386 bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk); 1387 return 0; 1388} 1389 1390static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1391{ 1392 char b[BDEVNAME_SIZE]; 1393 if (!rdev->mddev) { 1394 MD_BUG(); 1395 return; 1396 } 1397 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1398 list_del_init(&rdev->same_set); 1399 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1400 rdev->mddev = NULL; 1401 sysfs_remove_link(&rdev->kobj, "block"); 1402 kobject_del(&rdev->kobj); 1403} 1404 1405/* 1406 * prevent the device from being mounted, repartitioned or 1407 * otherwise reused by a RAID array (or any other kernel 1408 * subsystem), by bd_claiming the device. 1409 */ 1410static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1411{ 1412 int err = 0; 1413 struct block_device *bdev; 1414 char b[BDEVNAME_SIZE]; 1415 1416 bdev = open_partition_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1417 if (IS_ERR(bdev)) { 1418 printk(KERN_ERR "md: could not open %s.\n", 1419 __bdevname(dev, b)); 1420 return PTR_ERR(bdev); 1421 } 1422 err = bd_claim(bdev, rdev); 1423 if (err) { 1424 printk(KERN_ERR "md: could not bd_claim %s.\n", 1425 bdevname(bdev, b)); 1426 blkdev_put_partition(bdev); 1427 return err; 1428 } 1429 rdev->bdev = bdev; 1430 return err; 1431} 1432 1433static void unlock_rdev(mdk_rdev_t *rdev) 1434{ 1435 struct block_device *bdev = rdev->bdev; 1436 rdev->bdev = NULL; 1437 if (!bdev) 1438 MD_BUG(); 1439 bd_release(bdev); 1440 blkdev_put_partition(bdev); 1441} 1442 1443void md_autodetect_dev(dev_t dev); 1444 1445static void export_rdev(mdk_rdev_t * rdev) 1446{ 1447 char b[BDEVNAME_SIZE]; 1448 printk(KERN_INFO "md: export_rdev(%s)\n", 1449 bdevname(rdev->bdev,b)); 1450 if (rdev->mddev) 1451 MD_BUG(); 1452 free_disk_sb(rdev); 1453 list_del_init(&rdev->same_set); 1454#ifndef MODULE 1455 md_autodetect_dev(rdev->bdev->bd_dev); 1456#endif 1457 unlock_rdev(rdev); 1458 kobject_put(&rdev->kobj); 1459} 1460 1461static void kick_rdev_from_array(mdk_rdev_t * rdev) 1462{ 1463 unbind_rdev_from_array(rdev); 1464 export_rdev(rdev); 1465} 1466 1467static void export_array(mddev_t *mddev) 1468{ 1469 struct list_head *tmp; 1470 mdk_rdev_t *rdev; 1471 1472 ITERATE_RDEV(mddev,rdev,tmp) { 1473 if (!rdev->mddev) { 1474 MD_BUG(); 1475 continue; 1476 } 1477 kick_rdev_from_array(rdev); 1478 } 1479 if (!list_empty(&mddev->disks)) 1480 MD_BUG(); 1481 mddev->raid_disks = 0; 1482 mddev->major_version = 0; 1483} 1484 1485static void print_desc(mdp_disk_t *desc) 1486{ 1487 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1488 desc->major,desc->minor,desc->raid_disk,desc->state); 1489} 1490 1491static void print_sb(mdp_super_t *sb) 1492{ 1493 int i; 1494 1495 printk(KERN_INFO 1496 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1497 sb->major_version, sb->minor_version, sb->patch_version, 1498 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1499 sb->ctime); 1500 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1501 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1502 sb->md_minor, sb->layout, sb->chunk_size); 1503 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1504 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1505 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1506 sb->failed_disks, sb->spare_disks, 1507 sb->sb_csum, (unsigned long)sb->events_lo); 1508 1509 printk(KERN_INFO); 1510 for (i = 0; i < MD_SB_DISKS; i++) { 1511 mdp_disk_t *desc; 1512 1513 desc = sb->disks + i; 1514 if (desc->number || desc->major || desc->minor || 1515 desc->raid_disk || (desc->state && (desc->state != 4))) { 1516 printk(" D %2d: ", i); 1517 print_desc(desc); 1518 } 1519 } 1520 printk(KERN_INFO "md: THIS: "); 1521 print_desc(&sb->this_disk); 1522 1523} 1524 1525static void print_rdev(mdk_rdev_t *rdev) 1526{ 1527 char b[BDEVNAME_SIZE]; 1528 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1529 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1530 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1531 rdev->desc_nr); 1532 if (rdev->sb_loaded) { 1533 printk(KERN_INFO "md: rdev superblock:\n"); 1534 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1535 } else 1536 printk(KERN_INFO "md: no rdev superblock!\n"); 1537} 1538 1539static void md_print_devices(void) 1540{ 1541 struct list_head *tmp, *tmp2; 1542 mdk_rdev_t *rdev; 1543 mddev_t *mddev; 1544 char b[BDEVNAME_SIZE]; 1545 1546 printk("\n"); 1547 printk("md: **********************************\n"); 1548 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1549 printk("md: **********************************\n"); 1550 ITERATE_MDDEV(mddev,tmp) { 1551 1552 if (mddev->bitmap) 1553 bitmap_print_sb(mddev->bitmap); 1554 else 1555 printk("%s: ", mdname(mddev)); 1556 ITERATE_RDEV(mddev,rdev,tmp2) 1557 printk("<%s>", bdevname(rdev->bdev,b)); 1558 printk("\n"); 1559 1560 ITERATE_RDEV(mddev,rdev,tmp2) 1561 print_rdev(rdev); 1562 } 1563 printk("md: **********************************\n"); 1564 printk("\n"); 1565} 1566 1567 1568static void sync_sbs(mddev_t * mddev, int nospares) 1569{ 1570 /* Update each superblock (in-memory image), but 1571 * if we are allowed to, skip spares which already 1572 * have the right event counter, or have one earlier 1573 * (which would mean they aren't being marked as dirty 1574 * with the rest of the array) 1575 */ 1576 mdk_rdev_t *rdev; 1577 struct list_head *tmp; 1578 1579 ITERATE_RDEV(mddev,rdev,tmp) { 1580 if (rdev->sb_events == mddev->events || 1581 (nospares && 1582 rdev->raid_disk < 0 && 1583 (rdev->sb_events&1)==0 && 1584 rdev->sb_events+1 == mddev->events)) { 1585 /* Don't update this superblock */ 1586 rdev->sb_loaded = 2; 1587 } else { 1588 super_types[mddev->major_version]. 1589 sync_super(mddev, rdev); 1590 rdev->sb_loaded = 1; 1591 } 1592 } 1593} 1594 1595static void md_update_sb(mddev_t * mddev, int force_change) 1596{ 1597 int err; 1598 struct list_head *tmp; 1599 mdk_rdev_t *rdev; 1600 int sync_req; 1601 int nospares = 0; 1602 1603repeat: 1604 spin_lock_irq(&mddev->write_lock); 1605 1606 set_bit(MD_CHANGE_PENDING, &mddev->flags); 1607 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 1608 force_change = 1; 1609 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 1610 /* just a clean<-> dirty transition, possibly leave spares alone, 1611 * though if events isn't the right even/odd, we will have to do 1612 * spares after all 1613 */ 1614 nospares = 1; 1615 if (force_change) 1616 nospares = 0; 1617 if (mddev->degraded) 1618 /* If the array is degraded, then skipping spares is both 1619 * dangerous and fairly pointless. 1620 * Dangerous because a device that was removed from the array 1621 * might have a event_count that still looks up-to-date, 1622 * so it can be re-added without a resync. 1623 * Pointless because if there are any spares to skip, 1624 * then a recovery will happen and soon that array won't 1625 * be degraded any more and the spare can go back to sleep then. 1626 */ 1627 nospares = 0; 1628 1629 sync_req = mddev->in_sync; 1630 mddev->utime = get_seconds(); 1631 1632 /* If this is just a dirty<->clean transition, and the array is clean 1633 * and 'events' is odd, we can roll back to the previous clean state */ 1634 if (nospares 1635 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 1636 && (mddev->events & 1)) 1637 mddev->events--; 1638 else { 1639 /* otherwise we have to go forward and ... */ 1640 mddev->events ++; 1641 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 1642 /* .. if the array isn't clean, insist on an odd 'events' */ 1643 if ((mddev->events&1)==0) { 1644 mddev->events++; 1645 nospares = 0; 1646 } 1647 } else { 1648 /* otherwise insist on an even 'events' (for clean states) */ 1649 if ((mddev->events&1)) { 1650 mddev->events++; 1651 nospares = 0; 1652 } 1653 } 1654 } 1655 1656 if (!mddev->events) { 1657 /* 1658 * oops, this 64-bit counter should never wrap. 1659 * Either we are in around ~1 trillion A.C., assuming 1660 * 1 reboot per second, or we have a bug: 1661 */ 1662 MD_BUG(); 1663 mddev->events --; 1664 } 1665 sync_sbs(mddev, nospares); 1666 1667 /* 1668 * do not write anything to disk if using 1669 * nonpersistent superblocks 1670 */ 1671 if (!mddev->persistent) { 1672 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 1673 spin_unlock_irq(&mddev->write_lock); 1674 wake_up(&mddev->sb_wait); 1675 return; 1676 } 1677 spin_unlock_irq(&mddev->write_lock); 1678 1679 dprintk(KERN_INFO 1680 "md: updating %s RAID superblock on device (in sync %d)\n", 1681 mdname(mddev),mddev->in_sync); 1682 1683 err = bitmap_update_sb(mddev->bitmap); 1684 ITERATE_RDEV(mddev,rdev,tmp) { 1685 char b[BDEVNAME_SIZE]; 1686 dprintk(KERN_INFO "md: "); 1687 if (rdev->sb_loaded != 1) 1688 continue; /* no noise on spare devices */ 1689 if (test_bit(Faulty, &rdev->flags)) 1690 dprintk("(skipping faulty "); 1691 1692 dprintk("%s ", bdevname(rdev->bdev,b)); 1693 if (!test_bit(Faulty, &rdev->flags)) { 1694 md_super_write(mddev,rdev, 1695 rdev->sb_offset<<1, rdev->sb_size, 1696 rdev->sb_page); 1697 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1698 bdevname(rdev->bdev,b), 1699 (unsigned long long)rdev->sb_offset); 1700 rdev->sb_events = mddev->events; 1701 1702 } else 1703 dprintk(")\n"); 1704 if (mddev->level == LEVEL_MULTIPATH) 1705 /* only need to write one superblock... */ 1706 break; 1707 } 1708 md_super_wait(mddev); 1709 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 1710 1711 spin_lock_irq(&mddev->write_lock); 1712 if (mddev->in_sync != sync_req || 1713 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 1714 /* have to write it out again */ 1715 spin_unlock_irq(&mddev->write_lock); 1716 goto repeat; 1717 } 1718 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 1719 spin_unlock_irq(&mddev->write_lock); 1720 wake_up(&mddev->sb_wait); 1721 1722} 1723 1724/* words written to sysfs files may, or my not, be \n terminated. 1725 * We want to accept with case. For this we use cmd_match. 1726 */ 1727static int cmd_match(const char *cmd, const char *str) 1728{ 1729 /* See if cmd, written into a sysfs file, matches 1730 * str. They must either be the same, or cmd can 1731 * have a trailing newline 1732 */ 1733 while (*cmd && *str && *cmd == *str) { 1734 cmd++; 1735 str++; 1736 } 1737 if (*cmd == '\n') 1738 cmd++; 1739 if (*str || *cmd) 1740 return 0; 1741 return 1; 1742} 1743 1744struct rdev_sysfs_entry { 1745 struct attribute attr; 1746 ssize_t (*show)(mdk_rdev_t *, char *); 1747 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 1748}; 1749 1750static ssize_t 1751state_show(mdk_rdev_t *rdev, char *page) 1752{ 1753 char *sep = ""; 1754 int len=0; 1755 1756 if (test_bit(Faulty, &rdev->flags)) { 1757 len+= sprintf(page+len, "%sfaulty",sep); 1758 sep = ","; 1759 } 1760 if (test_bit(In_sync, &rdev->flags)) { 1761 len += sprintf(page+len, "%sin_sync",sep); 1762 sep = ","; 1763 } 1764 if (test_bit(WriteMostly, &rdev->flags)) { 1765 len += sprintf(page+len, "%swrite_mostly",sep); 1766 sep = ","; 1767 } 1768 if (!test_bit(Faulty, &rdev->flags) && 1769 !test_bit(In_sync, &rdev->flags)) { 1770 len += sprintf(page+len, "%sspare", sep); 1771 sep = ","; 1772 } 1773 return len+sprintf(page+len, "\n"); 1774} 1775 1776static ssize_t 1777state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1778{ 1779 /* can write 1780 * faulty - simulates and error 1781 * remove - disconnects the device 1782 * writemostly - sets write_mostly 1783 * -writemostly - clears write_mostly 1784 */ 1785 int err = -EINVAL; 1786 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 1787 md_error(rdev->mddev, rdev); 1788 err = 0; 1789 } else if (cmd_match(buf, "remove")) { 1790 if (rdev->raid_disk >= 0) 1791 err = -EBUSY; 1792 else { 1793 mddev_t *mddev = rdev->mddev; 1794 kick_rdev_from_array(rdev); 1795 md_update_sb(mddev, 1); 1796 md_new_event(mddev); 1797 err = 0; 1798 } 1799 } else if (cmd_match(buf, "writemostly")) { 1800 set_bit(WriteMostly, &rdev->flags); 1801 err = 0; 1802 } else if (cmd_match(buf, "-writemostly")) { 1803 clear_bit(WriteMostly, &rdev->flags); 1804 err = 0; 1805 } 1806 return err ? err : len; 1807} 1808static struct rdev_sysfs_entry rdev_state = 1809__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); 1810 1811static ssize_t 1812super_show(mdk_rdev_t *rdev, char *page) 1813{ 1814 if (rdev->sb_loaded && rdev->sb_size) { 1815 memcpy(page, page_address(rdev->sb_page), rdev->sb_size); 1816 return rdev->sb_size; 1817 } else 1818 return 0; 1819} 1820static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super); 1821 1822static ssize_t 1823errors_show(mdk_rdev_t *rdev, char *page) 1824{ 1825 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 1826} 1827 1828static ssize_t 1829errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1830{ 1831 char *e; 1832 unsigned long n = simple_strtoul(buf, &e, 10); 1833 if (*buf && (*e == 0 || *e == '\n')) { 1834 atomic_set(&rdev->corrected_errors, n); 1835 return len; 1836 } 1837 return -EINVAL; 1838} 1839static struct rdev_sysfs_entry rdev_errors = 1840__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 1841 1842static ssize_t 1843slot_show(mdk_rdev_t *rdev, char *page) 1844{ 1845 if (rdev->raid_disk < 0) 1846 return sprintf(page, "none\n"); 1847 else 1848 return sprintf(page, "%d\n", rdev->raid_disk); 1849} 1850 1851static ssize_t 1852slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1853{ 1854 char *e; 1855 int slot = simple_strtoul(buf, &e, 10); 1856 if (strncmp(buf, "none", 4)==0) 1857 slot = -1; 1858 else if (e==buf || (*e && *e!= '\n')) 1859 return -EINVAL; 1860 if (rdev->mddev->pers) 1861 /* Cannot set slot in active array (yet) */ 1862 return -EBUSY; 1863 if (slot >= rdev->mddev->raid_disks) 1864 return -ENOSPC; 1865 rdev->raid_disk = slot; 1866 /* assume it is working */ 1867 rdev->flags = 0; 1868 set_bit(In_sync, &rdev->flags); 1869 return len; 1870} 1871 1872 1873static struct rdev_sysfs_entry rdev_slot = 1874__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 1875 1876static ssize_t 1877offset_show(mdk_rdev_t *rdev, char *page) 1878{ 1879 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 1880} 1881 1882static ssize_t 1883offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1884{ 1885 char *e; 1886 unsigned long long offset = simple_strtoull(buf, &e, 10); 1887 if (e==buf || (*e && *e != '\n')) 1888 return -EINVAL; 1889 if (rdev->mddev->pers) 1890 return -EBUSY; 1891 rdev->data_offset = offset; 1892 return len; 1893} 1894 1895static struct rdev_sysfs_entry rdev_offset = 1896__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 1897 1898static ssize_t 1899rdev_size_show(mdk_rdev_t *rdev, char *page) 1900{ 1901 return sprintf(page, "%llu\n", (unsigned long long)rdev->size); 1902} 1903 1904static ssize_t 1905rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1906{ 1907 char *e; 1908 unsigned long long size = simple_strtoull(buf, &e, 10); 1909 if (e==buf || (*e && *e != '\n')) 1910 return -EINVAL; 1911 if (rdev->mddev->pers) 1912 return -EBUSY; 1913 rdev->size = size; 1914 if (size < rdev->mddev->size || rdev->mddev->size == 0) 1915 rdev->mddev->size = size; 1916 return len; 1917} 1918 1919static struct rdev_sysfs_entry rdev_size = 1920__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 1921 1922static struct attribute *rdev_default_attrs[] = { 1923 &rdev_state.attr, 1924 &rdev_super.attr, 1925 &rdev_errors.attr, 1926 &rdev_slot.attr, 1927 &rdev_offset.attr, 1928 &rdev_size.attr, 1929 NULL, 1930}; 1931static ssize_t 1932rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1933{ 1934 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1935 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1936 1937 if (!entry->show) 1938 return -EIO; 1939 return entry->show(rdev, page); 1940} 1941 1942static ssize_t 1943rdev_attr_store(struct kobject *kobj, struct attribute *attr, 1944 const char *page, size_t length) 1945{ 1946 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1947 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1948 1949 if (!entry->store) 1950 return -EIO; 1951 if (!capable(CAP_SYS_ADMIN)) 1952 return -EACCES; 1953 return entry->store(rdev, page, length); 1954} 1955 1956static void rdev_free(struct kobject *ko) 1957{ 1958 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 1959 kfree(rdev); 1960} 1961static struct sysfs_ops rdev_sysfs_ops = { 1962 .show = rdev_attr_show, 1963 .store = rdev_attr_store, 1964}; 1965static struct kobj_type rdev_ktype = { 1966 .release = rdev_free, 1967 .sysfs_ops = &rdev_sysfs_ops, 1968 .default_attrs = rdev_default_attrs, 1969}; 1970 1971/* 1972 * Import a device. If 'super_format' >= 0, then sanity check the superblock 1973 * 1974 * mark the device faulty if: 1975 * 1976 * - the device is nonexistent (zero size) 1977 * - the device has no valid superblock 1978 * 1979 * a faulty rdev _never_ has rdev->sb set. 1980 */ 1981static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 1982{ 1983 char b[BDEVNAME_SIZE]; 1984 int err; 1985 mdk_rdev_t *rdev; 1986 sector_t size; 1987 1988 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 1989 if (!rdev) { 1990 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 1991 return ERR_PTR(-ENOMEM); 1992 } 1993 1994 if ((err = alloc_disk_sb(rdev))) 1995 goto abort_free; 1996 1997 err = lock_rdev(rdev, newdev); 1998 if (err) 1999 goto abort_free; 2000 2001 rdev->kobj.parent = NULL; 2002 rdev->kobj.ktype = &rdev_ktype; 2003 kobject_init(&rdev->kobj); 2004 2005 rdev->desc_nr = -1; 2006 rdev->saved_raid_disk = -1; 2007 rdev->flags = 0; 2008 rdev->data_offset = 0; 2009 rdev->sb_events = 0; 2010 atomic_set(&rdev->nr_pending, 0); 2011 atomic_set(&rdev->read_errors, 0); 2012 atomic_set(&rdev->corrected_errors, 0); 2013 2014 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2015 if (!size) { 2016 printk(KERN_WARNING 2017 "md: %s has zero or unknown size, marking faulty!\n", 2018 bdevname(rdev->bdev,b)); 2019 err = -EINVAL; 2020 goto abort_free; 2021 } 2022 2023 if (super_format >= 0) { 2024 err = super_types[super_format]. 2025 load_super(rdev, NULL, super_minor); 2026 if (err == -EINVAL) { 2027 printk(KERN_WARNING 2028 "md: %s has invalid sb, not importing!\n", 2029 bdevname(rdev->bdev,b)); 2030 goto abort_free; 2031 } 2032 if (err < 0) { 2033 printk(KERN_WARNING 2034 "md: could not read %s's sb, not importing!\n", 2035 bdevname(rdev->bdev,b)); 2036 goto abort_free; 2037 } 2038 } 2039 INIT_LIST_HEAD(&rdev->same_set); 2040 2041 return rdev; 2042 2043abort_free: 2044 if (rdev->sb_page) { 2045 if (rdev->bdev) 2046 unlock_rdev(rdev); 2047 free_disk_sb(rdev); 2048 } 2049 kfree(rdev); 2050 return ERR_PTR(err); 2051} 2052 2053/* 2054 * Check a full RAID array for plausibility 2055 */ 2056 2057 2058static void analyze_sbs(mddev_t * mddev) 2059{ 2060 int i; 2061 struct list_head *tmp; 2062 mdk_rdev_t *rdev, *freshest; 2063 char b[BDEVNAME_SIZE]; 2064 2065 freshest = NULL; 2066 ITERATE_RDEV(mddev,rdev,tmp) 2067 switch (super_types[mddev->major_version]. 2068 load_super(rdev, freshest, mddev->minor_version)) { 2069 case 1: 2070 freshest = rdev; 2071 break; 2072 case 0: 2073 break; 2074 default: 2075 printk( KERN_ERR \ 2076 "md: fatal superblock inconsistency in %s" 2077 " -- removing from array\n", 2078 bdevname(rdev->bdev,b)); 2079 kick_rdev_from_array(rdev); 2080 } 2081 2082 2083 super_types[mddev->major_version]. 2084 validate_super(mddev, freshest); 2085 2086 i = 0; 2087 ITERATE_RDEV(mddev,rdev,tmp) { 2088 if (rdev != freshest) 2089 if (super_types[mddev->major_version]. 2090 validate_super(mddev, rdev)) { 2091 printk(KERN_WARNING "md: kicking non-fresh %s" 2092 " from array!\n", 2093 bdevname(rdev->bdev,b)); 2094 kick_rdev_from_array(rdev); 2095 continue; 2096 } 2097 if (mddev->level == LEVEL_MULTIPATH) { 2098 rdev->desc_nr = i++; 2099 rdev->raid_disk = rdev->desc_nr; 2100 set_bit(In_sync, &rdev->flags); 2101 } 2102 } 2103 2104 2105 2106 if (mddev->recovery_cp != MaxSector && 2107 mddev->level >= 1) 2108 printk(KERN_ERR "md: %s: raid array is not clean" 2109 " -- starting background reconstruction\n", 2110 mdname(mddev)); 2111 2112} 2113 2114static ssize_t 2115safe_delay_show(mddev_t *mddev, char *page) 2116{ 2117 int msec = (mddev->safemode_delay*1000)/HZ; 2118 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 2119} 2120static ssize_t 2121safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2122{ 2123 int scale=1; 2124 int dot=0; 2125 int i; 2126 unsigned long msec; 2127 char buf[30]; 2128 char *e; 2129 /* remove a period, and count digits after it */ 2130 if (len >= sizeof(buf)) 2131 return -EINVAL; 2132 strlcpy(buf, cbuf, len); 2133 buf[len] = 0; 2134 for (i=0; i<len; i++) { 2135 if (dot) { 2136 if (isdigit(buf[i])) { 2137 buf[i-1] = buf[i]; 2138 scale *= 10; 2139 } 2140 buf[i] = 0; 2141 } else if (buf[i] == '.') { 2142 dot=1; 2143 buf[i] = 0; 2144 } 2145 } 2146 msec = simple_strtoul(buf, &e, 10); 2147 if (e == buf || (*e && *e != '\n')) 2148 return -EINVAL; 2149 msec = (msec * 1000) / scale; 2150 if (msec == 0) 2151 mddev->safemode_delay = 0; 2152 else { 2153 mddev->safemode_delay = (msec*HZ)/1000; 2154 if (mddev->safemode_delay == 0) 2155 mddev->safemode_delay = 1; 2156 } 2157 return len; 2158} 2159static struct md_sysfs_entry md_safe_delay = 2160__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 2161 2162static ssize_t 2163level_show(mddev_t *mddev, char *page) 2164{ 2165 struct mdk_personality *p = mddev->pers; 2166 if (p) 2167 return sprintf(page, "%s\n", p->name); 2168 else if (mddev->clevel[0]) 2169 return sprintf(page, "%s\n", mddev->clevel); 2170 else if (mddev->level != LEVEL_NONE) 2171 return sprintf(page, "%d\n", mddev->level); 2172 else 2173 return 0; 2174} 2175 2176static ssize_t 2177level_store(mddev_t *mddev, const char *buf, size_t len) 2178{ 2179 int rv = len; 2180 if (mddev->pers) 2181 return -EBUSY; 2182 if (len == 0) 2183 return 0; 2184 if (len >= sizeof(mddev->clevel)) 2185 return -ENOSPC; 2186 strncpy(mddev->clevel, buf, len); 2187 if (mddev->clevel[len-1] == '\n') 2188 len--; 2189 mddev->clevel[len] = 0; 2190 mddev->level = LEVEL_NONE; 2191 return rv; 2192} 2193 2194static struct md_sysfs_entry md_level = 2195__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 2196 2197 2198static ssize_t 2199layout_show(mddev_t *mddev, char *page) 2200{ 2201 /* just a number, not meaningful for all levels */ 2202 return sprintf(page, "%d\n", mddev->layout); 2203} 2204 2205static ssize_t 2206layout_store(mddev_t *mddev, const char *buf, size_t len) 2207{ 2208 char *e; 2209 unsigned long n = simple_strtoul(buf, &e, 10); 2210 if (mddev->pers) 2211 return -EBUSY; 2212 2213 if (!*buf || (*e && *e != '\n')) 2214 return -EINVAL; 2215 2216 mddev->layout = n; 2217 return len; 2218} 2219static struct md_sysfs_entry md_layout = 2220__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 2221 2222 2223static ssize_t 2224raid_disks_show(mddev_t *mddev, char *page) 2225{ 2226 if (mddev->raid_disks == 0) 2227 return 0; 2228 return sprintf(page, "%d\n", mddev->raid_disks); 2229} 2230 2231static int update_raid_disks(mddev_t *mddev, int raid_disks); 2232 2233static ssize_t 2234raid_disks_store(mddev_t *mddev, const char *buf, size_t len) 2235{ 2236 /* can only set raid_disks if array is not yet active */ 2237 char *e; 2238 int rv = 0; 2239 unsigned long n = simple_strtoul(buf, &e, 10); 2240 2241 if (!*buf || (*e && *e != '\n')) 2242 return -EINVAL; 2243 2244 if (mddev->pers) 2245 rv = update_raid_disks(mddev, n); 2246 else 2247 mddev->raid_disks = n; 2248 return rv ? rv : len; 2249} 2250static struct md_sysfs_entry md_raid_disks = 2251__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 2252 2253static ssize_t 2254chunk_size_show(mddev_t *mddev, char *page) 2255{ 2256 return sprintf(page, "%d\n", mddev->chunk_size); 2257} 2258 2259static ssize_t 2260chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 2261{ 2262 /* can only set chunk_size if array is not yet active */ 2263 char *e; 2264 unsigned long n = simple_strtoul(buf, &e, 10); 2265 2266 if (mddev->pers) 2267 return -EBUSY; 2268 if (!*buf || (*e && *e != '\n')) 2269 return -EINVAL; 2270 2271 mddev->chunk_size = n; 2272 return len; 2273} 2274static struct md_sysfs_entry md_chunk_size = 2275__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 2276 2277static ssize_t 2278resync_start_show(mddev_t *mddev, char *page) 2279{ 2280 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 2281} 2282 2283static ssize_t 2284resync_start_store(mddev_t *mddev, const char *buf, size_t len) 2285{ 2286 /* can only set chunk_size if array is not yet active */ 2287 char *e; 2288 unsigned long long n = simple_strtoull(buf, &e, 10); 2289 2290 if (mddev->pers) 2291 return -EBUSY; 2292 if (!*buf || (*e && *e != '\n')) 2293 return -EINVAL; 2294 2295 mddev->recovery_cp = n; 2296 return len; 2297} 2298static struct md_sysfs_entry md_resync_start = 2299__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); 2300 2301/* 2302 * The array state can be: 2303 * 2304 * clear 2305 * No devices, no size, no level 2306 * Equivalent to STOP_ARRAY ioctl 2307 * inactive 2308 * May have some settings, but array is not active 2309 * all IO results in error 2310 * When written, doesn't tear down array, but just stops it 2311 * suspended (not supported yet) 2312 * All IO requests will block. The array can be reconfigured. 2313 * Writing this, if accepted, will block until array is quiessent 2314 * readonly 2315 * no resync can happen. no superblocks get written. 2316 * write requests fail 2317 * read-auto 2318 * like readonly, but behaves like 'clean' on a write request. 2319 * 2320 * clean - no pending writes, but otherwise active. 2321 * When written to inactive array, starts without resync 2322 * If a write request arrives then 2323 * if metadata is known, mark 'dirty' and switch to 'active'. 2324 * if not known, block and switch to write-pending 2325 * If written to an active array that has pending writes, then fails. 2326 * active 2327 * fully active: IO and resync can be happening. 2328 * When written to inactive array, starts with resync 2329 * 2330 * write-pending 2331 * clean, but writes are blocked waiting for 'active' to be written. 2332 * 2333 * active-idle 2334 * like active, but no writes have been seen for a while (100msec). 2335 * 2336 */ 2337enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 2338 write_pending, active_idle, bad_word}; 2339static char *array_states[] = { 2340 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 2341 "write-pending", "active-idle", NULL }; 2342 2343static int match_word(const char *word, char **list) 2344{ 2345 int n; 2346 for (n=0; list[n]; n++) 2347 if (cmd_match(word, list[n])) 2348 break; 2349 return n; 2350} 2351 2352static ssize_t 2353array_state_show(mddev_t *mddev, char *page) 2354{ 2355 enum array_state st = inactive; 2356 2357 if (mddev->pers) 2358 switch(mddev->ro) { 2359 case 1: 2360 st = readonly; 2361 break; 2362 case 2: 2363 st = read_auto; 2364 break; 2365 case 0: 2366 if (mddev->in_sync) 2367 st = clean; 2368 else if (mddev->safemode) 2369 st = active_idle; 2370 else 2371 st = active; 2372 } 2373 else { 2374 if (list_empty(&mddev->disks) && 2375 mddev->raid_disks == 0 && 2376 mddev->size == 0) 2377 st = clear; 2378 else 2379 st = inactive; 2380 } 2381 return sprintf(page, "%s\n", array_states[st]); 2382} 2383 2384static int do_md_stop(mddev_t * mddev, int ro); 2385static int do_md_run(mddev_t * mddev); 2386static int restart_array(mddev_t *mddev); 2387 2388static ssize_t 2389array_state_store(mddev_t *mddev, const char *buf, size_t len) 2390{ 2391 int err = -EINVAL; 2392 enum array_state st = match_word(buf, array_states); 2393 switch(st) { 2394 case bad_word: 2395 break; 2396 case clear: 2397 /* stopping an active array */ 2398 if (mddev->pers) { 2399 if (atomic_read(&mddev->active) > 1) 2400 return -EBUSY; 2401 err = do_md_stop(mddev, 0); 2402 } 2403 break; 2404 case inactive: 2405 /* stopping an active array */ 2406 if (mddev->pers) { 2407 if (atomic_read(&mddev->active) > 1) 2408 return -EBUSY; 2409 err = do_md_stop(mddev, 2); 2410 } 2411 break; 2412 case suspended: 2413 break; /* not supported yet */ 2414 case readonly: 2415 if (mddev->pers) 2416 err = do_md_stop(mddev, 1); 2417 else { 2418 mddev->ro = 1; 2419 err = do_md_run(mddev); 2420 } 2421 break; 2422 case read_auto: 2423 /* stopping an active array */ 2424 if (mddev->pers) { 2425 err = do_md_stop(mddev, 1); 2426 if (err == 0) 2427 mddev->ro = 2; /* FIXME mark devices writable */ 2428 } else { 2429 mddev->ro = 2; 2430 err = do_md_run(mddev); 2431 } 2432 break; 2433 case clean: 2434 if (mddev->pers) { 2435 restart_array(mddev); 2436 spin_lock_irq(&mddev->write_lock); 2437 if (atomic_read(&mddev->writes_pending) == 0) { 2438 mddev->in_sync = 1; 2439 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 2440 } 2441 spin_unlock_irq(&mddev->write_lock); 2442 } else { 2443 mddev->ro = 0; 2444 mddev->recovery_cp = MaxSector; 2445 err = do_md_run(mddev); 2446 } 2447 break; 2448 case active: 2449 if (mddev->pers) { 2450 restart_array(mddev); 2451 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2452 wake_up(&mddev->sb_wait); 2453 err = 0; 2454 } else { 2455 mddev->ro = 0; 2456 err = do_md_run(mddev); 2457 } 2458 break; 2459 case write_pending: 2460 case active_idle: 2461 /* these cannot be set */ 2462 break; 2463 } 2464 if (err) 2465 return err; 2466 else 2467 return len; 2468} 2469static struct md_sysfs_entry md_array_state = 2470__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 2471 2472static ssize_t 2473null_show(mddev_t *mddev, char *page) 2474{ 2475 return -EINVAL; 2476} 2477 2478static ssize_t 2479new_dev_store(mddev_t *mddev, const char *buf, size_t len) 2480{ 2481 /* buf must be %d:%d\n? giving major and minor numbers */ 2482 /* The new device is added to the array. 2483 * If the array has a persistent superblock, we read the 2484 * superblock to initialise info and check validity. 2485 * Otherwise, only checking done is that in bind_rdev_to_array, 2486 * which mainly checks size. 2487 */ 2488 char *e; 2489 int major = simple_strtoul(buf, &e, 10); 2490 int minor; 2491 dev_t dev; 2492 mdk_rdev_t *rdev; 2493 int err; 2494 2495 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 2496 return -EINVAL; 2497 minor = simple_strtoul(e+1, &e, 10); 2498 if (*e && *e != '\n') 2499 return -EINVAL; 2500 dev = MKDEV(major, minor); 2501 if (major != MAJOR(dev) || 2502 minor != MINOR(dev)) 2503 return -EOVERFLOW; 2504 2505 2506 if (mddev->persistent) { 2507 rdev = md_import_device(dev, mddev->major_version, 2508 mddev->minor_version); 2509 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 2510 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2511 mdk_rdev_t, same_set); 2512 err = super_types[mddev->major_version] 2513 .load_super(rdev, rdev0, mddev->minor_version); 2514 if (err < 0) 2515 goto out; 2516 } 2517 } else 2518 rdev = md_import_device(dev, -1, -1); 2519 2520 if (IS_ERR(rdev)) 2521 return PTR_ERR(rdev); 2522 err = bind_rdev_to_array(rdev, mddev); 2523 out: 2524 if (err) 2525 export_rdev(rdev); 2526 return err ? err : len; 2527} 2528 2529static struct md_sysfs_entry md_new_device = 2530__ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 2531 2532static ssize_t 2533bitmap_store(mddev_t *mddev, const char *buf, size_t len) 2534{ 2535 char *end; 2536 unsigned long chunk, end_chunk; 2537 2538 if (!mddev->bitmap) 2539 goto out; 2540 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 2541 while (*buf) { 2542 chunk = end_chunk = simple_strtoul(buf, &end, 0); 2543 if (buf == end) break; 2544 if (*end == '-') { /* range */ 2545 buf = end + 1; 2546 end_chunk = simple_strtoul(buf, &end, 0); 2547 if (buf == end) break; 2548 } 2549 if (*end && !isspace(*end)) break; 2550 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 2551 buf = end; 2552 while (isspace(*buf)) buf++; 2553 } 2554 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 2555out: 2556 return len; 2557} 2558 2559static struct md_sysfs_entry md_bitmap = 2560__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 2561 2562static ssize_t 2563size_show(mddev_t *mddev, char *page) 2564{ 2565 return sprintf(page, "%llu\n", (unsigned long long)mddev->size); 2566} 2567 2568static int update_size(mddev_t *mddev, unsigned long size); 2569 2570static ssize_t 2571size_store(mddev_t *mddev, const char *buf, size_t len) 2572{ 2573 /* If array is inactive, we can reduce the component size, but 2574 * not increase it (except from 0). 2575 * If array is active, we can try an on-line resize 2576 */ 2577 char *e; 2578 int err = 0; 2579 unsigned long long size = simple_strtoull(buf, &e, 10); 2580 if (!*buf || *buf == '\n' || 2581 (*e && *e != '\n')) 2582 return -EINVAL; 2583 2584 if (mddev->pers) { 2585 err = update_size(mddev, size); 2586 md_update_sb(mddev, 1); 2587 } else { 2588 if (mddev->size == 0 || 2589 mddev->size > size) 2590 mddev->size = size; 2591 else 2592 err = -ENOSPC; 2593 } 2594 return err ? err : len; 2595} 2596 2597static struct md_sysfs_entry md_size = 2598__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 2599 2600 2601/* Metdata version. 2602 * This is either 'none' for arrays with externally managed metadata, 2603 * or N.M for internally known formats 2604 */ 2605static ssize_t 2606metadata_show(mddev_t *mddev, char *page) 2607{ 2608 if (mddev->persistent) 2609 return sprintf(page, "%d.%d\n", 2610 mddev->major_version, mddev->minor_version); 2611 else 2612 return sprintf(page, "none\n"); 2613} 2614 2615static ssize_t 2616metadata_store(mddev_t *mddev, const char *buf, size_t len) 2617{ 2618 int major, minor; 2619 char *e; 2620 if (!list_empty(&mddev->disks)) 2621 return -EBUSY; 2622 2623 if (cmd_match(buf, "none")) { 2624 mddev->persistent = 0; 2625 mddev->major_version = 0; 2626 mddev->minor_version = 90; 2627 return len; 2628 } 2629 major = simple_strtoul(buf, &e, 10); 2630 if (e==buf || *e != '.') 2631 return -EINVAL; 2632 buf = e+1; 2633 minor = simple_strtoul(buf, &e, 10); 2634 if (e==buf || *e != '\n') 2635 return -EINVAL; 2636 if (major >= sizeof(super_types)/sizeof(super_types[0]) || 2637 super_types[major].name == NULL) 2638 return -ENOENT; 2639 mddev->major_version = major; 2640 mddev->minor_version = minor; 2641 mddev->persistent = 1; 2642 return len; 2643} 2644 2645static struct md_sysfs_entry md_metadata = 2646__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 2647 2648static ssize_t 2649action_show(mddev_t *mddev, char *page) 2650{ 2651 char *type = "idle"; 2652 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2653 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { 2654 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 2655 type = "reshape"; 2656 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2657 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 2658 type = "resync"; 2659 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2660 type = "check"; 2661 else 2662 type = "repair"; 2663 } else 2664 type = "recover"; 2665 } 2666 return sprintf(page, "%s\n", type); 2667} 2668 2669static ssize_t 2670action_store(mddev_t *mddev, const char *page, size_t len) 2671{ 2672 if (!mddev->pers || !mddev->pers->sync_request) 2673 return -EINVAL; 2674 2675 if (cmd_match(page, "idle")) { 2676 if (mddev->sync_thread) { 2677 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2678 md_unregister_thread(mddev->sync_thread); 2679 mddev->sync_thread = NULL; 2680 mddev->recovery = 0; 2681 } 2682 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2683 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 2684 return -EBUSY; 2685 else if (cmd_match(page, "resync") || cmd_match(page, "recover")) 2686 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2687 else if (cmd_match(page, "reshape")) { 2688 int err; 2689 if (mddev->pers->start_reshape == NULL) 2690 return -EINVAL; 2691 err = mddev->pers->start_reshape(mddev); 2692 if (err) 2693 return err; 2694 } else { 2695 if (cmd_match(page, "check")) 2696 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 2697 else if (!cmd_match(page, "repair")) 2698 return -EINVAL; 2699 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 2700 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 2701 } 2702 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2703 md_wakeup_thread(mddev->thread); 2704 return len; 2705} 2706 2707static ssize_t 2708mismatch_cnt_show(mddev_t *mddev, char *page) 2709{ 2710 return sprintf(page, "%llu\n", 2711 (unsigned long long) mddev->resync_mismatches); 2712} 2713 2714static struct md_sysfs_entry md_scan_mode = 2715__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 2716 2717 2718static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 2719 2720static ssize_t 2721sync_min_show(mddev_t *mddev, char *page) 2722{ 2723 return sprintf(page, "%d (%s)\n", speed_min(mddev), 2724 mddev->sync_speed_min ? "local": "system"); 2725} 2726 2727static ssize_t 2728sync_min_store(mddev_t *mddev, const char *buf, size_t len) 2729{ 2730 int min; 2731 char *e; 2732 if (strncmp(buf, "system", 6)==0) { 2733 mddev->sync_speed_min = 0; 2734 return len; 2735 } 2736 min = simple_strtoul(buf, &e, 10); 2737 if (buf == e || (*e && *e != '\n') || min <= 0) 2738 return -EINVAL; 2739 mddev->sync_speed_min = min; 2740 return len; 2741} 2742 2743static struct md_sysfs_entry md_sync_min = 2744__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 2745 2746static ssize_t 2747sync_max_show(mddev_t *mddev, char *page) 2748{ 2749 return sprintf(page, "%d (%s)\n", speed_max(mddev), 2750 mddev->sync_speed_max ? "local": "system"); 2751} 2752 2753static ssize_t 2754sync_max_store(mddev_t *mddev, const char *buf, size_t len) 2755{ 2756 int max; 2757 char *e; 2758 if (strncmp(buf, "system", 6)==0) { 2759 mddev->sync_speed_max = 0; 2760 return len; 2761 } 2762 max = simple_strtoul(buf, &e, 10); 2763 if (buf == e || (*e && *e != '\n') || max <= 0) 2764 return -EINVAL; 2765 mddev->sync_speed_max = max; 2766 return len; 2767} 2768 2769static struct md_sysfs_entry md_sync_max = 2770__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 2771 2772 2773static ssize_t 2774sync_speed_show(mddev_t *mddev, char *page) 2775{ 2776 unsigned long resync, dt, db; 2777 resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)); 2778 dt = ((jiffies - mddev->resync_mark) / HZ); 2779 if (!dt) dt++; 2780 db = resync - (mddev->resync_mark_cnt); 2781 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ 2782} 2783 2784static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 2785 2786static ssize_t 2787sync_completed_show(mddev_t *mddev, char *page) 2788{ 2789 unsigned long max_blocks, resync; 2790 2791 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2792 max_blocks = mddev->resync_max_sectors; 2793 else 2794 max_blocks = mddev->size << 1; 2795 2796 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); 2797 return sprintf(page, "%lu / %lu\n", resync, max_blocks); 2798} 2799 2800static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 2801 2802static ssize_t 2803suspend_lo_show(mddev_t *mddev, char *page) 2804{ 2805 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 2806} 2807 2808static ssize_t 2809suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) 2810{ 2811 char *e; 2812 unsigned long long new = simple_strtoull(buf, &e, 10); 2813 2814 if (mddev->pers->quiesce == NULL) 2815 return -EINVAL; 2816 if (buf == e || (*e && *e != '\n')) 2817 return -EINVAL; 2818 if (new >= mddev->suspend_hi || 2819 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 2820 mddev->suspend_lo = new; 2821 mddev->pers->quiesce(mddev, 2); 2822 return len; 2823 } else 2824 return -EINVAL; 2825} 2826static struct md_sysfs_entry md_suspend_lo = 2827__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 2828 2829 2830static ssize_t 2831suspend_hi_show(mddev_t *mddev, char *page) 2832{ 2833 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 2834} 2835 2836static ssize_t 2837suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) 2838{ 2839 char *e; 2840 unsigned long long new = simple_strtoull(buf, &e, 10); 2841 2842 if (mddev->pers->quiesce == NULL) 2843 return -EINVAL; 2844 if (buf == e || (*e && *e != '\n')) 2845 return -EINVAL; 2846 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 2847 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 2848 mddev->suspend_hi = new; 2849 mddev->pers->quiesce(mddev, 1); 2850 mddev->pers->quiesce(mddev, 0); 2851 return len; 2852 } else 2853 return -EINVAL; 2854} 2855static struct md_sysfs_entry md_suspend_hi = 2856__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 2857 2858 2859static struct attribute *md_default_attrs[] = { 2860 &md_level.attr, 2861 &md_layout.attr, 2862 &md_raid_disks.attr, 2863 &md_chunk_size.attr, 2864 &md_size.attr, 2865 &md_resync_start.attr, 2866 &md_metadata.attr, 2867 &md_new_device.attr, 2868 &md_safe_delay.attr, 2869 &md_array_state.attr, 2870 NULL, 2871}; 2872 2873static struct attribute *md_redundancy_attrs[] = { 2874 &md_scan_mode.attr, 2875 &md_mismatches.attr, 2876 &md_sync_min.attr, 2877 &md_sync_max.attr, 2878 &md_sync_speed.attr, 2879 &md_sync_completed.attr, 2880 &md_suspend_lo.attr, 2881 &md_suspend_hi.attr, 2882 &md_bitmap.attr, 2883 NULL, 2884}; 2885static struct attribute_group md_redundancy_group = { 2886 .name = NULL, 2887 .attrs = md_redundancy_attrs, 2888}; 2889 2890 2891static ssize_t 2892md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2893{ 2894 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 2895 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 2896 ssize_t rv; 2897 2898 if (!entry->show) 2899 return -EIO; 2900 rv = mddev_lock(mddev); 2901 if (!rv) { 2902 rv = entry->show(mddev, page); 2903 mddev_unlock(mddev); 2904 } 2905 return rv; 2906} 2907 2908static ssize_t 2909md_attr_store(struct kobject *kobj, struct attribute *attr, 2910 const char *page, size_t length) 2911{ 2912 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 2913 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 2914 ssize_t rv; 2915 2916 if (!entry->store) 2917 return -EIO; 2918 if (!capable(CAP_SYS_ADMIN)) 2919 return -EACCES; 2920 rv = mddev_lock(mddev); 2921 if (!rv) { 2922 rv = entry->store(mddev, page, length); 2923 mddev_unlock(mddev); 2924 } 2925 return rv; 2926} 2927 2928static void md_free(struct kobject *ko) 2929{ 2930 mddev_t *mddev = container_of(ko, mddev_t, kobj); 2931 kfree(mddev); 2932} 2933 2934static struct sysfs_ops md_sysfs_ops = { 2935 .show = md_attr_show, 2936 .store = md_attr_store, 2937}; 2938static struct kobj_type md_ktype = { 2939 .release = md_free, 2940 .sysfs_ops = &md_sysfs_ops, 2941 .default_attrs = md_default_attrs, 2942}; 2943 2944int mdp_major = 0; 2945 2946static struct kobject *md_probe(dev_t dev, int *part, void *data) 2947{ 2948 static DEFINE_MUTEX(disks_mutex); 2949 mddev_t *mddev = mddev_find(dev); 2950 struct gendisk *disk; 2951 int partitioned = (MAJOR(dev) != MD_MAJOR); 2952 int shift = partitioned ? MdpMinorShift : 0; 2953 int unit = MINOR(dev) >> shift; 2954 2955 if (!mddev) 2956 return NULL; 2957 2958 mutex_lock(&disks_mutex); 2959 if (mddev->gendisk) { 2960 mutex_unlock(&disks_mutex); 2961 mddev_put(mddev); 2962 return NULL; 2963 } 2964 disk = alloc_disk(1 << shift); 2965 if (!disk) { 2966 mutex_unlock(&disks_mutex); 2967 mddev_put(mddev); 2968 return NULL; 2969 } 2970 disk->major = MAJOR(dev); 2971 disk->first_minor = unit << shift; 2972 if (partitioned) 2973 sprintf(disk->disk_name, "md_d%d", unit); 2974 else 2975 sprintf(disk->disk_name, "md%d", unit); 2976 disk->fops = &md_fops; 2977 disk->private_data = mddev; 2978 disk->queue = mddev->queue; 2979 add_disk(disk); 2980 mddev->gendisk = disk; 2981 mutex_unlock(&disks_mutex); 2982 mddev->kobj.parent = &disk->kobj; 2983 mddev->kobj.k_name = NULL; 2984 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); 2985 mddev->kobj.ktype = &md_ktype; 2986 kobject_register(&mddev->kobj); 2987 return NULL; 2988} 2989 2990static void md_safemode_timeout(unsigned long data) 2991{ 2992 mddev_t *mddev = (mddev_t *) data; 2993 2994 mddev->safemode = 1; 2995 md_wakeup_thread(mddev->thread); 2996} 2997 2998static int start_dirty_degraded; 2999 3000static int do_md_run(mddev_t * mddev) 3001{ 3002 int err; 3003 int chunk_size; 3004 struct list_head *tmp; 3005 mdk_rdev_t *rdev; 3006 struct gendisk *disk; 3007 struct mdk_personality *pers; 3008 char b[BDEVNAME_SIZE]; 3009 3010 if (list_empty(&mddev->disks)) 3011 /* cannot run an array with no devices.. */ 3012 return -EINVAL; 3013 3014 if (mddev->pers) 3015 return -EBUSY; 3016 3017 /* 3018 * Analyze all RAID superblock(s) 3019 */ 3020 if (!mddev->raid_disks) 3021 analyze_sbs(mddev); 3022 3023 chunk_size = mddev->chunk_size; 3024 3025 if (chunk_size) { 3026 if (chunk_size > MAX_CHUNK_SIZE) { 3027 printk(KERN_ERR "too big chunk_size: %d > %d\n", 3028 chunk_size, MAX_CHUNK_SIZE); 3029 return -EINVAL; 3030 } 3031 /* 3032 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 3033 */ 3034 if ( (1 << ffz(~chunk_size)) != chunk_size) { 3035 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 3036 return -EINVAL; 3037 } 3038 if (chunk_size < PAGE_SIZE) { 3039 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 3040 chunk_size, PAGE_SIZE); 3041 return -EINVAL; 3042 } 3043 3044 /* devices must have minimum size of one chunk */ 3045 ITERATE_RDEV(mddev,rdev,tmp) { 3046 if (test_bit(Faulty, &rdev->flags)) 3047 continue; 3048 if (rdev->size < chunk_size / 1024) { 3049 printk(KERN_WARNING 3050 "md: Dev %s smaller than chunk_size:" 3051 " %lluk < %dk\n", 3052 bdevname(rdev->bdev,b), 3053 (unsigned long long)rdev->size, 3054 chunk_size / 1024); 3055 return -EINVAL; 3056 } 3057 } 3058 } 3059 3060#ifdef CONFIG_KMOD 3061 if (mddev->level != LEVEL_NONE) 3062 request_module("md-level-%d", mddev->level); 3063 else if (mddev->clevel[0]) 3064 request_module("md-%s", mddev->clevel); 3065#endif 3066 3067 /* 3068 * Drop all container device buffers, from now on 3069 * the only valid external interface is through the md 3070 * device. 3071 * Also find largest hardsector size 3072 */ 3073 ITERATE_RDEV(mddev,rdev,tmp) { 3074 if (test_bit(Faulty, &rdev->flags)) 3075 continue; 3076 sync_blockdev(rdev->bdev); 3077 invalidate_bdev(rdev->bdev, 0); 3078 } 3079 3080 md_probe(mddev->unit, NULL, NULL); 3081 disk = mddev->gendisk; 3082 if (!disk) 3083 return -ENOMEM; 3084 3085 spin_lock(&pers_lock); 3086 pers = find_pers(mddev->level, mddev->clevel); 3087 if (!pers || !try_module_get(pers->owner)) { 3088 spin_unlock(&pers_lock); 3089 if (mddev->level != LEVEL_NONE) 3090 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 3091 mddev->level); 3092 else 3093 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 3094 mddev->clevel); 3095 return -EINVAL; 3096 } 3097 mddev->pers = pers; 3098 spin_unlock(&pers_lock); 3099 mddev->level = pers->level; 3100 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3101 3102 if (mddev->reshape_position != MaxSector && 3103 pers->start_reshape == NULL) { 3104 /* This personality cannot handle reshaping... */ 3105 mddev->pers = NULL; 3106 module_put(pers->owner); 3107 return -EINVAL; 3108 } 3109 3110 mddev->recovery = 0; 3111 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 3112 mddev->barriers_work = 1; 3113 mddev->ok_start_degraded = start_dirty_degraded; 3114 3115 if (start_readonly) 3116 mddev->ro = 2; /* read-only, but switch on first write */ 3117 3118 err = mddev->pers->run(mddev); 3119 if (!err && mddev->pers->sync_request) { 3120 err = bitmap_create(mddev); 3121 if (err) { 3122 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 3123 mdname(mddev), err); 3124 mddev->pers->stop(mddev); 3125 } 3126 } 3127 if (err) { 3128 printk(KERN_ERR "md: pers->run() failed ...\n"); 3129 module_put(mddev->pers->owner); 3130 mddev->pers = NULL; 3131 bitmap_destroy(mddev); 3132 return err; 3133 } 3134 if (mddev->pers->sync_request) 3135 sysfs_create_group(&mddev->kobj, &md_redundancy_group); 3136 else if (mddev->ro == 2) /* auto-readonly not meaningful */ 3137 mddev->ro = 0; 3138 3139 atomic_set(&mddev->writes_pending,0); 3140 mddev->safemode = 0; 3141 mddev->safemode_timer.function = md_safemode_timeout; 3142 mddev->safemode_timer.data = (unsigned long) mddev; 3143 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 3144 mddev->in_sync = 1; 3145 3146 ITERATE_RDEV(mddev,rdev,tmp) 3147 if (rdev->raid_disk >= 0) { 3148 char nm[20]; 3149 sprintf(nm, "rd%d", rdev->raid_disk); 3150 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 3151 } 3152 3153 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3154 3155 if (mddev->flags) 3156 md_update_sb(mddev, 0); 3157 3158 set_capacity(disk, mddev->array_size<<1); 3159 3160 /* If we call blk_queue_make_request here, it will 3161 * re-initialise max_sectors etc which may have been 3162 * refined inside -> run. So just set the bits we need to set. 3163 * Most initialisation happended when we called 3164 * blk_queue_make_request(..., md_fail_request) 3165 * earlier. 3166 */ 3167 mddev->queue->queuedata = mddev; 3168 mddev->queue->make_request_fn = mddev->pers->make_request; 3169 3170 /* If there is a partially-recovered drive we need to 3171 * start recovery here. If we leave it to md_check_recovery, 3172 * it will remove the drives and not do the right thing 3173 */ 3174 if (mddev->degraded && !mddev->sync_thread) { 3175 struct list_head *rtmp; 3176 int spares = 0; 3177 ITERATE_RDEV(mddev,rdev,rtmp) 3178 if (rdev->raid_disk >= 0 && 3179 !test_bit(In_sync, &rdev->flags) && 3180 !test_bit(Faulty, &rdev->flags)) 3181 /* complete an interrupted recovery */ 3182 spares++; 3183 if (spares && mddev->pers->sync_request) { 3184 mddev->recovery = 0; 3185 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3186 mddev->sync_thread = md_register_thread(md_do_sync, 3187 mddev, 3188 "%s_resync"); 3189 if (!mddev->sync_thread) { 3190 printk(KERN_ERR "%s: could not start resync" 3191 " thread...\n", 3192 mdname(mddev)); 3193 /* leave the spares where they are, it shouldn't hurt */ 3194 mddev->recovery = 0; 3195 } 3196 } 3197 } 3198 md_wakeup_thread(mddev->thread); 3199 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 3200 3201 mddev->changed = 1; 3202 md_new_event(mddev); 3203 kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE); 3204 return 0; 3205} 3206 3207static int restart_array(mddev_t *mddev) 3208{ 3209 struct gendisk *disk = mddev->gendisk; 3210 int err; 3211 3212 /* 3213 * Complain if it has no devices 3214 */ 3215 err = -ENXIO; 3216 if (list_empty(&mddev->disks)) 3217 goto out; 3218 3219 if (mddev->pers) { 3220 err = -EBUSY; 3221 if (!mddev->ro) 3222 goto out; 3223 3224 mddev->safemode = 0; 3225 mddev->ro = 0; 3226 set_disk_ro(disk, 0); 3227 3228 printk(KERN_INFO "md: %s switched to read-write mode.\n", 3229 mdname(mddev)); 3230 /* 3231 * Kick recovery or resync if necessary 3232 */ 3233 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3234 md_wakeup_thread(mddev->thread); 3235 md_wakeup_thread(mddev->sync_thread); 3236 err = 0; 3237 } else 3238 err = -EINVAL; 3239 3240out: 3241 return err; 3242} 3243 3244/* similar to deny_write_access, but accounts for our holding a reference 3245 * to the file ourselves */ 3246static int deny_bitmap_write_access(struct file * file) 3247{ 3248 struct inode *inode = file->f_mapping->host; 3249 3250 spin_lock(&inode->i_lock); 3251 if (atomic_read(&inode->i_writecount) > 1) { 3252 spin_unlock(&inode->i_lock); 3253 return -ETXTBSY; 3254 } 3255 atomic_set(&inode->i_writecount, -1); 3256 spin_unlock(&inode->i_lock); 3257 3258 return 0; 3259} 3260 3261static void restore_bitmap_write_access(struct file *file) 3262{ 3263 struct inode *inode = file->f_mapping->host; 3264 3265 spin_lock(&inode->i_lock); 3266 atomic_set(&inode->i_writecount, 1); 3267 spin_unlock(&inode->i_lock); 3268} 3269 3270/* mode: 3271 * 0 - completely stop and dis-assemble array 3272 * 1 - switch to readonly 3273 * 2 - stop but do not disassemble array 3274 */ 3275static int do_md_stop(mddev_t * mddev, int mode) 3276{ 3277 int err = 0; 3278 struct gendisk *disk = mddev->gendisk; 3279 3280 if (mddev->pers) { 3281 if (atomic_read(&mddev->active)>2) { 3282 printk("md: %s still in use.\n",mdname(mddev)); 3283 return -EBUSY; 3284 } 3285 3286 if (mddev->sync_thread) { 3287 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3288 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3289 md_unregister_thread(mddev->sync_thread); 3290 mddev->sync_thread = NULL; 3291 } 3292 3293 del_timer_sync(&mddev->safemode_timer); 3294 3295 invalidate_partition(disk, 0); 3296 3297 switch(mode) { 3298 case 1: /* readonly */ 3299 err = -ENXIO; 3300 if (mddev->ro==1) 3301 goto out; 3302 mddev->ro = 1; 3303 break; 3304 case 0: /* disassemble */ 3305 case 2: /* stop */ 3306 bitmap_flush(mddev); 3307 md_super_wait(mddev); 3308 if (mddev->ro) 3309 set_disk_ro(disk, 0); 3310 blk_queue_make_request(mddev->queue, md_fail_request); 3311 mddev->pers->stop(mddev); 3312 if (mddev->pers->sync_request) 3313 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 3314 3315 module_put(mddev->pers->owner); 3316 mddev->pers = NULL; 3317 if (mddev->ro) 3318 mddev->ro = 0; 3319 } 3320 if (!mddev->in_sync || mddev->flags) { 3321 /* mark array as shutdown cleanly */ 3322 mddev->in_sync = 1; 3323 md_update_sb(mddev, 1); 3324 } 3325 if (mode == 1) 3326 set_disk_ro(disk, 1); 3327 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3328 } 3329 3330 /* 3331 * Free resources if final stop 3332 */ 3333 if (mode == 0) { 3334 mdk_rdev_t *rdev; 3335 struct list_head *tmp; 3336 struct gendisk *disk; 3337 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 3338 3339 bitmap_destroy(mddev); 3340 if (mddev->bitmap_file) { 3341 restore_bitmap_write_access(mddev->bitmap_file); 3342 fput(mddev->bitmap_file); 3343 mddev->bitmap_file = NULL; 3344 } 3345 mddev->bitmap_offset = 0; 3346 3347 ITERATE_RDEV(mddev,rdev,tmp) 3348 if (rdev->raid_disk >= 0) { 3349 char nm[20]; 3350 sprintf(nm, "rd%d", rdev->raid_disk); 3351 sysfs_remove_link(&mddev->kobj, nm); 3352 } 3353 3354 export_array(mddev); 3355 3356 mddev->array_size = 0; 3357 mddev->size = 0; 3358 mddev->raid_disks = 0; 3359 mddev->recovery_cp = 0; 3360 3361 disk = mddev->gendisk; 3362 if (disk) 3363 set_capacity(disk, 0); 3364 mddev->changed = 1; 3365 } else if (mddev->pers) 3366 printk(KERN_INFO "md: %s switched to read-only mode.\n", 3367 mdname(mddev)); 3368 err = 0; 3369 md_new_event(mddev); 3370out: 3371 return err; 3372} 3373 3374static void autorun_array(mddev_t *mddev) 3375{ 3376 mdk_rdev_t *rdev; 3377 struct list_head *tmp; 3378 int err; 3379 3380 if (list_empty(&mddev->disks)) 3381 return; 3382 3383 printk(KERN_INFO "md: running: "); 3384 3385 ITERATE_RDEV(mddev,rdev,tmp) { 3386 char b[BDEVNAME_SIZE]; 3387 printk("<%s>", bdevname(rdev->bdev,b)); 3388 } 3389 printk("\n"); 3390 3391 err = do_md_run (mddev); 3392 if (err) { 3393 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 3394 do_md_stop (mddev, 0); 3395 } 3396} 3397 3398/* 3399 * lets try to run arrays based on all disks that have arrived 3400 * until now. (those are in pending_raid_disks) 3401 * 3402 * the method: pick the first pending disk, collect all disks with 3403 * the same UUID, remove all from the pending list and put them into 3404 * the 'same_array' list. Then order this list based on superblock 3405 * update time (freshest comes first), kick out 'old' disks and 3406 * compare superblocks. If everything's fine then run it. 3407 * 3408 * If "unit" is allocated, then bump its reference count 3409 */ 3410static void autorun_devices(int part) 3411{ 3412 struct list_head *tmp; 3413 mdk_rdev_t *rdev0, *rdev; 3414 mddev_t *mddev; 3415 char b[BDEVNAME_SIZE]; 3416 3417 printk(KERN_INFO "md: autorun ...\n"); 3418 while (!list_empty(&pending_raid_disks)) { 3419 int unit; 3420 dev_t dev; 3421 LIST_HEAD(candidates); 3422 rdev0 = list_entry(pending_raid_disks.next, 3423 mdk_rdev_t, same_set); 3424 3425 printk(KERN_INFO "md: considering %s ...\n", 3426 bdevname(rdev0->bdev,b)); 3427 INIT_LIST_HEAD(&candidates); 3428 ITERATE_RDEV_PENDING(rdev,tmp) 3429 if (super_90_load(rdev, rdev0, 0) >= 0) { 3430 printk(KERN_INFO "md: adding %s ...\n", 3431 bdevname(rdev->bdev,b)); 3432 list_move(&rdev->same_set, &candidates); 3433 } 3434 /* 3435 * now we have a set of devices, with all of them having 3436 * mostly sane superblocks. It's time to allocate the 3437 * mddev. 3438 */ 3439 if (part) { 3440 dev = MKDEV(mdp_major, 3441 rdev0->preferred_minor << MdpMinorShift); 3442 unit = MINOR(dev) >> MdpMinorShift; 3443 } else { 3444 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 3445 unit = MINOR(dev); 3446 } 3447 if (rdev0->preferred_minor != unit) { 3448 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 3449 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 3450 break; 3451 } 3452 3453 md_probe(dev, NULL, NULL); 3454 mddev = mddev_find(dev); 3455 if (!mddev) { 3456 printk(KERN_ERR 3457 "md: cannot allocate memory for md drive.\n"); 3458 break; 3459 } 3460 if (mddev_lock(mddev)) 3461 printk(KERN_WARNING "md: %s locked, cannot run\n", 3462 mdname(mddev)); 3463 else if (mddev->raid_disks || mddev->major_version 3464 || !list_empty(&mddev->disks)) { 3465 printk(KERN_WARNING 3466 "md: %s already running, cannot run %s\n", 3467 mdname(mddev), bdevname(rdev0->bdev,b)); 3468 mddev_unlock(mddev); 3469 } else { 3470 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 3471 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 3472 list_del_init(&rdev->same_set); 3473 if (bind_rdev_to_array(rdev, mddev)) 3474 export_rdev(rdev); 3475 } 3476 autorun_array(mddev); 3477 mddev_unlock(mddev); 3478 } 3479 /* on success, candidates will be empty, on error 3480 * it won't... 3481 */ 3482 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 3483 export_rdev(rdev); 3484 mddev_put(mddev); 3485 } 3486 printk(KERN_INFO "md: ... autorun DONE.\n"); 3487} 3488 3489static int get_version(void __user * arg) 3490{ 3491 mdu_version_t ver; 3492 3493 ver.major = MD_MAJOR_VERSION; 3494 ver.minor = MD_MINOR_VERSION; 3495 ver.patchlevel = MD_PATCHLEVEL_VERSION; 3496 3497 if (copy_to_user(arg, &ver, sizeof(ver))) 3498 return -EFAULT; 3499 3500 return 0; 3501} 3502 3503static int get_array_info(mddev_t * mddev, void __user * arg) 3504{ 3505 mdu_array_info_t info; 3506 int nr,working,active,failed,spare; 3507 mdk_rdev_t *rdev; 3508 struct list_head *tmp; 3509 3510 nr=working=active=failed=spare=0; 3511 ITERATE_RDEV(mddev,rdev,tmp) { 3512 nr++; 3513 if (test_bit(Faulty, &rdev->flags)) 3514 failed++; 3515 else { 3516 working++; 3517 if (test_bit(In_sync, &rdev->flags)) 3518 active++; 3519 else 3520 spare++; 3521 } 3522 } 3523 3524 info.major_version = mddev->major_version; 3525 info.minor_version = mddev->minor_version; 3526 info.patch_version = MD_PATCHLEVEL_VERSION; 3527 info.ctime = mddev->ctime; 3528 info.level = mddev->level; 3529 info.size = mddev->size; 3530 if (info.size != mddev->size) /* overflow */ 3531 info.size = -1; 3532 info.nr_disks = nr; 3533 info.raid_disks = mddev->raid_disks; 3534 info.md_minor = mddev->md_minor; 3535 info.not_persistent= !mddev->persistent; 3536 3537 info.utime = mddev->utime; 3538 info.state = 0; 3539 if (mddev->in_sync) 3540 info.state = (1<<MD_SB_CLEAN); 3541 if (mddev->bitmap && mddev->bitmap_offset) 3542 info.state = (1<<MD_SB_BITMAP_PRESENT); 3543 info.active_disks = active; 3544 info.working_disks = working; 3545 info.failed_disks = failed; 3546 info.spare_disks = spare; 3547 3548 info.layout = mddev->layout; 3549 info.chunk_size = mddev->chunk_size; 3550 3551 if (copy_to_user(arg, &info, sizeof(info))) 3552 return -EFAULT; 3553 3554 return 0; 3555} 3556 3557static int get_bitmap_file(mddev_t * mddev, void __user * arg) 3558{ 3559 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 3560 char *ptr, *buf = NULL; 3561 int err = -ENOMEM; 3562 3563 file = kmalloc(sizeof(*file), GFP_KERNEL); 3564 if (!file) 3565 goto out; 3566 3567 /* bitmap disabled, zero the first byte and copy out */ 3568 if (!mddev->bitmap || !mddev->bitmap->file) { 3569 file->pathname[0] = '\0'; 3570 goto copy_out; 3571 } 3572 3573 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 3574 if (!buf) 3575 goto out; 3576 3577 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); 3578 if (!ptr) 3579 goto out; 3580 3581 strcpy(file->pathname, ptr); 3582 3583copy_out: 3584 err = 0; 3585 if (copy_to_user(arg, file, sizeof(*file))) 3586 err = -EFAULT; 3587out: 3588 kfree(buf); 3589 kfree(file); 3590 return err; 3591} 3592 3593static int get_disk_info(mddev_t * mddev, void __user * arg) 3594{ 3595 mdu_disk_info_t info; 3596 unsigned int nr; 3597 mdk_rdev_t *rdev; 3598 3599 if (copy_from_user(&info, arg, sizeof(info))) 3600 return -EFAULT; 3601 3602 nr = info.number; 3603 3604 rdev = find_rdev_nr(mddev, nr); 3605 if (rdev) { 3606 info.major = MAJOR(rdev->bdev->bd_dev); 3607 info.minor = MINOR(rdev->bdev->bd_dev); 3608 info.raid_disk = rdev->raid_disk; 3609 info.state = 0; 3610 if (test_bit(Faulty, &rdev->flags)) 3611 info.state |= (1<<MD_DISK_FAULTY); 3612 else if (test_bit(In_sync, &rdev->flags)) { 3613 info.state |= (1<<MD_DISK_ACTIVE); 3614 info.state |= (1<<MD_DISK_SYNC); 3615 } 3616 if (test_bit(WriteMostly, &rdev->flags)) 3617 info.state |= (1<<MD_DISK_WRITEMOSTLY); 3618 } else { 3619 info.major = info.minor = 0; 3620 info.raid_disk = -1; 3621 info.state = (1<<MD_DISK_REMOVED); 3622 } 3623 3624 if (copy_to_user(arg, &info, sizeof(info))) 3625 return -EFAULT; 3626 3627 return 0; 3628} 3629 3630static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 3631{ 3632 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 3633 mdk_rdev_t *rdev; 3634 dev_t dev = MKDEV(info->major,info->minor); 3635 3636 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 3637 return -EOVERFLOW; 3638 3639 if (!mddev->raid_disks) { 3640 int err; 3641 /* expecting a device which has a superblock */ 3642 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 3643 if (IS_ERR(rdev)) { 3644 printk(KERN_WARNING 3645 "md: md_import_device returned %ld\n", 3646 PTR_ERR(rdev)); 3647 return PTR_ERR(rdev); 3648 } 3649 if (!list_empty(&mddev->disks)) { 3650 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 3651 mdk_rdev_t, same_set); 3652 int err = super_types[mddev->major_version] 3653 .load_super(rdev, rdev0, mddev->minor_version); 3654 if (err < 0) { 3655 printk(KERN_WARNING 3656 "md: %s has different UUID to %s\n", 3657 bdevname(rdev->bdev,b), 3658 bdevname(rdev0->bdev,b2)); 3659 export_rdev(rdev); 3660 return -EINVAL; 3661 } 3662 } 3663 err = bind_rdev_to_array(rdev, mddev); 3664 if (err) 3665 export_rdev(rdev); 3666 return err; 3667 } 3668 3669 /* 3670 * add_new_disk can be used once the array is assembled 3671 * to add "hot spares". They must already have a superblock 3672 * written 3673 */ 3674 if (mddev->pers) { 3675 int err; 3676 if (!mddev->pers->hot_add_disk) { 3677 printk(KERN_WARNING 3678 "%s: personality does not support diskops!\n", 3679 mdname(mddev)); 3680 return -EINVAL; 3681 } 3682 if (mddev->persistent) 3683 rdev = md_import_device(dev, mddev->major_version, 3684 mddev->minor_version); 3685 else 3686 rdev = md_import_device(dev, -1, -1); 3687 if (IS_ERR(rdev)) { 3688 printk(KERN_WARNING 3689 "md: md_import_device returned %ld\n", 3690 PTR_ERR(rdev)); 3691 return PTR_ERR(rdev); 3692 } 3693 /* set save_raid_disk if appropriate */ 3694 if (!mddev->persistent) { 3695 if (info->state & (1<<MD_DISK_SYNC) && 3696 info->raid_disk < mddev->raid_disks) 3697 rdev->raid_disk = info->raid_disk; 3698 else 3699 rdev->raid_disk = -1; 3700 } else 3701 super_types[mddev->major_version]. 3702 validate_super(mddev, rdev); 3703 rdev->saved_raid_disk = rdev->raid_disk; 3704 3705 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 3706 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3707 set_bit(WriteMostly, &rdev->flags); 3708 3709 rdev->raid_disk = -1; 3710 err = bind_rdev_to_array(rdev, mddev); 3711 if (!err && !mddev->pers->hot_remove_disk) { 3712 /* If there is hot_add_disk but no hot_remove_disk 3713 * then added disks for geometry changes, 3714 * and should be added immediately. 3715 */ 3716 super_types[mddev->major_version]. 3717 validate_super(mddev, rdev); 3718 err = mddev->pers->hot_add_disk(mddev, rdev); 3719 if (err) 3720 unbind_rdev_from_array(rdev); 3721 } 3722 if (err) 3723 export_rdev(rdev); 3724 3725 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3726 md_wakeup_thread(mddev->thread); 3727 return err; 3728 } 3729 3730 /* otherwise, add_new_disk is only allowed 3731 * for major_version==0 superblocks 3732 */ 3733 if (mddev->major_version != 0) { 3734 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 3735 mdname(mddev)); 3736 return -EINVAL; 3737 } 3738 3739 if (!(info->state & (1<<MD_DISK_FAULTY))) { 3740 int err; 3741 rdev = md_import_device (dev, -1, 0); 3742 if (IS_ERR(rdev)) { 3743 printk(KERN_WARNING 3744 "md: error, md_import_device() returned %ld\n", 3745 PTR_ERR(rdev)); 3746 return PTR_ERR(rdev); 3747 } 3748 rdev->desc_nr = info->number; 3749 if (info->raid_disk < mddev->raid_disks) 3750 rdev->raid_disk = info->raid_disk; 3751 else 3752 rdev->raid_disk = -1; 3753 3754 rdev->flags = 0; 3755 3756 if (rdev->raid_disk < mddev->raid_disks) 3757 if (info->state & (1<<MD_DISK_SYNC)) 3758 set_bit(In_sync, &rdev->flags); 3759 3760 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3761 set_bit(WriteMostly, &rdev->flags); 3762 3763 if (!mddev->persistent) { 3764 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 3765 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3766 } else 3767 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3768 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 3769 3770 err = bind_rdev_to_array(rdev, mddev); 3771 if (err) { 3772 export_rdev(rdev); 3773 return err; 3774 } 3775 } 3776 3777 return 0; 3778} 3779 3780static int hot_remove_disk(mddev_t * mddev, dev_t dev) 3781{ 3782 char b[BDEVNAME_SIZE]; 3783 mdk_rdev_t *rdev; 3784 3785 if (!mddev->pers) 3786 return -ENODEV; 3787 3788 rdev = find_rdev(mddev, dev); 3789 if (!rdev) 3790 return -ENXIO; 3791 3792 if (rdev->raid_disk >= 0) 3793 goto busy; 3794 3795 kick_rdev_from_array(rdev); 3796 md_update_sb(mddev, 1); 3797 md_new_event(mddev); 3798 3799 return 0; 3800busy: 3801 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", 3802 bdevname(rdev->bdev,b), mdname(mddev)); 3803 return -EBUSY; 3804} 3805 3806static int hot_add_disk(mddev_t * mddev, dev_t dev) 3807{ 3808 char b[BDEVNAME_SIZE]; 3809 int err; 3810 unsigned int size; 3811 mdk_rdev_t *rdev; 3812 3813 if (!mddev->pers) 3814 return -ENODEV; 3815 3816 if (mddev->major_version != 0) { 3817 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 3818 " version-0 superblocks.\n", 3819 mdname(mddev)); 3820 return -EINVAL; 3821 } 3822 if (!mddev->pers->hot_add_disk) { 3823 printk(KERN_WARNING 3824 "%s: personality does not support diskops!\n", 3825 mdname(mddev)); 3826 return -EINVAL; 3827 } 3828 3829 rdev = md_import_device (dev, -1, 0); 3830 if (IS_ERR(rdev)) { 3831 printk(KERN_WARNING 3832 "md: error, md_import_device() returned %ld\n", 3833 PTR_ERR(rdev)); 3834 return -EINVAL; 3835 } 3836 3837 if (mddev->persistent) 3838 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3839 else 3840 rdev->sb_offset = 3841 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3842 3843 size = calc_dev_size(rdev, mddev->chunk_size); 3844 rdev->size = size; 3845 3846 if (test_bit(Faulty, &rdev->flags)) { 3847 printk(KERN_WARNING 3848 "md: can not hot-add faulty %s disk to %s!\n", 3849 bdevname(rdev->bdev,b), mdname(mddev)); 3850 err = -EINVAL; 3851 goto abort_export; 3852 } 3853 clear_bit(In_sync, &rdev->flags); 3854 rdev->desc_nr = -1; 3855 rdev->saved_raid_disk = -1; 3856 err = bind_rdev_to_array(rdev, mddev); 3857 if (err) 3858 goto abort_export; 3859 3860 /* 3861 * The rest should better be atomic, we can have disk failures 3862 * noticed in interrupt contexts ... 3863 */ 3864 3865 if (rdev->desc_nr == mddev->max_disks) { 3866 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 3867 mdname(mddev)); 3868 err = -EBUSY; 3869 goto abort_unbind_export; 3870 } 3871 3872 rdev->raid_disk = -1; 3873 3874 md_update_sb(mddev, 1); 3875 3876 /* 3877 * Kick recovery, maybe this spare has to be added to the 3878 * array immediately. 3879 */ 3880 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3881 md_wakeup_thread(mddev->thread); 3882 md_new_event(mddev); 3883 return 0; 3884 3885abort_unbind_export: 3886 unbind_rdev_from_array(rdev); 3887 3888abort_export: 3889 export_rdev(rdev); 3890 return err; 3891} 3892 3893static int set_bitmap_file(mddev_t *mddev, int fd) 3894{ 3895 int err; 3896 3897 if (mddev->pers) { 3898 if (!mddev->pers->quiesce) 3899 return -EBUSY; 3900 if (mddev->recovery || mddev->sync_thread) 3901 return -EBUSY; 3902 /* we should be able to change the bitmap.. */ 3903 } 3904 3905 3906 if (fd >= 0) { 3907 if (mddev->bitmap) 3908 return -EEXIST; /* cannot add when bitmap is present */ 3909 mddev->bitmap_file = fget(fd); 3910 3911 if (mddev->bitmap_file == NULL) { 3912 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 3913 mdname(mddev)); 3914 return -EBADF; 3915 } 3916 3917 err = deny_bitmap_write_access(mddev->bitmap_file); 3918 if (err) { 3919 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 3920 mdname(mddev)); 3921 fput(mddev->bitmap_file); 3922 mddev->bitmap_file = NULL; 3923 return err; 3924 } 3925 mddev->bitmap_offset = 0; /* file overrides offset */ 3926 } else if (mddev->bitmap == NULL) 3927 return -ENOENT; /* cannot remove what isn't there */ 3928 err = 0; 3929 if (mddev->pers) { 3930 mddev->pers->quiesce(mddev, 1); 3931 if (fd >= 0) 3932 err = bitmap_create(mddev); 3933 if (fd < 0 || err) { 3934 bitmap_destroy(mddev); 3935 fd = -1; /* make sure to put the file */ 3936 } 3937 mddev->pers->quiesce(mddev, 0); 3938 } 3939 if (fd < 0) { 3940 if (mddev->bitmap_file) { 3941 restore_bitmap_write_access(mddev->bitmap_file); 3942 fput(mddev->bitmap_file); 3943 } 3944 mddev->bitmap_file = NULL; 3945 } 3946 3947 return err; 3948} 3949 3950/* 3951 * set_array_info is used two different ways 3952 * The original usage is when creating a new array. 3953 * In this usage, raid_disks is > 0 and it together with 3954 * level, size, not_persistent,layout,chunksize determine the 3955 * shape of the array. 3956 * This will always create an array with a type-0.90.0 superblock. 3957 * The newer usage is when assembling an array. 3958 * In this case raid_disks will be 0, and the major_version field is 3959 * use to determine which style super-blocks are to be found on the devices. 3960 * The minor and patch _version numbers are also kept incase the 3961 * super_block handler wishes to interpret them. 3962 */ 3963static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 3964{ 3965 3966 if (info->raid_disks == 0) { 3967 /* just setting version number for superblock loading */ 3968 if (info->major_version < 0 || 3969 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || 3970 super_types[info->major_version].name == NULL) { 3971 /* maybe try to auto-load a module? */ 3972 printk(KERN_INFO 3973 "md: superblock version %d not known\n", 3974 info->major_version); 3975 return -EINVAL; 3976 } 3977 mddev->major_version = info->major_version; 3978 mddev->minor_version = info->minor_version; 3979 mddev->patch_version = info->patch_version; 3980 return 0; 3981 } 3982 mddev->major_version = MD_MAJOR_VERSION; 3983 mddev->minor_version = MD_MINOR_VERSION; 3984 mddev->patch_version = MD_PATCHLEVEL_VERSION; 3985 mddev->ctime = get_seconds(); 3986 3987 mddev->level = info->level; 3988 mddev->clevel[0] = 0; 3989 mddev->size = info->size; 3990 mddev->raid_disks = info->raid_disks; 3991 /* don't set md_minor, it is determined by which /dev/md* was 3992 * openned 3993 */ 3994 if (info->state & (1<<MD_SB_CLEAN)) 3995 mddev->recovery_cp = MaxSector; 3996 else 3997 mddev->recovery_cp = 0; 3998 mddev->persistent = ! info->not_persistent; 3999 4000 mddev->layout = info->layout; 4001 mddev->chunk_size = info->chunk_size; 4002 4003 mddev->max_disks = MD_SB_DISKS; 4004 4005 mddev->flags = 0; 4006 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4007 4008 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 4009 mddev->bitmap_offset = 0; 4010 4011 mddev->reshape_position = MaxSector; 4012 4013 /* 4014 * Generate a 128 bit UUID 4015 */ 4016 get_random_bytes(mddev->uuid, 16); 4017 4018 mddev->new_level = mddev->level; 4019 mddev->new_chunk = mddev->chunk_size; 4020 mddev->new_layout = mddev->layout; 4021 mddev->delta_disks = 0; 4022 4023 return 0; 4024} 4025 4026static int update_size(mddev_t *mddev, unsigned long size) 4027{ 4028 mdk_rdev_t * rdev; 4029 int rv; 4030 struct list_head *tmp; 4031 int fit = (size == 0); 4032 4033 if (mddev->pers->resize == NULL) 4034 return -EINVAL; 4035 /* The "size" is the amount of each device that is used. 4036 * This can only make sense for arrays with redundancy. 4037 * linear and raid0 always use whatever space is available 4038 * We can only consider changing the size if no resync 4039 * or reconstruction is happening, and if the new size 4040 * is acceptable. It must fit before the sb_offset or, 4041 * if that is <data_offset, it must fit before the 4042 * size of each device. 4043 * If size is zero, we find the largest size that fits. 4044 */ 4045 if (mddev->sync_thread) 4046 return -EBUSY; 4047 ITERATE_RDEV(mddev,rdev,tmp) { 4048 sector_t avail; 4049 avail = rdev->size * 2; 4050 4051 if (fit && (size == 0 || size > avail/2)) 4052 size = avail/2; 4053 if (avail < ((sector_t)size << 1)) 4054 return -ENOSPC; 4055 } 4056 rv = mddev->pers->resize(mddev, (sector_t)size *2); 4057 if (!rv) { 4058 struct block_device *bdev; 4059 4060 bdev = bdget_disk(mddev->gendisk, 0); 4061 if (bdev) { 4062 mutex_lock(&bdev->bd_inode->i_mutex); 4063 i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10); 4064 mutex_unlock(&bdev->bd_inode->i_mutex); 4065 bdput(bdev); 4066 } 4067 } 4068 return rv; 4069} 4070 4071static int update_raid_disks(mddev_t *mddev, int raid_disks) 4072{ 4073 int rv; 4074 /* change the number of raid disks */ 4075 if (mddev->pers->check_reshape == NULL) 4076 return -EINVAL; 4077 if (raid_disks <= 0 || 4078 raid_disks >= mddev->max_disks) 4079 return -EINVAL; 4080 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 4081 return -EBUSY; 4082 mddev->delta_disks = raid_disks - mddev->raid_disks; 4083 4084 rv = mddev->pers->check_reshape(mddev); 4085 return rv; 4086} 4087 4088 4089/* 4090 * update_array_info is used to change the configuration of an 4091 * on-line array. 4092 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 4093 * fields in the info are checked against the array. 4094 * Any differences that cannot be handled will cause an error. 4095 * Normally, only one change can be managed at a time. 4096 */ 4097static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 4098{ 4099 int rv = 0; 4100 int cnt = 0; 4101 int state = 0; 4102 4103 /* calculate expected state,ignoring low bits */ 4104 if (mddev->bitmap && mddev->bitmap_offset) 4105 state |= (1 << MD_SB_BITMAP_PRESENT); 4106 4107 if (mddev->major_version != info->major_version || 4108 mddev->minor_version != info->minor_version || 4109/* mddev->patch_version != info->patch_version || */ 4110 mddev->ctime != info->ctime || 4111 mddev->level != info->level || 4112/* mddev->layout != info->layout || */ 4113 !mddev->persistent != info->not_persistent|| 4114 mddev->chunk_size != info->chunk_size || 4115 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 4116 ((state^info->state) & 0xfffffe00) 4117 ) 4118 return -EINVAL; 4119 /* Check there is only one change */ 4120 if (info->size >= 0 && mddev->size != info->size) cnt++; 4121 if (mddev->raid_disks != info->raid_disks) cnt++; 4122 if (mddev->layout != info->layout) cnt++; 4123 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 4124 if (cnt == 0) return 0; 4125 if (cnt > 1) return -EINVAL; 4126 4127 if (mddev->layout != info->layout) { 4128 /* Change layout 4129 * we don't need to do anything at the md level, the 4130 * personality will take care of it all. 4131 */ 4132 if (mddev->pers->reconfig == NULL) 4133 return -EINVAL; 4134 else 4135 return mddev->pers->reconfig(mddev, info->layout, -1); 4136 } 4137 if (info->size >= 0 && mddev->size != info->size) 4138 rv = update_size(mddev, info->size); 4139 4140 if (mddev->raid_disks != info->raid_disks) 4141 rv = update_raid_disks(mddev, info->raid_disks); 4142 4143 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 4144 if (mddev->pers->quiesce == NULL) 4145 return -EINVAL; 4146 if (mddev->recovery || mddev->sync_thread) 4147 return -EBUSY; 4148 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 4149 /* add the bitmap */ 4150 if (mddev->bitmap) 4151 return -EEXIST; 4152 if (mddev->default_bitmap_offset == 0) 4153 return -EINVAL; 4154 mddev->bitmap_offset = mddev->default_bitmap_offset; 4155 mddev->pers->quiesce(mddev, 1); 4156 rv = bitmap_create(mddev); 4157 if (rv) 4158 bitmap_destroy(mddev); 4159 mddev->pers->quiesce(mddev, 0); 4160 } else { 4161 /* remove the bitmap */ 4162 if (!mddev->bitmap) 4163 return -ENOENT; 4164 if (mddev->bitmap->file) 4165 return -EINVAL; 4166 mddev->pers->quiesce(mddev, 1); 4167 bitmap_destroy(mddev); 4168 mddev->pers->quiesce(mddev, 0); 4169 mddev->bitmap_offset = 0; 4170 } 4171 } 4172 md_update_sb(mddev, 1); 4173 return rv; 4174} 4175 4176static int set_disk_faulty(mddev_t *mddev, dev_t dev) 4177{ 4178 mdk_rdev_t *rdev; 4179 4180 if (mddev->pers == NULL) 4181 return -ENODEV; 4182 4183 rdev = find_rdev(mddev, dev); 4184 if (!rdev) 4185 return -ENODEV; 4186 4187 md_error(mddev, rdev); 4188 return 0; 4189} 4190 4191static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 4192{ 4193 mddev_t *mddev = bdev->bd_disk->private_data; 4194 4195 geo->heads = 2; 4196 geo->sectors = 4; 4197 geo->cylinders = get_capacity(mddev->gendisk) / 8; 4198 return 0; 4199} 4200 4201static int md_ioctl(struct inode *inode, struct file *file, 4202 unsigned int cmd, unsigned long arg) 4203{ 4204 int err = 0; 4205 void __user *argp = (void __user *)arg; 4206 mddev_t *mddev = NULL; 4207 4208 if (!capable(CAP_SYS_ADMIN)) 4209 return -EACCES; 4210 4211 /* 4212 * Commands dealing with the RAID driver but not any 4213 * particular array: 4214 */ 4215 switch (cmd) 4216 { 4217 case RAID_VERSION: 4218 err = get_version(argp); 4219 goto done; 4220 4221 case PRINT_RAID_DEBUG: 4222 err = 0; 4223 md_print_devices(); 4224 goto done; 4225 4226#ifndef MODULE 4227 case RAID_AUTORUN: 4228 err = 0; 4229 autostart_arrays(arg); 4230 goto done; 4231#endif 4232 default:; 4233 } 4234 4235 /* 4236 * Commands creating/starting a new array: 4237 */ 4238 4239 mddev = inode->i_bdev->bd_disk->private_data; 4240 4241 if (!mddev) { 4242 BUG(); 4243 goto abort; 4244 } 4245 4246 err = mddev_lock(mddev); 4247 if (err) { 4248 printk(KERN_INFO 4249 "md: ioctl lock interrupted, reason %d, cmd %d\n", 4250 err, cmd); 4251 goto abort; 4252 } 4253 4254 switch (cmd) 4255 { 4256 case SET_ARRAY_INFO: 4257 { 4258 mdu_array_info_t info; 4259 if (!arg) 4260 memset(&info, 0, sizeof(info)); 4261 else if (copy_from_user(&info, argp, sizeof(info))) { 4262 err = -EFAULT; 4263 goto abort_unlock; 4264 } 4265 if (mddev->pers) { 4266 err = update_array_info(mddev, &info); 4267 if (err) { 4268 printk(KERN_WARNING "md: couldn't update" 4269 " array info. %d\n", err); 4270 goto abort_unlock; 4271 } 4272 goto done_unlock; 4273 } 4274 if (!list_empty(&mddev->disks)) { 4275 printk(KERN_WARNING 4276 "md: array %s already has disks!\n", 4277 mdname(mddev)); 4278 err = -EBUSY; 4279 goto abort_unlock; 4280 } 4281 if (mddev->raid_disks) { 4282 printk(KERN_WARNING 4283 "md: array %s already initialised!\n", 4284 mdname(mddev)); 4285 err = -EBUSY; 4286 goto abort_unlock; 4287 } 4288 err = set_array_info(mddev, &info); 4289 if (err) { 4290 printk(KERN_WARNING "md: couldn't set" 4291 " array info. %d\n", err); 4292 goto abort_unlock; 4293 } 4294 } 4295 goto done_unlock; 4296 4297 default:; 4298 } 4299 4300 /* 4301 * Commands querying/configuring an existing array: 4302 */ 4303 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 4304 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */ 4305 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 4306 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) { 4307 err = -ENODEV; 4308 goto abort_unlock; 4309 } 4310 4311 /* 4312 * Commands even a read-only array can execute: 4313 */ 4314 switch (cmd) 4315 { 4316 case GET_ARRAY_INFO: 4317 err = get_array_info(mddev, argp); 4318 goto done_unlock; 4319 4320 case GET_BITMAP_FILE: 4321 err = get_bitmap_file(mddev, argp); 4322 goto done_unlock; 4323 4324 case GET_DISK_INFO: 4325 err = get_disk_info(mddev, argp); 4326 goto done_unlock; 4327 4328 case RESTART_ARRAY_RW: 4329 err = restart_array(mddev); 4330 goto done_unlock; 4331 4332 case STOP_ARRAY: 4333 err = do_md_stop (mddev, 0); 4334 goto done_unlock; 4335 4336 case STOP_ARRAY_RO: 4337 err = do_md_stop (mddev, 1); 4338 goto done_unlock; 4339 4340 /* 4341 * We have a problem here : there is no easy way to give a CHS 4342 * virtual geometry. We currently pretend that we have a 2 heads 4343 * 4 sectors (with a BIG number of cylinders...). This drives 4344 * dosfs just mad... ;-) 4345 */ 4346 } 4347 4348 /* 4349 * The remaining ioctls are changing the state of the 4350 * superblock, so we do not allow them on read-only arrays. 4351 * However non-MD ioctls (e.g. get-size) will still come through 4352 * here and hit the 'default' below, so only disallow 4353 * 'md' ioctls, and switch to rw mode if started auto-readonly. 4354 */ 4355 if (_IOC_TYPE(cmd) == MD_MAJOR && 4356 mddev->ro && mddev->pers) { 4357 if (mddev->ro == 2) { 4358 mddev->ro = 0; 4359 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4360 md_wakeup_thread(mddev->thread); 4361 4362 } else { 4363 err = -EROFS; 4364 goto abort_unlock; 4365 } 4366 } 4367 4368 switch (cmd) 4369 { 4370 case ADD_NEW_DISK: 4371 { 4372 mdu_disk_info_t info; 4373 if (copy_from_user(&info, argp, sizeof(info))) 4374 err = -EFAULT; 4375 else 4376 err = add_new_disk(mddev, &info); 4377 goto done_unlock; 4378 } 4379 4380 case HOT_REMOVE_DISK: 4381 err = hot_remove_disk(mddev, new_decode_dev(arg)); 4382 goto done_unlock; 4383 4384 case HOT_ADD_DISK: 4385 err = hot_add_disk(mddev, new_decode_dev(arg)); 4386 goto done_unlock; 4387 4388 case SET_DISK_FAULTY: 4389 err = set_disk_faulty(mddev, new_decode_dev(arg)); 4390 goto done_unlock; 4391 4392 case RUN_ARRAY: 4393 err = do_md_run (mddev); 4394 goto done_unlock; 4395 4396 case SET_BITMAP_FILE: 4397 err = set_bitmap_file(mddev, (int)arg); 4398 goto done_unlock; 4399 4400 default: 4401 err = -EINVAL; 4402 goto abort_unlock; 4403 } 4404 4405done_unlock: 4406abort_unlock: 4407 mddev_unlock(mddev); 4408 4409 return err; 4410done: 4411 if (err) 4412 MD_BUG(); 4413abort: 4414 return err; 4415} 4416 4417static int md_open(struct inode *inode, struct file *file) 4418{ 4419 /* 4420 * Succeed if we can lock the mddev, which confirms that 4421 * it isn't being stopped right now. 4422 */ 4423 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 4424 int err; 4425 4426 if ((err = mddev_lock(mddev))) 4427 goto out; 4428 4429 err = 0; 4430 mddev_get(mddev); 4431 mddev_unlock(mddev); 4432 4433 check_disk_change(inode->i_bdev); 4434 out: 4435 return err; 4436} 4437 4438static int md_release(struct inode *inode, struct file * file) 4439{ 4440 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 4441 4442 BUG_ON(!mddev); 4443 mddev_put(mddev); 4444 4445 return 0; 4446} 4447 4448static int md_media_changed(struct gendisk *disk) 4449{ 4450 mddev_t *mddev = disk->private_data; 4451 4452 return mddev->changed; 4453} 4454 4455static int md_revalidate(struct gendisk *disk) 4456{ 4457 mddev_t *mddev = disk->private_data; 4458 4459 mddev->changed = 0; 4460 return 0; 4461} 4462static struct block_device_operations md_fops = 4463{ 4464 .owner = THIS_MODULE, 4465 .open = md_open, 4466 .release = md_release, 4467 .ioctl = md_ioctl, 4468 .getgeo = md_getgeo, 4469 .media_changed = md_media_changed, 4470 .revalidate_disk= md_revalidate, 4471}; 4472 4473static int md_thread(void * arg) 4474{ 4475 mdk_thread_t *thread = arg; 4476 4477 /* 4478 * md_thread is a 'system-thread', it's priority should be very 4479 * high. We avoid resource deadlocks individually in each 4480 * raid personality. (RAID5 does preallocation) We also use RR and 4481 * the very same RT priority as kswapd, thus we will never get 4482 * into a priority inversion deadlock. 4483 * 4484 * we definitely have to have equal or higher priority than 4485 * bdflush, otherwise bdflush will deadlock if there are too 4486 * many dirty RAID5 blocks. 4487 */ 4488 4489 current->flags |= PF_NOFREEZE; 4490 allow_signal(SIGKILL); 4491 while (!kthread_should_stop()) { 4492 4493 /* We need to wait INTERRUPTIBLE so that 4494 * we don't add to the load-average. 4495 * That means we need to be sure no signals are 4496 * pending 4497 */ 4498 if (signal_pending(current)) 4499 flush_signals(current); 4500 4501 wait_event_interruptible_timeout 4502 (thread->wqueue, 4503 test_bit(THREAD_WAKEUP, &thread->flags) 4504 || kthread_should_stop(), 4505 thread->timeout); 4506 4507 clear_bit(THREAD_WAKEUP, &thread->flags); 4508 4509 thread->run(thread->mddev); 4510 } 4511 4512 return 0; 4513} 4514 4515void md_wakeup_thread(mdk_thread_t *thread) 4516{ 4517 if (thread) { 4518 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 4519 set_bit(THREAD_WAKEUP, &thread->flags); 4520 wake_up(&thread->wqueue); 4521 } 4522} 4523 4524mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 4525 const char *name) 4526{ 4527 mdk_thread_t *thread; 4528 4529 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 4530 if (!thread) 4531 return NULL; 4532 4533 init_waitqueue_head(&thread->wqueue); 4534 4535 thread->run = run; 4536 thread->mddev = mddev; 4537 thread->timeout = MAX_SCHEDULE_TIMEOUT; 4538 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 4539 if (IS_ERR(thread->tsk)) { 4540 kfree(thread); 4541 return NULL; 4542 } 4543 return thread; 4544} 4545 4546void md_unregister_thread(mdk_thread_t *thread) 4547{ 4548 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 4549 4550 kthread_stop(thread->tsk); 4551 kfree(thread); 4552} 4553 4554void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 4555{ 4556 if (!mddev) { 4557 MD_BUG(); 4558 return; 4559 } 4560 4561 if (!rdev || test_bit(Faulty, &rdev->flags)) 4562 return; 4563/* 4564 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 4565 mdname(mddev), 4566 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 4567 __builtin_return_address(0),__builtin_return_address(1), 4568 __builtin_return_address(2),__builtin_return_address(3)); 4569*/ 4570 if (!mddev->pers) 4571 return; 4572 if (!mddev->pers->error_handler) 4573 return; 4574 mddev->pers->error_handler(mddev,rdev); 4575 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4576 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4577 md_wakeup_thread(mddev->thread); 4578 md_new_event_inintr(mddev); 4579} 4580 4581/* seq_file implementation /proc/mdstat */ 4582 4583static void status_unused(struct seq_file *seq) 4584{ 4585 int i = 0; 4586 mdk_rdev_t *rdev; 4587 struct list_head *tmp; 4588 4589 seq_printf(seq, "unused devices: "); 4590 4591 ITERATE_RDEV_PENDING(rdev,tmp) { 4592 char b[BDEVNAME_SIZE]; 4593 i++; 4594 seq_printf(seq, "%s ", 4595 bdevname(rdev->bdev,b)); 4596 } 4597 if (!i) 4598 seq_printf(seq, "<none>"); 4599 4600 seq_printf(seq, "\n"); 4601} 4602 4603 4604static void status_resync(struct seq_file *seq, mddev_t * mddev) 4605{ 4606 sector_t max_blocks, resync, res; 4607 unsigned long dt, db, rt; 4608 int scale; 4609 unsigned int per_milli; 4610 4611 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 4612 4613 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 4614 max_blocks = mddev->resync_max_sectors >> 1; 4615 else 4616 max_blocks = mddev->size; 4617 4618 /* 4619 * Should not happen. 4620 */ 4621 if (!max_blocks) { 4622 MD_BUG(); 4623 return; 4624 } 4625 /* Pick 'scale' such that (resync>>scale)*1000 will fit 4626 * in a sector_t, and (max_blocks>>scale) will fit in a 4627 * u32, as those are the requirements for sector_div. 4628 * Thus 'scale' must be at least 10 4629 */ 4630 scale = 10; 4631 if (sizeof(sector_t) > sizeof(unsigned long)) { 4632 while ( max_blocks/2 > (1ULL<<(scale+32))) 4633 scale++; 4634 } 4635 res = (resync>>scale)*1000; 4636 sector_div(res, (u32)((max_blocks>>scale)+1)); 4637 4638 per_milli = res; 4639 { 4640 int i, x = per_milli/50, y = 20-x; 4641 seq_printf(seq, "["); 4642 for (i = 0; i < x; i++) 4643 seq_printf(seq, "="); 4644 seq_printf(seq, ">"); 4645 for (i = 0; i < y; i++) 4646 seq_printf(seq, "."); 4647 seq_printf(seq, "] "); 4648 } 4649 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 4650 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 4651 "reshape" : 4652 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 4653 "check" : 4654 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 4655 "resync" : "recovery"))), 4656 per_milli/10, per_milli % 10, 4657 (unsigned long long) resync, 4658 (unsigned long long) max_blocks); 4659 4660 /* 4661 * We do not want to overflow, so the order of operands and 4662 * the * 100 / 100 trick are important. We do a +1 to be 4663 * safe against division by zero. We only estimate anyway. 4664 * 4665 * dt: time from mark until now 4666 * db: blocks written from mark until now 4667 * rt: remaining time 4668 */ 4669 dt = ((jiffies - mddev->resync_mark) / HZ); 4670 if (!dt) dt++; 4671 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 4672 - mddev->resync_mark_cnt; 4673 rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100; 4674 4675 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 4676 4677 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 4678} 4679 4680static void *md_seq_start(struct seq_file *seq, loff_t *pos) 4681{ 4682 struct list_head *tmp; 4683 loff_t l = *pos; 4684 mddev_t *mddev; 4685 4686 if (l >= 0x10000) 4687 return NULL; 4688 if (!l--) 4689 /* header */ 4690 return (void*)1; 4691 4692 spin_lock(&all_mddevs_lock); 4693 list_for_each(tmp,&all_mddevs) 4694 if (!l--) { 4695 mddev = list_entry(tmp, mddev_t, all_mddevs); 4696 mddev_get(mddev); 4697 spin_unlock(&all_mddevs_lock); 4698 return mddev; 4699 } 4700 spin_unlock(&all_mddevs_lock); 4701 if (!l--) 4702 return (void*)2;/* tail */ 4703 return NULL; 4704} 4705 4706static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4707{ 4708 struct list_head *tmp; 4709 mddev_t *next_mddev, *mddev = v; 4710 4711 ++*pos; 4712 if (v == (void*)2) 4713 return NULL; 4714 4715 spin_lock(&all_mddevs_lock); 4716 if (v == (void*)1) 4717 tmp = all_mddevs.next; 4718 else 4719 tmp = mddev->all_mddevs.next; 4720 if (tmp != &all_mddevs) 4721 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 4722 else { 4723 next_mddev = (void*)2; 4724 *pos = 0x10000; 4725 } 4726 spin_unlock(&all_mddevs_lock); 4727 4728 if (v != (void*)1) 4729 mddev_put(mddev); 4730 return next_mddev; 4731 4732} 4733 4734static void md_seq_stop(struct seq_file *seq, void *v) 4735{ 4736 mddev_t *mddev = v; 4737 4738 if (mddev && v != (void*)1 && v != (void*)2) 4739 mddev_put(mddev); 4740} 4741 4742struct mdstat_info { 4743 int event; 4744}; 4745 4746static int md_seq_show(struct seq_file *seq, void *v) 4747{ 4748 mddev_t *mddev = v; 4749 sector_t size; 4750 struct list_head *tmp2; 4751 mdk_rdev_t *rdev; 4752 struct mdstat_info *mi = seq->private; 4753 struct bitmap *bitmap; 4754 4755 if (v == (void*)1) { 4756 struct mdk_personality *pers; 4757 seq_printf(seq, "Personalities : "); 4758 spin_lock(&pers_lock); 4759 list_for_each_entry(pers, &pers_list, list) 4760 seq_printf(seq, "[%s] ", pers->name); 4761 4762 spin_unlock(&pers_lock); 4763 seq_printf(seq, "\n"); 4764 mi->event = atomic_read(&md_event_count); 4765 return 0; 4766 } 4767 if (v == (void*)2) { 4768 status_unused(seq); 4769 return 0; 4770 } 4771 4772 if (mddev_lock(mddev) < 0) 4773 return -EINTR; 4774 4775 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 4776 seq_printf(seq, "%s : %sactive", mdname(mddev), 4777 mddev->pers ? "" : "in"); 4778 if (mddev->pers) { 4779 if (mddev->ro==1) 4780 seq_printf(seq, " (read-only)"); 4781 if (mddev->ro==2) 4782 seq_printf(seq, "(auto-read-only)"); 4783 seq_printf(seq, " %s", mddev->pers->name); 4784 } 4785 4786 size = 0; 4787 ITERATE_RDEV(mddev,rdev,tmp2) { 4788 char b[BDEVNAME_SIZE]; 4789 seq_printf(seq, " %s[%d]", 4790 bdevname(rdev->bdev,b), rdev->desc_nr); 4791 if (test_bit(WriteMostly, &rdev->flags)) 4792 seq_printf(seq, "(W)"); 4793 if (test_bit(Faulty, &rdev->flags)) { 4794 seq_printf(seq, "(F)"); 4795 continue; 4796 } else if (rdev->raid_disk < 0) 4797 seq_printf(seq, "(S)"); /* spare */ 4798 size += rdev->size; 4799 } 4800 4801 if (!list_empty(&mddev->disks)) { 4802 if (mddev->pers) 4803 seq_printf(seq, "\n %llu blocks", 4804 (unsigned long long)mddev->array_size); 4805 else 4806 seq_printf(seq, "\n %llu blocks", 4807 (unsigned long long)size); 4808 } 4809 if (mddev->persistent) { 4810 if (mddev->major_version != 0 || 4811 mddev->minor_version != 90) { 4812 seq_printf(seq," super %d.%d", 4813 mddev->major_version, 4814 mddev->minor_version); 4815 } 4816 } else 4817 seq_printf(seq, " super non-persistent"); 4818 4819 if (mddev->pers) { 4820 mddev->pers->status (seq, mddev); 4821 seq_printf(seq, "\n "); 4822 if (mddev->pers->sync_request) { 4823 if (mddev->curr_resync > 2) { 4824 status_resync (seq, mddev); 4825 seq_printf(seq, "\n "); 4826 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 4827 seq_printf(seq, "\tresync=DELAYED\n "); 4828 else if (mddev->recovery_cp < MaxSector) 4829 seq_printf(seq, "\tresync=PENDING\n "); 4830 } 4831 } else 4832 seq_printf(seq, "\n "); 4833 4834 if ((bitmap = mddev->bitmap)) { 4835 unsigned long chunk_kb; 4836 unsigned long flags; 4837 spin_lock_irqsave(&bitmap->lock, flags); 4838 chunk_kb = bitmap->chunksize >> 10; 4839 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 4840 "%lu%s chunk", 4841 bitmap->pages - bitmap->missing_pages, 4842 bitmap->pages, 4843 (bitmap->pages - bitmap->missing_pages) 4844 << (PAGE_SHIFT - 10), 4845 chunk_kb ? chunk_kb : bitmap->chunksize, 4846 chunk_kb ? "KB" : "B"); 4847 if (bitmap->file) { 4848 seq_printf(seq, ", file: "); 4849 seq_path(seq, bitmap->file->f_vfsmnt, 4850 bitmap->file->f_dentry," \t\n"); 4851 } 4852 4853 seq_printf(seq, "\n"); 4854 spin_unlock_irqrestore(&bitmap->lock, flags); 4855 } 4856 4857 seq_printf(seq, "\n"); 4858 } 4859 mddev_unlock(mddev); 4860 4861 return 0; 4862} 4863 4864static struct seq_operations md_seq_ops = { 4865 .start = md_seq_start, 4866 .next = md_seq_next, 4867 .stop = md_seq_stop, 4868 .show = md_seq_show, 4869}; 4870 4871static int md_seq_open(struct inode *inode, struct file *file) 4872{ 4873 int error; 4874 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 4875 if (mi == NULL) 4876 return -ENOMEM; 4877 4878 error = seq_open(file, &md_seq_ops); 4879 if (error) 4880 kfree(mi); 4881 else { 4882 struct seq_file *p = file->private_data; 4883 p->private = mi; 4884 mi->event = atomic_read(&md_event_count); 4885 } 4886 return error; 4887} 4888 4889static int md_seq_release(struct inode *inode, struct file *file) 4890{ 4891 struct seq_file *m = file->private_data; 4892 struct mdstat_info *mi = m->private; 4893 m->private = NULL; 4894 kfree(mi); 4895 return seq_release(inode, file); 4896} 4897 4898static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 4899{ 4900 struct seq_file *m = filp->private_data; 4901 struct mdstat_info *mi = m->private; 4902 int mask; 4903 4904 poll_wait(filp, &md_event_waiters, wait); 4905 4906 /* always allow read */ 4907 mask = POLLIN | POLLRDNORM; 4908 4909 if (mi->event != atomic_read(&md_event_count)) 4910 mask |= POLLERR | POLLPRI; 4911 return mask; 4912} 4913 4914static struct file_operations md_seq_fops = { 4915 .owner = THIS_MODULE, 4916 .open = md_seq_open, 4917 .read = seq_read, 4918 .llseek = seq_lseek, 4919 .release = md_seq_release, 4920 .poll = mdstat_poll, 4921}; 4922 4923int register_md_personality(struct mdk_personality *p) 4924{ 4925 spin_lock(&pers_lock); 4926 list_add_tail(&p->list, &pers_list); 4927 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 4928 spin_unlock(&pers_lock); 4929 return 0; 4930} 4931 4932int unregister_md_personality(struct mdk_personality *p) 4933{ 4934 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 4935 spin_lock(&pers_lock); 4936 list_del_init(&p->list); 4937 spin_unlock(&pers_lock); 4938 return 0; 4939} 4940 4941static int is_mddev_idle(mddev_t *mddev) 4942{ 4943 mdk_rdev_t * rdev; 4944 struct list_head *tmp; 4945 int idle; 4946 unsigned long curr_events; 4947 4948 idle = 1; 4949 ITERATE_RDEV(mddev,rdev,tmp) { 4950 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 4951 curr_events = disk_stat_read(disk, sectors[0]) + 4952 disk_stat_read(disk, sectors[1]) - 4953 atomic_read(&disk->sync_io); 4954 /* The difference between curr_events and last_events 4955 * will be affected by any new non-sync IO (making 4956 * curr_events bigger) and any difference in the amount of 4957 * in-flight syncio (making current_events bigger or smaller) 4958 * The amount in-flight is currently limited to 4959 * 32*64K in raid1/10 and 256*PAGE_SIZE in raid5/6 4960 * which is at most 4096 sectors. 4961 * These numbers are fairly fragile and should be made 4962 * more robust, probably by enforcing the 4963 * 'window size' that md_do_sync sort-of uses. 4964 * 4965 * Note: the following is an unsigned comparison. 4966 */ 4967 if ((curr_events - rdev->last_events + 4096) > 8192) { 4968 rdev->last_events = curr_events; 4969 idle = 0; 4970 } 4971 } 4972 return idle; 4973} 4974 4975void md_done_sync(mddev_t *mddev, int blocks, int ok) 4976{ 4977 /* another "blocks" (512byte) blocks have been synced */ 4978 atomic_sub(blocks, &mddev->recovery_active); 4979 wake_up(&mddev->recovery_wait); 4980 if (!ok) { 4981 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 4982 md_wakeup_thread(mddev->thread); 4983 // stop recovery, signal do_sync .... 4984 } 4985} 4986 4987 4988/* md_write_start(mddev, bi) 4989 * If we need to update some array metadata (e.g. 'active' flag 4990 * in superblock) before writing, schedule a superblock update 4991 * and wait for it to complete. 4992 */ 4993void md_write_start(mddev_t *mddev, struct bio *bi) 4994{ 4995 if (bio_data_dir(bi) != WRITE) 4996 return; 4997 4998 BUG_ON(mddev->ro == 1); 4999 if (mddev->ro == 2) { 5000 /* need to switch to read/write */ 5001 mddev->ro = 0; 5002 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5003 md_wakeup_thread(mddev->thread); 5004 } 5005 atomic_inc(&mddev->writes_pending); 5006 if (mddev->in_sync) { 5007 spin_lock_irq(&mddev->write_lock); 5008 if (mddev->in_sync) { 5009 mddev->in_sync = 0; 5010 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5011 md_wakeup_thread(mddev->thread); 5012 } 5013 spin_unlock_irq(&mddev->write_lock); 5014 } 5015 wait_event(mddev->sb_wait, mddev->flags==0); 5016} 5017 5018void md_write_end(mddev_t *mddev) 5019{ 5020 if (atomic_dec_and_test(&mddev->writes_pending)) { 5021 if (mddev->safemode == 2) 5022 md_wakeup_thread(mddev->thread); 5023 else if (mddev->safemode_delay) 5024 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 5025 } 5026} 5027 5028static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 5029 5030#define SYNC_MARKS 10 5031#define SYNC_MARK_STEP (3*HZ) 5032void md_do_sync(mddev_t *mddev) 5033{ 5034 mddev_t *mddev2; 5035 unsigned int currspeed = 0, 5036 window; 5037 sector_t max_sectors,j, io_sectors; 5038 unsigned long mark[SYNC_MARKS]; 5039 sector_t mark_cnt[SYNC_MARKS]; 5040 int last_mark,m; 5041 struct list_head *tmp; 5042 sector_t last_check; 5043 int skipped = 0; 5044 struct list_head *rtmp; 5045 mdk_rdev_t *rdev; 5046 char *desc; 5047 5048 /* just incase thread restarts... */ 5049 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 5050 return; 5051 if (mddev->ro) /* never try to sync a read-only array */ 5052 return; 5053 5054 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5055 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 5056 desc = "data-check"; 5057 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5058 desc = "requested-resync"; 5059 else 5060 desc = "resync"; 5061 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5062 desc = "reshape"; 5063 else 5064 desc = "recovery"; 5065 5066 /* we overload curr_resync somewhat here. 5067 * 0 == not engaged in resync at all 5068 * 2 == checking that there is no conflict with another sync 5069 * 1 == like 2, but have yielded to allow conflicting resync to 5070 * commense 5071 * other == active in resync - this many blocks 5072 * 5073 * Before starting a resync we must have set curr_resync to 5074 * 2, and then checked that every "conflicting" array has curr_resync 5075 * less than ours. When we find one that is the same or higher 5076 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 5077 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 5078 * This will mean we have to start checking from the beginning again. 5079 * 5080 */ 5081 5082 do { 5083 mddev->curr_resync = 2; 5084 5085 try_again: 5086 if (kthread_should_stop()) { 5087 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5088 goto skip; 5089 } 5090 ITERATE_MDDEV(mddev2,tmp) { 5091 if (mddev2 == mddev) 5092 continue; 5093 if (mddev2->curr_resync && 5094 match_mddev_units(mddev,mddev2)) { 5095 DEFINE_WAIT(wq); 5096 if (mddev < mddev2 && mddev->curr_resync == 2) { 5097 /* arbitrarily yield */ 5098 mddev->curr_resync = 1; 5099 wake_up(&resync_wait); 5100 } 5101 if (mddev > mddev2 && mddev->curr_resync == 1) 5102 /* no need to wait here, we can wait the next 5103 * time 'round when curr_resync == 2 5104 */ 5105 continue; 5106 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE); 5107 if (!kthread_should_stop() && 5108 mddev2->curr_resync >= mddev->curr_resync) { 5109 printk(KERN_INFO "md: delaying %s of %s" 5110 " until %s has finished (they" 5111 " share one or more physical units)\n", 5112 desc, mdname(mddev), mdname(mddev2)); 5113 mddev_put(mddev2); 5114 schedule(); 5115 finish_wait(&resync_wait, &wq); 5116 goto try_again; 5117 } 5118 finish_wait(&resync_wait, &wq); 5119 } 5120 } 5121 } while (mddev->curr_resync < 2); 5122 5123 j = 0; 5124 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5125 /* resync follows the size requested by the personality, 5126 * which defaults to physical size, but can be virtual size 5127 */ 5128 max_sectors = mddev->resync_max_sectors; 5129 mddev->resync_mismatches = 0; 5130 /* we don't use the checkpoint if there's a bitmap */ 5131 if (!mddev->bitmap && 5132 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5133 j = mddev->recovery_cp; 5134 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5135 max_sectors = mddev->size << 1; 5136 else { 5137 /* recovery follows the physical size of devices */ 5138 max_sectors = mddev->size << 1; 5139 j = MaxSector; 5140 ITERATE_RDEV(mddev,rdev,rtmp) 5141 if (rdev->raid_disk >= 0 && 5142 !test_bit(Faulty, &rdev->flags) && 5143 !test_bit(In_sync, &rdev->flags) && 5144 rdev->recovery_offset < j) 5145 j = rdev->recovery_offset; 5146 } 5147 5148 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 5149 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 5150 " %d KB/sec/disk.\n", speed_min(mddev)); 5151 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 5152 "(but not more than %d KB/sec) for %s.\n", 5153 speed_max(mddev), desc); 5154 5155 is_mddev_idle(mddev); /* this also initializes IO event counters */ 5156 5157 io_sectors = 0; 5158 for (m = 0; m < SYNC_MARKS; m++) { 5159 mark[m] = jiffies; 5160 mark_cnt[m] = io_sectors; 5161 } 5162 last_mark = 0; 5163 mddev->resync_mark = mark[last_mark]; 5164 mddev->resync_mark_cnt = mark_cnt[last_mark]; 5165 5166 /* 5167 * Tune reconstruction: 5168 */ 5169 window = 32*(PAGE_SIZE/512); 5170 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 5171 window/2,(unsigned long long) max_sectors/2); 5172 5173 atomic_set(&mddev->recovery_active, 0); 5174 init_waitqueue_head(&mddev->recovery_wait); 5175 last_check = 0; 5176 5177 if (j>2) { 5178 printk(KERN_INFO 5179 "md: resuming %s of %s from checkpoint.\n", 5180 desc, mdname(mddev)); 5181 mddev->curr_resync = j; 5182 } 5183 5184 while (j < max_sectors) { 5185 sector_t sectors; 5186 5187 skipped = 0; 5188 sectors = mddev->pers->sync_request(mddev, j, &skipped, 5189 currspeed < speed_min(mddev)); 5190 if (sectors == 0) { 5191 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 5192 goto out; 5193 } 5194 5195 if (!skipped) { /* actual IO requested */ 5196 io_sectors += sectors; 5197 atomic_add(sectors, &mddev->recovery_active); 5198 } 5199 5200 j += sectors; 5201 if (j>1) mddev->curr_resync = j; 5202 mddev->curr_mark_cnt = io_sectors; 5203 if (last_check == 0) 5204 /* this is the earliers that rebuilt will be 5205 * visible in /proc/mdstat 5206 */ 5207 md_new_event(mddev); 5208 5209 if (last_check + window > io_sectors || j == max_sectors) 5210 continue; 5211 5212 last_check = io_sectors; 5213 5214 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 5215 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 5216 break; 5217 5218 repeat: 5219 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 5220 /* step marks */ 5221 int next = (last_mark+1) % SYNC_MARKS; 5222 5223 mddev->resync_mark = mark[next]; 5224 mddev->resync_mark_cnt = mark_cnt[next]; 5225 mark[next] = jiffies; 5226 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 5227 last_mark = next; 5228 } 5229 5230 5231 if (kthread_should_stop()) { 5232 /* 5233 * got a signal, exit. 5234 */ 5235 printk(KERN_INFO 5236 "md: md_do_sync() got signal ... exiting\n"); 5237 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5238 goto out; 5239 } 5240 5241 /* 5242 * this loop exits only if either when we are slower than 5243 * the 'hard' speed limit, or the system was IO-idle for 5244 * a jiffy. 5245 * the system might be non-idle CPU-wise, but we only care 5246 * about not overloading the IO subsystem. (things like an 5247 * e2fsck being done on the RAID array should execute fast) 5248 */ 5249 mddev->queue->unplug_fn(mddev->queue); 5250 cond_resched(); 5251 5252 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 5253 /((jiffies-mddev->resync_mark)/HZ +1) +1; 5254 5255 if (currspeed > speed_min(mddev)) { 5256 if ((currspeed > speed_max(mddev)) || 5257 !is_mddev_idle(mddev)) { 5258 msleep(500); 5259 goto repeat; 5260 } 5261 } 5262 } 5263 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); 5264 /* 5265 * this also signals 'finished resyncing' to md_stop 5266 */ 5267 out: 5268 mddev->queue->unplug_fn(mddev->queue); 5269 5270 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 5271 5272 /* tell personality that we are finished */ 5273 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 5274 5275 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5276 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 5277 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 5278 mddev->curr_resync > 2) { 5279 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5280 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5281 if (mddev->curr_resync >= mddev->recovery_cp) { 5282 printk(KERN_INFO 5283 "md: checkpointing %s of %s.\n", 5284 desc, mdname(mddev)); 5285 mddev->recovery_cp = mddev->curr_resync; 5286 } 5287 } else 5288 mddev->recovery_cp = MaxSector; 5289 } else { 5290 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5291 mddev->curr_resync = MaxSector; 5292 ITERATE_RDEV(mddev,rdev,rtmp) 5293 if (rdev->raid_disk >= 0 && 5294 !test_bit(Faulty, &rdev->flags) && 5295 !test_bit(In_sync, &rdev->flags) && 5296 rdev->recovery_offset < mddev->curr_resync) 5297 rdev->recovery_offset = mddev->curr_resync; 5298 } 5299 } 5300 5301 skip: 5302 mddev->curr_resync = 0; 5303 wake_up(&resync_wait); 5304 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 5305 md_wakeup_thread(mddev->thread); 5306} 5307EXPORT_SYMBOL_GPL(md_do_sync); 5308 5309 5310/* 5311 * This routine is regularly called by all per-raid-array threads to 5312 * deal with generic issues like resync and super-block update. 5313 * Raid personalities that don't have a thread (linear/raid0) do not 5314 * need this as they never do any recovery or update the superblock. 5315 * 5316 * It does not do any resync itself, but rather "forks" off other threads 5317 * to do that as needed. 5318 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 5319 * "->recovery" and create a thread at ->sync_thread. 5320 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 5321 * and wakeups up this thread which will reap the thread and finish up. 5322 * This thread also removes any faulty devices (with nr_pending == 0). 5323 * 5324 * The overall approach is: 5325 * 1/ if the superblock needs updating, update it. 5326 * 2/ If a recovery thread is running, don't do anything else. 5327 * 3/ If recovery has finished, clean up, possibly marking spares active. 5328 * 4/ If there are any faulty devices, remove them. 5329 * 5/ If array is degraded, try to add spares devices 5330 * 6/ If array has spares or is not in-sync, start a resync thread. 5331 */ 5332void md_check_recovery(mddev_t *mddev) 5333{ 5334 mdk_rdev_t *rdev; 5335 struct list_head *rtmp; 5336 5337 5338 if (mddev->bitmap) 5339 bitmap_daemon_work(mddev->bitmap); 5340 5341 if (mddev->ro) 5342 return; 5343 5344 if (signal_pending(current)) { 5345 if (mddev->pers->sync_request) { 5346 printk(KERN_INFO "md: %s in immediate safe mode\n", 5347 mdname(mddev)); 5348 mddev->safemode = 2; 5349 } 5350 flush_signals(current); 5351 } 5352 5353 if ( ! ( 5354 mddev->flags || 5355 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 5356 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 5357 (mddev->safemode == 1) || 5358 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 5359 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 5360 )) 5361 return; 5362 5363 if (mddev_trylock(mddev)) { 5364 int spares =0; 5365 5366 spin_lock_irq(&mddev->write_lock); 5367 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 5368 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 5369 mddev->in_sync = 1; 5370 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5371 } 5372 if (mddev->safemode == 1) 5373 mddev->safemode = 0; 5374 spin_unlock_irq(&mddev->write_lock); 5375 5376 if (mddev->flags) 5377 md_update_sb(mddev, 0); 5378 5379 5380 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 5381 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 5382 /* resync/recovery still happening */ 5383 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5384 goto unlock; 5385 } 5386 if (mddev->sync_thread) { 5387 /* resync has finished, collect result */ 5388 md_unregister_thread(mddev->sync_thread); 5389 mddev->sync_thread = NULL; 5390 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5391 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5392 /* success...*/ 5393 /* activate any spares */ 5394 mddev->pers->spare_active(mddev); 5395 } 5396 md_update_sb(mddev, 1); 5397 5398 /* if array is no-longer degraded, then any saved_raid_disk 5399 * information must be scrapped 5400 */ 5401 if (!mddev->degraded) 5402 ITERATE_RDEV(mddev,rdev,rtmp) 5403 rdev->saved_raid_disk = -1; 5404 5405 mddev->recovery = 0; 5406 /* flag recovery needed just to double check */ 5407 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5408 md_new_event(mddev); 5409 goto unlock; 5410 } 5411 /* Clear some bits that don't mean anything, but 5412 * might be left set 5413 */ 5414 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5415 clear_bit(MD_RECOVERY_ERR, &mddev->recovery); 5416 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 5417 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 5418 5419 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 5420 goto unlock; 5421 /* no recovery is running. 5422 * remove any failed drives, then 5423 * add spares if possible. 5424 * Spare are also removed and re-added, to allow 5425 * the personality to fail the re-add. 5426 */ 5427 ITERATE_RDEV(mddev,rdev,rtmp) 5428 if (rdev->raid_disk >= 0 && 5429 (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && 5430 atomic_read(&rdev->nr_pending)==0) { 5431 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) { 5432 char nm[20]; 5433 sprintf(nm,"rd%d", rdev->raid_disk); 5434 sysfs_remove_link(&mddev->kobj, nm); 5435 rdev->raid_disk = -1; 5436 } 5437 } 5438 5439 if (mddev->degraded) { 5440 ITERATE_RDEV(mddev,rdev,rtmp) 5441 if (rdev->raid_disk < 0 5442 && !test_bit(Faulty, &rdev->flags)) { 5443 rdev->recovery_offset = 0; 5444 if (mddev->pers->hot_add_disk(mddev,rdev)) { 5445 char nm[20]; 5446 sprintf(nm, "rd%d", rdev->raid_disk); 5447 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 5448 spares++; 5449 md_new_event(mddev); 5450 } else 5451 break; 5452 } 5453 } 5454 5455 if (spares) { 5456 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5457 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5458 } else if (mddev->recovery_cp < MaxSector) { 5459 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5460 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 5461 /* nothing to be done ... */ 5462 goto unlock; 5463 5464 if (mddev->pers->sync_request) { 5465 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5466 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 5467 /* We are adding a device or devices to an array 5468 * which has the bitmap stored on all devices. 5469 * So make sure all bitmap pages get written 5470 */ 5471 bitmap_write_all(mddev->bitmap); 5472 } 5473 mddev->sync_thread = md_register_thread(md_do_sync, 5474 mddev, 5475 "%s_resync"); 5476 if (!mddev->sync_thread) { 5477 printk(KERN_ERR "%s: could not start resync" 5478 " thread...\n", 5479 mdname(mddev)); 5480 /* leave the spares where they are, it shouldn't hurt */ 5481 mddev->recovery = 0; 5482 } else 5483 md_wakeup_thread(mddev->sync_thread); 5484 md_new_event(mddev); 5485 } 5486 unlock: 5487 mddev_unlock(mddev); 5488 } 5489} 5490 5491static int md_notify_reboot(struct notifier_block *this, 5492 unsigned long code, void *x) 5493{ 5494 struct list_head *tmp; 5495 mddev_t *mddev; 5496 5497 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 5498 5499 printk(KERN_INFO "md: stopping all md devices.\n"); 5500 5501 ITERATE_MDDEV(mddev,tmp) 5502 if (mddev_trylock(mddev)) { 5503 do_md_stop (mddev, 1); 5504 mddev_unlock(mddev); 5505 } 5506 /* 5507 * certain more exotic SCSI devices are known to be 5508 * volatile wrt too early system reboots. While the 5509 * right place to handle this issue is the given 5510 * driver, we do want to have a safe RAID driver ... 5511 */ 5512 mdelay(1000*1); 5513 } 5514 return NOTIFY_DONE; 5515} 5516 5517static struct notifier_block md_notifier = { 5518 .notifier_call = md_notify_reboot, 5519 .next = NULL, 5520 .priority = INT_MAX, /* before any real devices */ 5521}; 5522 5523static void md_geninit(void) 5524{ 5525 struct proc_dir_entry *p; 5526 5527 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 5528 5529 p = create_proc_entry("mdstat", S_IRUGO, NULL); 5530 if (p) 5531 p->proc_fops = &md_seq_fops; 5532} 5533 5534static int __init md_init(void) 5535{ 5536 if (register_blkdev(MAJOR_NR, "md")) 5537 return -1; 5538 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 5539 unregister_blkdev(MAJOR_NR, "md"); 5540 return -1; 5541 } 5542 blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE, 5543 md_probe, NULL, NULL); 5544 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 5545 md_probe, NULL, NULL); 5546 5547 register_reboot_notifier(&md_notifier); 5548 raid_table_header = register_sysctl_table(raid_root_table, 1); 5549 5550 md_geninit(); 5551 return (0); 5552} 5553 5554 5555#ifndef MODULE 5556 5557/* 5558 * Searches all registered partitions for autorun RAID arrays 5559 * at boot time. 5560 */ 5561static dev_t detected_devices[128]; 5562static int dev_cnt; 5563 5564void md_autodetect_dev(dev_t dev) 5565{ 5566 if (dev_cnt >= 0 && dev_cnt < 127) 5567 detected_devices[dev_cnt++] = dev; 5568} 5569 5570 5571static void autostart_arrays(int part) 5572{ 5573 mdk_rdev_t *rdev; 5574 int i; 5575 5576 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 5577 5578 for (i = 0; i < dev_cnt; i++) { 5579 dev_t dev = detected_devices[i]; 5580 5581 rdev = md_import_device(dev,0, 0); 5582 if (IS_ERR(rdev)) 5583 continue; 5584 5585 if (test_bit(Faulty, &rdev->flags)) { 5586 MD_BUG(); 5587 continue; 5588 } 5589 list_add(&rdev->same_set, &pending_raid_disks); 5590 } 5591 dev_cnt = 0; 5592 5593 autorun_devices(part); 5594} 5595 5596#endif 5597 5598static __exit void md_exit(void) 5599{ 5600 mddev_t *mddev; 5601 struct list_head *tmp; 5602 5603 blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS); 5604 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 5605 5606 unregister_blkdev(MAJOR_NR,"md"); 5607 unregister_blkdev(mdp_major, "mdp"); 5608 unregister_reboot_notifier(&md_notifier); 5609 unregister_sysctl_table(raid_table_header); 5610 remove_proc_entry("mdstat", NULL); 5611 ITERATE_MDDEV(mddev,tmp) { 5612 struct gendisk *disk = mddev->gendisk; 5613 if (!disk) 5614 continue; 5615 export_array(mddev); 5616 del_gendisk(disk); 5617 put_disk(disk); 5618 mddev->gendisk = NULL; 5619 mddev_put(mddev); 5620 } 5621} 5622 5623module_init(md_init) 5624module_exit(md_exit) 5625 5626static int get_ro(char *buffer, struct kernel_param *kp) 5627{ 5628 return sprintf(buffer, "%d", start_readonly); 5629} 5630static int set_ro(const char *val, struct kernel_param *kp) 5631{ 5632 char *e; 5633 int num = simple_strtoul(val, &e, 10); 5634 if (*val && (*e == '\0' || *e == '\n')) { 5635 start_readonly = num; 5636 return 0; 5637 } 5638 return -EINVAL; 5639} 5640 5641module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 5642module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 5643 5644 5645EXPORT_SYMBOL(register_md_personality); 5646EXPORT_SYMBOL(unregister_md_personality); 5647EXPORT_SYMBOL(md_error); 5648EXPORT_SYMBOL(md_done_sync); 5649EXPORT_SYMBOL(md_write_start); 5650EXPORT_SYMBOL(md_write_end); 5651EXPORT_SYMBOL(md_register_thread); 5652EXPORT_SYMBOL(md_unregister_thread); 5653EXPORT_SYMBOL(md_wakeup_thread); 5654EXPORT_SYMBOL(md_check_recovery); 5655MODULE_LICENSE("GPL"); 5656MODULE_ALIAS("md"); 5657MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);