at v2.6.19-rc2 5656 lines 142 kB view raw
1/* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33*/ 34 35#include <linux/module.h> 36#include <linux/kthread.h> 37#include <linux/linkage.h> 38#include <linux/raid/md.h> 39#include <linux/raid/bitmap.h> 40#include <linux/sysctl.h> 41#include <linux/buffer_head.h> /* for invalidate_bdev */ 42#include <linux/suspend.h> 43#include <linux/poll.h> 44#include <linux/mutex.h> 45#include <linux/ctype.h> 46 47#include <linux/init.h> 48 49#include <linux/file.h> 50 51#ifdef CONFIG_KMOD 52#include <linux/kmod.h> 53#endif 54 55#include <asm/unaligned.h> 56 57#define MAJOR_NR MD_MAJOR 58#define MD_DRIVER 59 60/* 63 partitions with the alternate major number (mdp) */ 61#define MdpMinorShift 6 62 63#define DEBUG 0 64#define dprintk(x...) ((void)(DEBUG && printk(x))) 65 66 67#ifndef MODULE 68static void autostart_arrays (int part); 69#endif 70 71static LIST_HEAD(pers_list); 72static DEFINE_SPINLOCK(pers_lock); 73 74static void md_print_devices(void); 75 76#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 77 78/* 79 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 80 * is 1000 KB/sec, so the extra system load does not show up that much. 81 * Increase it if you want to have more _guaranteed_ speed. Note that 82 * the RAID driver will use the maximum available bandwidth if the IO 83 * subsystem is idle. There is also an 'absolute maximum' reconstruction 84 * speed limit - in case reconstruction slows down your system despite 85 * idle IO detection. 86 * 87 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 88 * or /sys/block/mdX/md/sync_speed_{min,max} 89 */ 90 91static int sysctl_speed_limit_min = 1000; 92static int sysctl_speed_limit_max = 200000; 93static inline int speed_min(mddev_t *mddev) 94{ 95 return mddev->sync_speed_min ? 96 mddev->sync_speed_min : sysctl_speed_limit_min; 97} 98 99static inline int speed_max(mddev_t *mddev) 100{ 101 return mddev->sync_speed_max ? 102 mddev->sync_speed_max : sysctl_speed_limit_max; 103} 104 105static struct ctl_table_header *raid_table_header; 106 107static ctl_table raid_table[] = { 108 { 109 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 110 .procname = "speed_limit_min", 111 .data = &sysctl_speed_limit_min, 112 .maxlen = sizeof(int), 113 .mode = S_IRUGO|S_IWUSR, 114 .proc_handler = &proc_dointvec, 115 }, 116 { 117 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 118 .procname = "speed_limit_max", 119 .data = &sysctl_speed_limit_max, 120 .maxlen = sizeof(int), 121 .mode = S_IRUGO|S_IWUSR, 122 .proc_handler = &proc_dointvec, 123 }, 124 { .ctl_name = 0 } 125}; 126 127static ctl_table raid_dir_table[] = { 128 { 129 .ctl_name = DEV_RAID, 130 .procname = "raid", 131 .maxlen = 0, 132 .mode = S_IRUGO|S_IXUGO, 133 .child = raid_table, 134 }, 135 { .ctl_name = 0 } 136}; 137 138static ctl_table raid_root_table[] = { 139 { 140 .ctl_name = CTL_DEV, 141 .procname = "dev", 142 .maxlen = 0, 143 .mode = 0555, 144 .child = raid_dir_table, 145 }, 146 { .ctl_name = 0 } 147}; 148 149static struct block_device_operations md_fops; 150 151static int start_readonly; 152 153/* 154 * We have a system wide 'event count' that is incremented 155 * on any 'interesting' event, and readers of /proc/mdstat 156 * can use 'poll' or 'select' to find out when the event 157 * count increases. 158 * 159 * Events are: 160 * start array, stop array, error, add device, remove device, 161 * start build, activate spare 162 */ 163static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 164static atomic_t md_event_count; 165void md_new_event(mddev_t *mddev) 166{ 167 atomic_inc(&md_event_count); 168 wake_up(&md_event_waiters); 169 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 170} 171EXPORT_SYMBOL_GPL(md_new_event); 172 173/* Alternate version that can be called from interrupts 174 * when calling sysfs_notify isn't needed. 175 */ 176static void md_new_event_inintr(mddev_t *mddev) 177{ 178 atomic_inc(&md_event_count); 179 wake_up(&md_event_waiters); 180} 181 182/* 183 * Enables to iterate over all existing md arrays 184 * all_mddevs_lock protects this list. 185 */ 186static LIST_HEAD(all_mddevs); 187static DEFINE_SPINLOCK(all_mddevs_lock); 188 189 190/* 191 * iterates through all used mddevs in the system. 192 * We take care to grab the all_mddevs_lock whenever navigating 193 * the list, and to always hold a refcount when unlocked. 194 * Any code which breaks out of this loop while own 195 * a reference to the current mddev and must mddev_put it. 196 */ 197#define ITERATE_MDDEV(mddev,tmp) \ 198 \ 199 for (({ spin_lock(&all_mddevs_lock); \ 200 tmp = all_mddevs.next; \ 201 mddev = NULL;}); \ 202 ({ if (tmp != &all_mddevs) \ 203 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 204 spin_unlock(&all_mddevs_lock); \ 205 if (mddev) mddev_put(mddev); \ 206 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 207 tmp != &all_mddevs;}); \ 208 ({ spin_lock(&all_mddevs_lock); \ 209 tmp = tmp->next;}) \ 210 ) 211 212 213static int md_fail_request (request_queue_t *q, struct bio *bio) 214{ 215 bio_io_error(bio, bio->bi_size); 216 return 0; 217} 218 219static inline mddev_t *mddev_get(mddev_t *mddev) 220{ 221 atomic_inc(&mddev->active); 222 return mddev; 223} 224 225static void mddev_put(mddev_t *mddev) 226{ 227 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 228 return; 229 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 230 list_del(&mddev->all_mddevs); 231 spin_unlock(&all_mddevs_lock); 232 blk_cleanup_queue(mddev->queue); 233 kobject_unregister(&mddev->kobj); 234 } else 235 spin_unlock(&all_mddevs_lock); 236} 237 238static mddev_t * mddev_find(dev_t unit) 239{ 240 mddev_t *mddev, *new = NULL; 241 242 retry: 243 spin_lock(&all_mddevs_lock); 244 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 245 if (mddev->unit == unit) { 246 mddev_get(mddev); 247 spin_unlock(&all_mddevs_lock); 248 kfree(new); 249 return mddev; 250 } 251 252 if (new) { 253 list_add(&new->all_mddevs, &all_mddevs); 254 spin_unlock(&all_mddevs_lock); 255 return new; 256 } 257 spin_unlock(&all_mddevs_lock); 258 259 new = kzalloc(sizeof(*new), GFP_KERNEL); 260 if (!new) 261 return NULL; 262 263 new->unit = unit; 264 if (MAJOR(unit) == MD_MAJOR) 265 new->md_minor = MINOR(unit); 266 else 267 new->md_minor = MINOR(unit) >> MdpMinorShift; 268 269 mutex_init(&new->reconfig_mutex); 270 INIT_LIST_HEAD(&new->disks); 271 INIT_LIST_HEAD(&new->all_mddevs); 272 init_timer(&new->safemode_timer); 273 atomic_set(&new->active, 1); 274 spin_lock_init(&new->write_lock); 275 init_waitqueue_head(&new->sb_wait); 276 277 new->queue = blk_alloc_queue(GFP_KERNEL); 278 if (!new->queue) { 279 kfree(new); 280 return NULL; 281 } 282 set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags); 283 284 blk_queue_make_request(new->queue, md_fail_request); 285 286 goto retry; 287} 288 289static inline int mddev_lock(mddev_t * mddev) 290{ 291 return mutex_lock_interruptible(&mddev->reconfig_mutex); 292} 293 294static inline int mddev_trylock(mddev_t * mddev) 295{ 296 return mutex_trylock(&mddev->reconfig_mutex); 297} 298 299static inline void mddev_unlock(mddev_t * mddev) 300{ 301 mutex_unlock(&mddev->reconfig_mutex); 302 303 md_wakeup_thread(mddev->thread); 304} 305 306static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 307{ 308 mdk_rdev_t * rdev; 309 struct list_head *tmp; 310 311 ITERATE_RDEV(mddev,rdev,tmp) { 312 if (rdev->desc_nr == nr) 313 return rdev; 314 } 315 return NULL; 316} 317 318static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 319{ 320 struct list_head *tmp; 321 mdk_rdev_t *rdev; 322 323 ITERATE_RDEV(mddev,rdev,tmp) { 324 if (rdev->bdev->bd_dev == dev) 325 return rdev; 326 } 327 return NULL; 328} 329 330static struct mdk_personality *find_pers(int level, char *clevel) 331{ 332 struct mdk_personality *pers; 333 list_for_each_entry(pers, &pers_list, list) { 334 if (level != LEVEL_NONE && pers->level == level) 335 return pers; 336 if (strcmp(pers->name, clevel)==0) 337 return pers; 338 } 339 return NULL; 340} 341 342static inline sector_t calc_dev_sboffset(struct block_device *bdev) 343{ 344 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 345 return MD_NEW_SIZE_BLOCKS(size); 346} 347 348static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 349{ 350 sector_t size; 351 352 size = rdev->sb_offset; 353 354 if (chunk_size) 355 size &= ~((sector_t)chunk_size/1024 - 1); 356 return size; 357} 358 359static int alloc_disk_sb(mdk_rdev_t * rdev) 360{ 361 if (rdev->sb_page) 362 MD_BUG(); 363 364 rdev->sb_page = alloc_page(GFP_KERNEL); 365 if (!rdev->sb_page) { 366 printk(KERN_ALERT "md: out of memory.\n"); 367 return -EINVAL; 368 } 369 370 return 0; 371} 372 373static void free_disk_sb(mdk_rdev_t * rdev) 374{ 375 if (rdev->sb_page) { 376 put_page(rdev->sb_page); 377 rdev->sb_loaded = 0; 378 rdev->sb_page = NULL; 379 rdev->sb_offset = 0; 380 rdev->size = 0; 381 } 382} 383 384 385static int super_written(struct bio *bio, unsigned int bytes_done, int error) 386{ 387 mdk_rdev_t *rdev = bio->bi_private; 388 mddev_t *mddev = rdev->mddev; 389 if (bio->bi_size) 390 return 1; 391 392 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 393 printk("md: super_written gets error=%d, uptodate=%d\n", 394 error, test_bit(BIO_UPTODATE, &bio->bi_flags)); 395 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); 396 md_error(mddev, rdev); 397 } 398 399 if (atomic_dec_and_test(&mddev->pending_writes)) 400 wake_up(&mddev->sb_wait); 401 bio_put(bio); 402 return 0; 403} 404 405static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error) 406{ 407 struct bio *bio2 = bio->bi_private; 408 mdk_rdev_t *rdev = bio2->bi_private; 409 mddev_t *mddev = rdev->mddev; 410 if (bio->bi_size) 411 return 1; 412 413 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 414 error == -EOPNOTSUPP) { 415 unsigned long flags; 416 /* barriers don't appear to be supported :-( */ 417 set_bit(BarriersNotsupp, &rdev->flags); 418 mddev->barriers_work = 0; 419 spin_lock_irqsave(&mddev->write_lock, flags); 420 bio2->bi_next = mddev->biolist; 421 mddev->biolist = bio2; 422 spin_unlock_irqrestore(&mddev->write_lock, flags); 423 wake_up(&mddev->sb_wait); 424 bio_put(bio); 425 return 0; 426 } 427 bio_put(bio2); 428 bio->bi_private = rdev; 429 return super_written(bio, bytes_done, error); 430} 431 432void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 433 sector_t sector, int size, struct page *page) 434{ 435 /* write first size bytes of page to sector of rdev 436 * Increment mddev->pending_writes before returning 437 * and decrement it on completion, waking up sb_wait 438 * if zero is reached. 439 * If an error occurred, call md_error 440 * 441 * As we might need to resubmit the request if BIO_RW_BARRIER 442 * causes ENOTSUPP, we allocate a spare bio... 443 */ 444 struct bio *bio = bio_alloc(GFP_NOIO, 1); 445 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC); 446 447 bio->bi_bdev = rdev->bdev; 448 bio->bi_sector = sector; 449 bio_add_page(bio, page, size, 0); 450 bio->bi_private = rdev; 451 bio->bi_end_io = super_written; 452 bio->bi_rw = rw; 453 454 atomic_inc(&mddev->pending_writes); 455 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 456 struct bio *rbio; 457 rw |= (1<<BIO_RW_BARRIER); 458 rbio = bio_clone(bio, GFP_NOIO); 459 rbio->bi_private = bio; 460 rbio->bi_end_io = super_written_barrier; 461 submit_bio(rw, rbio); 462 } else 463 submit_bio(rw, bio); 464} 465 466void md_super_wait(mddev_t *mddev) 467{ 468 /* wait for all superblock writes that were scheduled to complete. 469 * if any had to be retried (due to BARRIER problems), retry them 470 */ 471 DEFINE_WAIT(wq); 472 for(;;) { 473 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 474 if (atomic_read(&mddev->pending_writes)==0) 475 break; 476 while (mddev->biolist) { 477 struct bio *bio; 478 spin_lock_irq(&mddev->write_lock); 479 bio = mddev->biolist; 480 mddev->biolist = bio->bi_next ; 481 bio->bi_next = NULL; 482 spin_unlock_irq(&mddev->write_lock); 483 submit_bio(bio->bi_rw, bio); 484 } 485 schedule(); 486 } 487 finish_wait(&mddev->sb_wait, &wq); 488} 489 490static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 491{ 492 if (bio->bi_size) 493 return 1; 494 495 complete((struct completion*)bio->bi_private); 496 return 0; 497} 498 499int sync_page_io(struct block_device *bdev, sector_t sector, int size, 500 struct page *page, int rw) 501{ 502 struct bio *bio = bio_alloc(GFP_NOIO, 1); 503 struct completion event; 504 int ret; 505 506 rw |= (1 << BIO_RW_SYNC); 507 508 bio->bi_bdev = bdev; 509 bio->bi_sector = sector; 510 bio_add_page(bio, page, size, 0); 511 init_completion(&event); 512 bio->bi_private = &event; 513 bio->bi_end_io = bi_complete; 514 submit_bio(rw, bio); 515 wait_for_completion(&event); 516 517 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 518 bio_put(bio); 519 return ret; 520} 521EXPORT_SYMBOL_GPL(sync_page_io); 522 523static int read_disk_sb(mdk_rdev_t * rdev, int size) 524{ 525 char b[BDEVNAME_SIZE]; 526 if (!rdev->sb_page) { 527 MD_BUG(); 528 return -EINVAL; 529 } 530 if (rdev->sb_loaded) 531 return 0; 532 533 534 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) 535 goto fail; 536 rdev->sb_loaded = 1; 537 return 0; 538 539fail: 540 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 541 bdevname(rdev->bdev,b)); 542 return -EINVAL; 543} 544 545static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 546{ 547 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 548 (sb1->set_uuid1 == sb2->set_uuid1) && 549 (sb1->set_uuid2 == sb2->set_uuid2) && 550 (sb1->set_uuid3 == sb2->set_uuid3)) 551 552 return 1; 553 554 return 0; 555} 556 557 558static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 559{ 560 int ret; 561 mdp_super_t *tmp1, *tmp2; 562 563 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 564 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 565 566 if (!tmp1 || !tmp2) { 567 ret = 0; 568 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 569 goto abort; 570 } 571 572 *tmp1 = *sb1; 573 *tmp2 = *sb2; 574 575 /* 576 * nr_disks is not constant 577 */ 578 tmp1->nr_disks = 0; 579 tmp2->nr_disks = 0; 580 581 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 582 ret = 0; 583 else 584 ret = 1; 585 586abort: 587 kfree(tmp1); 588 kfree(tmp2); 589 return ret; 590} 591 592static unsigned int calc_sb_csum(mdp_super_t * sb) 593{ 594 unsigned int disk_csum, csum; 595 596 disk_csum = sb->sb_csum; 597 sb->sb_csum = 0; 598 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 599 sb->sb_csum = disk_csum; 600 return csum; 601} 602 603 604/* 605 * Handle superblock details. 606 * We want to be able to handle multiple superblock formats 607 * so we have a common interface to them all, and an array of 608 * different handlers. 609 * We rely on user-space to write the initial superblock, and support 610 * reading and updating of superblocks. 611 * Interface methods are: 612 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 613 * loads and validates a superblock on dev. 614 * if refdev != NULL, compare superblocks on both devices 615 * Return: 616 * 0 - dev has a superblock that is compatible with refdev 617 * 1 - dev has a superblock that is compatible and newer than refdev 618 * so dev should be used as the refdev in future 619 * -EINVAL superblock incompatible or invalid 620 * -othererror e.g. -EIO 621 * 622 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 623 * Verify that dev is acceptable into mddev. 624 * The first time, mddev->raid_disks will be 0, and data from 625 * dev should be merged in. Subsequent calls check that dev 626 * is new enough. Return 0 or -EINVAL 627 * 628 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 629 * Update the superblock for rdev with data in mddev 630 * This does not write to disc. 631 * 632 */ 633 634struct super_type { 635 char *name; 636 struct module *owner; 637 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 638 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 639 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 640}; 641 642/* 643 * load_super for 0.90.0 644 */ 645static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 646{ 647 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 648 mdp_super_t *sb; 649 int ret; 650 sector_t sb_offset; 651 652 /* 653 * Calculate the position of the superblock, 654 * it's at the end of the disk. 655 * 656 * It also happens to be a multiple of 4Kb. 657 */ 658 sb_offset = calc_dev_sboffset(rdev->bdev); 659 rdev->sb_offset = sb_offset; 660 661 ret = read_disk_sb(rdev, MD_SB_BYTES); 662 if (ret) return ret; 663 664 ret = -EINVAL; 665 666 bdevname(rdev->bdev, b); 667 sb = (mdp_super_t*)page_address(rdev->sb_page); 668 669 if (sb->md_magic != MD_SB_MAGIC) { 670 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 671 b); 672 goto abort; 673 } 674 675 if (sb->major_version != 0 || 676 sb->minor_version < 90 || 677 sb->minor_version > 91) { 678 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 679 sb->major_version, sb->minor_version, 680 b); 681 goto abort; 682 } 683 684 if (sb->raid_disks <= 0) 685 goto abort; 686 687 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { 688 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 689 b); 690 goto abort; 691 } 692 693 rdev->preferred_minor = sb->md_minor; 694 rdev->data_offset = 0; 695 rdev->sb_size = MD_SB_BYTES; 696 697 if (sb->level == LEVEL_MULTIPATH) 698 rdev->desc_nr = -1; 699 else 700 rdev->desc_nr = sb->this_disk.number; 701 702 if (refdev == 0) 703 ret = 1; 704 else { 705 __u64 ev1, ev2; 706 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 707 if (!uuid_equal(refsb, sb)) { 708 printk(KERN_WARNING "md: %s has different UUID to %s\n", 709 b, bdevname(refdev->bdev,b2)); 710 goto abort; 711 } 712 if (!sb_equal(refsb, sb)) { 713 printk(KERN_WARNING "md: %s has same UUID" 714 " but different superblock to %s\n", 715 b, bdevname(refdev->bdev, b2)); 716 goto abort; 717 } 718 ev1 = md_event(sb); 719 ev2 = md_event(refsb); 720 if (ev1 > ev2) 721 ret = 1; 722 else 723 ret = 0; 724 } 725 rdev->size = calc_dev_size(rdev, sb->chunk_size); 726 727 if (rdev->size < sb->size && sb->level > 1) 728 /* "this cannot possibly happen" ... */ 729 ret = -EINVAL; 730 731 abort: 732 return ret; 733} 734 735/* 736 * validate_super for 0.90.0 737 */ 738static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 739{ 740 mdp_disk_t *desc; 741 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 742 __u64 ev1 = md_event(sb); 743 744 rdev->raid_disk = -1; 745 rdev->flags = 0; 746 if (mddev->raid_disks == 0) { 747 mddev->major_version = 0; 748 mddev->minor_version = sb->minor_version; 749 mddev->patch_version = sb->patch_version; 750 mddev->persistent = ! sb->not_persistent; 751 mddev->chunk_size = sb->chunk_size; 752 mddev->ctime = sb->ctime; 753 mddev->utime = sb->utime; 754 mddev->level = sb->level; 755 mddev->clevel[0] = 0; 756 mddev->layout = sb->layout; 757 mddev->raid_disks = sb->raid_disks; 758 mddev->size = sb->size; 759 mddev->events = ev1; 760 mddev->bitmap_offset = 0; 761 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 762 763 if (mddev->minor_version >= 91) { 764 mddev->reshape_position = sb->reshape_position; 765 mddev->delta_disks = sb->delta_disks; 766 mddev->new_level = sb->new_level; 767 mddev->new_layout = sb->new_layout; 768 mddev->new_chunk = sb->new_chunk; 769 } else { 770 mddev->reshape_position = MaxSector; 771 mddev->delta_disks = 0; 772 mddev->new_level = mddev->level; 773 mddev->new_layout = mddev->layout; 774 mddev->new_chunk = mddev->chunk_size; 775 } 776 777 if (sb->state & (1<<MD_SB_CLEAN)) 778 mddev->recovery_cp = MaxSector; 779 else { 780 if (sb->events_hi == sb->cp_events_hi && 781 sb->events_lo == sb->cp_events_lo) { 782 mddev->recovery_cp = sb->recovery_cp; 783 } else 784 mddev->recovery_cp = 0; 785 } 786 787 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 788 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 789 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 790 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 791 792 mddev->max_disks = MD_SB_DISKS; 793 794 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 795 mddev->bitmap_file == NULL) { 796 if (mddev->level != 1 && mddev->level != 4 797 && mddev->level != 5 && mddev->level != 6 798 && mddev->level != 10) { 799 /* FIXME use a better test */ 800 printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); 801 return -EINVAL; 802 } 803 mddev->bitmap_offset = mddev->default_bitmap_offset; 804 } 805 806 } else if (mddev->pers == NULL) { 807 /* Insist on good event counter while assembling */ 808 ++ev1; 809 if (ev1 < mddev->events) 810 return -EINVAL; 811 } else if (mddev->bitmap) { 812 /* if adding to array with a bitmap, then we can accept an 813 * older device ... but not too old. 814 */ 815 if (ev1 < mddev->bitmap->events_cleared) 816 return 0; 817 } else { 818 if (ev1 < mddev->events) 819 /* just a hot-add of a new device, leave raid_disk at -1 */ 820 return 0; 821 } 822 823 if (mddev->level != LEVEL_MULTIPATH) { 824 desc = sb->disks + rdev->desc_nr; 825 826 if (desc->state & (1<<MD_DISK_FAULTY)) 827 set_bit(Faulty, &rdev->flags); 828 else if (desc->state & (1<<MD_DISK_SYNC) /* && 829 desc->raid_disk < mddev->raid_disks */) { 830 set_bit(In_sync, &rdev->flags); 831 rdev->raid_disk = desc->raid_disk; 832 } 833 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 834 set_bit(WriteMostly, &rdev->flags); 835 } else /* MULTIPATH are always insync */ 836 set_bit(In_sync, &rdev->flags); 837 return 0; 838} 839 840/* 841 * sync_super for 0.90.0 842 */ 843static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 844{ 845 mdp_super_t *sb; 846 struct list_head *tmp; 847 mdk_rdev_t *rdev2; 848 int next_spare = mddev->raid_disks; 849 850 851 /* make rdev->sb match mddev data.. 852 * 853 * 1/ zero out disks 854 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 855 * 3/ any empty disks < next_spare become removed 856 * 857 * disks[0] gets initialised to REMOVED because 858 * we cannot be sure from other fields if it has 859 * been initialised or not. 860 */ 861 int i; 862 int active=0, working=0,failed=0,spare=0,nr_disks=0; 863 864 rdev->sb_size = MD_SB_BYTES; 865 866 sb = (mdp_super_t*)page_address(rdev->sb_page); 867 868 memset(sb, 0, sizeof(*sb)); 869 870 sb->md_magic = MD_SB_MAGIC; 871 sb->major_version = mddev->major_version; 872 sb->patch_version = mddev->patch_version; 873 sb->gvalid_words = 0; /* ignored */ 874 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 875 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 876 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 877 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 878 879 sb->ctime = mddev->ctime; 880 sb->level = mddev->level; 881 sb->size = mddev->size; 882 sb->raid_disks = mddev->raid_disks; 883 sb->md_minor = mddev->md_minor; 884 sb->not_persistent = !mddev->persistent; 885 sb->utime = mddev->utime; 886 sb->state = 0; 887 sb->events_hi = (mddev->events>>32); 888 sb->events_lo = (u32)mddev->events; 889 890 if (mddev->reshape_position == MaxSector) 891 sb->minor_version = 90; 892 else { 893 sb->minor_version = 91; 894 sb->reshape_position = mddev->reshape_position; 895 sb->new_level = mddev->new_level; 896 sb->delta_disks = mddev->delta_disks; 897 sb->new_layout = mddev->new_layout; 898 sb->new_chunk = mddev->new_chunk; 899 } 900 mddev->minor_version = sb->minor_version; 901 if (mddev->in_sync) 902 { 903 sb->recovery_cp = mddev->recovery_cp; 904 sb->cp_events_hi = (mddev->events>>32); 905 sb->cp_events_lo = (u32)mddev->events; 906 if (mddev->recovery_cp == MaxSector) 907 sb->state = (1<< MD_SB_CLEAN); 908 } else 909 sb->recovery_cp = 0; 910 911 sb->layout = mddev->layout; 912 sb->chunk_size = mddev->chunk_size; 913 914 if (mddev->bitmap && mddev->bitmap_file == NULL) 915 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 916 917 sb->disks[0].state = (1<<MD_DISK_REMOVED); 918 ITERATE_RDEV(mddev,rdev2,tmp) { 919 mdp_disk_t *d; 920 int desc_nr; 921 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 922 && !test_bit(Faulty, &rdev2->flags)) 923 desc_nr = rdev2->raid_disk; 924 else 925 desc_nr = next_spare++; 926 rdev2->desc_nr = desc_nr; 927 d = &sb->disks[rdev2->desc_nr]; 928 nr_disks++; 929 d->number = rdev2->desc_nr; 930 d->major = MAJOR(rdev2->bdev->bd_dev); 931 d->minor = MINOR(rdev2->bdev->bd_dev); 932 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 933 && !test_bit(Faulty, &rdev2->flags)) 934 d->raid_disk = rdev2->raid_disk; 935 else 936 d->raid_disk = rdev2->desc_nr; /* compatibility */ 937 if (test_bit(Faulty, &rdev2->flags)) 938 d->state = (1<<MD_DISK_FAULTY); 939 else if (test_bit(In_sync, &rdev2->flags)) { 940 d->state = (1<<MD_DISK_ACTIVE); 941 d->state |= (1<<MD_DISK_SYNC); 942 active++; 943 working++; 944 } else { 945 d->state = 0; 946 spare++; 947 working++; 948 } 949 if (test_bit(WriteMostly, &rdev2->flags)) 950 d->state |= (1<<MD_DISK_WRITEMOSTLY); 951 } 952 /* now set the "removed" and "faulty" bits on any missing devices */ 953 for (i=0 ; i < mddev->raid_disks ; i++) { 954 mdp_disk_t *d = &sb->disks[i]; 955 if (d->state == 0 && d->number == 0) { 956 d->number = i; 957 d->raid_disk = i; 958 d->state = (1<<MD_DISK_REMOVED); 959 d->state |= (1<<MD_DISK_FAULTY); 960 failed++; 961 } 962 } 963 sb->nr_disks = nr_disks; 964 sb->active_disks = active; 965 sb->working_disks = working; 966 sb->failed_disks = failed; 967 sb->spare_disks = spare; 968 969 sb->this_disk = sb->disks[rdev->desc_nr]; 970 sb->sb_csum = calc_sb_csum(sb); 971} 972 973/* 974 * version 1 superblock 975 */ 976 977static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) 978{ 979 unsigned int disk_csum, csum; 980 unsigned long long newcsum; 981 int size = 256 + le32_to_cpu(sb->max_dev)*2; 982 unsigned int *isuper = (unsigned int*)sb; 983 int i; 984 985 disk_csum = sb->sb_csum; 986 sb->sb_csum = 0; 987 newcsum = 0; 988 for (i=0; size>=4; size -= 4 ) 989 newcsum += le32_to_cpu(*isuper++); 990 991 if (size == 2) 992 newcsum += le16_to_cpu(*(unsigned short*) isuper); 993 994 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 995 sb->sb_csum = disk_csum; 996 return cpu_to_le32(csum); 997} 998 999static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 1000{ 1001 struct mdp_superblock_1 *sb; 1002 int ret; 1003 sector_t sb_offset; 1004 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1005 int bmask; 1006 1007 /* 1008 * Calculate the position of the superblock. 1009 * It is always aligned to a 4K boundary and 1010 * depeding on minor_version, it can be: 1011 * 0: At least 8K, but less than 12K, from end of device 1012 * 1: At start of device 1013 * 2: 4K from start of device. 1014 */ 1015 switch(minor_version) { 1016 case 0: 1017 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 1018 sb_offset -= 8*2; 1019 sb_offset &= ~(sector_t)(4*2-1); 1020 /* convert from sectors to K */ 1021 sb_offset /= 2; 1022 break; 1023 case 1: 1024 sb_offset = 0; 1025 break; 1026 case 2: 1027 sb_offset = 4; 1028 break; 1029 default: 1030 return -EINVAL; 1031 } 1032 rdev->sb_offset = sb_offset; 1033 1034 /* superblock is rarely larger than 1K, but it can be larger, 1035 * and it is safe to read 4k, so we do that 1036 */ 1037 ret = read_disk_sb(rdev, 4096); 1038 if (ret) return ret; 1039 1040 1041 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1042 1043 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1044 sb->major_version != cpu_to_le32(1) || 1045 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1046 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 1047 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1048 return -EINVAL; 1049 1050 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1051 printk("md: invalid superblock checksum on %s\n", 1052 bdevname(rdev->bdev,b)); 1053 return -EINVAL; 1054 } 1055 if (le64_to_cpu(sb->data_size) < 10) { 1056 printk("md: data_size too small on %s\n", 1057 bdevname(rdev->bdev,b)); 1058 return -EINVAL; 1059 } 1060 rdev->preferred_minor = 0xffff; 1061 rdev->data_offset = le64_to_cpu(sb->data_offset); 1062 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1063 1064 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1065 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1066 if (rdev->sb_size & bmask) 1067 rdev-> sb_size = (rdev->sb_size | bmask)+1; 1068 1069 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1070 rdev->desc_nr = -1; 1071 else 1072 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1073 1074 if (refdev == 0) 1075 ret = 1; 1076 else { 1077 __u64 ev1, ev2; 1078 struct mdp_superblock_1 *refsb = 1079 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1080 1081 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1082 sb->level != refsb->level || 1083 sb->layout != refsb->layout || 1084 sb->chunksize != refsb->chunksize) { 1085 printk(KERN_WARNING "md: %s has strangely different" 1086 " superblock to %s\n", 1087 bdevname(rdev->bdev,b), 1088 bdevname(refdev->bdev,b2)); 1089 return -EINVAL; 1090 } 1091 ev1 = le64_to_cpu(sb->events); 1092 ev2 = le64_to_cpu(refsb->events); 1093 1094 if (ev1 > ev2) 1095 ret = 1; 1096 else 1097 ret = 0; 1098 } 1099 if (minor_version) 1100 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1101 else 1102 rdev->size = rdev->sb_offset; 1103 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1104 return -EINVAL; 1105 rdev->size = le64_to_cpu(sb->data_size)/2; 1106 if (le32_to_cpu(sb->chunksize)) 1107 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1108 1109 if (le32_to_cpu(sb->size) > rdev->size*2) 1110 return -EINVAL; 1111 return ret; 1112} 1113 1114static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1115{ 1116 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1117 __u64 ev1 = le64_to_cpu(sb->events); 1118 1119 rdev->raid_disk = -1; 1120 rdev->flags = 0; 1121 if (mddev->raid_disks == 0) { 1122 mddev->major_version = 1; 1123 mddev->patch_version = 0; 1124 mddev->persistent = 1; 1125 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 1126 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1127 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1128 mddev->level = le32_to_cpu(sb->level); 1129 mddev->clevel[0] = 0; 1130 mddev->layout = le32_to_cpu(sb->layout); 1131 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1132 mddev->size = le64_to_cpu(sb->size)/2; 1133 mddev->events = ev1; 1134 mddev->bitmap_offset = 0; 1135 mddev->default_bitmap_offset = 1024 >> 9; 1136 1137 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1138 memcpy(mddev->uuid, sb->set_uuid, 16); 1139 1140 mddev->max_disks = (4096-256)/2; 1141 1142 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1143 mddev->bitmap_file == NULL ) { 1144 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6 1145 && mddev->level != 10) { 1146 printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); 1147 return -EINVAL; 1148 } 1149 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1150 } 1151 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1152 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1153 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1154 mddev->new_level = le32_to_cpu(sb->new_level); 1155 mddev->new_layout = le32_to_cpu(sb->new_layout); 1156 mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; 1157 } else { 1158 mddev->reshape_position = MaxSector; 1159 mddev->delta_disks = 0; 1160 mddev->new_level = mddev->level; 1161 mddev->new_layout = mddev->layout; 1162 mddev->new_chunk = mddev->chunk_size; 1163 } 1164 1165 } else if (mddev->pers == NULL) { 1166 /* Insist of good event counter while assembling */ 1167 ++ev1; 1168 if (ev1 < mddev->events) 1169 return -EINVAL; 1170 } else if (mddev->bitmap) { 1171 /* If adding to array with a bitmap, then we can accept an 1172 * older device, but not too old. 1173 */ 1174 if (ev1 < mddev->bitmap->events_cleared) 1175 return 0; 1176 } else { 1177 if (ev1 < mddev->events) 1178 /* just a hot-add of a new device, leave raid_disk at -1 */ 1179 return 0; 1180 } 1181 if (mddev->level != LEVEL_MULTIPATH) { 1182 int role; 1183 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1184 switch(role) { 1185 case 0xffff: /* spare */ 1186 break; 1187 case 0xfffe: /* faulty */ 1188 set_bit(Faulty, &rdev->flags); 1189 break; 1190 default: 1191 if ((le32_to_cpu(sb->feature_map) & 1192 MD_FEATURE_RECOVERY_OFFSET)) 1193 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1194 else 1195 set_bit(In_sync, &rdev->flags); 1196 rdev->raid_disk = role; 1197 break; 1198 } 1199 if (sb->devflags & WriteMostly1) 1200 set_bit(WriteMostly, &rdev->flags); 1201 } else /* MULTIPATH are always insync */ 1202 set_bit(In_sync, &rdev->flags); 1203 1204 return 0; 1205} 1206 1207static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1208{ 1209 struct mdp_superblock_1 *sb; 1210 struct list_head *tmp; 1211 mdk_rdev_t *rdev2; 1212 int max_dev, i; 1213 /* make rdev->sb match mddev and rdev data. */ 1214 1215 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1216 1217 sb->feature_map = 0; 1218 sb->pad0 = 0; 1219 sb->recovery_offset = cpu_to_le64(0); 1220 memset(sb->pad1, 0, sizeof(sb->pad1)); 1221 memset(sb->pad2, 0, sizeof(sb->pad2)); 1222 memset(sb->pad3, 0, sizeof(sb->pad3)); 1223 1224 sb->utime = cpu_to_le64((__u64)mddev->utime); 1225 sb->events = cpu_to_le64(mddev->events); 1226 if (mddev->in_sync) 1227 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1228 else 1229 sb->resync_offset = cpu_to_le64(0); 1230 1231 sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors); 1232 1233 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1234 sb->size = cpu_to_le64(mddev->size<<1); 1235 1236 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1237 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1238 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1239 } 1240 1241 if (rdev->raid_disk >= 0 && 1242 !test_bit(In_sync, &rdev->flags) && 1243 rdev->recovery_offset > 0) { 1244 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1245 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); 1246 } 1247 1248 if (mddev->reshape_position != MaxSector) { 1249 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1250 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1251 sb->new_layout = cpu_to_le32(mddev->new_layout); 1252 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1253 sb->new_level = cpu_to_le32(mddev->new_level); 1254 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); 1255 } 1256 1257 max_dev = 0; 1258 ITERATE_RDEV(mddev,rdev2,tmp) 1259 if (rdev2->desc_nr+1 > max_dev) 1260 max_dev = rdev2->desc_nr+1; 1261 1262 sb->max_dev = cpu_to_le32(max_dev); 1263 for (i=0; i<max_dev;i++) 1264 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1265 1266 ITERATE_RDEV(mddev,rdev2,tmp) { 1267 i = rdev2->desc_nr; 1268 if (test_bit(Faulty, &rdev2->flags)) 1269 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1270 else if (test_bit(In_sync, &rdev2->flags)) 1271 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1272 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) 1273 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1274 else 1275 sb->dev_roles[i] = cpu_to_le16(0xffff); 1276 } 1277 1278 sb->sb_csum = calc_sb_1_csum(sb); 1279} 1280 1281 1282static struct super_type super_types[] = { 1283 [0] = { 1284 .name = "0.90.0", 1285 .owner = THIS_MODULE, 1286 .load_super = super_90_load, 1287 .validate_super = super_90_validate, 1288 .sync_super = super_90_sync, 1289 }, 1290 [1] = { 1291 .name = "md-1", 1292 .owner = THIS_MODULE, 1293 .load_super = super_1_load, 1294 .validate_super = super_1_validate, 1295 .sync_super = super_1_sync, 1296 }, 1297}; 1298 1299static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) 1300{ 1301 struct list_head *tmp; 1302 mdk_rdev_t *rdev; 1303 1304 ITERATE_RDEV(mddev,rdev,tmp) 1305 if (rdev->bdev->bd_contains == dev->bdev->bd_contains) 1306 return rdev; 1307 1308 return NULL; 1309} 1310 1311static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1312{ 1313 struct list_head *tmp; 1314 mdk_rdev_t *rdev; 1315 1316 ITERATE_RDEV(mddev1,rdev,tmp) 1317 if (match_dev_unit(mddev2, rdev)) 1318 return 1; 1319 1320 return 0; 1321} 1322 1323static LIST_HEAD(pending_raid_disks); 1324 1325static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1326{ 1327 mdk_rdev_t *same_pdev; 1328 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1329 struct kobject *ko; 1330 char *s; 1331 1332 if (rdev->mddev) { 1333 MD_BUG(); 1334 return -EINVAL; 1335 } 1336 /* make sure rdev->size exceeds mddev->size */ 1337 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { 1338 if (mddev->pers) 1339 /* Cannot change size, so fail */ 1340 return -ENOSPC; 1341 else 1342 mddev->size = rdev->size; 1343 } 1344 same_pdev = match_dev_unit(mddev, rdev); 1345 if (same_pdev) 1346 printk(KERN_WARNING 1347 "%s: WARNING: %s appears to be on the same physical" 1348 " disk as %s. True\n protection against single-disk" 1349 " failure might be compromised.\n", 1350 mdname(mddev), bdevname(rdev->bdev,b), 1351 bdevname(same_pdev->bdev,b2)); 1352 1353 /* Verify rdev->desc_nr is unique. 1354 * If it is -1, assign a free number, else 1355 * check number is not in use 1356 */ 1357 if (rdev->desc_nr < 0) { 1358 int choice = 0; 1359 if (mddev->pers) choice = mddev->raid_disks; 1360 while (find_rdev_nr(mddev, choice)) 1361 choice++; 1362 rdev->desc_nr = choice; 1363 } else { 1364 if (find_rdev_nr(mddev, rdev->desc_nr)) 1365 return -EBUSY; 1366 } 1367 bdevname(rdev->bdev,b); 1368 if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0) 1369 return -ENOMEM; 1370 while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL) 1371 *s = '!'; 1372 1373 list_add(&rdev->same_set, &mddev->disks); 1374 rdev->mddev = mddev; 1375 printk(KERN_INFO "md: bind<%s>\n", b); 1376 1377 rdev->kobj.parent = &mddev->kobj; 1378 kobject_add(&rdev->kobj); 1379 1380 if (rdev->bdev->bd_part) 1381 ko = &rdev->bdev->bd_part->kobj; 1382 else 1383 ko = &rdev->bdev->bd_disk->kobj; 1384 sysfs_create_link(&rdev->kobj, ko, "block"); 1385 bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk); 1386 return 0; 1387} 1388 1389static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1390{ 1391 char b[BDEVNAME_SIZE]; 1392 if (!rdev->mddev) { 1393 MD_BUG(); 1394 return; 1395 } 1396 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1397 list_del_init(&rdev->same_set); 1398 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1399 rdev->mddev = NULL; 1400 sysfs_remove_link(&rdev->kobj, "block"); 1401 kobject_del(&rdev->kobj); 1402} 1403 1404/* 1405 * prevent the device from being mounted, repartitioned or 1406 * otherwise reused by a RAID array (or any other kernel 1407 * subsystem), by bd_claiming the device. 1408 */ 1409static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1410{ 1411 int err = 0; 1412 struct block_device *bdev; 1413 char b[BDEVNAME_SIZE]; 1414 1415 bdev = open_partition_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1416 if (IS_ERR(bdev)) { 1417 printk(KERN_ERR "md: could not open %s.\n", 1418 __bdevname(dev, b)); 1419 return PTR_ERR(bdev); 1420 } 1421 err = bd_claim(bdev, rdev); 1422 if (err) { 1423 printk(KERN_ERR "md: could not bd_claim %s.\n", 1424 bdevname(bdev, b)); 1425 blkdev_put_partition(bdev); 1426 return err; 1427 } 1428 rdev->bdev = bdev; 1429 return err; 1430} 1431 1432static void unlock_rdev(mdk_rdev_t *rdev) 1433{ 1434 struct block_device *bdev = rdev->bdev; 1435 rdev->bdev = NULL; 1436 if (!bdev) 1437 MD_BUG(); 1438 bd_release(bdev); 1439 blkdev_put_partition(bdev); 1440} 1441 1442void md_autodetect_dev(dev_t dev); 1443 1444static void export_rdev(mdk_rdev_t * rdev) 1445{ 1446 char b[BDEVNAME_SIZE]; 1447 printk(KERN_INFO "md: export_rdev(%s)\n", 1448 bdevname(rdev->bdev,b)); 1449 if (rdev->mddev) 1450 MD_BUG(); 1451 free_disk_sb(rdev); 1452 list_del_init(&rdev->same_set); 1453#ifndef MODULE 1454 md_autodetect_dev(rdev->bdev->bd_dev); 1455#endif 1456 unlock_rdev(rdev); 1457 kobject_put(&rdev->kobj); 1458} 1459 1460static void kick_rdev_from_array(mdk_rdev_t * rdev) 1461{ 1462 unbind_rdev_from_array(rdev); 1463 export_rdev(rdev); 1464} 1465 1466static void export_array(mddev_t *mddev) 1467{ 1468 struct list_head *tmp; 1469 mdk_rdev_t *rdev; 1470 1471 ITERATE_RDEV(mddev,rdev,tmp) { 1472 if (!rdev->mddev) { 1473 MD_BUG(); 1474 continue; 1475 } 1476 kick_rdev_from_array(rdev); 1477 } 1478 if (!list_empty(&mddev->disks)) 1479 MD_BUG(); 1480 mddev->raid_disks = 0; 1481 mddev->major_version = 0; 1482} 1483 1484static void print_desc(mdp_disk_t *desc) 1485{ 1486 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1487 desc->major,desc->minor,desc->raid_disk,desc->state); 1488} 1489 1490static void print_sb(mdp_super_t *sb) 1491{ 1492 int i; 1493 1494 printk(KERN_INFO 1495 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1496 sb->major_version, sb->minor_version, sb->patch_version, 1497 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1498 sb->ctime); 1499 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1500 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1501 sb->md_minor, sb->layout, sb->chunk_size); 1502 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1503 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1504 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1505 sb->failed_disks, sb->spare_disks, 1506 sb->sb_csum, (unsigned long)sb->events_lo); 1507 1508 printk(KERN_INFO); 1509 for (i = 0; i < MD_SB_DISKS; i++) { 1510 mdp_disk_t *desc; 1511 1512 desc = sb->disks + i; 1513 if (desc->number || desc->major || desc->minor || 1514 desc->raid_disk || (desc->state && (desc->state != 4))) { 1515 printk(" D %2d: ", i); 1516 print_desc(desc); 1517 } 1518 } 1519 printk(KERN_INFO "md: THIS: "); 1520 print_desc(&sb->this_disk); 1521 1522} 1523 1524static void print_rdev(mdk_rdev_t *rdev) 1525{ 1526 char b[BDEVNAME_SIZE]; 1527 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1528 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1529 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1530 rdev->desc_nr); 1531 if (rdev->sb_loaded) { 1532 printk(KERN_INFO "md: rdev superblock:\n"); 1533 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1534 } else 1535 printk(KERN_INFO "md: no rdev superblock!\n"); 1536} 1537 1538static void md_print_devices(void) 1539{ 1540 struct list_head *tmp, *tmp2; 1541 mdk_rdev_t *rdev; 1542 mddev_t *mddev; 1543 char b[BDEVNAME_SIZE]; 1544 1545 printk("\n"); 1546 printk("md: **********************************\n"); 1547 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1548 printk("md: **********************************\n"); 1549 ITERATE_MDDEV(mddev,tmp) { 1550 1551 if (mddev->bitmap) 1552 bitmap_print_sb(mddev->bitmap); 1553 else 1554 printk("%s: ", mdname(mddev)); 1555 ITERATE_RDEV(mddev,rdev,tmp2) 1556 printk("<%s>", bdevname(rdev->bdev,b)); 1557 printk("\n"); 1558 1559 ITERATE_RDEV(mddev,rdev,tmp2) 1560 print_rdev(rdev); 1561 } 1562 printk("md: **********************************\n"); 1563 printk("\n"); 1564} 1565 1566 1567static void sync_sbs(mddev_t * mddev, int nospares) 1568{ 1569 /* Update each superblock (in-memory image), but 1570 * if we are allowed to, skip spares which already 1571 * have the right event counter, or have one earlier 1572 * (which would mean they aren't being marked as dirty 1573 * with the rest of the array) 1574 */ 1575 mdk_rdev_t *rdev; 1576 struct list_head *tmp; 1577 1578 ITERATE_RDEV(mddev,rdev,tmp) { 1579 if (rdev->sb_events == mddev->events || 1580 (nospares && 1581 rdev->raid_disk < 0 && 1582 (rdev->sb_events&1)==0 && 1583 rdev->sb_events+1 == mddev->events)) { 1584 /* Don't update this superblock */ 1585 rdev->sb_loaded = 2; 1586 } else { 1587 super_types[mddev->major_version]. 1588 sync_super(mddev, rdev); 1589 rdev->sb_loaded = 1; 1590 } 1591 } 1592} 1593 1594static void md_update_sb(mddev_t * mddev, int force_change) 1595{ 1596 int err; 1597 struct list_head *tmp; 1598 mdk_rdev_t *rdev; 1599 int sync_req; 1600 int nospares = 0; 1601 1602repeat: 1603 spin_lock_irq(&mddev->write_lock); 1604 1605 set_bit(MD_CHANGE_PENDING, &mddev->flags); 1606 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 1607 force_change = 1; 1608 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 1609 /* just a clean<-> dirty transition, possibly leave spares alone, 1610 * though if events isn't the right even/odd, we will have to do 1611 * spares after all 1612 */ 1613 nospares = 1; 1614 if (force_change) 1615 nospares = 0; 1616 if (mddev->degraded) 1617 /* If the array is degraded, then skipping spares is both 1618 * dangerous and fairly pointless. 1619 * Dangerous because a device that was removed from the array 1620 * might have a event_count that still looks up-to-date, 1621 * so it can be re-added without a resync. 1622 * Pointless because if there are any spares to skip, 1623 * then a recovery will happen and soon that array won't 1624 * be degraded any more and the spare can go back to sleep then. 1625 */ 1626 nospares = 0; 1627 1628 sync_req = mddev->in_sync; 1629 mddev->utime = get_seconds(); 1630 1631 /* If this is just a dirty<->clean transition, and the array is clean 1632 * and 'events' is odd, we can roll back to the previous clean state */ 1633 if (nospares 1634 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 1635 && (mddev->events & 1)) 1636 mddev->events--; 1637 else { 1638 /* otherwise we have to go forward and ... */ 1639 mddev->events ++; 1640 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 1641 /* .. if the array isn't clean, insist on an odd 'events' */ 1642 if ((mddev->events&1)==0) { 1643 mddev->events++; 1644 nospares = 0; 1645 } 1646 } else { 1647 /* otherwise insist on an even 'events' (for clean states) */ 1648 if ((mddev->events&1)) { 1649 mddev->events++; 1650 nospares = 0; 1651 } 1652 } 1653 } 1654 1655 if (!mddev->events) { 1656 /* 1657 * oops, this 64-bit counter should never wrap. 1658 * Either we are in around ~1 trillion A.C., assuming 1659 * 1 reboot per second, or we have a bug: 1660 */ 1661 MD_BUG(); 1662 mddev->events --; 1663 } 1664 sync_sbs(mddev, nospares); 1665 1666 /* 1667 * do not write anything to disk if using 1668 * nonpersistent superblocks 1669 */ 1670 if (!mddev->persistent) { 1671 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 1672 spin_unlock_irq(&mddev->write_lock); 1673 wake_up(&mddev->sb_wait); 1674 return; 1675 } 1676 spin_unlock_irq(&mddev->write_lock); 1677 1678 dprintk(KERN_INFO 1679 "md: updating %s RAID superblock on device (in sync %d)\n", 1680 mdname(mddev),mddev->in_sync); 1681 1682 err = bitmap_update_sb(mddev->bitmap); 1683 ITERATE_RDEV(mddev,rdev,tmp) { 1684 char b[BDEVNAME_SIZE]; 1685 dprintk(KERN_INFO "md: "); 1686 if (rdev->sb_loaded != 1) 1687 continue; /* no noise on spare devices */ 1688 if (test_bit(Faulty, &rdev->flags)) 1689 dprintk("(skipping faulty "); 1690 1691 dprintk("%s ", bdevname(rdev->bdev,b)); 1692 if (!test_bit(Faulty, &rdev->flags)) { 1693 md_super_write(mddev,rdev, 1694 rdev->sb_offset<<1, rdev->sb_size, 1695 rdev->sb_page); 1696 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1697 bdevname(rdev->bdev,b), 1698 (unsigned long long)rdev->sb_offset); 1699 rdev->sb_events = mddev->events; 1700 1701 } else 1702 dprintk(")\n"); 1703 if (mddev->level == LEVEL_MULTIPATH) 1704 /* only need to write one superblock... */ 1705 break; 1706 } 1707 md_super_wait(mddev); 1708 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 1709 1710 spin_lock_irq(&mddev->write_lock); 1711 if (mddev->in_sync != sync_req || 1712 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 1713 /* have to write it out again */ 1714 spin_unlock_irq(&mddev->write_lock); 1715 goto repeat; 1716 } 1717 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 1718 spin_unlock_irq(&mddev->write_lock); 1719 wake_up(&mddev->sb_wait); 1720 1721} 1722 1723/* words written to sysfs files may, or my not, be \n terminated. 1724 * We want to accept with case. For this we use cmd_match. 1725 */ 1726static int cmd_match(const char *cmd, const char *str) 1727{ 1728 /* See if cmd, written into a sysfs file, matches 1729 * str. They must either be the same, or cmd can 1730 * have a trailing newline 1731 */ 1732 while (*cmd && *str && *cmd == *str) { 1733 cmd++; 1734 str++; 1735 } 1736 if (*cmd == '\n') 1737 cmd++; 1738 if (*str || *cmd) 1739 return 0; 1740 return 1; 1741} 1742 1743struct rdev_sysfs_entry { 1744 struct attribute attr; 1745 ssize_t (*show)(mdk_rdev_t *, char *); 1746 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 1747}; 1748 1749static ssize_t 1750state_show(mdk_rdev_t *rdev, char *page) 1751{ 1752 char *sep = ""; 1753 int len=0; 1754 1755 if (test_bit(Faulty, &rdev->flags)) { 1756 len+= sprintf(page+len, "%sfaulty",sep); 1757 sep = ","; 1758 } 1759 if (test_bit(In_sync, &rdev->flags)) { 1760 len += sprintf(page+len, "%sin_sync",sep); 1761 sep = ","; 1762 } 1763 if (test_bit(WriteMostly, &rdev->flags)) { 1764 len += sprintf(page+len, "%swrite_mostly",sep); 1765 sep = ","; 1766 } 1767 if (!test_bit(Faulty, &rdev->flags) && 1768 !test_bit(In_sync, &rdev->flags)) { 1769 len += sprintf(page+len, "%sspare", sep); 1770 sep = ","; 1771 } 1772 return len+sprintf(page+len, "\n"); 1773} 1774 1775static ssize_t 1776state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1777{ 1778 /* can write 1779 * faulty - simulates and error 1780 * remove - disconnects the device 1781 * writemostly - sets write_mostly 1782 * -writemostly - clears write_mostly 1783 */ 1784 int err = -EINVAL; 1785 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 1786 md_error(rdev->mddev, rdev); 1787 err = 0; 1788 } else if (cmd_match(buf, "remove")) { 1789 if (rdev->raid_disk >= 0) 1790 err = -EBUSY; 1791 else { 1792 mddev_t *mddev = rdev->mddev; 1793 kick_rdev_from_array(rdev); 1794 md_update_sb(mddev, 1); 1795 md_new_event(mddev); 1796 err = 0; 1797 } 1798 } else if (cmd_match(buf, "writemostly")) { 1799 set_bit(WriteMostly, &rdev->flags); 1800 err = 0; 1801 } else if (cmd_match(buf, "-writemostly")) { 1802 clear_bit(WriteMostly, &rdev->flags); 1803 err = 0; 1804 } 1805 return err ? err : len; 1806} 1807static struct rdev_sysfs_entry rdev_state = 1808__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); 1809 1810static ssize_t 1811super_show(mdk_rdev_t *rdev, char *page) 1812{ 1813 if (rdev->sb_loaded && rdev->sb_size) { 1814 memcpy(page, page_address(rdev->sb_page), rdev->sb_size); 1815 return rdev->sb_size; 1816 } else 1817 return 0; 1818} 1819static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super); 1820 1821static ssize_t 1822errors_show(mdk_rdev_t *rdev, char *page) 1823{ 1824 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 1825} 1826 1827static ssize_t 1828errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1829{ 1830 char *e; 1831 unsigned long n = simple_strtoul(buf, &e, 10); 1832 if (*buf && (*e == 0 || *e == '\n')) { 1833 atomic_set(&rdev->corrected_errors, n); 1834 return len; 1835 } 1836 return -EINVAL; 1837} 1838static struct rdev_sysfs_entry rdev_errors = 1839__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 1840 1841static ssize_t 1842slot_show(mdk_rdev_t *rdev, char *page) 1843{ 1844 if (rdev->raid_disk < 0) 1845 return sprintf(page, "none\n"); 1846 else 1847 return sprintf(page, "%d\n", rdev->raid_disk); 1848} 1849 1850static ssize_t 1851slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1852{ 1853 char *e; 1854 int slot = simple_strtoul(buf, &e, 10); 1855 if (strncmp(buf, "none", 4)==0) 1856 slot = -1; 1857 else if (e==buf || (*e && *e!= '\n')) 1858 return -EINVAL; 1859 if (rdev->mddev->pers) 1860 /* Cannot set slot in active array (yet) */ 1861 return -EBUSY; 1862 if (slot >= rdev->mddev->raid_disks) 1863 return -ENOSPC; 1864 rdev->raid_disk = slot; 1865 /* assume it is working */ 1866 rdev->flags = 0; 1867 set_bit(In_sync, &rdev->flags); 1868 return len; 1869} 1870 1871 1872static struct rdev_sysfs_entry rdev_slot = 1873__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 1874 1875static ssize_t 1876offset_show(mdk_rdev_t *rdev, char *page) 1877{ 1878 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 1879} 1880 1881static ssize_t 1882offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1883{ 1884 char *e; 1885 unsigned long long offset = simple_strtoull(buf, &e, 10); 1886 if (e==buf || (*e && *e != '\n')) 1887 return -EINVAL; 1888 if (rdev->mddev->pers) 1889 return -EBUSY; 1890 rdev->data_offset = offset; 1891 return len; 1892} 1893 1894static struct rdev_sysfs_entry rdev_offset = 1895__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 1896 1897static ssize_t 1898rdev_size_show(mdk_rdev_t *rdev, char *page) 1899{ 1900 return sprintf(page, "%llu\n", (unsigned long long)rdev->size); 1901} 1902 1903static ssize_t 1904rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1905{ 1906 char *e; 1907 unsigned long long size = simple_strtoull(buf, &e, 10); 1908 if (e==buf || (*e && *e != '\n')) 1909 return -EINVAL; 1910 if (rdev->mddev->pers) 1911 return -EBUSY; 1912 rdev->size = size; 1913 if (size < rdev->mddev->size || rdev->mddev->size == 0) 1914 rdev->mddev->size = size; 1915 return len; 1916} 1917 1918static struct rdev_sysfs_entry rdev_size = 1919__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 1920 1921static struct attribute *rdev_default_attrs[] = { 1922 &rdev_state.attr, 1923 &rdev_super.attr, 1924 &rdev_errors.attr, 1925 &rdev_slot.attr, 1926 &rdev_offset.attr, 1927 &rdev_size.attr, 1928 NULL, 1929}; 1930static ssize_t 1931rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1932{ 1933 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1934 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1935 1936 if (!entry->show) 1937 return -EIO; 1938 return entry->show(rdev, page); 1939} 1940 1941static ssize_t 1942rdev_attr_store(struct kobject *kobj, struct attribute *attr, 1943 const char *page, size_t length) 1944{ 1945 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1946 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1947 1948 if (!entry->store) 1949 return -EIO; 1950 if (!capable(CAP_SYS_ADMIN)) 1951 return -EACCES; 1952 return entry->store(rdev, page, length); 1953} 1954 1955static void rdev_free(struct kobject *ko) 1956{ 1957 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 1958 kfree(rdev); 1959} 1960static struct sysfs_ops rdev_sysfs_ops = { 1961 .show = rdev_attr_show, 1962 .store = rdev_attr_store, 1963}; 1964static struct kobj_type rdev_ktype = { 1965 .release = rdev_free, 1966 .sysfs_ops = &rdev_sysfs_ops, 1967 .default_attrs = rdev_default_attrs, 1968}; 1969 1970/* 1971 * Import a device. If 'super_format' >= 0, then sanity check the superblock 1972 * 1973 * mark the device faulty if: 1974 * 1975 * - the device is nonexistent (zero size) 1976 * - the device has no valid superblock 1977 * 1978 * a faulty rdev _never_ has rdev->sb set. 1979 */ 1980static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 1981{ 1982 char b[BDEVNAME_SIZE]; 1983 int err; 1984 mdk_rdev_t *rdev; 1985 sector_t size; 1986 1987 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 1988 if (!rdev) { 1989 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 1990 return ERR_PTR(-ENOMEM); 1991 } 1992 1993 if ((err = alloc_disk_sb(rdev))) 1994 goto abort_free; 1995 1996 err = lock_rdev(rdev, newdev); 1997 if (err) 1998 goto abort_free; 1999 2000 rdev->kobj.parent = NULL; 2001 rdev->kobj.ktype = &rdev_ktype; 2002 kobject_init(&rdev->kobj); 2003 2004 rdev->desc_nr = -1; 2005 rdev->flags = 0; 2006 rdev->data_offset = 0; 2007 rdev->sb_events = 0; 2008 atomic_set(&rdev->nr_pending, 0); 2009 atomic_set(&rdev->read_errors, 0); 2010 atomic_set(&rdev->corrected_errors, 0); 2011 2012 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2013 if (!size) { 2014 printk(KERN_WARNING 2015 "md: %s has zero or unknown size, marking faulty!\n", 2016 bdevname(rdev->bdev,b)); 2017 err = -EINVAL; 2018 goto abort_free; 2019 } 2020 2021 if (super_format >= 0) { 2022 err = super_types[super_format]. 2023 load_super(rdev, NULL, super_minor); 2024 if (err == -EINVAL) { 2025 printk(KERN_WARNING 2026 "md: %s has invalid sb, not importing!\n", 2027 bdevname(rdev->bdev,b)); 2028 goto abort_free; 2029 } 2030 if (err < 0) { 2031 printk(KERN_WARNING 2032 "md: could not read %s's sb, not importing!\n", 2033 bdevname(rdev->bdev,b)); 2034 goto abort_free; 2035 } 2036 } 2037 INIT_LIST_HEAD(&rdev->same_set); 2038 2039 return rdev; 2040 2041abort_free: 2042 if (rdev->sb_page) { 2043 if (rdev->bdev) 2044 unlock_rdev(rdev); 2045 free_disk_sb(rdev); 2046 } 2047 kfree(rdev); 2048 return ERR_PTR(err); 2049} 2050 2051/* 2052 * Check a full RAID array for plausibility 2053 */ 2054 2055 2056static void analyze_sbs(mddev_t * mddev) 2057{ 2058 int i; 2059 struct list_head *tmp; 2060 mdk_rdev_t *rdev, *freshest; 2061 char b[BDEVNAME_SIZE]; 2062 2063 freshest = NULL; 2064 ITERATE_RDEV(mddev,rdev,tmp) 2065 switch (super_types[mddev->major_version]. 2066 load_super(rdev, freshest, mddev->minor_version)) { 2067 case 1: 2068 freshest = rdev; 2069 break; 2070 case 0: 2071 break; 2072 default: 2073 printk( KERN_ERR \ 2074 "md: fatal superblock inconsistency in %s" 2075 " -- removing from array\n", 2076 bdevname(rdev->bdev,b)); 2077 kick_rdev_from_array(rdev); 2078 } 2079 2080 2081 super_types[mddev->major_version]. 2082 validate_super(mddev, freshest); 2083 2084 i = 0; 2085 ITERATE_RDEV(mddev,rdev,tmp) { 2086 if (rdev != freshest) 2087 if (super_types[mddev->major_version]. 2088 validate_super(mddev, rdev)) { 2089 printk(KERN_WARNING "md: kicking non-fresh %s" 2090 " from array!\n", 2091 bdevname(rdev->bdev,b)); 2092 kick_rdev_from_array(rdev); 2093 continue; 2094 } 2095 if (mddev->level == LEVEL_MULTIPATH) { 2096 rdev->desc_nr = i++; 2097 rdev->raid_disk = rdev->desc_nr; 2098 set_bit(In_sync, &rdev->flags); 2099 } 2100 } 2101 2102 2103 2104 if (mddev->recovery_cp != MaxSector && 2105 mddev->level >= 1) 2106 printk(KERN_ERR "md: %s: raid array is not clean" 2107 " -- starting background reconstruction\n", 2108 mdname(mddev)); 2109 2110} 2111 2112static ssize_t 2113safe_delay_show(mddev_t *mddev, char *page) 2114{ 2115 int msec = (mddev->safemode_delay*1000)/HZ; 2116 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 2117} 2118static ssize_t 2119safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2120{ 2121 int scale=1; 2122 int dot=0; 2123 int i; 2124 unsigned long msec; 2125 char buf[30]; 2126 char *e; 2127 /* remove a period, and count digits after it */ 2128 if (len >= sizeof(buf)) 2129 return -EINVAL; 2130 strlcpy(buf, cbuf, len); 2131 buf[len] = 0; 2132 for (i=0; i<len; i++) { 2133 if (dot) { 2134 if (isdigit(buf[i])) { 2135 buf[i-1] = buf[i]; 2136 scale *= 10; 2137 } 2138 buf[i] = 0; 2139 } else if (buf[i] == '.') { 2140 dot=1; 2141 buf[i] = 0; 2142 } 2143 } 2144 msec = simple_strtoul(buf, &e, 10); 2145 if (e == buf || (*e && *e != '\n')) 2146 return -EINVAL; 2147 msec = (msec * 1000) / scale; 2148 if (msec == 0) 2149 mddev->safemode_delay = 0; 2150 else { 2151 mddev->safemode_delay = (msec*HZ)/1000; 2152 if (mddev->safemode_delay == 0) 2153 mddev->safemode_delay = 1; 2154 } 2155 return len; 2156} 2157static struct md_sysfs_entry md_safe_delay = 2158__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 2159 2160static ssize_t 2161level_show(mddev_t *mddev, char *page) 2162{ 2163 struct mdk_personality *p = mddev->pers; 2164 if (p) 2165 return sprintf(page, "%s\n", p->name); 2166 else if (mddev->clevel[0]) 2167 return sprintf(page, "%s\n", mddev->clevel); 2168 else if (mddev->level != LEVEL_NONE) 2169 return sprintf(page, "%d\n", mddev->level); 2170 else 2171 return 0; 2172} 2173 2174static ssize_t 2175level_store(mddev_t *mddev, const char *buf, size_t len) 2176{ 2177 int rv = len; 2178 if (mddev->pers) 2179 return -EBUSY; 2180 if (len == 0) 2181 return 0; 2182 if (len >= sizeof(mddev->clevel)) 2183 return -ENOSPC; 2184 strncpy(mddev->clevel, buf, len); 2185 if (mddev->clevel[len-1] == '\n') 2186 len--; 2187 mddev->clevel[len] = 0; 2188 mddev->level = LEVEL_NONE; 2189 return rv; 2190} 2191 2192static struct md_sysfs_entry md_level = 2193__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 2194 2195 2196static ssize_t 2197layout_show(mddev_t *mddev, char *page) 2198{ 2199 /* just a number, not meaningful for all levels */ 2200 return sprintf(page, "%d\n", mddev->layout); 2201} 2202 2203static ssize_t 2204layout_store(mddev_t *mddev, const char *buf, size_t len) 2205{ 2206 char *e; 2207 unsigned long n = simple_strtoul(buf, &e, 10); 2208 if (mddev->pers) 2209 return -EBUSY; 2210 2211 if (!*buf || (*e && *e != '\n')) 2212 return -EINVAL; 2213 2214 mddev->layout = n; 2215 return len; 2216} 2217static struct md_sysfs_entry md_layout = 2218__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 2219 2220 2221static ssize_t 2222raid_disks_show(mddev_t *mddev, char *page) 2223{ 2224 if (mddev->raid_disks == 0) 2225 return 0; 2226 return sprintf(page, "%d\n", mddev->raid_disks); 2227} 2228 2229static int update_raid_disks(mddev_t *mddev, int raid_disks); 2230 2231static ssize_t 2232raid_disks_store(mddev_t *mddev, const char *buf, size_t len) 2233{ 2234 /* can only set raid_disks if array is not yet active */ 2235 char *e; 2236 int rv = 0; 2237 unsigned long n = simple_strtoul(buf, &e, 10); 2238 2239 if (!*buf || (*e && *e != '\n')) 2240 return -EINVAL; 2241 2242 if (mddev->pers) 2243 rv = update_raid_disks(mddev, n); 2244 else 2245 mddev->raid_disks = n; 2246 return rv ? rv : len; 2247} 2248static struct md_sysfs_entry md_raid_disks = 2249__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 2250 2251static ssize_t 2252chunk_size_show(mddev_t *mddev, char *page) 2253{ 2254 return sprintf(page, "%d\n", mddev->chunk_size); 2255} 2256 2257static ssize_t 2258chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 2259{ 2260 /* can only set chunk_size if array is not yet active */ 2261 char *e; 2262 unsigned long n = simple_strtoul(buf, &e, 10); 2263 2264 if (mddev->pers) 2265 return -EBUSY; 2266 if (!*buf || (*e && *e != '\n')) 2267 return -EINVAL; 2268 2269 mddev->chunk_size = n; 2270 return len; 2271} 2272static struct md_sysfs_entry md_chunk_size = 2273__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 2274 2275static ssize_t 2276resync_start_show(mddev_t *mddev, char *page) 2277{ 2278 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 2279} 2280 2281static ssize_t 2282resync_start_store(mddev_t *mddev, const char *buf, size_t len) 2283{ 2284 /* can only set chunk_size if array is not yet active */ 2285 char *e; 2286 unsigned long long n = simple_strtoull(buf, &e, 10); 2287 2288 if (mddev->pers) 2289 return -EBUSY; 2290 if (!*buf || (*e && *e != '\n')) 2291 return -EINVAL; 2292 2293 mddev->recovery_cp = n; 2294 return len; 2295} 2296static struct md_sysfs_entry md_resync_start = 2297__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); 2298 2299/* 2300 * The array state can be: 2301 * 2302 * clear 2303 * No devices, no size, no level 2304 * Equivalent to STOP_ARRAY ioctl 2305 * inactive 2306 * May have some settings, but array is not active 2307 * all IO results in error 2308 * When written, doesn't tear down array, but just stops it 2309 * suspended (not supported yet) 2310 * All IO requests will block. The array can be reconfigured. 2311 * Writing this, if accepted, will block until array is quiessent 2312 * readonly 2313 * no resync can happen. no superblocks get written. 2314 * write requests fail 2315 * read-auto 2316 * like readonly, but behaves like 'clean' on a write request. 2317 * 2318 * clean - no pending writes, but otherwise active. 2319 * When written to inactive array, starts without resync 2320 * If a write request arrives then 2321 * if metadata is known, mark 'dirty' and switch to 'active'. 2322 * if not known, block and switch to write-pending 2323 * If written to an active array that has pending writes, then fails. 2324 * active 2325 * fully active: IO and resync can be happening. 2326 * When written to inactive array, starts with resync 2327 * 2328 * write-pending 2329 * clean, but writes are blocked waiting for 'active' to be written. 2330 * 2331 * active-idle 2332 * like active, but no writes have been seen for a while (100msec). 2333 * 2334 */ 2335enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 2336 write_pending, active_idle, bad_word}; 2337static char *array_states[] = { 2338 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 2339 "write-pending", "active-idle", NULL }; 2340 2341static int match_word(const char *word, char **list) 2342{ 2343 int n; 2344 for (n=0; list[n]; n++) 2345 if (cmd_match(word, list[n])) 2346 break; 2347 return n; 2348} 2349 2350static ssize_t 2351array_state_show(mddev_t *mddev, char *page) 2352{ 2353 enum array_state st = inactive; 2354 2355 if (mddev->pers) 2356 switch(mddev->ro) { 2357 case 1: 2358 st = readonly; 2359 break; 2360 case 2: 2361 st = read_auto; 2362 break; 2363 case 0: 2364 if (mddev->in_sync) 2365 st = clean; 2366 else if (mddev->safemode) 2367 st = active_idle; 2368 else 2369 st = active; 2370 } 2371 else { 2372 if (list_empty(&mddev->disks) && 2373 mddev->raid_disks == 0 && 2374 mddev->size == 0) 2375 st = clear; 2376 else 2377 st = inactive; 2378 } 2379 return sprintf(page, "%s\n", array_states[st]); 2380} 2381 2382static int do_md_stop(mddev_t * mddev, int ro); 2383static int do_md_run(mddev_t * mddev); 2384static int restart_array(mddev_t *mddev); 2385 2386static ssize_t 2387array_state_store(mddev_t *mddev, const char *buf, size_t len) 2388{ 2389 int err = -EINVAL; 2390 enum array_state st = match_word(buf, array_states); 2391 switch(st) { 2392 case bad_word: 2393 break; 2394 case clear: 2395 /* stopping an active array */ 2396 if (mddev->pers) { 2397 if (atomic_read(&mddev->active) > 1) 2398 return -EBUSY; 2399 err = do_md_stop(mddev, 0); 2400 } 2401 break; 2402 case inactive: 2403 /* stopping an active array */ 2404 if (mddev->pers) { 2405 if (atomic_read(&mddev->active) > 1) 2406 return -EBUSY; 2407 err = do_md_stop(mddev, 2); 2408 } 2409 break; 2410 case suspended: 2411 break; /* not supported yet */ 2412 case readonly: 2413 if (mddev->pers) 2414 err = do_md_stop(mddev, 1); 2415 else { 2416 mddev->ro = 1; 2417 err = do_md_run(mddev); 2418 } 2419 break; 2420 case read_auto: 2421 /* stopping an active array */ 2422 if (mddev->pers) { 2423 err = do_md_stop(mddev, 1); 2424 if (err == 0) 2425 mddev->ro = 2; /* FIXME mark devices writable */ 2426 } else { 2427 mddev->ro = 2; 2428 err = do_md_run(mddev); 2429 } 2430 break; 2431 case clean: 2432 if (mddev->pers) { 2433 restart_array(mddev); 2434 spin_lock_irq(&mddev->write_lock); 2435 if (atomic_read(&mddev->writes_pending) == 0) { 2436 mddev->in_sync = 1; 2437 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 2438 } 2439 spin_unlock_irq(&mddev->write_lock); 2440 } else { 2441 mddev->ro = 0; 2442 mddev->recovery_cp = MaxSector; 2443 err = do_md_run(mddev); 2444 } 2445 break; 2446 case active: 2447 if (mddev->pers) { 2448 restart_array(mddev); 2449 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2450 wake_up(&mddev->sb_wait); 2451 err = 0; 2452 } else { 2453 mddev->ro = 0; 2454 err = do_md_run(mddev); 2455 } 2456 break; 2457 case write_pending: 2458 case active_idle: 2459 /* these cannot be set */ 2460 break; 2461 } 2462 if (err) 2463 return err; 2464 else 2465 return len; 2466} 2467static struct md_sysfs_entry md_array_state = 2468__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 2469 2470static ssize_t 2471null_show(mddev_t *mddev, char *page) 2472{ 2473 return -EINVAL; 2474} 2475 2476static ssize_t 2477new_dev_store(mddev_t *mddev, const char *buf, size_t len) 2478{ 2479 /* buf must be %d:%d\n? giving major and minor numbers */ 2480 /* The new device is added to the array. 2481 * If the array has a persistent superblock, we read the 2482 * superblock to initialise info and check validity. 2483 * Otherwise, only checking done is that in bind_rdev_to_array, 2484 * which mainly checks size. 2485 */ 2486 char *e; 2487 int major = simple_strtoul(buf, &e, 10); 2488 int minor; 2489 dev_t dev; 2490 mdk_rdev_t *rdev; 2491 int err; 2492 2493 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 2494 return -EINVAL; 2495 minor = simple_strtoul(e+1, &e, 10); 2496 if (*e && *e != '\n') 2497 return -EINVAL; 2498 dev = MKDEV(major, minor); 2499 if (major != MAJOR(dev) || 2500 minor != MINOR(dev)) 2501 return -EOVERFLOW; 2502 2503 2504 if (mddev->persistent) { 2505 rdev = md_import_device(dev, mddev->major_version, 2506 mddev->minor_version); 2507 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 2508 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2509 mdk_rdev_t, same_set); 2510 err = super_types[mddev->major_version] 2511 .load_super(rdev, rdev0, mddev->minor_version); 2512 if (err < 0) 2513 goto out; 2514 } 2515 } else 2516 rdev = md_import_device(dev, -1, -1); 2517 2518 if (IS_ERR(rdev)) 2519 return PTR_ERR(rdev); 2520 err = bind_rdev_to_array(rdev, mddev); 2521 out: 2522 if (err) 2523 export_rdev(rdev); 2524 return err ? err : len; 2525} 2526 2527static struct md_sysfs_entry md_new_device = 2528__ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 2529 2530static ssize_t 2531bitmap_store(mddev_t *mddev, const char *buf, size_t len) 2532{ 2533 char *end; 2534 unsigned long chunk, end_chunk; 2535 2536 if (!mddev->bitmap) 2537 goto out; 2538 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 2539 while (*buf) { 2540 chunk = end_chunk = simple_strtoul(buf, &end, 0); 2541 if (buf == end) break; 2542 if (*end == '-') { /* range */ 2543 buf = end + 1; 2544 end_chunk = simple_strtoul(buf, &end, 0); 2545 if (buf == end) break; 2546 } 2547 if (*end && !isspace(*end)) break; 2548 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 2549 buf = end; 2550 while (isspace(*buf)) buf++; 2551 } 2552 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 2553out: 2554 return len; 2555} 2556 2557static struct md_sysfs_entry md_bitmap = 2558__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 2559 2560static ssize_t 2561size_show(mddev_t *mddev, char *page) 2562{ 2563 return sprintf(page, "%llu\n", (unsigned long long)mddev->size); 2564} 2565 2566static int update_size(mddev_t *mddev, unsigned long size); 2567 2568static ssize_t 2569size_store(mddev_t *mddev, const char *buf, size_t len) 2570{ 2571 /* If array is inactive, we can reduce the component size, but 2572 * not increase it (except from 0). 2573 * If array is active, we can try an on-line resize 2574 */ 2575 char *e; 2576 int err = 0; 2577 unsigned long long size = simple_strtoull(buf, &e, 10); 2578 if (!*buf || *buf == '\n' || 2579 (*e && *e != '\n')) 2580 return -EINVAL; 2581 2582 if (mddev->pers) { 2583 err = update_size(mddev, size); 2584 md_update_sb(mddev, 1); 2585 } else { 2586 if (mddev->size == 0 || 2587 mddev->size > size) 2588 mddev->size = size; 2589 else 2590 err = -ENOSPC; 2591 } 2592 return err ? err : len; 2593} 2594 2595static struct md_sysfs_entry md_size = 2596__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 2597 2598 2599/* Metdata version. 2600 * This is either 'none' for arrays with externally managed metadata, 2601 * or N.M for internally known formats 2602 */ 2603static ssize_t 2604metadata_show(mddev_t *mddev, char *page) 2605{ 2606 if (mddev->persistent) 2607 return sprintf(page, "%d.%d\n", 2608 mddev->major_version, mddev->minor_version); 2609 else 2610 return sprintf(page, "none\n"); 2611} 2612 2613static ssize_t 2614metadata_store(mddev_t *mddev, const char *buf, size_t len) 2615{ 2616 int major, minor; 2617 char *e; 2618 if (!list_empty(&mddev->disks)) 2619 return -EBUSY; 2620 2621 if (cmd_match(buf, "none")) { 2622 mddev->persistent = 0; 2623 mddev->major_version = 0; 2624 mddev->minor_version = 90; 2625 return len; 2626 } 2627 major = simple_strtoul(buf, &e, 10); 2628 if (e==buf || *e != '.') 2629 return -EINVAL; 2630 buf = e+1; 2631 minor = simple_strtoul(buf, &e, 10); 2632 if (e==buf || *e != '\n') 2633 return -EINVAL; 2634 if (major >= sizeof(super_types)/sizeof(super_types[0]) || 2635 super_types[major].name == NULL) 2636 return -ENOENT; 2637 mddev->major_version = major; 2638 mddev->minor_version = minor; 2639 mddev->persistent = 1; 2640 return len; 2641} 2642 2643static struct md_sysfs_entry md_metadata = 2644__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 2645 2646static ssize_t 2647action_show(mddev_t *mddev, char *page) 2648{ 2649 char *type = "idle"; 2650 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2651 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { 2652 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 2653 type = "reshape"; 2654 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2655 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 2656 type = "resync"; 2657 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2658 type = "check"; 2659 else 2660 type = "repair"; 2661 } else 2662 type = "recover"; 2663 } 2664 return sprintf(page, "%s\n", type); 2665} 2666 2667static ssize_t 2668action_store(mddev_t *mddev, const char *page, size_t len) 2669{ 2670 if (!mddev->pers || !mddev->pers->sync_request) 2671 return -EINVAL; 2672 2673 if (cmd_match(page, "idle")) { 2674 if (mddev->sync_thread) { 2675 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2676 md_unregister_thread(mddev->sync_thread); 2677 mddev->sync_thread = NULL; 2678 mddev->recovery = 0; 2679 } 2680 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2681 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 2682 return -EBUSY; 2683 else if (cmd_match(page, "resync") || cmd_match(page, "recover")) 2684 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2685 else if (cmd_match(page, "reshape")) { 2686 int err; 2687 if (mddev->pers->start_reshape == NULL) 2688 return -EINVAL; 2689 err = mddev->pers->start_reshape(mddev); 2690 if (err) 2691 return err; 2692 } else { 2693 if (cmd_match(page, "check")) 2694 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 2695 else if (!cmd_match(page, "repair")) 2696 return -EINVAL; 2697 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 2698 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 2699 } 2700 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2701 md_wakeup_thread(mddev->thread); 2702 return len; 2703} 2704 2705static ssize_t 2706mismatch_cnt_show(mddev_t *mddev, char *page) 2707{ 2708 return sprintf(page, "%llu\n", 2709 (unsigned long long) mddev->resync_mismatches); 2710} 2711 2712static struct md_sysfs_entry md_scan_mode = 2713__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 2714 2715 2716static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 2717 2718static ssize_t 2719sync_min_show(mddev_t *mddev, char *page) 2720{ 2721 return sprintf(page, "%d (%s)\n", speed_min(mddev), 2722 mddev->sync_speed_min ? "local": "system"); 2723} 2724 2725static ssize_t 2726sync_min_store(mddev_t *mddev, const char *buf, size_t len) 2727{ 2728 int min; 2729 char *e; 2730 if (strncmp(buf, "system", 6)==0) { 2731 mddev->sync_speed_min = 0; 2732 return len; 2733 } 2734 min = simple_strtoul(buf, &e, 10); 2735 if (buf == e || (*e && *e != '\n') || min <= 0) 2736 return -EINVAL; 2737 mddev->sync_speed_min = min; 2738 return len; 2739} 2740 2741static struct md_sysfs_entry md_sync_min = 2742__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 2743 2744static ssize_t 2745sync_max_show(mddev_t *mddev, char *page) 2746{ 2747 return sprintf(page, "%d (%s)\n", speed_max(mddev), 2748 mddev->sync_speed_max ? "local": "system"); 2749} 2750 2751static ssize_t 2752sync_max_store(mddev_t *mddev, const char *buf, size_t len) 2753{ 2754 int max; 2755 char *e; 2756 if (strncmp(buf, "system", 6)==0) { 2757 mddev->sync_speed_max = 0; 2758 return len; 2759 } 2760 max = simple_strtoul(buf, &e, 10); 2761 if (buf == e || (*e && *e != '\n') || max <= 0) 2762 return -EINVAL; 2763 mddev->sync_speed_max = max; 2764 return len; 2765} 2766 2767static struct md_sysfs_entry md_sync_max = 2768__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 2769 2770 2771static ssize_t 2772sync_speed_show(mddev_t *mddev, char *page) 2773{ 2774 unsigned long resync, dt, db; 2775 resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)); 2776 dt = ((jiffies - mddev->resync_mark) / HZ); 2777 if (!dt) dt++; 2778 db = resync - (mddev->resync_mark_cnt); 2779 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ 2780} 2781 2782static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 2783 2784static ssize_t 2785sync_completed_show(mddev_t *mddev, char *page) 2786{ 2787 unsigned long max_blocks, resync; 2788 2789 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2790 max_blocks = mddev->resync_max_sectors; 2791 else 2792 max_blocks = mddev->size << 1; 2793 2794 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); 2795 return sprintf(page, "%lu / %lu\n", resync, max_blocks); 2796} 2797 2798static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 2799 2800static ssize_t 2801suspend_lo_show(mddev_t *mddev, char *page) 2802{ 2803 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 2804} 2805 2806static ssize_t 2807suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) 2808{ 2809 char *e; 2810 unsigned long long new = simple_strtoull(buf, &e, 10); 2811 2812 if (mddev->pers->quiesce == NULL) 2813 return -EINVAL; 2814 if (buf == e || (*e && *e != '\n')) 2815 return -EINVAL; 2816 if (new >= mddev->suspend_hi || 2817 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 2818 mddev->suspend_lo = new; 2819 mddev->pers->quiesce(mddev, 2); 2820 return len; 2821 } else 2822 return -EINVAL; 2823} 2824static struct md_sysfs_entry md_suspend_lo = 2825__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 2826 2827 2828static ssize_t 2829suspend_hi_show(mddev_t *mddev, char *page) 2830{ 2831 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 2832} 2833 2834static ssize_t 2835suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) 2836{ 2837 char *e; 2838 unsigned long long new = simple_strtoull(buf, &e, 10); 2839 2840 if (mddev->pers->quiesce == NULL) 2841 return -EINVAL; 2842 if (buf == e || (*e && *e != '\n')) 2843 return -EINVAL; 2844 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 2845 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 2846 mddev->suspend_hi = new; 2847 mddev->pers->quiesce(mddev, 1); 2848 mddev->pers->quiesce(mddev, 0); 2849 return len; 2850 } else 2851 return -EINVAL; 2852} 2853static struct md_sysfs_entry md_suspend_hi = 2854__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 2855 2856 2857static struct attribute *md_default_attrs[] = { 2858 &md_level.attr, 2859 &md_layout.attr, 2860 &md_raid_disks.attr, 2861 &md_chunk_size.attr, 2862 &md_size.attr, 2863 &md_resync_start.attr, 2864 &md_metadata.attr, 2865 &md_new_device.attr, 2866 &md_safe_delay.attr, 2867 &md_array_state.attr, 2868 NULL, 2869}; 2870 2871static struct attribute *md_redundancy_attrs[] = { 2872 &md_scan_mode.attr, 2873 &md_mismatches.attr, 2874 &md_sync_min.attr, 2875 &md_sync_max.attr, 2876 &md_sync_speed.attr, 2877 &md_sync_completed.attr, 2878 &md_suspend_lo.attr, 2879 &md_suspend_hi.attr, 2880 &md_bitmap.attr, 2881 NULL, 2882}; 2883static struct attribute_group md_redundancy_group = { 2884 .name = NULL, 2885 .attrs = md_redundancy_attrs, 2886}; 2887 2888 2889static ssize_t 2890md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2891{ 2892 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 2893 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 2894 ssize_t rv; 2895 2896 if (!entry->show) 2897 return -EIO; 2898 rv = mddev_lock(mddev); 2899 if (!rv) { 2900 rv = entry->show(mddev, page); 2901 mddev_unlock(mddev); 2902 } 2903 return rv; 2904} 2905 2906static ssize_t 2907md_attr_store(struct kobject *kobj, struct attribute *attr, 2908 const char *page, size_t length) 2909{ 2910 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 2911 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 2912 ssize_t rv; 2913 2914 if (!entry->store) 2915 return -EIO; 2916 if (!capable(CAP_SYS_ADMIN)) 2917 return -EACCES; 2918 rv = mddev_lock(mddev); 2919 if (!rv) { 2920 rv = entry->store(mddev, page, length); 2921 mddev_unlock(mddev); 2922 } 2923 return rv; 2924} 2925 2926static void md_free(struct kobject *ko) 2927{ 2928 mddev_t *mddev = container_of(ko, mddev_t, kobj); 2929 kfree(mddev); 2930} 2931 2932static struct sysfs_ops md_sysfs_ops = { 2933 .show = md_attr_show, 2934 .store = md_attr_store, 2935}; 2936static struct kobj_type md_ktype = { 2937 .release = md_free, 2938 .sysfs_ops = &md_sysfs_ops, 2939 .default_attrs = md_default_attrs, 2940}; 2941 2942int mdp_major = 0; 2943 2944static struct kobject *md_probe(dev_t dev, int *part, void *data) 2945{ 2946 static DEFINE_MUTEX(disks_mutex); 2947 mddev_t *mddev = mddev_find(dev); 2948 struct gendisk *disk; 2949 int partitioned = (MAJOR(dev) != MD_MAJOR); 2950 int shift = partitioned ? MdpMinorShift : 0; 2951 int unit = MINOR(dev) >> shift; 2952 2953 if (!mddev) 2954 return NULL; 2955 2956 mutex_lock(&disks_mutex); 2957 if (mddev->gendisk) { 2958 mutex_unlock(&disks_mutex); 2959 mddev_put(mddev); 2960 return NULL; 2961 } 2962 disk = alloc_disk(1 << shift); 2963 if (!disk) { 2964 mutex_unlock(&disks_mutex); 2965 mddev_put(mddev); 2966 return NULL; 2967 } 2968 disk->major = MAJOR(dev); 2969 disk->first_minor = unit << shift; 2970 if (partitioned) 2971 sprintf(disk->disk_name, "md_d%d", unit); 2972 else 2973 sprintf(disk->disk_name, "md%d", unit); 2974 disk->fops = &md_fops; 2975 disk->private_data = mddev; 2976 disk->queue = mddev->queue; 2977 add_disk(disk); 2978 mddev->gendisk = disk; 2979 mutex_unlock(&disks_mutex); 2980 mddev->kobj.parent = &disk->kobj; 2981 mddev->kobj.k_name = NULL; 2982 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); 2983 mddev->kobj.ktype = &md_ktype; 2984 kobject_register(&mddev->kobj); 2985 return NULL; 2986} 2987 2988static void md_safemode_timeout(unsigned long data) 2989{ 2990 mddev_t *mddev = (mddev_t *) data; 2991 2992 mddev->safemode = 1; 2993 md_wakeup_thread(mddev->thread); 2994} 2995 2996static int start_dirty_degraded; 2997 2998static int do_md_run(mddev_t * mddev) 2999{ 3000 int err; 3001 int chunk_size; 3002 struct list_head *tmp; 3003 mdk_rdev_t *rdev; 3004 struct gendisk *disk; 3005 struct mdk_personality *pers; 3006 char b[BDEVNAME_SIZE]; 3007 3008 if (list_empty(&mddev->disks)) 3009 /* cannot run an array with no devices.. */ 3010 return -EINVAL; 3011 3012 if (mddev->pers) 3013 return -EBUSY; 3014 3015 /* 3016 * Analyze all RAID superblock(s) 3017 */ 3018 if (!mddev->raid_disks) 3019 analyze_sbs(mddev); 3020 3021 chunk_size = mddev->chunk_size; 3022 3023 if (chunk_size) { 3024 if (chunk_size > MAX_CHUNK_SIZE) { 3025 printk(KERN_ERR "too big chunk_size: %d > %d\n", 3026 chunk_size, MAX_CHUNK_SIZE); 3027 return -EINVAL; 3028 } 3029 /* 3030 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 3031 */ 3032 if ( (1 << ffz(~chunk_size)) != chunk_size) { 3033 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 3034 return -EINVAL; 3035 } 3036 if (chunk_size < PAGE_SIZE) { 3037 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 3038 chunk_size, PAGE_SIZE); 3039 return -EINVAL; 3040 } 3041 3042 /* devices must have minimum size of one chunk */ 3043 ITERATE_RDEV(mddev,rdev,tmp) { 3044 if (test_bit(Faulty, &rdev->flags)) 3045 continue; 3046 if (rdev->size < chunk_size / 1024) { 3047 printk(KERN_WARNING 3048 "md: Dev %s smaller than chunk_size:" 3049 " %lluk < %dk\n", 3050 bdevname(rdev->bdev,b), 3051 (unsigned long long)rdev->size, 3052 chunk_size / 1024); 3053 return -EINVAL; 3054 } 3055 } 3056 } 3057 3058#ifdef CONFIG_KMOD 3059 if (mddev->level != LEVEL_NONE) 3060 request_module("md-level-%d", mddev->level); 3061 else if (mddev->clevel[0]) 3062 request_module("md-%s", mddev->clevel); 3063#endif 3064 3065 /* 3066 * Drop all container device buffers, from now on 3067 * the only valid external interface is through the md 3068 * device. 3069 * Also find largest hardsector size 3070 */ 3071 ITERATE_RDEV(mddev,rdev,tmp) { 3072 if (test_bit(Faulty, &rdev->flags)) 3073 continue; 3074 sync_blockdev(rdev->bdev); 3075 invalidate_bdev(rdev->bdev, 0); 3076 } 3077 3078 md_probe(mddev->unit, NULL, NULL); 3079 disk = mddev->gendisk; 3080 if (!disk) 3081 return -ENOMEM; 3082 3083 spin_lock(&pers_lock); 3084 pers = find_pers(mddev->level, mddev->clevel); 3085 if (!pers || !try_module_get(pers->owner)) { 3086 spin_unlock(&pers_lock); 3087 if (mddev->level != LEVEL_NONE) 3088 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 3089 mddev->level); 3090 else 3091 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 3092 mddev->clevel); 3093 return -EINVAL; 3094 } 3095 mddev->pers = pers; 3096 spin_unlock(&pers_lock); 3097 mddev->level = pers->level; 3098 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3099 3100 if (mddev->reshape_position != MaxSector && 3101 pers->start_reshape == NULL) { 3102 /* This personality cannot handle reshaping... */ 3103 mddev->pers = NULL; 3104 module_put(pers->owner); 3105 return -EINVAL; 3106 } 3107 3108 mddev->recovery = 0; 3109 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 3110 mddev->barriers_work = 1; 3111 mddev->ok_start_degraded = start_dirty_degraded; 3112 3113 if (start_readonly) 3114 mddev->ro = 2; /* read-only, but switch on first write */ 3115 3116 err = mddev->pers->run(mddev); 3117 if (!err && mddev->pers->sync_request) { 3118 err = bitmap_create(mddev); 3119 if (err) { 3120 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 3121 mdname(mddev), err); 3122 mddev->pers->stop(mddev); 3123 } 3124 } 3125 if (err) { 3126 printk(KERN_ERR "md: pers->run() failed ...\n"); 3127 module_put(mddev->pers->owner); 3128 mddev->pers = NULL; 3129 bitmap_destroy(mddev); 3130 return err; 3131 } 3132 if (mddev->pers->sync_request) 3133 sysfs_create_group(&mddev->kobj, &md_redundancy_group); 3134 else if (mddev->ro == 2) /* auto-readonly not meaningful */ 3135 mddev->ro = 0; 3136 3137 atomic_set(&mddev->writes_pending,0); 3138 mddev->safemode = 0; 3139 mddev->safemode_timer.function = md_safemode_timeout; 3140 mddev->safemode_timer.data = (unsigned long) mddev; 3141 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 3142 mddev->in_sync = 1; 3143 3144 ITERATE_RDEV(mddev,rdev,tmp) 3145 if (rdev->raid_disk >= 0) { 3146 char nm[20]; 3147 sprintf(nm, "rd%d", rdev->raid_disk); 3148 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 3149 } 3150 3151 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3152 3153 if (mddev->flags) 3154 md_update_sb(mddev, 0); 3155 3156 set_capacity(disk, mddev->array_size<<1); 3157 3158 /* If we call blk_queue_make_request here, it will 3159 * re-initialise max_sectors etc which may have been 3160 * refined inside -> run. So just set the bits we need to set. 3161 * Most initialisation happended when we called 3162 * blk_queue_make_request(..., md_fail_request) 3163 * earlier. 3164 */ 3165 mddev->queue->queuedata = mddev; 3166 mddev->queue->make_request_fn = mddev->pers->make_request; 3167 3168 /* If there is a partially-recovered drive we need to 3169 * start recovery here. If we leave it to md_check_recovery, 3170 * it will remove the drives and not do the right thing 3171 */ 3172 if (mddev->degraded && !mddev->sync_thread) { 3173 struct list_head *rtmp; 3174 int spares = 0; 3175 ITERATE_RDEV(mddev,rdev,rtmp) 3176 if (rdev->raid_disk >= 0 && 3177 !test_bit(In_sync, &rdev->flags) && 3178 !test_bit(Faulty, &rdev->flags)) 3179 /* complete an interrupted recovery */ 3180 spares++; 3181 if (spares && mddev->pers->sync_request) { 3182 mddev->recovery = 0; 3183 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3184 mddev->sync_thread = md_register_thread(md_do_sync, 3185 mddev, 3186 "%s_resync"); 3187 if (!mddev->sync_thread) { 3188 printk(KERN_ERR "%s: could not start resync" 3189 " thread...\n", 3190 mdname(mddev)); 3191 /* leave the spares where they are, it shouldn't hurt */ 3192 mddev->recovery = 0; 3193 } 3194 } 3195 } 3196 md_wakeup_thread(mddev->thread); 3197 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 3198 3199 mddev->changed = 1; 3200 md_new_event(mddev); 3201 return 0; 3202} 3203 3204static int restart_array(mddev_t *mddev) 3205{ 3206 struct gendisk *disk = mddev->gendisk; 3207 int err; 3208 3209 /* 3210 * Complain if it has no devices 3211 */ 3212 err = -ENXIO; 3213 if (list_empty(&mddev->disks)) 3214 goto out; 3215 3216 if (mddev->pers) { 3217 err = -EBUSY; 3218 if (!mddev->ro) 3219 goto out; 3220 3221 mddev->safemode = 0; 3222 mddev->ro = 0; 3223 set_disk_ro(disk, 0); 3224 3225 printk(KERN_INFO "md: %s switched to read-write mode.\n", 3226 mdname(mddev)); 3227 /* 3228 * Kick recovery or resync if necessary 3229 */ 3230 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3231 md_wakeup_thread(mddev->thread); 3232 md_wakeup_thread(mddev->sync_thread); 3233 err = 0; 3234 } else 3235 err = -EINVAL; 3236 3237out: 3238 return err; 3239} 3240 3241/* similar to deny_write_access, but accounts for our holding a reference 3242 * to the file ourselves */ 3243static int deny_bitmap_write_access(struct file * file) 3244{ 3245 struct inode *inode = file->f_mapping->host; 3246 3247 spin_lock(&inode->i_lock); 3248 if (atomic_read(&inode->i_writecount) > 1) { 3249 spin_unlock(&inode->i_lock); 3250 return -ETXTBSY; 3251 } 3252 atomic_set(&inode->i_writecount, -1); 3253 spin_unlock(&inode->i_lock); 3254 3255 return 0; 3256} 3257 3258static void restore_bitmap_write_access(struct file *file) 3259{ 3260 struct inode *inode = file->f_mapping->host; 3261 3262 spin_lock(&inode->i_lock); 3263 atomic_set(&inode->i_writecount, 1); 3264 spin_unlock(&inode->i_lock); 3265} 3266 3267/* mode: 3268 * 0 - completely stop and dis-assemble array 3269 * 1 - switch to readonly 3270 * 2 - stop but do not disassemble array 3271 */ 3272static int do_md_stop(mddev_t * mddev, int mode) 3273{ 3274 int err = 0; 3275 struct gendisk *disk = mddev->gendisk; 3276 3277 if (mddev->pers) { 3278 if (atomic_read(&mddev->active)>2) { 3279 printk("md: %s still in use.\n",mdname(mddev)); 3280 return -EBUSY; 3281 } 3282 3283 if (mddev->sync_thread) { 3284 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3285 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3286 md_unregister_thread(mddev->sync_thread); 3287 mddev->sync_thread = NULL; 3288 } 3289 3290 del_timer_sync(&mddev->safemode_timer); 3291 3292 invalidate_partition(disk, 0); 3293 3294 switch(mode) { 3295 case 1: /* readonly */ 3296 err = -ENXIO; 3297 if (mddev->ro==1) 3298 goto out; 3299 mddev->ro = 1; 3300 break; 3301 case 0: /* disassemble */ 3302 case 2: /* stop */ 3303 bitmap_flush(mddev); 3304 md_super_wait(mddev); 3305 if (mddev->ro) 3306 set_disk_ro(disk, 0); 3307 blk_queue_make_request(mddev->queue, md_fail_request); 3308 mddev->pers->stop(mddev); 3309 if (mddev->pers->sync_request) 3310 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 3311 3312 module_put(mddev->pers->owner); 3313 mddev->pers = NULL; 3314 if (mddev->ro) 3315 mddev->ro = 0; 3316 } 3317 if (!mddev->in_sync || mddev->flags) { 3318 /* mark array as shutdown cleanly */ 3319 mddev->in_sync = 1; 3320 md_update_sb(mddev, 1); 3321 } 3322 if (mode == 1) 3323 set_disk_ro(disk, 1); 3324 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3325 } 3326 3327 /* 3328 * Free resources if final stop 3329 */ 3330 if (mode == 0) { 3331 mdk_rdev_t *rdev; 3332 struct list_head *tmp; 3333 struct gendisk *disk; 3334 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 3335 3336 bitmap_destroy(mddev); 3337 if (mddev->bitmap_file) { 3338 restore_bitmap_write_access(mddev->bitmap_file); 3339 fput(mddev->bitmap_file); 3340 mddev->bitmap_file = NULL; 3341 } 3342 mddev->bitmap_offset = 0; 3343 3344 ITERATE_RDEV(mddev,rdev,tmp) 3345 if (rdev->raid_disk >= 0) { 3346 char nm[20]; 3347 sprintf(nm, "rd%d", rdev->raid_disk); 3348 sysfs_remove_link(&mddev->kobj, nm); 3349 } 3350 3351 export_array(mddev); 3352 3353 mddev->array_size = 0; 3354 mddev->size = 0; 3355 mddev->raid_disks = 0; 3356 mddev->recovery_cp = 0; 3357 3358 disk = mddev->gendisk; 3359 if (disk) 3360 set_capacity(disk, 0); 3361 mddev->changed = 1; 3362 } else if (mddev->pers) 3363 printk(KERN_INFO "md: %s switched to read-only mode.\n", 3364 mdname(mddev)); 3365 err = 0; 3366 md_new_event(mddev); 3367out: 3368 return err; 3369} 3370 3371static void autorun_array(mddev_t *mddev) 3372{ 3373 mdk_rdev_t *rdev; 3374 struct list_head *tmp; 3375 int err; 3376 3377 if (list_empty(&mddev->disks)) 3378 return; 3379 3380 printk(KERN_INFO "md: running: "); 3381 3382 ITERATE_RDEV(mddev,rdev,tmp) { 3383 char b[BDEVNAME_SIZE]; 3384 printk("<%s>", bdevname(rdev->bdev,b)); 3385 } 3386 printk("\n"); 3387 3388 err = do_md_run (mddev); 3389 if (err) { 3390 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 3391 do_md_stop (mddev, 0); 3392 } 3393} 3394 3395/* 3396 * lets try to run arrays based on all disks that have arrived 3397 * until now. (those are in pending_raid_disks) 3398 * 3399 * the method: pick the first pending disk, collect all disks with 3400 * the same UUID, remove all from the pending list and put them into 3401 * the 'same_array' list. Then order this list based on superblock 3402 * update time (freshest comes first), kick out 'old' disks and 3403 * compare superblocks. If everything's fine then run it. 3404 * 3405 * If "unit" is allocated, then bump its reference count 3406 */ 3407static void autorun_devices(int part) 3408{ 3409 struct list_head *tmp; 3410 mdk_rdev_t *rdev0, *rdev; 3411 mddev_t *mddev; 3412 char b[BDEVNAME_SIZE]; 3413 3414 printk(KERN_INFO "md: autorun ...\n"); 3415 while (!list_empty(&pending_raid_disks)) { 3416 int unit; 3417 dev_t dev; 3418 LIST_HEAD(candidates); 3419 rdev0 = list_entry(pending_raid_disks.next, 3420 mdk_rdev_t, same_set); 3421 3422 printk(KERN_INFO "md: considering %s ...\n", 3423 bdevname(rdev0->bdev,b)); 3424 INIT_LIST_HEAD(&candidates); 3425 ITERATE_RDEV_PENDING(rdev,tmp) 3426 if (super_90_load(rdev, rdev0, 0) >= 0) { 3427 printk(KERN_INFO "md: adding %s ...\n", 3428 bdevname(rdev->bdev,b)); 3429 list_move(&rdev->same_set, &candidates); 3430 } 3431 /* 3432 * now we have a set of devices, with all of them having 3433 * mostly sane superblocks. It's time to allocate the 3434 * mddev. 3435 */ 3436 if (part) { 3437 dev = MKDEV(mdp_major, 3438 rdev0->preferred_minor << MdpMinorShift); 3439 unit = MINOR(dev) >> MdpMinorShift; 3440 } else { 3441 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 3442 unit = MINOR(dev); 3443 } 3444 if (rdev0->preferred_minor != unit) { 3445 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 3446 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 3447 break; 3448 } 3449 3450 md_probe(dev, NULL, NULL); 3451 mddev = mddev_find(dev); 3452 if (!mddev) { 3453 printk(KERN_ERR 3454 "md: cannot allocate memory for md drive.\n"); 3455 break; 3456 } 3457 if (mddev_lock(mddev)) 3458 printk(KERN_WARNING "md: %s locked, cannot run\n", 3459 mdname(mddev)); 3460 else if (mddev->raid_disks || mddev->major_version 3461 || !list_empty(&mddev->disks)) { 3462 printk(KERN_WARNING 3463 "md: %s already running, cannot run %s\n", 3464 mdname(mddev), bdevname(rdev0->bdev,b)); 3465 mddev_unlock(mddev); 3466 } else { 3467 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 3468 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 3469 list_del_init(&rdev->same_set); 3470 if (bind_rdev_to_array(rdev, mddev)) 3471 export_rdev(rdev); 3472 } 3473 autorun_array(mddev); 3474 mddev_unlock(mddev); 3475 } 3476 /* on success, candidates will be empty, on error 3477 * it won't... 3478 */ 3479 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 3480 export_rdev(rdev); 3481 mddev_put(mddev); 3482 } 3483 printk(KERN_INFO "md: ... autorun DONE.\n"); 3484} 3485 3486static int get_version(void __user * arg) 3487{ 3488 mdu_version_t ver; 3489 3490 ver.major = MD_MAJOR_VERSION; 3491 ver.minor = MD_MINOR_VERSION; 3492 ver.patchlevel = MD_PATCHLEVEL_VERSION; 3493 3494 if (copy_to_user(arg, &ver, sizeof(ver))) 3495 return -EFAULT; 3496 3497 return 0; 3498} 3499 3500static int get_array_info(mddev_t * mddev, void __user * arg) 3501{ 3502 mdu_array_info_t info; 3503 int nr,working,active,failed,spare; 3504 mdk_rdev_t *rdev; 3505 struct list_head *tmp; 3506 3507 nr=working=active=failed=spare=0; 3508 ITERATE_RDEV(mddev,rdev,tmp) { 3509 nr++; 3510 if (test_bit(Faulty, &rdev->flags)) 3511 failed++; 3512 else { 3513 working++; 3514 if (test_bit(In_sync, &rdev->flags)) 3515 active++; 3516 else 3517 spare++; 3518 } 3519 } 3520 3521 info.major_version = mddev->major_version; 3522 info.minor_version = mddev->minor_version; 3523 info.patch_version = MD_PATCHLEVEL_VERSION; 3524 info.ctime = mddev->ctime; 3525 info.level = mddev->level; 3526 info.size = mddev->size; 3527 if (info.size != mddev->size) /* overflow */ 3528 info.size = -1; 3529 info.nr_disks = nr; 3530 info.raid_disks = mddev->raid_disks; 3531 info.md_minor = mddev->md_minor; 3532 info.not_persistent= !mddev->persistent; 3533 3534 info.utime = mddev->utime; 3535 info.state = 0; 3536 if (mddev->in_sync) 3537 info.state = (1<<MD_SB_CLEAN); 3538 if (mddev->bitmap && mddev->bitmap_offset) 3539 info.state = (1<<MD_SB_BITMAP_PRESENT); 3540 info.active_disks = active; 3541 info.working_disks = working; 3542 info.failed_disks = failed; 3543 info.spare_disks = spare; 3544 3545 info.layout = mddev->layout; 3546 info.chunk_size = mddev->chunk_size; 3547 3548 if (copy_to_user(arg, &info, sizeof(info))) 3549 return -EFAULT; 3550 3551 return 0; 3552} 3553 3554static int get_bitmap_file(mddev_t * mddev, void __user * arg) 3555{ 3556 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 3557 char *ptr, *buf = NULL; 3558 int err = -ENOMEM; 3559 3560 file = kmalloc(sizeof(*file), GFP_KERNEL); 3561 if (!file) 3562 goto out; 3563 3564 /* bitmap disabled, zero the first byte and copy out */ 3565 if (!mddev->bitmap || !mddev->bitmap->file) { 3566 file->pathname[0] = '\0'; 3567 goto copy_out; 3568 } 3569 3570 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 3571 if (!buf) 3572 goto out; 3573 3574 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); 3575 if (!ptr) 3576 goto out; 3577 3578 strcpy(file->pathname, ptr); 3579 3580copy_out: 3581 err = 0; 3582 if (copy_to_user(arg, file, sizeof(*file))) 3583 err = -EFAULT; 3584out: 3585 kfree(buf); 3586 kfree(file); 3587 return err; 3588} 3589 3590static int get_disk_info(mddev_t * mddev, void __user * arg) 3591{ 3592 mdu_disk_info_t info; 3593 unsigned int nr; 3594 mdk_rdev_t *rdev; 3595 3596 if (copy_from_user(&info, arg, sizeof(info))) 3597 return -EFAULT; 3598 3599 nr = info.number; 3600 3601 rdev = find_rdev_nr(mddev, nr); 3602 if (rdev) { 3603 info.major = MAJOR(rdev->bdev->bd_dev); 3604 info.minor = MINOR(rdev->bdev->bd_dev); 3605 info.raid_disk = rdev->raid_disk; 3606 info.state = 0; 3607 if (test_bit(Faulty, &rdev->flags)) 3608 info.state |= (1<<MD_DISK_FAULTY); 3609 else if (test_bit(In_sync, &rdev->flags)) { 3610 info.state |= (1<<MD_DISK_ACTIVE); 3611 info.state |= (1<<MD_DISK_SYNC); 3612 } 3613 if (test_bit(WriteMostly, &rdev->flags)) 3614 info.state |= (1<<MD_DISK_WRITEMOSTLY); 3615 } else { 3616 info.major = info.minor = 0; 3617 info.raid_disk = -1; 3618 info.state = (1<<MD_DISK_REMOVED); 3619 } 3620 3621 if (copy_to_user(arg, &info, sizeof(info))) 3622 return -EFAULT; 3623 3624 return 0; 3625} 3626 3627static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 3628{ 3629 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 3630 mdk_rdev_t *rdev; 3631 dev_t dev = MKDEV(info->major,info->minor); 3632 3633 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 3634 return -EOVERFLOW; 3635 3636 if (!mddev->raid_disks) { 3637 int err; 3638 /* expecting a device which has a superblock */ 3639 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 3640 if (IS_ERR(rdev)) { 3641 printk(KERN_WARNING 3642 "md: md_import_device returned %ld\n", 3643 PTR_ERR(rdev)); 3644 return PTR_ERR(rdev); 3645 } 3646 if (!list_empty(&mddev->disks)) { 3647 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 3648 mdk_rdev_t, same_set); 3649 int err = super_types[mddev->major_version] 3650 .load_super(rdev, rdev0, mddev->minor_version); 3651 if (err < 0) { 3652 printk(KERN_WARNING 3653 "md: %s has different UUID to %s\n", 3654 bdevname(rdev->bdev,b), 3655 bdevname(rdev0->bdev,b2)); 3656 export_rdev(rdev); 3657 return -EINVAL; 3658 } 3659 } 3660 err = bind_rdev_to_array(rdev, mddev); 3661 if (err) 3662 export_rdev(rdev); 3663 return err; 3664 } 3665 3666 /* 3667 * add_new_disk can be used once the array is assembled 3668 * to add "hot spares". They must already have a superblock 3669 * written 3670 */ 3671 if (mddev->pers) { 3672 int err; 3673 if (!mddev->pers->hot_add_disk) { 3674 printk(KERN_WARNING 3675 "%s: personality does not support diskops!\n", 3676 mdname(mddev)); 3677 return -EINVAL; 3678 } 3679 if (mddev->persistent) 3680 rdev = md_import_device(dev, mddev->major_version, 3681 mddev->minor_version); 3682 else 3683 rdev = md_import_device(dev, -1, -1); 3684 if (IS_ERR(rdev)) { 3685 printk(KERN_WARNING 3686 "md: md_import_device returned %ld\n", 3687 PTR_ERR(rdev)); 3688 return PTR_ERR(rdev); 3689 } 3690 /* set save_raid_disk if appropriate */ 3691 if (!mddev->persistent) { 3692 if (info->state & (1<<MD_DISK_SYNC) && 3693 info->raid_disk < mddev->raid_disks) 3694 rdev->raid_disk = info->raid_disk; 3695 else 3696 rdev->raid_disk = -1; 3697 } else 3698 super_types[mddev->major_version]. 3699 validate_super(mddev, rdev); 3700 rdev->saved_raid_disk = rdev->raid_disk; 3701 3702 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 3703 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3704 set_bit(WriteMostly, &rdev->flags); 3705 3706 rdev->raid_disk = -1; 3707 err = bind_rdev_to_array(rdev, mddev); 3708 if (!err && !mddev->pers->hot_remove_disk) { 3709 /* If there is hot_add_disk but no hot_remove_disk 3710 * then added disks for geometry changes, 3711 * and should be added immediately. 3712 */ 3713 super_types[mddev->major_version]. 3714 validate_super(mddev, rdev); 3715 err = mddev->pers->hot_add_disk(mddev, rdev); 3716 if (err) 3717 unbind_rdev_from_array(rdev); 3718 } 3719 if (err) 3720 export_rdev(rdev); 3721 3722 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3723 md_wakeup_thread(mddev->thread); 3724 return err; 3725 } 3726 3727 /* otherwise, add_new_disk is only allowed 3728 * for major_version==0 superblocks 3729 */ 3730 if (mddev->major_version != 0) { 3731 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 3732 mdname(mddev)); 3733 return -EINVAL; 3734 } 3735 3736 if (!(info->state & (1<<MD_DISK_FAULTY))) { 3737 int err; 3738 rdev = md_import_device (dev, -1, 0); 3739 if (IS_ERR(rdev)) { 3740 printk(KERN_WARNING 3741 "md: error, md_import_device() returned %ld\n", 3742 PTR_ERR(rdev)); 3743 return PTR_ERR(rdev); 3744 } 3745 rdev->desc_nr = info->number; 3746 if (info->raid_disk < mddev->raid_disks) 3747 rdev->raid_disk = info->raid_disk; 3748 else 3749 rdev->raid_disk = -1; 3750 3751 rdev->flags = 0; 3752 3753 if (rdev->raid_disk < mddev->raid_disks) 3754 if (info->state & (1<<MD_DISK_SYNC)) 3755 set_bit(In_sync, &rdev->flags); 3756 3757 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3758 set_bit(WriteMostly, &rdev->flags); 3759 3760 if (!mddev->persistent) { 3761 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 3762 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3763 } else 3764 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3765 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 3766 3767 err = bind_rdev_to_array(rdev, mddev); 3768 if (err) { 3769 export_rdev(rdev); 3770 return err; 3771 } 3772 } 3773 3774 return 0; 3775} 3776 3777static int hot_remove_disk(mddev_t * mddev, dev_t dev) 3778{ 3779 char b[BDEVNAME_SIZE]; 3780 mdk_rdev_t *rdev; 3781 3782 if (!mddev->pers) 3783 return -ENODEV; 3784 3785 rdev = find_rdev(mddev, dev); 3786 if (!rdev) 3787 return -ENXIO; 3788 3789 if (rdev->raid_disk >= 0) 3790 goto busy; 3791 3792 kick_rdev_from_array(rdev); 3793 md_update_sb(mddev, 1); 3794 md_new_event(mddev); 3795 3796 return 0; 3797busy: 3798 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", 3799 bdevname(rdev->bdev,b), mdname(mddev)); 3800 return -EBUSY; 3801} 3802 3803static int hot_add_disk(mddev_t * mddev, dev_t dev) 3804{ 3805 char b[BDEVNAME_SIZE]; 3806 int err; 3807 unsigned int size; 3808 mdk_rdev_t *rdev; 3809 3810 if (!mddev->pers) 3811 return -ENODEV; 3812 3813 if (mddev->major_version != 0) { 3814 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 3815 " version-0 superblocks.\n", 3816 mdname(mddev)); 3817 return -EINVAL; 3818 } 3819 if (!mddev->pers->hot_add_disk) { 3820 printk(KERN_WARNING 3821 "%s: personality does not support diskops!\n", 3822 mdname(mddev)); 3823 return -EINVAL; 3824 } 3825 3826 rdev = md_import_device (dev, -1, 0); 3827 if (IS_ERR(rdev)) { 3828 printk(KERN_WARNING 3829 "md: error, md_import_device() returned %ld\n", 3830 PTR_ERR(rdev)); 3831 return -EINVAL; 3832 } 3833 3834 if (mddev->persistent) 3835 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3836 else 3837 rdev->sb_offset = 3838 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3839 3840 size = calc_dev_size(rdev, mddev->chunk_size); 3841 rdev->size = size; 3842 3843 if (test_bit(Faulty, &rdev->flags)) { 3844 printk(KERN_WARNING 3845 "md: can not hot-add faulty %s disk to %s!\n", 3846 bdevname(rdev->bdev,b), mdname(mddev)); 3847 err = -EINVAL; 3848 goto abort_export; 3849 } 3850 clear_bit(In_sync, &rdev->flags); 3851 rdev->desc_nr = -1; 3852 rdev->saved_raid_disk = -1; 3853 err = bind_rdev_to_array(rdev, mddev); 3854 if (err) 3855 goto abort_export; 3856 3857 /* 3858 * The rest should better be atomic, we can have disk failures 3859 * noticed in interrupt contexts ... 3860 */ 3861 3862 if (rdev->desc_nr == mddev->max_disks) { 3863 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 3864 mdname(mddev)); 3865 err = -EBUSY; 3866 goto abort_unbind_export; 3867 } 3868 3869 rdev->raid_disk = -1; 3870 3871 md_update_sb(mddev, 1); 3872 3873 /* 3874 * Kick recovery, maybe this spare has to be added to the 3875 * array immediately. 3876 */ 3877 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3878 md_wakeup_thread(mddev->thread); 3879 md_new_event(mddev); 3880 return 0; 3881 3882abort_unbind_export: 3883 unbind_rdev_from_array(rdev); 3884 3885abort_export: 3886 export_rdev(rdev); 3887 return err; 3888} 3889 3890static int set_bitmap_file(mddev_t *mddev, int fd) 3891{ 3892 int err; 3893 3894 if (mddev->pers) { 3895 if (!mddev->pers->quiesce) 3896 return -EBUSY; 3897 if (mddev->recovery || mddev->sync_thread) 3898 return -EBUSY; 3899 /* we should be able to change the bitmap.. */ 3900 } 3901 3902 3903 if (fd >= 0) { 3904 if (mddev->bitmap) 3905 return -EEXIST; /* cannot add when bitmap is present */ 3906 mddev->bitmap_file = fget(fd); 3907 3908 if (mddev->bitmap_file == NULL) { 3909 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 3910 mdname(mddev)); 3911 return -EBADF; 3912 } 3913 3914 err = deny_bitmap_write_access(mddev->bitmap_file); 3915 if (err) { 3916 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 3917 mdname(mddev)); 3918 fput(mddev->bitmap_file); 3919 mddev->bitmap_file = NULL; 3920 return err; 3921 } 3922 mddev->bitmap_offset = 0; /* file overrides offset */ 3923 } else if (mddev->bitmap == NULL) 3924 return -ENOENT; /* cannot remove what isn't there */ 3925 err = 0; 3926 if (mddev->pers) { 3927 mddev->pers->quiesce(mddev, 1); 3928 if (fd >= 0) 3929 err = bitmap_create(mddev); 3930 if (fd < 0 || err) { 3931 bitmap_destroy(mddev); 3932 fd = -1; /* make sure to put the file */ 3933 } 3934 mddev->pers->quiesce(mddev, 0); 3935 } 3936 if (fd < 0) { 3937 if (mddev->bitmap_file) { 3938 restore_bitmap_write_access(mddev->bitmap_file); 3939 fput(mddev->bitmap_file); 3940 } 3941 mddev->bitmap_file = NULL; 3942 } 3943 3944 return err; 3945} 3946 3947/* 3948 * set_array_info is used two different ways 3949 * The original usage is when creating a new array. 3950 * In this usage, raid_disks is > 0 and it together with 3951 * level, size, not_persistent,layout,chunksize determine the 3952 * shape of the array. 3953 * This will always create an array with a type-0.90.0 superblock. 3954 * The newer usage is when assembling an array. 3955 * In this case raid_disks will be 0, and the major_version field is 3956 * use to determine which style super-blocks are to be found on the devices. 3957 * The minor and patch _version numbers are also kept incase the 3958 * super_block handler wishes to interpret them. 3959 */ 3960static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 3961{ 3962 3963 if (info->raid_disks == 0) { 3964 /* just setting version number for superblock loading */ 3965 if (info->major_version < 0 || 3966 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || 3967 super_types[info->major_version].name == NULL) { 3968 /* maybe try to auto-load a module? */ 3969 printk(KERN_INFO 3970 "md: superblock version %d not known\n", 3971 info->major_version); 3972 return -EINVAL; 3973 } 3974 mddev->major_version = info->major_version; 3975 mddev->minor_version = info->minor_version; 3976 mddev->patch_version = info->patch_version; 3977 return 0; 3978 } 3979 mddev->major_version = MD_MAJOR_VERSION; 3980 mddev->minor_version = MD_MINOR_VERSION; 3981 mddev->patch_version = MD_PATCHLEVEL_VERSION; 3982 mddev->ctime = get_seconds(); 3983 3984 mddev->level = info->level; 3985 mddev->clevel[0] = 0; 3986 mddev->size = info->size; 3987 mddev->raid_disks = info->raid_disks; 3988 /* don't set md_minor, it is determined by which /dev/md* was 3989 * openned 3990 */ 3991 if (info->state & (1<<MD_SB_CLEAN)) 3992 mddev->recovery_cp = MaxSector; 3993 else 3994 mddev->recovery_cp = 0; 3995 mddev->persistent = ! info->not_persistent; 3996 3997 mddev->layout = info->layout; 3998 mddev->chunk_size = info->chunk_size; 3999 4000 mddev->max_disks = MD_SB_DISKS; 4001 4002 mddev->flags = 0; 4003 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4004 4005 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 4006 mddev->bitmap_offset = 0; 4007 4008 mddev->reshape_position = MaxSector; 4009 4010 /* 4011 * Generate a 128 bit UUID 4012 */ 4013 get_random_bytes(mddev->uuid, 16); 4014 4015 mddev->new_level = mddev->level; 4016 mddev->new_chunk = mddev->chunk_size; 4017 mddev->new_layout = mddev->layout; 4018 mddev->delta_disks = 0; 4019 4020 return 0; 4021} 4022 4023static int update_size(mddev_t *mddev, unsigned long size) 4024{ 4025 mdk_rdev_t * rdev; 4026 int rv; 4027 struct list_head *tmp; 4028 int fit = (size == 0); 4029 4030 if (mddev->pers->resize == NULL) 4031 return -EINVAL; 4032 /* The "size" is the amount of each device that is used. 4033 * This can only make sense for arrays with redundancy. 4034 * linear and raid0 always use whatever space is available 4035 * We can only consider changing the size if no resync 4036 * or reconstruction is happening, and if the new size 4037 * is acceptable. It must fit before the sb_offset or, 4038 * if that is <data_offset, it must fit before the 4039 * size of each device. 4040 * If size is zero, we find the largest size that fits. 4041 */ 4042 if (mddev->sync_thread) 4043 return -EBUSY; 4044 ITERATE_RDEV(mddev,rdev,tmp) { 4045 sector_t avail; 4046 if (rdev->sb_offset > rdev->data_offset) 4047 avail = (rdev->sb_offset*2) - rdev->data_offset; 4048 else 4049 avail = get_capacity(rdev->bdev->bd_disk) 4050 - rdev->data_offset; 4051 if (fit && (size == 0 || size > avail/2)) 4052 size = avail/2; 4053 if (avail < ((sector_t)size << 1)) 4054 return -ENOSPC; 4055 } 4056 rv = mddev->pers->resize(mddev, (sector_t)size *2); 4057 if (!rv) { 4058 struct block_device *bdev; 4059 4060 bdev = bdget_disk(mddev->gendisk, 0); 4061 if (bdev) { 4062 mutex_lock(&bdev->bd_inode->i_mutex); 4063 i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10); 4064 mutex_unlock(&bdev->bd_inode->i_mutex); 4065 bdput(bdev); 4066 } 4067 } 4068 return rv; 4069} 4070 4071static int update_raid_disks(mddev_t *mddev, int raid_disks) 4072{ 4073 int rv; 4074 /* change the number of raid disks */ 4075 if (mddev->pers->check_reshape == NULL) 4076 return -EINVAL; 4077 if (raid_disks <= 0 || 4078 raid_disks >= mddev->max_disks) 4079 return -EINVAL; 4080 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 4081 return -EBUSY; 4082 mddev->delta_disks = raid_disks - mddev->raid_disks; 4083 4084 rv = mddev->pers->check_reshape(mddev); 4085 return rv; 4086} 4087 4088 4089/* 4090 * update_array_info is used to change the configuration of an 4091 * on-line array. 4092 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 4093 * fields in the info are checked against the array. 4094 * Any differences that cannot be handled will cause an error. 4095 * Normally, only one change can be managed at a time. 4096 */ 4097static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 4098{ 4099 int rv = 0; 4100 int cnt = 0; 4101 int state = 0; 4102 4103 /* calculate expected state,ignoring low bits */ 4104 if (mddev->bitmap && mddev->bitmap_offset) 4105 state |= (1 << MD_SB_BITMAP_PRESENT); 4106 4107 if (mddev->major_version != info->major_version || 4108 mddev->minor_version != info->minor_version || 4109/* mddev->patch_version != info->patch_version || */ 4110 mddev->ctime != info->ctime || 4111 mddev->level != info->level || 4112/* mddev->layout != info->layout || */ 4113 !mddev->persistent != info->not_persistent|| 4114 mddev->chunk_size != info->chunk_size || 4115 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 4116 ((state^info->state) & 0xfffffe00) 4117 ) 4118 return -EINVAL; 4119 /* Check there is only one change */ 4120 if (info->size >= 0 && mddev->size != info->size) cnt++; 4121 if (mddev->raid_disks != info->raid_disks) cnt++; 4122 if (mddev->layout != info->layout) cnt++; 4123 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 4124 if (cnt == 0) return 0; 4125 if (cnt > 1) return -EINVAL; 4126 4127 if (mddev->layout != info->layout) { 4128 /* Change layout 4129 * we don't need to do anything at the md level, the 4130 * personality will take care of it all. 4131 */ 4132 if (mddev->pers->reconfig == NULL) 4133 return -EINVAL; 4134 else 4135 return mddev->pers->reconfig(mddev, info->layout, -1); 4136 } 4137 if (info->size >= 0 && mddev->size != info->size) 4138 rv = update_size(mddev, info->size); 4139 4140 if (mddev->raid_disks != info->raid_disks) 4141 rv = update_raid_disks(mddev, info->raid_disks); 4142 4143 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 4144 if (mddev->pers->quiesce == NULL) 4145 return -EINVAL; 4146 if (mddev->recovery || mddev->sync_thread) 4147 return -EBUSY; 4148 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 4149 /* add the bitmap */ 4150 if (mddev->bitmap) 4151 return -EEXIST; 4152 if (mddev->default_bitmap_offset == 0) 4153 return -EINVAL; 4154 mddev->bitmap_offset = mddev->default_bitmap_offset; 4155 mddev->pers->quiesce(mddev, 1); 4156 rv = bitmap_create(mddev); 4157 if (rv) 4158 bitmap_destroy(mddev); 4159 mddev->pers->quiesce(mddev, 0); 4160 } else { 4161 /* remove the bitmap */ 4162 if (!mddev->bitmap) 4163 return -ENOENT; 4164 if (mddev->bitmap->file) 4165 return -EINVAL; 4166 mddev->pers->quiesce(mddev, 1); 4167 bitmap_destroy(mddev); 4168 mddev->pers->quiesce(mddev, 0); 4169 mddev->bitmap_offset = 0; 4170 } 4171 } 4172 md_update_sb(mddev, 1); 4173 return rv; 4174} 4175 4176static int set_disk_faulty(mddev_t *mddev, dev_t dev) 4177{ 4178 mdk_rdev_t *rdev; 4179 4180 if (mddev->pers == NULL) 4181 return -ENODEV; 4182 4183 rdev = find_rdev(mddev, dev); 4184 if (!rdev) 4185 return -ENODEV; 4186 4187 md_error(mddev, rdev); 4188 return 0; 4189} 4190 4191static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 4192{ 4193 mddev_t *mddev = bdev->bd_disk->private_data; 4194 4195 geo->heads = 2; 4196 geo->sectors = 4; 4197 geo->cylinders = get_capacity(mddev->gendisk) / 8; 4198 return 0; 4199} 4200 4201static int md_ioctl(struct inode *inode, struct file *file, 4202 unsigned int cmd, unsigned long arg) 4203{ 4204 int err = 0; 4205 void __user *argp = (void __user *)arg; 4206 mddev_t *mddev = NULL; 4207 4208 if (!capable(CAP_SYS_ADMIN)) 4209 return -EACCES; 4210 4211 /* 4212 * Commands dealing with the RAID driver but not any 4213 * particular array: 4214 */ 4215 switch (cmd) 4216 { 4217 case RAID_VERSION: 4218 err = get_version(argp); 4219 goto done; 4220 4221 case PRINT_RAID_DEBUG: 4222 err = 0; 4223 md_print_devices(); 4224 goto done; 4225 4226#ifndef MODULE 4227 case RAID_AUTORUN: 4228 err = 0; 4229 autostart_arrays(arg); 4230 goto done; 4231#endif 4232 default:; 4233 } 4234 4235 /* 4236 * Commands creating/starting a new array: 4237 */ 4238 4239 mddev = inode->i_bdev->bd_disk->private_data; 4240 4241 if (!mddev) { 4242 BUG(); 4243 goto abort; 4244 } 4245 4246 err = mddev_lock(mddev); 4247 if (err) { 4248 printk(KERN_INFO 4249 "md: ioctl lock interrupted, reason %d, cmd %d\n", 4250 err, cmd); 4251 goto abort; 4252 } 4253 4254 switch (cmd) 4255 { 4256 case SET_ARRAY_INFO: 4257 { 4258 mdu_array_info_t info; 4259 if (!arg) 4260 memset(&info, 0, sizeof(info)); 4261 else if (copy_from_user(&info, argp, sizeof(info))) { 4262 err = -EFAULT; 4263 goto abort_unlock; 4264 } 4265 if (mddev->pers) { 4266 err = update_array_info(mddev, &info); 4267 if (err) { 4268 printk(KERN_WARNING "md: couldn't update" 4269 " array info. %d\n", err); 4270 goto abort_unlock; 4271 } 4272 goto done_unlock; 4273 } 4274 if (!list_empty(&mddev->disks)) { 4275 printk(KERN_WARNING 4276 "md: array %s already has disks!\n", 4277 mdname(mddev)); 4278 err = -EBUSY; 4279 goto abort_unlock; 4280 } 4281 if (mddev->raid_disks) { 4282 printk(KERN_WARNING 4283 "md: array %s already initialised!\n", 4284 mdname(mddev)); 4285 err = -EBUSY; 4286 goto abort_unlock; 4287 } 4288 err = set_array_info(mddev, &info); 4289 if (err) { 4290 printk(KERN_WARNING "md: couldn't set" 4291 " array info. %d\n", err); 4292 goto abort_unlock; 4293 } 4294 } 4295 goto done_unlock; 4296 4297 default:; 4298 } 4299 4300 /* 4301 * Commands querying/configuring an existing array: 4302 */ 4303 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 4304 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */ 4305 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 4306 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) { 4307 err = -ENODEV; 4308 goto abort_unlock; 4309 } 4310 4311 /* 4312 * Commands even a read-only array can execute: 4313 */ 4314 switch (cmd) 4315 { 4316 case GET_ARRAY_INFO: 4317 err = get_array_info(mddev, argp); 4318 goto done_unlock; 4319 4320 case GET_BITMAP_FILE: 4321 err = get_bitmap_file(mddev, argp); 4322 goto done_unlock; 4323 4324 case GET_DISK_INFO: 4325 err = get_disk_info(mddev, argp); 4326 goto done_unlock; 4327 4328 case RESTART_ARRAY_RW: 4329 err = restart_array(mddev); 4330 goto done_unlock; 4331 4332 case STOP_ARRAY: 4333 err = do_md_stop (mddev, 0); 4334 goto done_unlock; 4335 4336 case STOP_ARRAY_RO: 4337 err = do_md_stop (mddev, 1); 4338 goto done_unlock; 4339 4340 /* 4341 * We have a problem here : there is no easy way to give a CHS 4342 * virtual geometry. We currently pretend that we have a 2 heads 4343 * 4 sectors (with a BIG number of cylinders...). This drives 4344 * dosfs just mad... ;-) 4345 */ 4346 } 4347 4348 /* 4349 * The remaining ioctls are changing the state of the 4350 * superblock, so we do not allow them on read-only arrays. 4351 * However non-MD ioctls (e.g. get-size) will still come through 4352 * here and hit the 'default' below, so only disallow 4353 * 'md' ioctls, and switch to rw mode if started auto-readonly. 4354 */ 4355 if (_IOC_TYPE(cmd) == MD_MAJOR && 4356 mddev->ro && mddev->pers) { 4357 if (mddev->ro == 2) { 4358 mddev->ro = 0; 4359 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4360 md_wakeup_thread(mddev->thread); 4361 4362 } else { 4363 err = -EROFS; 4364 goto abort_unlock; 4365 } 4366 } 4367 4368 switch (cmd) 4369 { 4370 case ADD_NEW_DISK: 4371 { 4372 mdu_disk_info_t info; 4373 if (copy_from_user(&info, argp, sizeof(info))) 4374 err = -EFAULT; 4375 else 4376 err = add_new_disk(mddev, &info); 4377 goto done_unlock; 4378 } 4379 4380 case HOT_REMOVE_DISK: 4381 err = hot_remove_disk(mddev, new_decode_dev(arg)); 4382 goto done_unlock; 4383 4384 case HOT_ADD_DISK: 4385 err = hot_add_disk(mddev, new_decode_dev(arg)); 4386 goto done_unlock; 4387 4388 case SET_DISK_FAULTY: 4389 err = set_disk_faulty(mddev, new_decode_dev(arg)); 4390 goto done_unlock; 4391 4392 case RUN_ARRAY: 4393 err = do_md_run (mddev); 4394 goto done_unlock; 4395 4396 case SET_BITMAP_FILE: 4397 err = set_bitmap_file(mddev, (int)arg); 4398 goto done_unlock; 4399 4400 default: 4401 err = -EINVAL; 4402 goto abort_unlock; 4403 } 4404 4405done_unlock: 4406abort_unlock: 4407 mddev_unlock(mddev); 4408 4409 return err; 4410done: 4411 if (err) 4412 MD_BUG(); 4413abort: 4414 return err; 4415} 4416 4417static int md_open(struct inode *inode, struct file *file) 4418{ 4419 /* 4420 * Succeed if we can lock the mddev, which confirms that 4421 * it isn't being stopped right now. 4422 */ 4423 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 4424 int err; 4425 4426 if ((err = mddev_lock(mddev))) 4427 goto out; 4428 4429 err = 0; 4430 mddev_get(mddev); 4431 mddev_unlock(mddev); 4432 4433 check_disk_change(inode->i_bdev); 4434 out: 4435 return err; 4436} 4437 4438static int md_release(struct inode *inode, struct file * file) 4439{ 4440 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 4441 4442 BUG_ON(!mddev); 4443 mddev_put(mddev); 4444 4445 return 0; 4446} 4447 4448static int md_media_changed(struct gendisk *disk) 4449{ 4450 mddev_t *mddev = disk->private_data; 4451 4452 return mddev->changed; 4453} 4454 4455static int md_revalidate(struct gendisk *disk) 4456{ 4457 mddev_t *mddev = disk->private_data; 4458 4459 mddev->changed = 0; 4460 return 0; 4461} 4462static struct block_device_operations md_fops = 4463{ 4464 .owner = THIS_MODULE, 4465 .open = md_open, 4466 .release = md_release, 4467 .ioctl = md_ioctl, 4468 .getgeo = md_getgeo, 4469 .media_changed = md_media_changed, 4470 .revalidate_disk= md_revalidate, 4471}; 4472 4473static int md_thread(void * arg) 4474{ 4475 mdk_thread_t *thread = arg; 4476 4477 /* 4478 * md_thread is a 'system-thread', it's priority should be very 4479 * high. We avoid resource deadlocks individually in each 4480 * raid personality. (RAID5 does preallocation) We also use RR and 4481 * the very same RT priority as kswapd, thus we will never get 4482 * into a priority inversion deadlock. 4483 * 4484 * we definitely have to have equal or higher priority than 4485 * bdflush, otherwise bdflush will deadlock if there are too 4486 * many dirty RAID5 blocks. 4487 */ 4488 4489 allow_signal(SIGKILL); 4490 while (!kthread_should_stop()) { 4491 4492 /* We need to wait INTERRUPTIBLE so that 4493 * we don't add to the load-average. 4494 * That means we need to be sure no signals are 4495 * pending 4496 */ 4497 if (signal_pending(current)) 4498 flush_signals(current); 4499 4500 wait_event_interruptible_timeout 4501 (thread->wqueue, 4502 test_bit(THREAD_WAKEUP, &thread->flags) 4503 || kthread_should_stop(), 4504 thread->timeout); 4505 try_to_freeze(); 4506 4507 clear_bit(THREAD_WAKEUP, &thread->flags); 4508 4509 thread->run(thread->mddev); 4510 } 4511 4512 return 0; 4513} 4514 4515void md_wakeup_thread(mdk_thread_t *thread) 4516{ 4517 if (thread) { 4518 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 4519 set_bit(THREAD_WAKEUP, &thread->flags); 4520 wake_up(&thread->wqueue); 4521 } 4522} 4523 4524mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 4525 const char *name) 4526{ 4527 mdk_thread_t *thread; 4528 4529 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 4530 if (!thread) 4531 return NULL; 4532 4533 init_waitqueue_head(&thread->wqueue); 4534 4535 thread->run = run; 4536 thread->mddev = mddev; 4537 thread->timeout = MAX_SCHEDULE_TIMEOUT; 4538 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 4539 if (IS_ERR(thread->tsk)) { 4540 kfree(thread); 4541 return NULL; 4542 } 4543 return thread; 4544} 4545 4546void md_unregister_thread(mdk_thread_t *thread) 4547{ 4548 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 4549 4550 kthread_stop(thread->tsk); 4551 kfree(thread); 4552} 4553 4554void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 4555{ 4556 if (!mddev) { 4557 MD_BUG(); 4558 return; 4559 } 4560 4561 if (!rdev || test_bit(Faulty, &rdev->flags)) 4562 return; 4563/* 4564 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 4565 mdname(mddev), 4566 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 4567 __builtin_return_address(0),__builtin_return_address(1), 4568 __builtin_return_address(2),__builtin_return_address(3)); 4569*/ 4570 if (!mddev->pers) 4571 return; 4572 if (!mddev->pers->error_handler) 4573 return; 4574 mddev->pers->error_handler(mddev,rdev); 4575 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4576 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4577 md_wakeup_thread(mddev->thread); 4578 md_new_event_inintr(mddev); 4579} 4580 4581/* seq_file implementation /proc/mdstat */ 4582 4583static void status_unused(struct seq_file *seq) 4584{ 4585 int i = 0; 4586 mdk_rdev_t *rdev; 4587 struct list_head *tmp; 4588 4589 seq_printf(seq, "unused devices: "); 4590 4591 ITERATE_RDEV_PENDING(rdev,tmp) { 4592 char b[BDEVNAME_SIZE]; 4593 i++; 4594 seq_printf(seq, "%s ", 4595 bdevname(rdev->bdev,b)); 4596 } 4597 if (!i) 4598 seq_printf(seq, "<none>"); 4599 4600 seq_printf(seq, "\n"); 4601} 4602 4603 4604static void status_resync(struct seq_file *seq, mddev_t * mddev) 4605{ 4606 sector_t max_blocks, resync, res; 4607 unsigned long dt, db, rt; 4608 int scale; 4609 unsigned int per_milli; 4610 4611 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 4612 4613 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 4614 max_blocks = mddev->resync_max_sectors >> 1; 4615 else 4616 max_blocks = mddev->size; 4617 4618 /* 4619 * Should not happen. 4620 */ 4621 if (!max_blocks) { 4622 MD_BUG(); 4623 return; 4624 } 4625 /* Pick 'scale' such that (resync>>scale)*1000 will fit 4626 * in a sector_t, and (max_blocks>>scale) will fit in a 4627 * u32, as those are the requirements for sector_div. 4628 * Thus 'scale' must be at least 10 4629 */ 4630 scale = 10; 4631 if (sizeof(sector_t) > sizeof(unsigned long)) { 4632 while ( max_blocks/2 > (1ULL<<(scale+32))) 4633 scale++; 4634 } 4635 res = (resync>>scale)*1000; 4636 sector_div(res, (u32)((max_blocks>>scale)+1)); 4637 4638 per_milli = res; 4639 { 4640 int i, x = per_milli/50, y = 20-x; 4641 seq_printf(seq, "["); 4642 for (i = 0; i < x; i++) 4643 seq_printf(seq, "="); 4644 seq_printf(seq, ">"); 4645 for (i = 0; i < y; i++) 4646 seq_printf(seq, "."); 4647 seq_printf(seq, "] "); 4648 } 4649 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 4650 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 4651 "reshape" : 4652 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 4653 "check" : 4654 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 4655 "resync" : "recovery"))), 4656 per_milli/10, per_milli % 10, 4657 (unsigned long long) resync, 4658 (unsigned long long) max_blocks); 4659 4660 /* 4661 * We do not want to overflow, so the order of operands and 4662 * the * 100 / 100 trick are important. We do a +1 to be 4663 * safe against division by zero. We only estimate anyway. 4664 * 4665 * dt: time from mark until now 4666 * db: blocks written from mark until now 4667 * rt: remaining time 4668 */ 4669 dt = ((jiffies - mddev->resync_mark) / HZ); 4670 if (!dt) dt++; 4671 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 4672 - mddev->resync_mark_cnt; 4673 rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100; 4674 4675 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 4676 4677 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 4678} 4679 4680static void *md_seq_start(struct seq_file *seq, loff_t *pos) 4681{ 4682 struct list_head *tmp; 4683 loff_t l = *pos; 4684 mddev_t *mddev; 4685 4686 if (l >= 0x10000) 4687 return NULL; 4688 if (!l--) 4689 /* header */ 4690 return (void*)1; 4691 4692 spin_lock(&all_mddevs_lock); 4693 list_for_each(tmp,&all_mddevs) 4694 if (!l--) { 4695 mddev = list_entry(tmp, mddev_t, all_mddevs); 4696 mddev_get(mddev); 4697 spin_unlock(&all_mddevs_lock); 4698 return mddev; 4699 } 4700 spin_unlock(&all_mddevs_lock); 4701 if (!l--) 4702 return (void*)2;/* tail */ 4703 return NULL; 4704} 4705 4706static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4707{ 4708 struct list_head *tmp; 4709 mddev_t *next_mddev, *mddev = v; 4710 4711 ++*pos; 4712 if (v == (void*)2) 4713 return NULL; 4714 4715 spin_lock(&all_mddevs_lock); 4716 if (v == (void*)1) 4717 tmp = all_mddevs.next; 4718 else 4719 tmp = mddev->all_mddevs.next; 4720 if (tmp != &all_mddevs) 4721 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 4722 else { 4723 next_mddev = (void*)2; 4724 *pos = 0x10000; 4725 } 4726 spin_unlock(&all_mddevs_lock); 4727 4728 if (v != (void*)1) 4729 mddev_put(mddev); 4730 return next_mddev; 4731 4732} 4733 4734static void md_seq_stop(struct seq_file *seq, void *v) 4735{ 4736 mddev_t *mddev = v; 4737 4738 if (mddev && v != (void*)1 && v != (void*)2) 4739 mddev_put(mddev); 4740} 4741 4742struct mdstat_info { 4743 int event; 4744}; 4745 4746static int md_seq_show(struct seq_file *seq, void *v) 4747{ 4748 mddev_t *mddev = v; 4749 sector_t size; 4750 struct list_head *tmp2; 4751 mdk_rdev_t *rdev; 4752 struct mdstat_info *mi = seq->private; 4753 struct bitmap *bitmap; 4754 4755 if (v == (void*)1) { 4756 struct mdk_personality *pers; 4757 seq_printf(seq, "Personalities : "); 4758 spin_lock(&pers_lock); 4759 list_for_each_entry(pers, &pers_list, list) 4760 seq_printf(seq, "[%s] ", pers->name); 4761 4762 spin_unlock(&pers_lock); 4763 seq_printf(seq, "\n"); 4764 mi->event = atomic_read(&md_event_count); 4765 return 0; 4766 } 4767 if (v == (void*)2) { 4768 status_unused(seq); 4769 return 0; 4770 } 4771 4772 if (mddev_lock(mddev) < 0) 4773 return -EINTR; 4774 4775 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 4776 seq_printf(seq, "%s : %sactive", mdname(mddev), 4777 mddev->pers ? "" : "in"); 4778 if (mddev->pers) { 4779 if (mddev->ro==1) 4780 seq_printf(seq, " (read-only)"); 4781 if (mddev->ro==2) 4782 seq_printf(seq, "(auto-read-only)"); 4783 seq_printf(seq, " %s", mddev->pers->name); 4784 } 4785 4786 size = 0; 4787 ITERATE_RDEV(mddev,rdev,tmp2) { 4788 char b[BDEVNAME_SIZE]; 4789 seq_printf(seq, " %s[%d]", 4790 bdevname(rdev->bdev,b), rdev->desc_nr); 4791 if (test_bit(WriteMostly, &rdev->flags)) 4792 seq_printf(seq, "(W)"); 4793 if (test_bit(Faulty, &rdev->flags)) { 4794 seq_printf(seq, "(F)"); 4795 continue; 4796 } else if (rdev->raid_disk < 0) 4797 seq_printf(seq, "(S)"); /* spare */ 4798 size += rdev->size; 4799 } 4800 4801 if (!list_empty(&mddev->disks)) { 4802 if (mddev->pers) 4803 seq_printf(seq, "\n %llu blocks", 4804 (unsigned long long)mddev->array_size); 4805 else 4806 seq_printf(seq, "\n %llu blocks", 4807 (unsigned long long)size); 4808 } 4809 if (mddev->persistent) { 4810 if (mddev->major_version != 0 || 4811 mddev->minor_version != 90) { 4812 seq_printf(seq," super %d.%d", 4813 mddev->major_version, 4814 mddev->minor_version); 4815 } 4816 } else 4817 seq_printf(seq, " super non-persistent"); 4818 4819 if (mddev->pers) { 4820 mddev->pers->status (seq, mddev); 4821 seq_printf(seq, "\n "); 4822 if (mddev->pers->sync_request) { 4823 if (mddev->curr_resync > 2) { 4824 status_resync (seq, mddev); 4825 seq_printf(seq, "\n "); 4826 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 4827 seq_printf(seq, "\tresync=DELAYED\n "); 4828 else if (mddev->recovery_cp < MaxSector) 4829 seq_printf(seq, "\tresync=PENDING\n "); 4830 } 4831 } else 4832 seq_printf(seq, "\n "); 4833 4834 if ((bitmap = mddev->bitmap)) { 4835 unsigned long chunk_kb; 4836 unsigned long flags; 4837 spin_lock_irqsave(&bitmap->lock, flags); 4838 chunk_kb = bitmap->chunksize >> 10; 4839 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 4840 "%lu%s chunk", 4841 bitmap->pages - bitmap->missing_pages, 4842 bitmap->pages, 4843 (bitmap->pages - bitmap->missing_pages) 4844 << (PAGE_SHIFT - 10), 4845 chunk_kb ? chunk_kb : bitmap->chunksize, 4846 chunk_kb ? "KB" : "B"); 4847 if (bitmap->file) { 4848 seq_printf(seq, ", file: "); 4849 seq_path(seq, bitmap->file->f_vfsmnt, 4850 bitmap->file->f_dentry," \t\n"); 4851 } 4852 4853 seq_printf(seq, "\n"); 4854 spin_unlock_irqrestore(&bitmap->lock, flags); 4855 } 4856 4857 seq_printf(seq, "\n"); 4858 } 4859 mddev_unlock(mddev); 4860 4861 return 0; 4862} 4863 4864static struct seq_operations md_seq_ops = { 4865 .start = md_seq_start, 4866 .next = md_seq_next, 4867 .stop = md_seq_stop, 4868 .show = md_seq_show, 4869}; 4870 4871static int md_seq_open(struct inode *inode, struct file *file) 4872{ 4873 int error; 4874 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 4875 if (mi == NULL) 4876 return -ENOMEM; 4877 4878 error = seq_open(file, &md_seq_ops); 4879 if (error) 4880 kfree(mi); 4881 else { 4882 struct seq_file *p = file->private_data; 4883 p->private = mi; 4884 mi->event = atomic_read(&md_event_count); 4885 } 4886 return error; 4887} 4888 4889static int md_seq_release(struct inode *inode, struct file *file) 4890{ 4891 struct seq_file *m = file->private_data; 4892 struct mdstat_info *mi = m->private; 4893 m->private = NULL; 4894 kfree(mi); 4895 return seq_release(inode, file); 4896} 4897 4898static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 4899{ 4900 struct seq_file *m = filp->private_data; 4901 struct mdstat_info *mi = m->private; 4902 int mask; 4903 4904 poll_wait(filp, &md_event_waiters, wait); 4905 4906 /* always allow read */ 4907 mask = POLLIN | POLLRDNORM; 4908 4909 if (mi->event != atomic_read(&md_event_count)) 4910 mask |= POLLERR | POLLPRI; 4911 return mask; 4912} 4913 4914static struct file_operations md_seq_fops = { 4915 .open = md_seq_open, 4916 .read = seq_read, 4917 .llseek = seq_lseek, 4918 .release = md_seq_release, 4919 .poll = mdstat_poll, 4920}; 4921 4922int register_md_personality(struct mdk_personality *p) 4923{ 4924 spin_lock(&pers_lock); 4925 list_add_tail(&p->list, &pers_list); 4926 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 4927 spin_unlock(&pers_lock); 4928 return 0; 4929} 4930 4931int unregister_md_personality(struct mdk_personality *p) 4932{ 4933 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 4934 spin_lock(&pers_lock); 4935 list_del_init(&p->list); 4936 spin_unlock(&pers_lock); 4937 return 0; 4938} 4939 4940static int is_mddev_idle(mddev_t *mddev) 4941{ 4942 mdk_rdev_t * rdev; 4943 struct list_head *tmp; 4944 int idle; 4945 unsigned long curr_events; 4946 4947 idle = 1; 4948 ITERATE_RDEV(mddev,rdev,tmp) { 4949 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 4950 curr_events = disk_stat_read(disk, sectors[0]) + 4951 disk_stat_read(disk, sectors[1]) - 4952 atomic_read(&disk->sync_io); 4953 /* The difference between curr_events and last_events 4954 * will be affected by any new non-sync IO (making 4955 * curr_events bigger) and any difference in the amount of 4956 * in-flight syncio (making current_events bigger or smaller) 4957 * The amount in-flight is currently limited to 4958 * 32*64K in raid1/10 and 256*PAGE_SIZE in raid5/6 4959 * which is at most 4096 sectors. 4960 * These numbers are fairly fragile and should be made 4961 * more robust, probably by enforcing the 4962 * 'window size' that md_do_sync sort-of uses. 4963 * 4964 * Note: the following is an unsigned comparison. 4965 */ 4966 if ((curr_events - rdev->last_events + 4096) > 8192) { 4967 rdev->last_events = curr_events; 4968 idle = 0; 4969 } 4970 } 4971 return idle; 4972} 4973 4974void md_done_sync(mddev_t *mddev, int blocks, int ok) 4975{ 4976 /* another "blocks" (512byte) blocks have been synced */ 4977 atomic_sub(blocks, &mddev->recovery_active); 4978 wake_up(&mddev->recovery_wait); 4979 if (!ok) { 4980 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 4981 md_wakeup_thread(mddev->thread); 4982 // stop recovery, signal do_sync .... 4983 } 4984} 4985 4986 4987/* md_write_start(mddev, bi) 4988 * If we need to update some array metadata (e.g. 'active' flag 4989 * in superblock) before writing, schedule a superblock update 4990 * and wait for it to complete. 4991 */ 4992void md_write_start(mddev_t *mddev, struct bio *bi) 4993{ 4994 if (bio_data_dir(bi) != WRITE) 4995 return; 4996 4997 BUG_ON(mddev->ro == 1); 4998 if (mddev->ro == 2) { 4999 /* need to switch to read/write */ 5000 mddev->ro = 0; 5001 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5002 md_wakeup_thread(mddev->thread); 5003 } 5004 atomic_inc(&mddev->writes_pending); 5005 if (mddev->in_sync) { 5006 spin_lock_irq(&mddev->write_lock); 5007 if (mddev->in_sync) { 5008 mddev->in_sync = 0; 5009 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5010 md_wakeup_thread(mddev->thread); 5011 } 5012 spin_unlock_irq(&mddev->write_lock); 5013 } 5014 wait_event(mddev->sb_wait, mddev->flags==0); 5015} 5016 5017void md_write_end(mddev_t *mddev) 5018{ 5019 if (atomic_dec_and_test(&mddev->writes_pending)) { 5020 if (mddev->safemode == 2) 5021 md_wakeup_thread(mddev->thread); 5022 else if (mddev->safemode_delay) 5023 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 5024 } 5025} 5026 5027static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 5028 5029#define SYNC_MARKS 10 5030#define SYNC_MARK_STEP (3*HZ) 5031void md_do_sync(mddev_t *mddev) 5032{ 5033 mddev_t *mddev2; 5034 unsigned int currspeed = 0, 5035 window; 5036 sector_t max_sectors,j, io_sectors; 5037 unsigned long mark[SYNC_MARKS]; 5038 sector_t mark_cnt[SYNC_MARKS]; 5039 int last_mark,m; 5040 struct list_head *tmp; 5041 sector_t last_check; 5042 int skipped = 0; 5043 struct list_head *rtmp; 5044 mdk_rdev_t *rdev; 5045 char *desc; 5046 5047 /* just incase thread restarts... */ 5048 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 5049 return; 5050 if (mddev->ro) /* never try to sync a read-only array */ 5051 return; 5052 5053 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5054 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 5055 desc = "data-check"; 5056 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5057 desc = "requested-resync"; 5058 else 5059 desc = "resync"; 5060 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5061 desc = "reshape"; 5062 else 5063 desc = "recovery"; 5064 5065 /* we overload curr_resync somewhat here. 5066 * 0 == not engaged in resync at all 5067 * 2 == checking that there is no conflict with another sync 5068 * 1 == like 2, but have yielded to allow conflicting resync to 5069 * commense 5070 * other == active in resync - this many blocks 5071 * 5072 * Before starting a resync we must have set curr_resync to 5073 * 2, and then checked that every "conflicting" array has curr_resync 5074 * less than ours. When we find one that is the same or higher 5075 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 5076 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 5077 * This will mean we have to start checking from the beginning again. 5078 * 5079 */ 5080 5081 do { 5082 mddev->curr_resync = 2; 5083 5084 try_again: 5085 if (kthread_should_stop()) { 5086 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5087 goto skip; 5088 } 5089 ITERATE_MDDEV(mddev2,tmp) { 5090 if (mddev2 == mddev) 5091 continue; 5092 if (mddev2->curr_resync && 5093 match_mddev_units(mddev,mddev2)) { 5094 DEFINE_WAIT(wq); 5095 if (mddev < mddev2 && mddev->curr_resync == 2) { 5096 /* arbitrarily yield */ 5097 mddev->curr_resync = 1; 5098 wake_up(&resync_wait); 5099 } 5100 if (mddev > mddev2 && mddev->curr_resync == 1) 5101 /* no need to wait here, we can wait the next 5102 * time 'round when curr_resync == 2 5103 */ 5104 continue; 5105 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE); 5106 if (!kthread_should_stop() && 5107 mddev2->curr_resync >= mddev->curr_resync) { 5108 printk(KERN_INFO "md: delaying %s of %s" 5109 " until %s has finished (they" 5110 " share one or more physical units)\n", 5111 desc, mdname(mddev), mdname(mddev2)); 5112 mddev_put(mddev2); 5113 schedule(); 5114 finish_wait(&resync_wait, &wq); 5115 goto try_again; 5116 } 5117 finish_wait(&resync_wait, &wq); 5118 } 5119 } 5120 } while (mddev->curr_resync < 2); 5121 5122 j = 0; 5123 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5124 /* resync follows the size requested by the personality, 5125 * which defaults to physical size, but can be virtual size 5126 */ 5127 max_sectors = mddev->resync_max_sectors; 5128 mddev->resync_mismatches = 0; 5129 /* we don't use the checkpoint if there's a bitmap */ 5130 if (!mddev->bitmap && 5131 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5132 j = mddev->recovery_cp; 5133 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5134 max_sectors = mddev->size << 1; 5135 else { 5136 /* recovery follows the physical size of devices */ 5137 max_sectors = mddev->size << 1; 5138 j = MaxSector; 5139 ITERATE_RDEV(mddev,rdev,rtmp) 5140 if (rdev->raid_disk >= 0 && 5141 !test_bit(Faulty, &rdev->flags) && 5142 !test_bit(In_sync, &rdev->flags) && 5143 rdev->recovery_offset < j) 5144 j = rdev->recovery_offset; 5145 } 5146 5147 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 5148 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 5149 " %d KB/sec/disk.\n", speed_min(mddev)); 5150 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 5151 "(but not more than %d KB/sec) for %s.\n", 5152 speed_max(mddev), desc); 5153 5154 is_mddev_idle(mddev); /* this also initializes IO event counters */ 5155 5156 io_sectors = 0; 5157 for (m = 0; m < SYNC_MARKS; m++) { 5158 mark[m] = jiffies; 5159 mark_cnt[m] = io_sectors; 5160 } 5161 last_mark = 0; 5162 mddev->resync_mark = mark[last_mark]; 5163 mddev->resync_mark_cnt = mark_cnt[last_mark]; 5164 5165 /* 5166 * Tune reconstruction: 5167 */ 5168 window = 32*(PAGE_SIZE/512); 5169 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 5170 window/2,(unsigned long long) max_sectors/2); 5171 5172 atomic_set(&mddev->recovery_active, 0); 5173 init_waitqueue_head(&mddev->recovery_wait); 5174 last_check = 0; 5175 5176 if (j>2) { 5177 printk(KERN_INFO 5178 "md: resuming %s of %s from checkpoint.\n", 5179 desc, mdname(mddev)); 5180 mddev->curr_resync = j; 5181 } 5182 5183 while (j < max_sectors) { 5184 sector_t sectors; 5185 5186 skipped = 0; 5187 sectors = mddev->pers->sync_request(mddev, j, &skipped, 5188 currspeed < speed_min(mddev)); 5189 if (sectors == 0) { 5190 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 5191 goto out; 5192 } 5193 5194 if (!skipped) { /* actual IO requested */ 5195 io_sectors += sectors; 5196 atomic_add(sectors, &mddev->recovery_active); 5197 } 5198 5199 j += sectors; 5200 if (j>1) mddev->curr_resync = j; 5201 mddev->curr_mark_cnt = io_sectors; 5202 if (last_check == 0) 5203 /* this is the earliers that rebuilt will be 5204 * visible in /proc/mdstat 5205 */ 5206 md_new_event(mddev); 5207 5208 if (last_check + window > io_sectors || j == max_sectors) 5209 continue; 5210 5211 last_check = io_sectors; 5212 5213 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 5214 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 5215 break; 5216 5217 repeat: 5218 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 5219 /* step marks */ 5220 int next = (last_mark+1) % SYNC_MARKS; 5221 5222 mddev->resync_mark = mark[next]; 5223 mddev->resync_mark_cnt = mark_cnt[next]; 5224 mark[next] = jiffies; 5225 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 5226 last_mark = next; 5227 } 5228 5229 5230 if (kthread_should_stop()) { 5231 /* 5232 * got a signal, exit. 5233 */ 5234 printk(KERN_INFO 5235 "md: md_do_sync() got signal ... exiting\n"); 5236 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5237 goto out; 5238 } 5239 5240 /* 5241 * this loop exits only if either when we are slower than 5242 * the 'hard' speed limit, or the system was IO-idle for 5243 * a jiffy. 5244 * the system might be non-idle CPU-wise, but we only care 5245 * about not overloading the IO subsystem. (things like an 5246 * e2fsck being done on the RAID array should execute fast) 5247 */ 5248 mddev->queue->unplug_fn(mddev->queue); 5249 cond_resched(); 5250 5251 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 5252 /((jiffies-mddev->resync_mark)/HZ +1) +1; 5253 5254 if (currspeed > speed_min(mddev)) { 5255 if ((currspeed > speed_max(mddev)) || 5256 !is_mddev_idle(mddev)) { 5257 msleep(500); 5258 goto repeat; 5259 } 5260 } 5261 } 5262 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); 5263 /* 5264 * this also signals 'finished resyncing' to md_stop 5265 */ 5266 out: 5267 mddev->queue->unplug_fn(mddev->queue); 5268 5269 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 5270 5271 /* tell personality that we are finished */ 5272 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 5273 5274 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5275 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 5276 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 5277 mddev->curr_resync > 2) { 5278 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5279 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5280 if (mddev->curr_resync >= mddev->recovery_cp) { 5281 printk(KERN_INFO 5282 "md: checkpointing %s of %s.\n", 5283 desc, mdname(mddev)); 5284 mddev->recovery_cp = mddev->curr_resync; 5285 } 5286 } else 5287 mddev->recovery_cp = MaxSector; 5288 } else { 5289 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5290 mddev->curr_resync = MaxSector; 5291 ITERATE_RDEV(mddev,rdev,rtmp) 5292 if (rdev->raid_disk >= 0 && 5293 !test_bit(Faulty, &rdev->flags) && 5294 !test_bit(In_sync, &rdev->flags) && 5295 rdev->recovery_offset < mddev->curr_resync) 5296 rdev->recovery_offset = mddev->curr_resync; 5297 } 5298 } 5299 5300 skip: 5301 mddev->curr_resync = 0; 5302 wake_up(&resync_wait); 5303 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 5304 md_wakeup_thread(mddev->thread); 5305} 5306EXPORT_SYMBOL_GPL(md_do_sync); 5307 5308 5309/* 5310 * This routine is regularly called by all per-raid-array threads to 5311 * deal with generic issues like resync and super-block update. 5312 * Raid personalities that don't have a thread (linear/raid0) do not 5313 * need this as they never do any recovery or update the superblock. 5314 * 5315 * It does not do any resync itself, but rather "forks" off other threads 5316 * to do that as needed. 5317 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 5318 * "->recovery" and create a thread at ->sync_thread. 5319 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 5320 * and wakeups up this thread which will reap the thread and finish up. 5321 * This thread also removes any faulty devices (with nr_pending == 0). 5322 * 5323 * The overall approach is: 5324 * 1/ if the superblock needs updating, update it. 5325 * 2/ If a recovery thread is running, don't do anything else. 5326 * 3/ If recovery has finished, clean up, possibly marking spares active. 5327 * 4/ If there are any faulty devices, remove them. 5328 * 5/ If array is degraded, try to add spares devices 5329 * 6/ If array has spares or is not in-sync, start a resync thread. 5330 */ 5331void md_check_recovery(mddev_t *mddev) 5332{ 5333 mdk_rdev_t *rdev; 5334 struct list_head *rtmp; 5335 5336 5337 if (mddev->bitmap) 5338 bitmap_daemon_work(mddev->bitmap); 5339 5340 if (mddev->ro) 5341 return; 5342 5343 if (signal_pending(current)) { 5344 if (mddev->pers->sync_request) { 5345 printk(KERN_INFO "md: %s in immediate safe mode\n", 5346 mdname(mddev)); 5347 mddev->safemode = 2; 5348 } 5349 flush_signals(current); 5350 } 5351 5352 if ( ! ( 5353 mddev->flags || 5354 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 5355 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 5356 (mddev->safemode == 1) || 5357 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 5358 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 5359 )) 5360 return; 5361 5362 if (mddev_trylock(mddev)) { 5363 int spares =0; 5364 5365 spin_lock_irq(&mddev->write_lock); 5366 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 5367 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 5368 mddev->in_sync = 1; 5369 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5370 } 5371 if (mddev->safemode == 1) 5372 mddev->safemode = 0; 5373 spin_unlock_irq(&mddev->write_lock); 5374 5375 if (mddev->flags) 5376 md_update_sb(mddev, 0); 5377 5378 5379 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 5380 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 5381 /* resync/recovery still happening */ 5382 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5383 goto unlock; 5384 } 5385 if (mddev->sync_thread) { 5386 /* resync has finished, collect result */ 5387 md_unregister_thread(mddev->sync_thread); 5388 mddev->sync_thread = NULL; 5389 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5390 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5391 /* success...*/ 5392 /* activate any spares */ 5393 mddev->pers->spare_active(mddev); 5394 } 5395 md_update_sb(mddev, 1); 5396 5397 /* if array is no-longer degraded, then any saved_raid_disk 5398 * information must be scrapped 5399 */ 5400 if (!mddev->degraded) 5401 ITERATE_RDEV(mddev,rdev,rtmp) 5402 rdev->saved_raid_disk = -1; 5403 5404 mddev->recovery = 0; 5405 /* flag recovery needed just to double check */ 5406 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5407 md_new_event(mddev); 5408 goto unlock; 5409 } 5410 /* Clear some bits that don't mean anything, but 5411 * might be left set 5412 */ 5413 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5414 clear_bit(MD_RECOVERY_ERR, &mddev->recovery); 5415 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 5416 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 5417 5418 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 5419 goto unlock; 5420 /* no recovery is running. 5421 * remove any failed drives, then 5422 * add spares if possible. 5423 * Spare are also removed and re-added, to allow 5424 * the personality to fail the re-add. 5425 */ 5426 ITERATE_RDEV(mddev,rdev,rtmp) 5427 if (rdev->raid_disk >= 0 && 5428 (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && 5429 atomic_read(&rdev->nr_pending)==0) { 5430 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) { 5431 char nm[20]; 5432 sprintf(nm,"rd%d", rdev->raid_disk); 5433 sysfs_remove_link(&mddev->kobj, nm); 5434 rdev->raid_disk = -1; 5435 } 5436 } 5437 5438 if (mddev->degraded) { 5439 ITERATE_RDEV(mddev,rdev,rtmp) 5440 if (rdev->raid_disk < 0 5441 && !test_bit(Faulty, &rdev->flags)) { 5442 rdev->recovery_offset = 0; 5443 if (mddev->pers->hot_add_disk(mddev,rdev)) { 5444 char nm[20]; 5445 sprintf(nm, "rd%d", rdev->raid_disk); 5446 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 5447 spares++; 5448 md_new_event(mddev); 5449 } else 5450 break; 5451 } 5452 } 5453 5454 if (spares) { 5455 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5456 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5457 } else if (mddev->recovery_cp < MaxSector) { 5458 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5459 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 5460 /* nothing to be done ... */ 5461 goto unlock; 5462 5463 if (mddev->pers->sync_request) { 5464 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5465 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 5466 /* We are adding a device or devices to an array 5467 * which has the bitmap stored on all devices. 5468 * So make sure all bitmap pages get written 5469 */ 5470 bitmap_write_all(mddev->bitmap); 5471 } 5472 mddev->sync_thread = md_register_thread(md_do_sync, 5473 mddev, 5474 "%s_resync"); 5475 if (!mddev->sync_thread) { 5476 printk(KERN_ERR "%s: could not start resync" 5477 " thread...\n", 5478 mdname(mddev)); 5479 /* leave the spares where they are, it shouldn't hurt */ 5480 mddev->recovery = 0; 5481 } else 5482 md_wakeup_thread(mddev->sync_thread); 5483 md_new_event(mddev); 5484 } 5485 unlock: 5486 mddev_unlock(mddev); 5487 } 5488} 5489 5490static int md_notify_reboot(struct notifier_block *this, 5491 unsigned long code, void *x) 5492{ 5493 struct list_head *tmp; 5494 mddev_t *mddev; 5495 5496 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 5497 5498 printk(KERN_INFO "md: stopping all md devices.\n"); 5499 5500 ITERATE_MDDEV(mddev,tmp) 5501 if (mddev_trylock(mddev)) { 5502 do_md_stop (mddev, 1); 5503 mddev_unlock(mddev); 5504 } 5505 /* 5506 * certain more exotic SCSI devices are known to be 5507 * volatile wrt too early system reboots. While the 5508 * right place to handle this issue is the given 5509 * driver, we do want to have a safe RAID driver ... 5510 */ 5511 mdelay(1000*1); 5512 } 5513 return NOTIFY_DONE; 5514} 5515 5516static struct notifier_block md_notifier = { 5517 .notifier_call = md_notify_reboot, 5518 .next = NULL, 5519 .priority = INT_MAX, /* before any real devices */ 5520}; 5521 5522static void md_geninit(void) 5523{ 5524 struct proc_dir_entry *p; 5525 5526 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 5527 5528 p = create_proc_entry("mdstat", S_IRUGO, NULL); 5529 if (p) 5530 p->proc_fops = &md_seq_fops; 5531} 5532 5533static int __init md_init(void) 5534{ 5535 if (register_blkdev(MAJOR_NR, "md")) 5536 return -1; 5537 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 5538 unregister_blkdev(MAJOR_NR, "md"); 5539 return -1; 5540 } 5541 blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE, 5542 md_probe, NULL, NULL); 5543 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 5544 md_probe, NULL, NULL); 5545 5546 register_reboot_notifier(&md_notifier); 5547 raid_table_header = register_sysctl_table(raid_root_table, 1); 5548 5549 md_geninit(); 5550 return (0); 5551} 5552 5553 5554#ifndef MODULE 5555 5556/* 5557 * Searches all registered partitions for autorun RAID arrays 5558 * at boot time. 5559 */ 5560static dev_t detected_devices[128]; 5561static int dev_cnt; 5562 5563void md_autodetect_dev(dev_t dev) 5564{ 5565 if (dev_cnt >= 0 && dev_cnt < 127) 5566 detected_devices[dev_cnt++] = dev; 5567} 5568 5569 5570static void autostart_arrays(int part) 5571{ 5572 mdk_rdev_t *rdev; 5573 int i; 5574 5575 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 5576 5577 for (i = 0; i < dev_cnt; i++) { 5578 dev_t dev = detected_devices[i]; 5579 5580 rdev = md_import_device(dev,0, 0); 5581 if (IS_ERR(rdev)) 5582 continue; 5583 5584 if (test_bit(Faulty, &rdev->flags)) { 5585 MD_BUG(); 5586 continue; 5587 } 5588 list_add(&rdev->same_set, &pending_raid_disks); 5589 } 5590 dev_cnt = 0; 5591 5592 autorun_devices(part); 5593} 5594 5595#endif 5596 5597static __exit void md_exit(void) 5598{ 5599 mddev_t *mddev; 5600 struct list_head *tmp; 5601 5602 blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS); 5603 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 5604 5605 unregister_blkdev(MAJOR_NR,"md"); 5606 unregister_blkdev(mdp_major, "mdp"); 5607 unregister_reboot_notifier(&md_notifier); 5608 unregister_sysctl_table(raid_table_header); 5609 remove_proc_entry("mdstat", NULL); 5610 ITERATE_MDDEV(mddev,tmp) { 5611 struct gendisk *disk = mddev->gendisk; 5612 if (!disk) 5613 continue; 5614 export_array(mddev); 5615 del_gendisk(disk); 5616 put_disk(disk); 5617 mddev->gendisk = NULL; 5618 mddev_put(mddev); 5619 } 5620} 5621 5622module_init(md_init) 5623module_exit(md_exit) 5624 5625static int get_ro(char *buffer, struct kernel_param *kp) 5626{ 5627 return sprintf(buffer, "%d", start_readonly); 5628} 5629static int set_ro(const char *val, struct kernel_param *kp) 5630{ 5631 char *e; 5632 int num = simple_strtoul(val, &e, 10); 5633 if (*val && (*e == '\0' || *e == '\n')) { 5634 start_readonly = num; 5635 return 0; 5636 } 5637 return -EINVAL; 5638} 5639 5640module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 5641module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 5642 5643 5644EXPORT_SYMBOL(register_md_personality); 5645EXPORT_SYMBOL(unregister_md_personality); 5646EXPORT_SYMBOL(md_error); 5647EXPORT_SYMBOL(md_done_sync); 5648EXPORT_SYMBOL(md_write_start); 5649EXPORT_SYMBOL(md_write_end); 5650EXPORT_SYMBOL(md_register_thread); 5651EXPORT_SYMBOL(md_unregister_thread); 5652EXPORT_SYMBOL(md_wakeup_thread); 5653EXPORT_SYMBOL(md_check_recovery); 5654MODULE_LICENSE("GPL"); 5655MODULE_ALIAS("md"); 5656MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);