at v2.6.20-rc2 5663 lines 142 kB view raw
1/* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33*/ 34 35#include <linux/module.h> 36#include <linux/kthread.h> 37#include <linux/linkage.h> 38#include <linux/raid/md.h> 39#include <linux/raid/bitmap.h> 40#include <linux/sysctl.h> 41#include <linux/buffer_head.h> /* for invalidate_bdev */ 42#include <linux/poll.h> 43#include <linux/mutex.h> 44#include <linux/ctype.h> 45#include <linux/freezer.h> 46 47#include <linux/init.h> 48 49#include <linux/file.h> 50 51#ifdef CONFIG_KMOD 52#include <linux/kmod.h> 53#endif 54 55#include <asm/unaligned.h> 56 57#define MAJOR_NR MD_MAJOR 58#define MD_DRIVER 59 60/* 63 partitions with the alternate major number (mdp) */ 61#define MdpMinorShift 6 62 63#define DEBUG 0 64#define dprintk(x...) ((void)(DEBUG && printk(x))) 65 66 67#ifndef MODULE 68static void autostart_arrays (int part); 69#endif 70 71static LIST_HEAD(pers_list); 72static DEFINE_SPINLOCK(pers_lock); 73 74static void md_print_devices(void); 75 76#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 77 78/* 79 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 80 * is 1000 KB/sec, so the extra system load does not show up that much. 81 * Increase it if you want to have more _guaranteed_ speed. Note that 82 * the RAID driver will use the maximum available bandwidth if the IO 83 * subsystem is idle. There is also an 'absolute maximum' reconstruction 84 * speed limit - in case reconstruction slows down your system despite 85 * idle IO detection. 86 * 87 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 88 * or /sys/block/mdX/md/sync_speed_{min,max} 89 */ 90 91static int sysctl_speed_limit_min = 1000; 92static int sysctl_speed_limit_max = 200000; 93static inline int speed_min(mddev_t *mddev) 94{ 95 return mddev->sync_speed_min ? 96 mddev->sync_speed_min : sysctl_speed_limit_min; 97} 98 99static inline int speed_max(mddev_t *mddev) 100{ 101 return mddev->sync_speed_max ? 102 mddev->sync_speed_max : sysctl_speed_limit_max; 103} 104 105static struct ctl_table_header *raid_table_header; 106 107static ctl_table raid_table[] = { 108 { 109 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 110 .procname = "speed_limit_min", 111 .data = &sysctl_speed_limit_min, 112 .maxlen = sizeof(int), 113 .mode = S_IRUGO|S_IWUSR, 114 .proc_handler = &proc_dointvec, 115 }, 116 { 117 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 118 .procname = "speed_limit_max", 119 .data = &sysctl_speed_limit_max, 120 .maxlen = sizeof(int), 121 .mode = S_IRUGO|S_IWUSR, 122 .proc_handler = &proc_dointvec, 123 }, 124 { .ctl_name = 0 } 125}; 126 127static ctl_table raid_dir_table[] = { 128 { 129 .ctl_name = DEV_RAID, 130 .procname = "raid", 131 .maxlen = 0, 132 .mode = S_IRUGO|S_IXUGO, 133 .child = raid_table, 134 }, 135 { .ctl_name = 0 } 136}; 137 138static ctl_table raid_root_table[] = { 139 { 140 .ctl_name = CTL_DEV, 141 .procname = "dev", 142 .maxlen = 0, 143 .mode = 0555, 144 .child = raid_dir_table, 145 }, 146 { .ctl_name = 0 } 147}; 148 149static struct block_device_operations md_fops; 150 151static int start_readonly; 152 153/* 154 * We have a system wide 'event count' that is incremented 155 * on any 'interesting' event, and readers of /proc/mdstat 156 * can use 'poll' or 'select' to find out when the event 157 * count increases. 158 * 159 * Events are: 160 * start array, stop array, error, add device, remove device, 161 * start build, activate spare 162 */ 163static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 164static atomic_t md_event_count; 165void md_new_event(mddev_t *mddev) 166{ 167 atomic_inc(&md_event_count); 168 wake_up(&md_event_waiters); 169 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 170} 171EXPORT_SYMBOL_GPL(md_new_event); 172 173/* Alternate version that can be called from interrupts 174 * when calling sysfs_notify isn't needed. 175 */ 176static void md_new_event_inintr(mddev_t *mddev) 177{ 178 atomic_inc(&md_event_count); 179 wake_up(&md_event_waiters); 180} 181 182/* 183 * Enables to iterate over all existing md arrays 184 * all_mddevs_lock protects this list. 185 */ 186static LIST_HEAD(all_mddevs); 187static DEFINE_SPINLOCK(all_mddevs_lock); 188 189 190/* 191 * iterates through all used mddevs in the system. 192 * We take care to grab the all_mddevs_lock whenever navigating 193 * the list, and to always hold a refcount when unlocked. 194 * Any code which breaks out of this loop while own 195 * a reference to the current mddev and must mddev_put it. 196 */ 197#define ITERATE_MDDEV(mddev,tmp) \ 198 \ 199 for (({ spin_lock(&all_mddevs_lock); \ 200 tmp = all_mddevs.next; \ 201 mddev = NULL;}); \ 202 ({ if (tmp != &all_mddevs) \ 203 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 204 spin_unlock(&all_mddevs_lock); \ 205 if (mddev) mddev_put(mddev); \ 206 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 207 tmp != &all_mddevs;}); \ 208 ({ spin_lock(&all_mddevs_lock); \ 209 tmp = tmp->next;}) \ 210 ) 211 212 213static int md_fail_request (request_queue_t *q, struct bio *bio) 214{ 215 bio_io_error(bio, bio->bi_size); 216 return 0; 217} 218 219static inline mddev_t *mddev_get(mddev_t *mddev) 220{ 221 atomic_inc(&mddev->active); 222 return mddev; 223} 224 225static void mddev_put(mddev_t *mddev) 226{ 227 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 228 return; 229 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 230 list_del(&mddev->all_mddevs); 231 spin_unlock(&all_mddevs_lock); 232 blk_cleanup_queue(mddev->queue); 233 kobject_unregister(&mddev->kobj); 234 } else 235 spin_unlock(&all_mddevs_lock); 236} 237 238static mddev_t * mddev_find(dev_t unit) 239{ 240 mddev_t *mddev, *new = NULL; 241 242 retry: 243 spin_lock(&all_mddevs_lock); 244 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 245 if (mddev->unit == unit) { 246 mddev_get(mddev); 247 spin_unlock(&all_mddevs_lock); 248 kfree(new); 249 return mddev; 250 } 251 252 if (new) { 253 list_add(&new->all_mddevs, &all_mddevs); 254 spin_unlock(&all_mddevs_lock); 255 return new; 256 } 257 spin_unlock(&all_mddevs_lock); 258 259 new = kzalloc(sizeof(*new), GFP_KERNEL); 260 if (!new) 261 return NULL; 262 263 new->unit = unit; 264 if (MAJOR(unit) == MD_MAJOR) 265 new->md_minor = MINOR(unit); 266 else 267 new->md_minor = MINOR(unit) >> MdpMinorShift; 268 269 mutex_init(&new->reconfig_mutex); 270 INIT_LIST_HEAD(&new->disks); 271 INIT_LIST_HEAD(&new->all_mddevs); 272 init_timer(&new->safemode_timer); 273 atomic_set(&new->active, 1); 274 spin_lock_init(&new->write_lock); 275 init_waitqueue_head(&new->sb_wait); 276 277 new->queue = blk_alloc_queue(GFP_KERNEL); 278 if (!new->queue) { 279 kfree(new); 280 return NULL; 281 } 282 set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags); 283 284 blk_queue_make_request(new->queue, md_fail_request); 285 286 goto retry; 287} 288 289static inline int mddev_lock(mddev_t * mddev) 290{ 291 return mutex_lock_interruptible(&mddev->reconfig_mutex); 292} 293 294static inline int mddev_trylock(mddev_t * mddev) 295{ 296 return mutex_trylock(&mddev->reconfig_mutex); 297} 298 299static inline void mddev_unlock(mddev_t * mddev) 300{ 301 mutex_unlock(&mddev->reconfig_mutex); 302 303 md_wakeup_thread(mddev->thread); 304} 305 306static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 307{ 308 mdk_rdev_t * rdev; 309 struct list_head *tmp; 310 311 ITERATE_RDEV(mddev,rdev,tmp) { 312 if (rdev->desc_nr == nr) 313 return rdev; 314 } 315 return NULL; 316} 317 318static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 319{ 320 struct list_head *tmp; 321 mdk_rdev_t *rdev; 322 323 ITERATE_RDEV(mddev,rdev,tmp) { 324 if (rdev->bdev->bd_dev == dev) 325 return rdev; 326 } 327 return NULL; 328} 329 330static struct mdk_personality *find_pers(int level, char *clevel) 331{ 332 struct mdk_personality *pers; 333 list_for_each_entry(pers, &pers_list, list) { 334 if (level != LEVEL_NONE && pers->level == level) 335 return pers; 336 if (strcmp(pers->name, clevel)==0) 337 return pers; 338 } 339 return NULL; 340} 341 342static inline sector_t calc_dev_sboffset(struct block_device *bdev) 343{ 344 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 345 return MD_NEW_SIZE_BLOCKS(size); 346} 347 348static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 349{ 350 sector_t size; 351 352 size = rdev->sb_offset; 353 354 if (chunk_size) 355 size &= ~((sector_t)chunk_size/1024 - 1); 356 return size; 357} 358 359static int alloc_disk_sb(mdk_rdev_t * rdev) 360{ 361 if (rdev->sb_page) 362 MD_BUG(); 363 364 rdev->sb_page = alloc_page(GFP_KERNEL); 365 if (!rdev->sb_page) { 366 printk(KERN_ALERT "md: out of memory.\n"); 367 return -EINVAL; 368 } 369 370 return 0; 371} 372 373static void free_disk_sb(mdk_rdev_t * rdev) 374{ 375 if (rdev->sb_page) { 376 put_page(rdev->sb_page); 377 rdev->sb_loaded = 0; 378 rdev->sb_page = NULL; 379 rdev->sb_offset = 0; 380 rdev->size = 0; 381 } 382} 383 384 385static int super_written(struct bio *bio, unsigned int bytes_done, int error) 386{ 387 mdk_rdev_t *rdev = bio->bi_private; 388 mddev_t *mddev = rdev->mddev; 389 if (bio->bi_size) 390 return 1; 391 392 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 393 printk("md: super_written gets error=%d, uptodate=%d\n", 394 error, test_bit(BIO_UPTODATE, &bio->bi_flags)); 395 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); 396 md_error(mddev, rdev); 397 } 398 399 if (atomic_dec_and_test(&mddev->pending_writes)) 400 wake_up(&mddev->sb_wait); 401 bio_put(bio); 402 return 0; 403} 404 405static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error) 406{ 407 struct bio *bio2 = bio->bi_private; 408 mdk_rdev_t *rdev = bio2->bi_private; 409 mddev_t *mddev = rdev->mddev; 410 if (bio->bi_size) 411 return 1; 412 413 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 414 error == -EOPNOTSUPP) { 415 unsigned long flags; 416 /* barriers don't appear to be supported :-( */ 417 set_bit(BarriersNotsupp, &rdev->flags); 418 mddev->barriers_work = 0; 419 spin_lock_irqsave(&mddev->write_lock, flags); 420 bio2->bi_next = mddev->biolist; 421 mddev->biolist = bio2; 422 spin_unlock_irqrestore(&mddev->write_lock, flags); 423 wake_up(&mddev->sb_wait); 424 bio_put(bio); 425 return 0; 426 } 427 bio_put(bio2); 428 bio->bi_private = rdev; 429 return super_written(bio, bytes_done, error); 430} 431 432void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 433 sector_t sector, int size, struct page *page) 434{ 435 /* write first size bytes of page to sector of rdev 436 * Increment mddev->pending_writes before returning 437 * and decrement it on completion, waking up sb_wait 438 * if zero is reached. 439 * If an error occurred, call md_error 440 * 441 * As we might need to resubmit the request if BIO_RW_BARRIER 442 * causes ENOTSUPP, we allocate a spare bio... 443 */ 444 struct bio *bio = bio_alloc(GFP_NOIO, 1); 445 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC); 446 447 bio->bi_bdev = rdev->bdev; 448 bio->bi_sector = sector; 449 bio_add_page(bio, page, size, 0); 450 bio->bi_private = rdev; 451 bio->bi_end_io = super_written; 452 bio->bi_rw = rw; 453 454 atomic_inc(&mddev->pending_writes); 455 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 456 struct bio *rbio; 457 rw |= (1<<BIO_RW_BARRIER); 458 rbio = bio_clone(bio, GFP_NOIO); 459 rbio->bi_private = bio; 460 rbio->bi_end_io = super_written_barrier; 461 submit_bio(rw, rbio); 462 } else 463 submit_bio(rw, bio); 464} 465 466void md_super_wait(mddev_t *mddev) 467{ 468 /* wait for all superblock writes that were scheduled to complete. 469 * if any had to be retried (due to BARRIER problems), retry them 470 */ 471 DEFINE_WAIT(wq); 472 for(;;) { 473 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 474 if (atomic_read(&mddev->pending_writes)==0) 475 break; 476 while (mddev->biolist) { 477 struct bio *bio; 478 spin_lock_irq(&mddev->write_lock); 479 bio = mddev->biolist; 480 mddev->biolist = bio->bi_next ; 481 bio->bi_next = NULL; 482 spin_unlock_irq(&mddev->write_lock); 483 submit_bio(bio->bi_rw, bio); 484 } 485 schedule(); 486 } 487 finish_wait(&mddev->sb_wait, &wq); 488} 489 490static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 491{ 492 if (bio->bi_size) 493 return 1; 494 495 complete((struct completion*)bio->bi_private); 496 return 0; 497} 498 499int sync_page_io(struct block_device *bdev, sector_t sector, int size, 500 struct page *page, int rw) 501{ 502 struct bio *bio = bio_alloc(GFP_NOIO, 1); 503 struct completion event; 504 int ret; 505 506 rw |= (1 << BIO_RW_SYNC); 507 508 bio->bi_bdev = bdev; 509 bio->bi_sector = sector; 510 bio_add_page(bio, page, size, 0); 511 init_completion(&event); 512 bio->bi_private = &event; 513 bio->bi_end_io = bi_complete; 514 submit_bio(rw, bio); 515 wait_for_completion(&event); 516 517 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 518 bio_put(bio); 519 return ret; 520} 521EXPORT_SYMBOL_GPL(sync_page_io); 522 523static int read_disk_sb(mdk_rdev_t * rdev, int size) 524{ 525 char b[BDEVNAME_SIZE]; 526 if (!rdev->sb_page) { 527 MD_BUG(); 528 return -EINVAL; 529 } 530 if (rdev->sb_loaded) 531 return 0; 532 533 534 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) 535 goto fail; 536 rdev->sb_loaded = 1; 537 return 0; 538 539fail: 540 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 541 bdevname(rdev->bdev,b)); 542 return -EINVAL; 543} 544 545static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 546{ 547 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 548 (sb1->set_uuid1 == sb2->set_uuid1) && 549 (sb1->set_uuid2 == sb2->set_uuid2) && 550 (sb1->set_uuid3 == sb2->set_uuid3)) 551 552 return 1; 553 554 return 0; 555} 556 557 558static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 559{ 560 int ret; 561 mdp_super_t *tmp1, *tmp2; 562 563 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 564 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 565 566 if (!tmp1 || !tmp2) { 567 ret = 0; 568 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 569 goto abort; 570 } 571 572 *tmp1 = *sb1; 573 *tmp2 = *sb2; 574 575 /* 576 * nr_disks is not constant 577 */ 578 tmp1->nr_disks = 0; 579 tmp2->nr_disks = 0; 580 581 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 582 ret = 0; 583 else 584 ret = 1; 585 586abort: 587 kfree(tmp1); 588 kfree(tmp2); 589 return ret; 590} 591 592static unsigned int calc_sb_csum(mdp_super_t * sb) 593{ 594 unsigned int disk_csum, csum; 595 596 disk_csum = sb->sb_csum; 597 sb->sb_csum = 0; 598 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 599 sb->sb_csum = disk_csum; 600 return csum; 601} 602 603 604/* 605 * Handle superblock details. 606 * We want to be able to handle multiple superblock formats 607 * so we have a common interface to them all, and an array of 608 * different handlers. 609 * We rely on user-space to write the initial superblock, and support 610 * reading and updating of superblocks. 611 * Interface methods are: 612 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 613 * loads and validates a superblock on dev. 614 * if refdev != NULL, compare superblocks on both devices 615 * Return: 616 * 0 - dev has a superblock that is compatible with refdev 617 * 1 - dev has a superblock that is compatible and newer than refdev 618 * so dev should be used as the refdev in future 619 * -EINVAL superblock incompatible or invalid 620 * -othererror e.g. -EIO 621 * 622 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 623 * Verify that dev is acceptable into mddev. 624 * The first time, mddev->raid_disks will be 0, and data from 625 * dev should be merged in. Subsequent calls check that dev 626 * is new enough. Return 0 or -EINVAL 627 * 628 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 629 * Update the superblock for rdev with data in mddev 630 * This does not write to disc. 631 * 632 */ 633 634struct super_type { 635 char *name; 636 struct module *owner; 637 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 638 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 639 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 640}; 641 642/* 643 * load_super for 0.90.0 644 */ 645static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 646{ 647 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 648 mdp_super_t *sb; 649 int ret; 650 sector_t sb_offset; 651 652 /* 653 * Calculate the position of the superblock, 654 * it's at the end of the disk. 655 * 656 * It also happens to be a multiple of 4Kb. 657 */ 658 sb_offset = calc_dev_sboffset(rdev->bdev); 659 rdev->sb_offset = sb_offset; 660 661 ret = read_disk_sb(rdev, MD_SB_BYTES); 662 if (ret) return ret; 663 664 ret = -EINVAL; 665 666 bdevname(rdev->bdev, b); 667 sb = (mdp_super_t*)page_address(rdev->sb_page); 668 669 if (sb->md_magic != MD_SB_MAGIC) { 670 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 671 b); 672 goto abort; 673 } 674 675 if (sb->major_version != 0 || 676 sb->minor_version < 90 || 677 sb->minor_version > 91) { 678 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 679 sb->major_version, sb->minor_version, 680 b); 681 goto abort; 682 } 683 684 if (sb->raid_disks <= 0) 685 goto abort; 686 687 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { 688 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 689 b); 690 goto abort; 691 } 692 693 rdev->preferred_minor = sb->md_minor; 694 rdev->data_offset = 0; 695 rdev->sb_size = MD_SB_BYTES; 696 697 if (sb->level == LEVEL_MULTIPATH) 698 rdev->desc_nr = -1; 699 else 700 rdev->desc_nr = sb->this_disk.number; 701 702 if (refdev == 0) 703 ret = 1; 704 else { 705 __u64 ev1, ev2; 706 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 707 if (!uuid_equal(refsb, sb)) { 708 printk(KERN_WARNING "md: %s has different UUID to %s\n", 709 b, bdevname(refdev->bdev,b2)); 710 goto abort; 711 } 712 if (!sb_equal(refsb, sb)) { 713 printk(KERN_WARNING "md: %s has same UUID" 714 " but different superblock to %s\n", 715 b, bdevname(refdev->bdev, b2)); 716 goto abort; 717 } 718 ev1 = md_event(sb); 719 ev2 = md_event(refsb); 720 if (ev1 > ev2) 721 ret = 1; 722 else 723 ret = 0; 724 } 725 rdev->size = calc_dev_size(rdev, sb->chunk_size); 726 727 if (rdev->size < sb->size && sb->level > 1) 728 /* "this cannot possibly happen" ... */ 729 ret = -EINVAL; 730 731 abort: 732 return ret; 733} 734 735/* 736 * validate_super for 0.90.0 737 */ 738static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 739{ 740 mdp_disk_t *desc; 741 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 742 __u64 ev1 = md_event(sb); 743 744 rdev->raid_disk = -1; 745 rdev->flags = 0; 746 if (mddev->raid_disks == 0) { 747 mddev->major_version = 0; 748 mddev->minor_version = sb->minor_version; 749 mddev->patch_version = sb->patch_version; 750 mddev->persistent = ! sb->not_persistent; 751 mddev->chunk_size = sb->chunk_size; 752 mddev->ctime = sb->ctime; 753 mddev->utime = sb->utime; 754 mddev->level = sb->level; 755 mddev->clevel[0] = 0; 756 mddev->layout = sb->layout; 757 mddev->raid_disks = sb->raid_disks; 758 mddev->size = sb->size; 759 mddev->events = ev1; 760 mddev->bitmap_offset = 0; 761 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 762 763 if (mddev->minor_version >= 91) { 764 mddev->reshape_position = sb->reshape_position; 765 mddev->delta_disks = sb->delta_disks; 766 mddev->new_level = sb->new_level; 767 mddev->new_layout = sb->new_layout; 768 mddev->new_chunk = sb->new_chunk; 769 } else { 770 mddev->reshape_position = MaxSector; 771 mddev->delta_disks = 0; 772 mddev->new_level = mddev->level; 773 mddev->new_layout = mddev->layout; 774 mddev->new_chunk = mddev->chunk_size; 775 } 776 777 if (sb->state & (1<<MD_SB_CLEAN)) 778 mddev->recovery_cp = MaxSector; 779 else { 780 if (sb->events_hi == sb->cp_events_hi && 781 sb->events_lo == sb->cp_events_lo) { 782 mddev->recovery_cp = sb->recovery_cp; 783 } else 784 mddev->recovery_cp = 0; 785 } 786 787 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 788 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 789 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 790 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 791 792 mddev->max_disks = MD_SB_DISKS; 793 794 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 795 mddev->bitmap_file == NULL) { 796 if (mddev->level != 1 && mddev->level != 4 797 && mddev->level != 5 && mddev->level != 6 798 && mddev->level != 10) { 799 /* FIXME use a better test */ 800 printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); 801 return -EINVAL; 802 } 803 mddev->bitmap_offset = mddev->default_bitmap_offset; 804 } 805 806 } else if (mddev->pers == NULL) { 807 /* Insist on good event counter while assembling */ 808 ++ev1; 809 if (ev1 < mddev->events) 810 return -EINVAL; 811 } else if (mddev->bitmap) { 812 /* if adding to array with a bitmap, then we can accept an 813 * older device ... but not too old. 814 */ 815 if (ev1 < mddev->bitmap->events_cleared) 816 return 0; 817 } else { 818 if (ev1 < mddev->events) 819 /* just a hot-add of a new device, leave raid_disk at -1 */ 820 return 0; 821 } 822 823 if (mddev->level != LEVEL_MULTIPATH) { 824 desc = sb->disks + rdev->desc_nr; 825 826 if (desc->state & (1<<MD_DISK_FAULTY)) 827 set_bit(Faulty, &rdev->flags); 828 else if (desc->state & (1<<MD_DISK_SYNC) /* && 829 desc->raid_disk < mddev->raid_disks */) { 830 set_bit(In_sync, &rdev->flags); 831 rdev->raid_disk = desc->raid_disk; 832 } 833 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 834 set_bit(WriteMostly, &rdev->flags); 835 } else /* MULTIPATH are always insync */ 836 set_bit(In_sync, &rdev->flags); 837 return 0; 838} 839 840/* 841 * sync_super for 0.90.0 842 */ 843static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 844{ 845 mdp_super_t *sb; 846 struct list_head *tmp; 847 mdk_rdev_t *rdev2; 848 int next_spare = mddev->raid_disks; 849 850 851 /* make rdev->sb match mddev data.. 852 * 853 * 1/ zero out disks 854 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 855 * 3/ any empty disks < next_spare become removed 856 * 857 * disks[0] gets initialised to REMOVED because 858 * we cannot be sure from other fields if it has 859 * been initialised or not. 860 */ 861 int i; 862 int active=0, working=0,failed=0,spare=0,nr_disks=0; 863 864 rdev->sb_size = MD_SB_BYTES; 865 866 sb = (mdp_super_t*)page_address(rdev->sb_page); 867 868 memset(sb, 0, sizeof(*sb)); 869 870 sb->md_magic = MD_SB_MAGIC; 871 sb->major_version = mddev->major_version; 872 sb->patch_version = mddev->patch_version; 873 sb->gvalid_words = 0; /* ignored */ 874 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 875 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 876 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 877 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 878 879 sb->ctime = mddev->ctime; 880 sb->level = mddev->level; 881 sb->size = mddev->size; 882 sb->raid_disks = mddev->raid_disks; 883 sb->md_minor = mddev->md_minor; 884 sb->not_persistent = !mddev->persistent; 885 sb->utime = mddev->utime; 886 sb->state = 0; 887 sb->events_hi = (mddev->events>>32); 888 sb->events_lo = (u32)mddev->events; 889 890 if (mddev->reshape_position == MaxSector) 891 sb->minor_version = 90; 892 else { 893 sb->minor_version = 91; 894 sb->reshape_position = mddev->reshape_position; 895 sb->new_level = mddev->new_level; 896 sb->delta_disks = mddev->delta_disks; 897 sb->new_layout = mddev->new_layout; 898 sb->new_chunk = mddev->new_chunk; 899 } 900 mddev->minor_version = sb->minor_version; 901 if (mddev->in_sync) 902 { 903 sb->recovery_cp = mddev->recovery_cp; 904 sb->cp_events_hi = (mddev->events>>32); 905 sb->cp_events_lo = (u32)mddev->events; 906 if (mddev->recovery_cp == MaxSector) 907 sb->state = (1<< MD_SB_CLEAN); 908 } else 909 sb->recovery_cp = 0; 910 911 sb->layout = mddev->layout; 912 sb->chunk_size = mddev->chunk_size; 913 914 if (mddev->bitmap && mddev->bitmap_file == NULL) 915 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 916 917 sb->disks[0].state = (1<<MD_DISK_REMOVED); 918 ITERATE_RDEV(mddev,rdev2,tmp) { 919 mdp_disk_t *d; 920 int desc_nr; 921 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 922 && !test_bit(Faulty, &rdev2->flags)) 923 desc_nr = rdev2->raid_disk; 924 else 925 desc_nr = next_spare++; 926 rdev2->desc_nr = desc_nr; 927 d = &sb->disks[rdev2->desc_nr]; 928 nr_disks++; 929 d->number = rdev2->desc_nr; 930 d->major = MAJOR(rdev2->bdev->bd_dev); 931 d->minor = MINOR(rdev2->bdev->bd_dev); 932 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 933 && !test_bit(Faulty, &rdev2->flags)) 934 d->raid_disk = rdev2->raid_disk; 935 else 936 d->raid_disk = rdev2->desc_nr; /* compatibility */ 937 if (test_bit(Faulty, &rdev2->flags)) 938 d->state = (1<<MD_DISK_FAULTY); 939 else if (test_bit(In_sync, &rdev2->flags)) { 940 d->state = (1<<MD_DISK_ACTIVE); 941 d->state |= (1<<MD_DISK_SYNC); 942 active++; 943 working++; 944 } else { 945 d->state = 0; 946 spare++; 947 working++; 948 } 949 if (test_bit(WriteMostly, &rdev2->flags)) 950 d->state |= (1<<MD_DISK_WRITEMOSTLY); 951 } 952 /* now set the "removed" and "faulty" bits on any missing devices */ 953 for (i=0 ; i < mddev->raid_disks ; i++) { 954 mdp_disk_t *d = &sb->disks[i]; 955 if (d->state == 0 && d->number == 0) { 956 d->number = i; 957 d->raid_disk = i; 958 d->state = (1<<MD_DISK_REMOVED); 959 d->state |= (1<<MD_DISK_FAULTY); 960 failed++; 961 } 962 } 963 sb->nr_disks = nr_disks; 964 sb->active_disks = active; 965 sb->working_disks = working; 966 sb->failed_disks = failed; 967 sb->spare_disks = spare; 968 969 sb->this_disk = sb->disks[rdev->desc_nr]; 970 sb->sb_csum = calc_sb_csum(sb); 971} 972 973/* 974 * version 1 superblock 975 */ 976 977static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) 978{ 979 __le32 disk_csum; 980 u32 csum; 981 unsigned long long newcsum; 982 int size = 256 + le32_to_cpu(sb->max_dev)*2; 983 __le32 *isuper = (__le32*)sb; 984 int i; 985 986 disk_csum = sb->sb_csum; 987 sb->sb_csum = 0; 988 newcsum = 0; 989 for (i=0; size>=4; size -= 4 ) 990 newcsum += le32_to_cpu(*isuper++); 991 992 if (size == 2) 993 newcsum += le16_to_cpu(*(__le16*) isuper); 994 995 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 996 sb->sb_csum = disk_csum; 997 return cpu_to_le32(csum); 998} 999 1000static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 1001{ 1002 struct mdp_superblock_1 *sb; 1003 int ret; 1004 sector_t sb_offset; 1005 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1006 int bmask; 1007 1008 /* 1009 * Calculate the position of the superblock. 1010 * It is always aligned to a 4K boundary and 1011 * depeding on minor_version, it can be: 1012 * 0: At least 8K, but less than 12K, from end of device 1013 * 1: At start of device 1014 * 2: 4K from start of device. 1015 */ 1016 switch(minor_version) { 1017 case 0: 1018 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 1019 sb_offset -= 8*2; 1020 sb_offset &= ~(sector_t)(4*2-1); 1021 /* convert from sectors to K */ 1022 sb_offset /= 2; 1023 break; 1024 case 1: 1025 sb_offset = 0; 1026 break; 1027 case 2: 1028 sb_offset = 4; 1029 break; 1030 default: 1031 return -EINVAL; 1032 } 1033 rdev->sb_offset = sb_offset; 1034 1035 /* superblock is rarely larger than 1K, but it can be larger, 1036 * and it is safe to read 4k, so we do that 1037 */ 1038 ret = read_disk_sb(rdev, 4096); 1039 if (ret) return ret; 1040 1041 1042 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1043 1044 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1045 sb->major_version != cpu_to_le32(1) || 1046 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1047 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 1048 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1049 return -EINVAL; 1050 1051 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1052 printk("md: invalid superblock checksum on %s\n", 1053 bdevname(rdev->bdev,b)); 1054 return -EINVAL; 1055 } 1056 if (le64_to_cpu(sb->data_size) < 10) { 1057 printk("md: data_size too small on %s\n", 1058 bdevname(rdev->bdev,b)); 1059 return -EINVAL; 1060 } 1061 rdev->preferred_minor = 0xffff; 1062 rdev->data_offset = le64_to_cpu(sb->data_offset); 1063 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1064 1065 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1066 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1067 if (rdev->sb_size & bmask) 1068 rdev-> sb_size = (rdev->sb_size | bmask)+1; 1069 1070 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1071 rdev->desc_nr = -1; 1072 else 1073 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1074 1075 if (refdev == 0) 1076 ret = 1; 1077 else { 1078 __u64 ev1, ev2; 1079 struct mdp_superblock_1 *refsb = 1080 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1081 1082 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1083 sb->level != refsb->level || 1084 sb->layout != refsb->layout || 1085 sb->chunksize != refsb->chunksize) { 1086 printk(KERN_WARNING "md: %s has strangely different" 1087 " superblock to %s\n", 1088 bdevname(rdev->bdev,b), 1089 bdevname(refdev->bdev,b2)); 1090 return -EINVAL; 1091 } 1092 ev1 = le64_to_cpu(sb->events); 1093 ev2 = le64_to_cpu(refsb->events); 1094 1095 if (ev1 > ev2) 1096 ret = 1; 1097 else 1098 ret = 0; 1099 } 1100 if (minor_version) 1101 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1102 else 1103 rdev->size = rdev->sb_offset; 1104 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1105 return -EINVAL; 1106 rdev->size = le64_to_cpu(sb->data_size)/2; 1107 if (le32_to_cpu(sb->chunksize)) 1108 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1109 1110 if (le64_to_cpu(sb->size) > rdev->size*2) 1111 return -EINVAL; 1112 return ret; 1113} 1114 1115static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1116{ 1117 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1118 __u64 ev1 = le64_to_cpu(sb->events); 1119 1120 rdev->raid_disk = -1; 1121 rdev->flags = 0; 1122 if (mddev->raid_disks == 0) { 1123 mddev->major_version = 1; 1124 mddev->patch_version = 0; 1125 mddev->persistent = 1; 1126 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 1127 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1128 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1129 mddev->level = le32_to_cpu(sb->level); 1130 mddev->clevel[0] = 0; 1131 mddev->layout = le32_to_cpu(sb->layout); 1132 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1133 mddev->size = le64_to_cpu(sb->size)/2; 1134 mddev->events = ev1; 1135 mddev->bitmap_offset = 0; 1136 mddev->default_bitmap_offset = 1024 >> 9; 1137 1138 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1139 memcpy(mddev->uuid, sb->set_uuid, 16); 1140 1141 mddev->max_disks = (4096-256)/2; 1142 1143 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1144 mddev->bitmap_file == NULL ) { 1145 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6 1146 && mddev->level != 10) { 1147 printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); 1148 return -EINVAL; 1149 } 1150 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1151 } 1152 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1153 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1154 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1155 mddev->new_level = le32_to_cpu(sb->new_level); 1156 mddev->new_layout = le32_to_cpu(sb->new_layout); 1157 mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; 1158 } else { 1159 mddev->reshape_position = MaxSector; 1160 mddev->delta_disks = 0; 1161 mddev->new_level = mddev->level; 1162 mddev->new_layout = mddev->layout; 1163 mddev->new_chunk = mddev->chunk_size; 1164 } 1165 1166 } else if (mddev->pers == NULL) { 1167 /* Insist of good event counter while assembling */ 1168 ++ev1; 1169 if (ev1 < mddev->events) 1170 return -EINVAL; 1171 } else if (mddev->bitmap) { 1172 /* If adding to array with a bitmap, then we can accept an 1173 * older device, but not too old. 1174 */ 1175 if (ev1 < mddev->bitmap->events_cleared) 1176 return 0; 1177 } else { 1178 if (ev1 < mddev->events) 1179 /* just a hot-add of a new device, leave raid_disk at -1 */ 1180 return 0; 1181 } 1182 if (mddev->level != LEVEL_MULTIPATH) { 1183 int role; 1184 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1185 switch(role) { 1186 case 0xffff: /* spare */ 1187 break; 1188 case 0xfffe: /* faulty */ 1189 set_bit(Faulty, &rdev->flags); 1190 break; 1191 default: 1192 if ((le32_to_cpu(sb->feature_map) & 1193 MD_FEATURE_RECOVERY_OFFSET)) 1194 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1195 else 1196 set_bit(In_sync, &rdev->flags); 1197 rdev->raid_disk = role; 1198 break; 1199 } 1200 if (sb->devflags & WriteMostly1) 1201 set_bit(WriteMostly, &rdev->flags); 1202 } else /* MULTIPATH are always insync */ 1203 set_bit(In_sync, &rdev->flags); 1204 1205 return 0; 1206} 1207 1208static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1209{ 1210 struct mdp_superblock_1 *sb; 1211 struct list_head *tmp; 1212 mdk_rdev_t *rdev2; 1213 int max_dev, i; 1214 /* make rdev->sb match mddev and rdev data. */ 1215 1216 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1217 1218 sb->feature_map = 0; 1219 sb->pad0 = 0; 1220 sb->recovery_offset = cpu_to_le64(0); 1221 memset(sb->pad1, 0, sizeof(sb->pad1)); 1222 memset(sb->pad2, 0, sizeof(sb->pad2)); 1223 memset(sb->pad3, 0, sizeof(sb->pad3)); 1224 1225 sb->utime = cpu_to_le64((__u64)mddev->utime); 1226 sb->events = cpu_to_le64(mddev->events); 1227 if (mddev->in_sync) 1228 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1229 else 1230 sb->resync_offset = cpu_to_le64(0); 1231 1232 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1233 1234 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1235 sb->size = cpu_to_le64(mddev->size<<1); 1236 1237 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1238 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1239 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1240 } 1241 1242 if (rdev->raid_disk >= 0 && 1243 !test_bit(In_sync, &rdev->flags) && 1244 rdev->recovery_offset > 0) { 1245 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1246 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); 1247 } 1248 1249 if (mddev->reshape_position != MaxSector) { 1250 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1251 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1252 sb->new_layout = cpu_to_le32(mddev->new_layout); 1253 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1254 sb->new_level = cpu_to_le32(mddev->new_level); 1255 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); 1256 } 1257 1258 max_dev = 0; 1259 ITERATE_RDEV(mddev,rdev2,tmp) 1260 if (rdev2->desc_nr+1 > max_dev) 1261 max_dev = rdev2->desc_nr+1; 1262 1263 sb->max_dev = cpu_to_le32(max_dev); 1264 for (i=0; i<max_dev;i++) 1265 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1266 1267 ITERATE_RDEV(mddev,rdev2,tmp) { 1268 i = rdev2->desc_nr; 1269 if (test_bit(Faulty, &rdev2->flags)) 1270 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1271 else if (test_bit(In_sync, &rdev2->flags)) 1272 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1273 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) 1274 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1275 else 1276 sb->dev_roles[i] = cpu_to_le16(0xffff); 1277 } 1278 1279 sb->sb_csum = calc_sb_1_csum(sb); 1280} 1281 1282 1283static struct super_type super_types[] = { 1284 [0] = { 1285 .name = "0.90.0", 1286 .owner = THIS_MODULE, 1287 .load_super = super_90_load, 1288 .validate_super = super_90_validate, 1289 .sync_super = super_90_sync, 1290 }, 1291 [1] = { 1292 .name = "md-1", 1293 .owner = THIS_MODULE, 1294 .load_super = super_1_load, 1295 .validate_super = super_1_validate, 1296 .sync_super = super_1_sync, 1297 }, 1298}; 1299 1300static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) 1301{ 1302 struct list_head *tmp; 1303 mdk_rdev_t *rdev; 1304 1305 ITERATE_RDEV(mddev,rdev,tmp) 1306 if (rdev->bdev->bd_contains == dev->bdev->bd_contains) 1307 return rdev; 1308 1309 return NULL; 1310} 1311 1312static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1313{ 1314 struct list_head *tmp; 1315 mdk_rdev_t *rdev; 1316 1317 ITERATE_RDEV(mddev1,rdev,tmp) 1318 if (match_dev_unit(mddev2, rdev)) 1319 return 1; 1320 1321 return 0; 1322} 1323 1324static LIST_HEAD(pending_raid_disks); 1325 1326static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1327{ 1328 mdk_rdev_t *same_pdev; 1329 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1330 struct kobject *ko; 1331 char *s; 1332 1333 if (rdev->mddev) { 1334 MD_BUG(); 1335 return -EINVAL; 1336 } 1337 /* make sure rdev->size exceeds mddev->size */ 1338 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { 1339 if (mddev->pers) 1340 /* Cannot change size, so fail */ 1341 return -ENOSPC; 1342 else 1343 mddev->size = rdev->size; 1344 } 1345 same_pdev = match_dev_unit(mddev, rdev); 1346 if (same_pdev) 1347 printk(KERN_WARNING 1348 "%s: WARNING: %s appears to be on the same physical" 1349 " disk as %s. True\n protection against single-disk" 1350 " failure might be compromised.\n", 1351 mdname(mddev), bdevname(rdev->bdev,b), 1352 bdevname(same_pdev->bdev,b2)); 1353 1354 /* Verify rdev->desc_nr is unique. 1355 * If it is -1, assign a free number, else 1356 * check number is not in use 1357 */ 1358 if (rdev->desc_nr < 0) { 1359 int choice = 0; 1360 if (mddev->pers) choice = mddev->raid_disks; 1361 while (find_rdev_nr(mddev, choice)) 1362 choice++; 1363 rdev->desc_nr = choice; 1364 } else { 1365 if (find_rdev_nr(mddev, rdev->desc_nr)) 1366 return -EBUSY; 1367 } 1368 bdevname(rdev->bdev,b); 1369 if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0) 1370 return -ENOMEM; 1371 while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL) 1372 *s = '!'; 1373 1374 list_add(&rdev->same_set, &mddev->disks); 1375 rdev->mddev = mddev; 1376 printk(KERN_INFO "md: bind<%s>\n", b); 1377 1378 rdev->kobj.parent = &mddev->kobj; 1379 kobject_add(&rdev->kobj); 1380 1381 if (rdev->bdev->bd_part) 1382 ko = &rdev->bdev->bd_part->kobj; 1383 else 1384 ko = &rdev->bdev->bd_disk->kobj; 1385 sysfs_create_link(&rdev->kobj, ko, "block"); 1386 bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk); 1387 return 0; 1388} 1389 1390static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1391{ 1392 char b[BDEVNAME_SIZE]; 1393 if (!rdev->mddev) { 1394 MD_BUG(); 1395 return; 1396 } 1397 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1398 list_del_init(&rdev->same_set); 1399 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1400 rdev->mddev = NULL; 1401 sysfs_remove_link(&rdev->kobj, "block"); 1402 kobject_del(&rdev->kobj); 1403} 1404 1405/* 1406 * prevent the device from being mounted, repartitioned or 1407 * otherwise reused by a RAID array (or any other kernel 1408 * subsystem), by bd_claiming the device. 1409 */ 1410static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1411{ 1412 int err = 0; 1413 struct block_device *bdev; 1414 char b[BDEVNAME_SIZE]; 1415 1416 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1417 if (IS_ERR(bdev)) { 1418 printk(KERN_ERR "md: could not open %s.\n", 1419 __bdevname(dev, b)); 1420 return PTR_ERR(bdev); 1421 } 1422 err = bd_claim(bdev, rdev); 1423 if (err) { 1424 printk(KERN_ERR "md: could not bd_claim %s.\n", 1425 bdevname(bdev, b)); 1426 blkdev_put(bdev); 1427 return err; 1428 } 1429 rdev->bdev = bdev; 1430 return err; 1431} 1432 1433static void unlock_rdev(mdk_rdev_t *rdev) 1434{ 1435 struct block_device *bdev = rdev->bdev; 1436 rdev->bdev = NULL; 1437 if (!bdev) 1438 MD_BUG(); 1439 bd_release(bdev); 1440 blkdev_put(bdev); 1441} 1442 1443void md_autodetect_dev(dev_t dev); 1444 1445static void export_rdev(mdk_rdev_t * rdev) 1446{ 1447 char b[BDEVNAME_SIZE]; 1448 printk(KERN_INFO "md: export_rdev(%s)\n", 1449 bdevname(rdev->bdev,b)); 1450 if (rdev->mddev) 1451 MD_BUG(); 1452 free_disk_sb(rdev); 1453 list_del_init(&rdev->same_set); 1454#ifndef MODULE 1455 md_autodetect_dev(rdev->bdev->bd_dev); 1456#endif 1457 unlock_rdev(rdev); 1458 kobject_put(&rdev->kobj); 1459} 1460 1461static void kick_rdev_from_array(mdk_rdev_t * rdev) 1462{ 1463 unbind_rdev_from_array(rdev); 1464 export_rdev(rdev); 1465} 1466 1467static void export_array(mddev_t *mddev) 1468{ 1469 struct list_head *tmp; 1470 mdk_rdev_t *rdev; 1471 1472 ITERATE_RDEV(mddev,rdev,tmp) { 1473 if (!rdev->mddev) { 1474 MD_BUG(); 1475 continue; 1476 } 1477 kick_rdev_from_array(rdev); 1478 } 1479 if (!list_empty(&mddev->disks)) 1480 MD_BUG(); 1481 mddev->raid_disks = 0; 1482 mddev->major_version = 0; 1483} 1484 1485static void print_desc(mdp_disk_t *desc) 1486{ 1487 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1488 desc->major,desc->minor,desc->raid_disk,desc->state); 1489} 1490 1491static void print_sb(mdp_super_t *sb) 1492{ 1493 int i; 1494 1495 printk(KERN_INFO 1496 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1497 sb->major_version, sb->minor_version, sb->patch_version, 1498 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1499 sb->ctime); 1500 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1501 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1502 sb->md_minor, sb->layout, sb->chunk_size); 1503 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1504 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1505 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1506 sb->failed_disks, sb->spare_disks, 1507 sb->sb_csum, (unsigned long)sb->events_lo); 1508 1509 printk(KERN_INFO); 1510 for (i = 0; i < MD_SB_DISKS; i++) { 1511 mdp_disk_t *desc; 1512 1513 desc = sb->disks + i; 1514 if (desc->number || desc->major || desc->minor || 1515 desc->raid_disk || (desc->state && (desc->state != 4))) { 1516 printk(" D %2d: ", i); 1517 print_desc(desc); 1518 } 1519 } 1520 printk(KERN_INFO "md: THIS: "); 1521 print_desc(&sb->this_disk); 1522 1523} 1524 1525static void print_rdev(mdk_rdev_t *rdev) 1526{ 1527 char b[BDEVNAME_SIZE]; 1528 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1529 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1530 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1531 rdev->desc_nr); 1532 if (rdev->sb_loaded) { 1533 printk(KERN_INFO "md: rdev superblock:\n"); 1534 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1535 } else 1536 printk(KERN_INFO "md: no rdev superblock!\n"); 1537} 1538 1539static void md_print_devices(void) 1540{ 1541 struct list_head *tmp, *tmp2; 1542 mdk_rdev_t *rdev; 1543 mddev_t *mddev; 1544 char b[BDEVNAME_SIZE]; 1545 1546 printk("\n"); 1547 printk("md: **********************************\n"); 1548 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1549 printk("md: **********************************\n"); 1550 ITERATE_MDDEV(mddev,tmp) { 1551 1552 if (mddev->bitmap) 1553 bitmap_print_sb(mddev->bitmap); 1554 else 1555 printk("%s: ", mdname(mddev)); 1556 ITERATE_RDEV(mddev,rdev,tmp2) 1557 printk("<%s>", bdevname(rdev->bdev,b)); 1558 printk("\n"); 1559 1560 ITERATE_RDEV(mddev,rdev,tmp2) 1561 print_rdev(rdev); 1562 } 1563 printk("md: **********************************\n"); 1564 printk("\n"); 1565} 1566 1567 1568static void sync_sbs(mddev_t * mddev, int nospares) 1569{ 1570 /* Update each superblock (in-memory image), but 1571 * if we are allowed to, skip spares which already 1572 * have the right event counter, or have one earlier 1573 * (which would mean they aren't being marked as dirty 1574 * with the rest of the array) 1575 */ 1576 mdk_rdev_t *rdev; 1577 struct list_head *tmp; 1578 1579 ITERATE_RDEV(mddev,rdev,tmp) { 1580 if (rdev->sb_events == mddev->events || 1581 (nospares && 1582 rdev->raid_disk < 0 && 1583 (rdev->sb_events&1)==0 && 1584 rdev->sb_events+1 == mddev->events)) { 1585 /* Don't update this superblock */ 1586 rdev->sb_loaded = 2; 1587 } else { 1588 super_types[mddev->major_version]. 1589 sync_super(mddev, rdev); 1590 rdev->sb_loaded = 1; 1591 } 1592 } 1593} 1594 1595static void md_update_sb(mddev_t * mddev, int force_change) 1596{ 1597 int err; 1598 struct list_head *tmp; 1599 mdk_rdev_t *rdev; 1600 int sync_req; 1601 int nospares = 0; 1602 1603repeat: 1604 spin_lock_irq(&mddev->write_lock); 1605 1606 set_bit(MD_CHANGE_PENDING, &mddev->flags); 1607 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 1608 force_change = 1; 1609 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 1610 /* just a clean<-> dirty transition, possibly leave spares alone, 1611 * though if events isn't the right even/odd, we will have to do 1612 * spares after all 1613 */ 1614 nospares = 1; 1615 if (force_change) 1616 nospares = 0; 1617 if (mddev->degraded) 1618 /* If the array is degraded, then skipping spares is both 1619 * dangerous and fairly pointless. 1620 * Dangerous because a device that was removed from the array 1621 * might have a event_count that still looks up-to-date, 1622 * so it can be re-added without a resync. 1623 * Pointless because if there are any spares to skip, 1624 * then a recovery will happen and soon that array won't 1625 * be degraded any more and the spare can go back to sleep then. 1626 */ 1627 nospares = 0; 1628 1629 sync_req = mddev->in_sync; 1630 mddev->utime = get_seconds(); 1631 1632 /* If this is just a dirty<->clean transition, and the array is clean 1633 * and 'events' is odd, we can roll back to the previous clean state */ 1634 if (nospares 1635 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 1636 && (mddev->events & 1)) 1637 mddev->events--; 1638 else { 1639 /* otherwise we have to go forward and ... */ 1640 mddev->events ++; 1641 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 1642 /* .. if the array isn't clean, insist on an odd 'events' */ 1643 if ((mddev->events&1)==0) { 1644 mddev->events++; 1645 nospares = 0; 1646 } 1647 } else { 1648 /* otherwise insist on an even 'events' (for clean states) */ 1649 if ((mddev->events&1)) { 1650 mddev->events++; 1651 nospares = 0; 1652 } 1653 } 1654 } 1655 1656 if (!mddev->events) { 1657 /* 1658 * oops, this 64-bit counter should never wrap. 1659 * Either we are in around ~1 trillion A.C., assuming 1660 * 1 reboot per second, or we have a bug: 1661 */ 1662 MD_BUG(); 1663 mddev->events --; 1664 } 1665 sync_sbs(mddev, nospares); 1666 1667 /* 1668 * do not write anything to disk if using 1669 * nonpersistent superblocks 1670 */ 1671 if (!mddev->persistent) { 1672 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 1673 spin_unlock_irq(&mddev->write_lock); 1674 wake_up(&mddev->sb_wait); 1675 return; 1676 } 1677 spin_unlock_irq(&mddev->write_lock); 1678 1679 dprintk(KERN_INFO 1680 "md: updating %s RAID superblock on device (in sync %d)\n", 1681 mdname(mddev),mddev->in_sync); 1682 1683 err = bitmap_update_sb(mddev->bitmap); 1684 ITERATE_RDEV(mddev,rdev,tmp) { 1685 char b[BDEVNAME_SIZE]; 1686 dprintk(KERN_INFO "md: "); 1687 if (rdev->sb_loaded != 1) 1688 continue; /* no noise on spare devices */ 1689 if (test_bit(Faulty, &rdev->flags)) 1690 dprintk("(skipping faulty "); 1691 1692 dprintk("%s ", bdevname(rdev->bdev,b)); 1693 if (!test_bit(Faulty, &rdev->flags)) { 1694 md_super_write(mddev,rdev, 1695 rdev->sb_offset<<1, rdev->sb_size, 1696 rdev->sb_page); 1697 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1698 bdevname(rdev->bdev,b), 1699 (unsigned long long)rdev->sb_offset); 1700 rdev->sb_events = mddev->events; 1701 1702 } else 1703 dprintk(")\n"); 1704 if (mddev->level == LEVEL_MULTIPATH) 1705 /* only need to write one superblock... */ 1706 break; 1707 } 1708 md_super_wait(mddev); 1709 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 1710 1711 spin_lock_irq(&mddev->write_lock); 1712 if (mddev->in_sync != sync_req || 1713 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 1714 /* have to write it out again */ 1715 spin_unlock_irq(&mddev->write_lock); 1716 goto repeat; 1717 } 1718 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 1719 spin_unlock_irq(&mddev->write_lock); 1720 wake_up(&mddev->sb_wait); 1721 1722} 1723 1724/* words written to sysfs files may, or my not, be \n terminated. 1725 * We want to accept with case. For this we use cmd_match. 1726 */ 1727static int cmd_match(const char *cmd, const char *str) 1728{ 1729 /* See if cmd, written into a sysfs file, matches 1730 * str. They must either be the same, or cmd can 1731 * have a trailing newline 1732 */ 1733 while (*cmd && *str && *cmd == *str) { 1734 cmd++; 1735 str++; 1736 } 1737 if (*cmd == '\n') 1738 cmd++; 1739 if (*str || *cmd) 1740 return 0; 1741 return 1; 1742} 1743 1744struct rdev_sysfs_entry { 1745 struct attribute attr; 1746 ssize_t (*show)(mdk_rdev_t *, char *); 1747 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 1748}; 1749 1750static ssize_t 1751state_show(mdk_rdev_t *rdev, char *page) 1752{ 1753 char *sep = ""; 1754 int len=0; 1755 1756 if (test_bit(Faulty, &rdev->flags)) { 1757 len+= sprintf(page+len, "%sfaulty",sep); 1758 sep = ","; 1759 } 1760 if (test_bit(In_sync, &rdev->flags)) { 1761 len += sprintf(page+len, "%sin_sync",sep); 1762 sep = ","; 1763 } 1764 if (test_bit(WriteMostly, &rdev->flags)) { 1765 len += sprintf(page+len, "%swrite_mostly",sep); 1766 sep = ","; 1767 } 1768 if (!test_bit(Faulty, &rdev->flags) && 1769 !test_bit(In_sync, &rdev->flags)) { 1770 len += sprintf(page+len, "%sspare", sep); 1771 sep = ","; 1772 } 1773 return len+sprintf(page+len, "\n"); 1774} 1775 1776static ssize_t 1777state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1778{ 1779 /* can write 1780 * faulty - simulates and error 1781 * remove - disconnects the device 1782 * writemostly - sets write_mostly 1783 * -writemostly - clears write_mostly 1784 */ 1785 int err = -EINVAL; 1786 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 1787 md_error(rdev->mddev, rdev); 1788 err = 0; 1789 } else if (cmd_match(buf, "remove")) { 1790 if (rdev->raid_disk >= 0) 1791 err = -EBUSY; 1792 else { 1793 mddev_t *mddev = rdev->mddev; 1794 kick_rdev_from_array(rdev); 1795 if (mddev->pers) 1796 md_update_sb(mddev, 1); 1797 md_new_event(mddev); 1798 err = 0; 1799 } 1800 } else if (cmd_match(buf, "writemostly")) { 1801 set_bit(WriteMostly, &rdev->flags); 1802 err = 0; 1803 } else if (cmd_match(buf, "-writemostly")) { 1804 clear_bit(WriteMostly, &rdev->flags); 1805 err = 0; 1806 } 1807 return err ? err : len; 1808} 1809static struct rdev_sysfs_entry rdev_state = 1810__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); 1811 1812static ssize_t 1813super_show(mdk_rdev_t *rdev, char *page) 1814{ 1815 if (rdev->sb_loaded && rdev->sb_size) { 1816 memcpy(page, page_address(rdev->sb_page), rdev->sb_size); 1817 return rdev->sb_size; 1818 } else 1819 return 0; 1820} 1821static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super); 1822 1823static ssize_t 1824errors_show(mdk_rdev_t *rdev, char *page) 1825{ 1826 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 1827} 1828 1829static ssize_t 1830errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1831{ 1832 char *e; 1833 unsigned long n = simple_strtoul(buf, &e, 10); 1834 if (*buf && (*e == 0 || *e == '\n')) { 1835 atomic_set(&rdev->corrected_errors, n); 1836 return len; 1837 } 1838 return -EINVAL; 1839} 1840static struct rdev_sysfs_entry rdev_errors = 1841__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 1842 1843static ssize_t 1844slot_show(mdk_rdev_t *rdev, char *page) 1845{ 1846 if (rdev->raid_disk < 0) 1847 return sprintf(page, "none\n"); 1848 else 1849 return sprintf(page, "%d\n", rdev->raid_disk); 1850} 1851 1852static ssize_t 1853slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1854{ 1855 char *e; 1856 int slot = simple_strtoul(buf, &e, 10); 1857 if (strncmp(buf, "none", 4)==0) 1858 slot = -1; 1859 else if (e==buf || (*e && *e!= '\n')) 1860 return -EINVAL; 1861 if (rdev->mddev->pers) 1862 /* Cannot set slot in active array (yet) */ 1863 return -EBUSY; 1864 if (slot >= rdev->mddev->raid_disks) 1865 return -ENOSPC; 1866 rdev->raid_disk = slot; 1867 /* assume it is working */ 1868 rdev->flags = 0; 1869 set_bit(In_sync, &rdev->flags); 1870 return len; 1871} 1872 1873 1874static struct rdev_sysfs_entry rdev_slot = 1875__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 1876 1877static ssize_t 1878offset_show(mdk_rdev_t *rdev, char *page) 1879{ 1880 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 1881} 1882 1883static ssize_t 1884offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1885{ 1886 char *e; 1887 unsigned long long offset = simple_strtoull(buf, &e, 10); 1888 if (e==buf || (*e && *e != '\n')) 1889 return -EINVAL; 1890 if (rdev->mddev->pers) 1891 return -EBUSY; 1892 rdev->data_offset = offset; 1893 return len; 1894} 1895 1896static struct rdev_sysfs_entry rdev_offset = 1897__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 1898 1899static ssize_t 1900rdev_size_show(mdk_rdev_t *rdev, char *page) 1901{ 1902 return sprintf(page, "%llu\n", (unsigned long long)rdev->size); 1903} 1904 1905static ssize_t 1906rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1907{ 1908 char *e; 1909 unsigned long long size = simple_strtoull(buf, &e, 10); 1910 if (e==buf || (*e && *e != '\n')) 1911 return -EINVAL; 1912 if (rdev->mddev->pers) 1913 return -EBUSY; 1914 rdev->size = size; 1915 if (size < rdev->mddev->size || rdev->mddev->size == 0) 1916 rdev->mddev->size = size; 1917 return len; 1918} 1919 1920static struct rdev_sysfs_entry rdev_size = 1921__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 1922 1923static struct attribute *rdev_default_attrs[] = { 1924 &rdev_state.attr, 1925 &rdev_super.attr, 1926 &rdev_errors.attr, 1927 &rdev_slot.attr, 1928 &rdev_offset.attr, 1929 &rdev_size.attr, 1930 NULL, 1931}; 1932static ssize_t 1933rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1934{ 1935 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1936 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1937 1938 if (!entry->show) 1939 return -EIO; 1940 return entry->show(rdev, page); 1941} 1942 1943static ssize_t 1944rdev_attr_store(struct kobject *kobj, struct attribute *attr, 1945 const char *page, size_t length) 1946{ 1947 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1948 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1949 1950 if (!entry->store) 1951 return -EIO; 1952 if (!capable(CAP_SYS_ADMIN)) 1953 return -EACCES; 1954 return entry->store(rdev, page, length); 1955} 1956 1957static void rdev_free(struct kobject *ko) 1958{ 1959 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 1960 kfree(rdev); 1961} 1962static struct sysfs_ops rdev_sysfs_ops = { 1963 .show = rdev_attr_show, 1964 .store = rdev_attr_store, 1965}; 1966static struct kobj_type rdev_ktype = { 1967 .release = rdev_free, 1968 .sysfs_ops = &rdev_sysfs_ops, 1969 .default_attrs = rdev_default_attrs, 1970}; 1971 1972/* 1973 * Import a device. If 'super_format' >= 0, then sanity check the superblock 1974 * 1975 * mark the device faulty if: 1976 * 1977 * - the device is nonexistent (zero size) 1978 * - the device has no valid superblock 1979 * 1980 * a faulty rdev _never_ has rdev->sb set. 1981 */ 1982static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 1983{ 1984 char b[BDEVNAME_SIZE]; 1985 int err; 1986 mdk_rdev_t *rdev; 1987 sector_t size; 1988 1989 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 1990 if (!rdev) { 1991 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 1992 return ERR_PTR(-ENOMEM); 1993 } 1994 1995 if ((err = alloc_disk_sb(rdev))) 1996 goto abort_free; 1997 1998 err = lock_rdev(rdev, newdev); 1999 if (err) 2000 goto abort_free; 2001 2002 rdev->kobj.parent = NULL; 2003 rdev->kobj.ktype = &rdev_ktype; 2004 kobject_init(&rdev->kobj); 2005 2006 rdev->desc_nr = -1; 2007 rdev->saved_raid_disk = -1; 2008 rdev->raid_disk = -1; 2009 rdev->flags = 0; 2010 rdev->data_offset = 0; 2011 rdev->sb_events = 0; 2012 atomic_set(&rdev->nr_pending, 0); 2013 atomic_set(&rdev->read_errors, 0); 2014 atomic_set(&rdev->corrected_errors, 0); 2015 2016 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2017 if (!size) { 2018 printk(KERN_WARNING 2019 "md: %s has zero or unknown size, marking faulty!\n", 2020 bdevname(rdev->bdev,b)); 2021 err = -EINVAL; 2022 goto abort_free; 2023 } 2024 2025 if (super_format >= 0) { 2026 err = super_types[super_format]. 2027 load_super(rdev, NULL, super_minor); 2028 if (err == -EINVAL) { 2029 printk(KERN_WARNING 2030 "md: %s has invalid sb, not importing!\n", 2031 bdevname(rdev->bdev,b)); 2032 goto abort_free; 2033 } 2034 if (err < 0) { 2035 printk(KERN_WARNING 2036 "md: could not read %s's sb, not importing!\n", 2037 bdevname(rdev->bdev,b)); 2038 goto abort_free; 2039 } 2040 } 2041 INIT_LIST_HEAD(&rdev->same_set); 2042 2043 return rdev; 2044 2045abort_free: 2046 if (rdev->sb_page) { 2047 if (rdev->bdev) 2048 unlock_rdev(rdev); 2049 free_disk_sb(rdev); 2050 } 2051 kfree(rdev); 2052 return ERR_PTR(err); 2053} 2054 2055/* 2056 * Check a full RAID array for plausibility 2057 */ 2058 2059 2060static void analyze_sbs(mddev_t * mddev) 2061{ 2062 int i; 2063 struct list_head *tmp; 2064 mdk_rdev_t *rdev, *freshest; 2065 char b[BDEVNAME_SIZE]; 2066 2067 freshest = NULL; 2068 ITERATE_RDEV(mddev,rdev,tmp) 2069 switch (super_types[mddev->major_version]. 2070 load_super(rdev, freshest, mddev->minor_version)) { 2071 case 1: 2072 freshest = rdev; 2073 break; 2074 case 0: 2075 break; 2076 default: 2077 printk( KERN_ERR \ 2078 "md: fatal superblock inconsistency in %s" 2079 " -- removing from array\n", 2080 bdevname(rdev->bdev,b)); 2081 kick_rdev_from_array(rdev); 2082 } 2083 2084 2085 super_types[mddev->major_version]. 2086 validate_super(mddev, freshest); 2087 2088 i = 0; 2089 ITERATE_RDEV(mddev,rdev,tmp) { 2090 if (rdev != freshest) 2091 if (super_types[mddev->major_version]. 2092 validate_super(mddev, rdev)) { 2093 printk(KERN_WARNING "md: kicking non-fresh %s" 2094 " from array!\n", 2095 bdevname(rdev->bdev,b)); 2096 kick_rdev_from_array(rdev); 2097 continue; 2098 } 2099 if (mddev->level == LEVEL_MULTIPATH) { 2100 rdev->desc_nr = i++; 2101 rdev->raid_disk = rdev->desc_nr; 2102 set_bit(In_sync, &rdev->flags); 2103 } 2104 } 2105 2106 2107 2108 if (mddev->recovery_cp != MaxSector && 2109 mddev->level >= 1) 2110 printk(KERN_ERR "md: %s: raid array is not clean" 2111 " -- starting background reconstruction\n", 2112 mdname(mddev)); 2113 2114} 2115 2116static ssize_t 2117safe_delay_show(mddev_t *mddev, char *page) 2118{ 2119 int msec = (mddev->safemode_delay*1000)/HZ; 2120 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 2121} 2122static ssize_t 2123safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2124{ 2125 int scale=1; 2126 int dot=0; 2127 int i; 2128 unsigned long msec; 2129 char buf[30]; 2130 char *e; 2131 /* remove a period, and count digits after it */ 2132 if (len >= sizeof(buf)) 2133 return -EINVAL; 2134 strlcpy(buf, cbuf, len); 2135 buf[len] = 0; 2136 for (i=0; i<len; i++) { 2137 if (dot) { 2138 if (isdigit(buf[i])) { 2139 buf[i-1] = buf[i]; 2140 scale *= 10; 2141 } 2142 buf[i] = 0; 2143 } else if (buf[i] == '.') { 2144 dot=1; 2145 buf[i] = 0; 2146 } 2147 } 2148 msec = simple_strtoul(buf, &e, 10); 2149 if (e == buf || (*e && *e != '\n')) 2150 return -EINVAL; 2151 msec = (msec * 1000) / scale; 2152 if (msec == 0) 2153 mddev->safemode_delay = 0; 2154 else { 2155 mddev->safemode_delay = (msec*HZ)/1000; 2156 if (mddev->safemode_delay == 0) 2157 mddev->safemode_delay = 1; 2158 } 2159 return len; 2160} 2161static struct md_sysfs_entry md_safe_delay = 2162__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 2163 2164static ssize_t 2165level_show(mddev_t *mddev, char *page) 2166{ 2167 struct mdk_personality *p = mddev->pers; 2168 if (p) 2169 return sprintf(page, "%s\n", p->name); 2170 else if (mddev->clevel[0]) 2171 return sprintf(page, "%s\n", mddev->clevel); 2172 else if (mddev->level != LEVEL_NONE) 2173 return sprintf(page, "%d\n", mddev->level); 2174 else 2175 return 0; 2176} 2177 2178static ssize_t 2179level_store(mddev_t *mddev, const char *buf, size_t len) 2180{ 2181 int rv = len; 2182 if (mddev->pers) 2183 return -EBUSY; 2184 if (len == 0) 2185 return 0; 2186 if (len >= sizeof(mddev->clevel)) 2187 return -ENOSPC; 2188 strncpy(mddev->clevel, buf, len); 2189 if (mddev->clevel[len-1] == '\n') 2190 len--; 2191 mddev->clevel[len] = 0; 2192 mddev->level = LEVEL_NONE; 2193 return rv; 2194} 2195 2196static struct md_sysfs_entry md_level = 2197__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 2198 2199 2200static ssize_t 2201layout_show(mddev_t *mddev, char *page) 2202{ 2203 /* just a number, not meaningful for all levels */ 2204 return sprintf(page, "%d\n", mddev->layout); 2205} 2206 2207static ssize_t 2208layout_store(mddev_t *mddev, const char *buf, size_t len) 2209{ 2210 char *e; 2211 unsigned long n = simple_strtoul(buf, &e, 10); 2212 if (mddev->pers) 2213 return -EBUSY; 2214 2215 if (!*buf || (*e && *e != '\n')) 2216 return -EINVAL; 2217 2218 mddev->layout = n; 2219 return len; 2220} 2221static struct md_sysfs_entry md_layout = 2222__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 2223 2224 2225static ssize_t 2226raid_disks_show(mddev_t *mddev, char *page) 2227{ 2228 if (mddev->raid_disks == 0) 2229 return 0; 2230 return sprintf(page, "%d\n", mddev->raid_disks); 2231} 2232 2233static int update_raid_disks(mddev_t *mddev, int raid_disks); 2234 2235static ssize_t 2236raid_disks_store(mddev_t *mddev, const char *buf, size_t len) 2237{ 2238 char *e; 2239 int rv = 0; 2240 unsigned long n = simple_strtoul(buf, &e, 10); 2241 2242 if (!*buf || (*e && *e != '\n')) 2243 return -EINVAL; 2244 2245 if (mddev->pers) 2246 rv = update_raid_disks(mddev, n); 2247 else 2248 mddev->raid_disks = n; 2249 return rv ? rv : len; 2250} 2251static struct md_sysfs_entry md_raid_disks = 2252__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 2253 2254static ssize_t 2255chunk_size_show(mddev_t *mddev, char *page) 2256{ 2257 return sprintf(page, "%d\n", mddev->chunk_size); 2258} 2259 2260static ssize_t 2261chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 2262{ 2263 /* can only set chunk_size if array is not yet active */ 2264 char *e; 2265 unsigned long n = simple_strtoul(buf, &e, 10); 2266 2267 if (mddev->pers) 2268 return -EBUSY; 2269 if (!*buf || (*e && *e != '\n')) 2270 return -EINVAL; 2271 2272 mddev->chunk_size = n; 2273 return len; 2274} 2275static struct md_sysfs_entry md_chunk_size = 2276__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 2277 2278static ssize_t 2279resync_start_show(mddev_t *mddev, char *page) 2280{ 2281 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 2282} 2283 2284static ssize_t 2285resync_start_store(mddev_t *mddev, const char *buf, size_t len) 2286{ 2287 /* can only set chunk_size if array is not yet active */ 2288 char *e; 2289 unsigned long long n = simple_strtoull(buf, &e, 10); 2290 2291 if (mddev->pers) 2292 return -EBUSY; 2293 if (!*buf || (*e && *e != '\n')) 2294 return -EINVAL; 2295 2296 mddev->recovery_cp = n; 2297 return len; 2298} 2299static struct md_sysfs_entry md_resync_start = 2300__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); 2301 2302/* 2303 * The array state can be: 2304 * 2305 * clear 2306 * No devices, no size, no level 2307 * Equivalent to STOP_ARRAY ioctl 2308 * inactive 2309 * May have some settings, but array is not active 2310 * all IO results in error 2311 * When written, doesn't tear down array, but just stops it 2312 * suspended (not supported yet) 2313 * All IO requests will block. The array can be reconfigured. 2314 * Writing this, if accepted, will block until array is quiessent 2315 * readonly 2316 * no resync can happen. no superblocks get written. 2317 * write requests fail 2318 * read-auto 2319 * like readonly, but behaves like 'clean' on a write request. 2320 * 2321 * clean - no pending writes, but otherwise active. 2322 * When written to inactive array, starts without resync 2323 * If a write request arrives then 2324 * if metadata is known, mark 'dirty' and switch to 'active'. 2325 * if not known, block and switch to write-pending 2326 * If written to an active array that has pending writes, then fails. 2327 * active 2328 * fully active: IO and resync can be happening. 2329 * When written to inactive array, starts with resync 2330 * 2331 * write-pending 2332 * clean, but writes are blocked waiting for 'active' to be written. 2333 * 2334 * active-idle 2335 * like active, but no writes have been seen for a while (100msec). 2336 * 2337 */ 2338enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 2339 write_pending, active_idle, bad_word}; 2340static char *array_states[] = { 2341 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 2342 "write-pending", "active-idle", NULL }; 2343 2344static int match_word(const char *word, char **list) 2345{ 2346 int n; 2347 for (n=0; list[n]; n++) 2348 if (cmd_match(word, list[n])) 2349 break; 2350 return n; 2351} 2352 2353static ssize_t 2354array_state_show(mddev_t *mddev, char *page) 2355{ 2356 enum array_state st = inactive; 2357 2358 if (mddev->pers) 2359 switch(mddev->ro) { 2360 case 1: 2361 st = readonly; 2362 break; 2363 case 2: 2364 st = read_auto; 2365 break; 2366 case 0: 2367 if (mddev->in_sync) 2368 st = clean; 2369 else if (mddev->safemode) 2370 st = active_idle; 2371 else 2372 st = active; 2373 } 2374 else { 2375 if (list_empty(&mddev->disks) && 2376 mddev->raid_disks == 0 && 2377 mddev->size == 0) 2378 st = clear; 2379 else 2380 st = inactive; 2381 } 2382 return sprintf(page, "%s\n", array_states[st]); 2383} 2384 2385static int do_md_stop(mddev_t * mddev, int ro); 2386static int do_md_run(mddev_t * mddev); 2387static int restart_array(mddev_t *mddev); 2388 2389static ssize_t 2390array_state_store(mddev_t *mddev, const char *buf, size_t len) 2391{ 2392 int err = -EINVAL; 2393 enum array_state st = match_word(buf, array_states); 2394 switch(st) { 2395 case bad_word: 2396 break; 2397 case clear: 2398 /* stopping an active array */ 2399 if (mddev->pers) { 2400 if (atomic_read(&mddev->active) > 1) 2401 return -EBUSY; 2402 err = do_md_stop(mddev, 0); 2403 } 2404 break; 2405 case inactive: 2406 /* stopping an active array */ 2407 if (mddev->pers) { 2408 if (atomic_read(&mddev->active) > 1) 2409 return -EBUSY; 2410 err = do_md_stop(mddev, 2); 2411 } 2412 break; 2413 case suspended: 2414 break; /* not supported yet */ 2415 case readonly: 2416 if (mddev->pers) 2417 err = do_md_stop(mddev, 1); 2418 else { 2419 mddev->ro = 1; 2420 err = do_md_run(mddev); 2421 } 2422 break; 2423 case read_auto: 2424 /* stopping an active array */ 2425 if (mddev->pers) { 2426 err = do_md_stop(mddev, 1); 2427 if (err == 0) 2428 mddev->ro = 2; /* FIXME mark devices writable */ 2429 } else { 2430 mddev->ro = 2; 2431 err = do_md_run(mddev); 2432 } 2433 break; 2434 case clean: 2435 if (mddev->pers) { 2436 restart_array(mddev); 2437 spin_lock_irq(&mddev->write_lock); 2438 if (atomic_read(&mddev->writes_pending) == 0) { 2439 mddev->in_sync = 1; 2440 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 2441 } 2442 spin_unlock_irq(&mddev->write_lock); 2443 } else { 2444 mddev->ro = 0; 2445 mddev->recovery_cp = MaxSector; 2446 err = do_md_run(mddev); 2447 } 2448 break; 2449 case active: 2450 if (mddev->pers) { 2451 restart_array(mddev); 2452 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2453 wake_up(&mddev->sb_wait); 2454 err = 0; 2455 } else { 2456 mddev->ro = 0; 2457 err = do_md_run(mddev); 2458 } 2459 break; 2460 case write_pending: 2461 case active_idle: 2462 /* these cannot be set */ 2463 break; 2464 } 2465 if (err) 2466 return err; 2467 else 2468 return len; 2469} 2470static struct md_sysfs_entry md_array_state = 2471__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 2472 2473static ssize_t 2474null_show(mddev_t *mddev, char *page) 2475{ 2476 return -EINVAL; 2477} 2478 2479static ssize_t 2480new_dev_store(mddev_t *mddev, const char *buf, size_t len) 2481{ 2482 /* buf must be %d:%d\n? giving major and minor numbers */ 2483 /* The new device is added to the array. 2484 * If the array has a persistent superblock, we read the 2485 * superblock to initialise info and check validity. 2486 * Otherwise, only checking done is that in bind_rdev_to_array, 2487 * which mainly checks size. 2488 */ 2489 char *e; 2490 int major = simple_strtoul(buf, &e, 10); 2491 int minor; 2492 dev_t dev; 2493 mdk_rdev_t *rdev; 2494 int err; 2495 2496 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 2497 return -EINVAL; 2498 minor = simple_strtoul(e+1, &e, 10); 2499 if (*e && *e != '\n') 2500 return -EINVAL; 2501 dev = MKDEV(major, minor); 2502 if (major != MAJOR(dev) || 2503 minor != MINOR(dev)) 2504 return -EOVERFLOW; 2505 2506 2507 if (mddev->persistent) { 2508 rdev = md_import_device(dev, mddev->major_version, 2509 mddev->minor_version); 2510 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 2511 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2512 mdk_rdev_t, same_set); 2513 err = super_types[mddev->major_version] 2514 .load_super(rdev, rdev0, mddev->minor_version); 2515 if (err < 0) 2516 goto out; 2517 } 2518 } else 2519 rdev = md_import_device(dev, -1, -1); 2520 2521 if (IS_ERR(rdev)) 2522 return PTR_ERR(rdev); 2523 err = bind_rdev_to_array(rdev, mddev); 2524 out: 2525 if (err) 2526 export_rdev(rdev); 2527 return err ? err : len; 2528} 2529 2530static struct md_sysfs_entry md_new_device = 2531__ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 2532 2533static ssize_t 2534bitmap_store(mddev_t *mddev, const char *buf, size_t len) 2535{ 2536 char *end; 2537 unsigned long chunk, end_chunk; 2538 2539 if (!mddev->bitmap) 2540 goto out; 2541 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 2542 while (*buf) { 2543 chunk = end_chunk = simple_strtoul(buf, &end, 0); 2544 if (buf == end) break; 2545 if (*end == '-') { /* range */ 2546 buf = end + 1; 2547 end_chunk = simple_strtoul(buf, &end, 0); 2548 if (buf == end) break; 2549 } 2550 if (*end && !isspace(*end)) break; 2551 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 2552 buf = end; 2553 while (isspace(*buf)) buf++; 2554 } 2555 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 2556out: 2557 return len; 2558} 2559 2560static struct md_sysfs_entry md_bitmap = 2561__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 2562 2563static ssize_t 2564size_show(mddev_t *mddev, char *page) 2565{ 2566 return sprintf(page, "%llu\n", (unsigned long long)mddev->size); 2567} 2568 2569static int update_size(mddev_t *mddev, unsigned long size); 2570 2571static ssize_t 2572size_store(mddev_t *mddev, const char *buf, size_t len) 2573{ 2574 /* If array is inactive, we can reduce the component size, but 2575 * not increase it (except from 0). 2576 * If array is active, we can try an on-line resize 2577 */ 2578 char *e; 2579 int err = 0; 2580 unsigned long long size = simple_strtoull(buf, &e, 10); 2581 if (!*buf || *buf == '\n' || 2582 (*e && *e != '\n')) 2583 return -EINVAL; 2584 2585 if (mddev->pers) { 2586 err = update_size(mddev, size); 2587 md_update_sb(mddev, 1); 2588 } else { 2589 if (mddev->size == 0 || 2590 mddev->size > size) 2591 mddev->size = size; 2592 else 2593 err = -ENOSPC; 2594 } 2595 return err ? err : len; 2596} 2597 2598static struct md_sysfs_entry md_size = 2599__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 2600 2601 2602/* Metdata version. 2603 * This is either 'none' for arrays with externally managed metadata, 2604 * or N.M for internally known formats 2605 */ 2606static ssize_t 2607metadata_show(mddev_t *mddev, char *page) 2608{ 2609 if (mddev->persistent) 2610 return sprintf(page, "%d.%d\n", 2611 mddev->major_version, mddev->minor_version); 2612 else 2613 return sprintf(page, "none\n"); 2614} 2615 2616static ssize_t 2617metadata_store(mddev_t *mddev, const char *buf, size_t len) 2618{ 2619 int major, minor; 2620 char *e; 2621 if (!list_empty(&mddev->disks)) 2622 return -EBUSY; 2623 2624 if (cmd_match(buf, "none")) { 2625 mddev->persistent = 0; 2626 mddev->major_version = 0; 2627 mddev->minor_version = 90; 2628 return len; 2629 } 2630 major = simple_strtoul(buf, &e, 10); 2631 if (e==buf || *e != '.') 2632 return -EINVAL; 2633 buf = e+1; 2634 minor = simple_strtoul(buf, &e, 10); 2635 if (e==buf || (*e && *e != '\n') ) 2636 return -EINVAL; 2637 if (major >= sizeof(super_types)/sizeof(super_types[0]) || 2638 super_types[major].name == NULL) 2639 return -ENOENT; 2640 mddev->major_version = major; 2641 mddev->minor_version = minor; 2642 mddev->persistent = 1; 2643 return len; 2644} 2645 2646static struct md_sysfs_entry md_metadata = 2647__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 2648 2649static ssize_t 2650action_show(mddev_t *mddev, char *page) 2651{ 2652 char *type = "idle"; 2653 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2654 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { 2655 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 2656 type = "reshape"; 2657 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2658 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 2659 type = "resync"; 2660 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2661 type = "check"; 2662 else 2663 type = "repair"; 2664 } else 2665 type = "recover"; 2666 } 2667 return sprintf(page, "%s\n", type); 2668} 2669 2670static ssize_t 2671action_store(mddev_t *mddev, const char *page, size_t len) 2672{ 2673 if (!mddev->pers || !mddev->pers->sync_request) 2674 return -EINVAL; 2675 2676 if (cmd_match(page, "idle")) { 2677 if (mddev->sync_thread) { 2678 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2679 md_unregister_thread(mddev->sync_thread); 2680 mddev->sync_thread = NULL; 2681 mddev->recovery = 0; 2682 } 2683 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2684 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 2685 return -EBUSY; 2686 else if (cmd_match(page, "resync") || cmd_match(page, "recover")) 2687 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2688 else if (cmd_match(page, "reshape")) { 2689 int err; 2690 if (mddev->pers->start_reshape == NULL) 2691 return -EINVAL; 2692 err = mddev->pers->start_reshape(mddev); 2693 if (err) 2694 return err; 2695 } else { 2696 if (cmd_match(page, "check")) 2697 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 2698 else if (!cmd_match(page, "repair")) 2699 return -EINVAL; 2700 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 2701 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 2702 } 2703 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2704 md_wakeup_thread(mddev->thread); 2705 return len; 2706} 2707 2708static ssize_t 2709mismatch_cnt_show(mddev_t *mddev, char *page) 2710{ 2711 return sprintf(page, "%llu\n", 2712 (unsigned long long) mddev->resync_mismatches); 2713} 2714 2715static struct md_sysfs_entry md_scan_mode = 2716__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 2717 2718 2719static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 2720 2721static ssize_t 2722sync_min_show(mddev_t *mddev, char *page) 2723{ 2724 return sprintf(page, "%d (%s)\n", speed_min(mddev), 2725 mddev->sync_speed_min ? "local": "system"); 2726} 2727 2728static ssize_t 2729sync_min_store(mddev_t *mddev, const char *buf, size_t len) 2730{ 2731 int min; 2732 char *e; 2733 if (strncmp(buf, "system", 6)==0) { 2734 mddev->sync_speed_min = 0; 2735 return len; 2736 } 2737 min = simple_strtoul(buf, &e, 10); 2738 if (buf == e || (*e && *e != '\n') || min <= 0) 2739 return -EINVAL; 2740 mddev->sync_speed_min = min; 2741 return len; 2742} 2743 2744static struct md_sysfs_entry md_sync_min = 2745__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 2746 2747static ssize_t 2748sync_max_show(mddev_t *mddev, char *page) 2749{ 2750 return sprintf(page, "%d (%s)\n", speed_max(mddev), 2751 mddev->sync_speed_max ? "local": "system"); 2752} 2753 2754static ssize_t 2755sync_max_store(mddev_t *mddev, const char *buf, size_t len) 2756{ 2757 int max; 2758 char *e; 2759 if (strncmp(buf, "system", 6)==0) { 2760 mddev->sync_speed_max = 0; 2761 return len; 2762 } 2763 max = simple_strtoul(buf, &e, 10); 2764 if (buf == e || (*e && *e != '\n') || max <= 0) 2765 return -EINVAL; 2766 mddev->sync_speed_max = max; 2767 return len; 2768} 2769 2770static struct md_sysfs_entry md_sync_max = 2771__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 2772 2773 2774static ssize_t 2775sync_speed_show(mddev_t *mddev, char *page) 2776{ 2777 unsigned long resync, dt, db; 2778 resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)); 2779 dt = ((jiffies - mddev->resync_mark) / HZ); 2780 if (!dt) dt++; 2781 db = resync - (mddev->resync_mark_cnt); 2782 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ 2783} 2784 2785static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 2786 2787static ssize_t 2788sync_completed_show(mddev_t *mddev, char *page) 2789{ 2790 unsigned long max_blocks, resync; 2791 2792 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2793 max_blocks = mddev->resync_max_sectors; 2794 else 2795 max_blocks = mddev->size << 1; 2796 2797 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); 2798 return sprintf(page, "%lu / %lu\n", resync, max_blocks); 2799} 2800 2801static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 2802 2803static ssize_t 2804suspend_lo_show(mddev_t *mddev, char *page) 2805{ 2806 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 2807} 2808 2809static ssize_t 2810suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) 2811{ 2812 char *e; 2813 unsigned long long new = simple_strtoull(buf, &e, 10); 2814 2815 if (mddev->pers->quiesce == NULL) 2816 return -EINVAL; 2817 if (buf == e || (*e && *e != '\n')) 2818 return -EINVAL; 2819 if (new >= mddev->suspend_hi || 2820 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 2821 mddev->suspend_lo = new; 2822 mddev->pers->quiesce(mddev, 2); 2823 return len; 2824 } else 2825 return -EINVAL; 2826} 2827static struct md_sysfs_entry md_suspend_lo = 2828__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 2829 2830 2831static ssize_t 2832suspend_hi_show(mddev_t *mddev, char *page) 2833{ 2834 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 2835} 2836 2837static ssize_t 2838suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) 2839{ 2840 char *e; 2841 unsigned long long new = simple_strtoull(buf, &e, 10); 2842 2843 if (mddev->pers->quiesce == NULL) 2844 return -EINVAL; 2845 if (buf == e || (*e && *e != '\n')) 2846 return -EINVAL; 2847 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 2848 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 2849 mddev->suspend_hi = new; 2850 mddev->pers->quiesce(mddev, 1); 2851 mddev->pers->quiesce(mddev, 0); 2852 return len; 2853 } else 2854 return -EINVAL; 2855} 2856static struct md_sysfs_entry md_suspend_hi = 2857__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 2858 2859 2860static struct attribute *md_default_attrs[] = { 2861 &md_level.attr, 2862 &md_layout.attr, 2863 &md_raid_disks.attr, 2864 &md_chunk_size.attr, 2865 &md_size.attr, 2866 &md_resync_start.attr, 2867 &md_metadata.attr, 2868 &md_new_device.attr, 2869 &md_safe_delay.attr, 2870 &md_array_state.attr, 2871 NULL, 2872}; 2873 2874static struct attribute *md_redundancy_attrs[] = { 2875 &md_scan_mode.attr, 2876 &md_mismatches.attr, 2877 &md_sync_min.attr, 2878 &md_sync_max.attr, 2879 &md_sync_speed.attr, 2880 &md_sync_completed.attr, 2881 &md_suspend_lo.attr, 2882 &md_suspend_hi.attr, 2883 &md_bitmap.attr, 2884 NULL, 2885}; 2886static struct attribute_group md_redundancy_group = { 2887 .name = NULL, 2888 .attrs = md_redundancy_attrs, 2889}; 2890 2891 2892static ssize_t 2893md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2894{ 2895 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 2896 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 2897 ssize_t rv; 2898 2899 if (!entry->show) 2900 return -EIO; 2901 rv = mddev_lock(mddev); 2902 if (!rv) { 2903 rv = entry->show(mddev, page); 2904 mddev_unlock(mddev); 2905 } 2906 return rv; 2907} 2908 2909static ssize_t 2910md_attr_store(struct kobject *kobj, struct attribute *attr, 2911 const char *page, size_t length) 2912{ 2913 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 2914 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 2915 ssize_t rv; 2916 2917 if (!entry->store) 2918 return -EIO; 2919 if (!capable(CAP_SYS_ADMIN)) 2920 return -EACCES; 2921 rv = mddev_lock(mddev); 2922 if (!rv) { 2923 rv = entry->store(mddev, page, length); 2924 mddev_unlock(mddev); 2925 } 2926 return rv; 2927} 2928 2929static void md_free(struct kobject *ko) 2930{ 2931 mddev_t *mddev = container_of(ko, mddev_t, kobj); 2932 kfree(mddev); 2933} 2934 2935static struct sysfs_ops md_sysfs_ops = { 2936 .show = md_attr_show, 2937 .store = md_attr_store, 2938}; 2939static struct kobj_type md_ktype = { 2940 .release = md_free, 2941 .sysfs_ops = &md_sysfs_ops, 2942 .default_attrs = md_default_attrs, 2943}; 2944 2945int mdp_major = 0; 2946 2947static struct kobject *md_probe(dev_t dev, int *part, void *data) 2948{ 2949 static DEFINE_MUTEX(disks_mutex); 2950 mddev_t *mddev = mddev_find(dev); 2951 struct gendisk *disk; 2952 int partitioned = (MAJOR(dev) != MD_MAJOR); 2953 int shift = partitioned ? MdpMinorShift : 0; 2954 int unit = MINOR(dev) >> shift; 2955 2956 if (!mddev) 2957 return NULL; 2958 2959 mutex_lock(&disks_mutex); 2960 if (mddev->gendisk) { 2961 mutex_unlock(&disks_mutex); 2962 mddev_put(mddev); 2963 return NULL; 2964 } 2965 disk = alloc_disk(1 << shift); 2966 if (!disk) { 2967 mutex_unlock(&disks_mutex); 2968 mddev_put(mddev); 2969 return NULL; 2970 } 2971 disk->major = MAJOR(dev); 2972 disk->first_minor = unit << shift; 2973 if (partitioned) 2974 sprintf(disk->disk_name, "md_d%d", unit); 2975 else 2976 sprintf(disk->disk_name, "md%d", unit); 2977 disk->fops = &md_fops; 2978 disk->private_data = mddev; 2979 disk->queue = mddev->queue; 2980 add_disk(disk); 2981 mddev->gendisk = disk; 2982 mutex_unlock(&disks_mutex); 2983 mddev->kobj.parent = &disk->kobj; 2984 mddev->kobj.k_name = NULL; 2985 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); 2986 mddev->kobj.ktype = &md_ktype; 2987 kobject_register(&mddev->kobj); 2988 return NULL; 2989} 2990 2991static void md_safemode_timeout(unsigned long data) 2992{ 2993 mddev_t *mddev = (mddev_t *) data; 2994 2995 mddev->safemode = 1; 2996 md_wakeup_thread(mddev->thread); 2997} 2998 2999static int start_dirty_degraded; 3000 3001static int do_md_run(mddev_t * mddev) 3002{ 3003 int err; 3004 int chunk_size; 3005 struct list_head *tmp; 3006 mdk_rdev_t *rdev; 3007 struct gendisk *disk; 3008 struct mdk_personality *pers; 3009 char b[BDEVNAME_SIZE]; 3010 3011 if (list_empty(&mddev->disks)) 3012 /* cannot run an array with no devices.. */ 3013 return -EINVAL; 3014 3015 if (mddev->pers) 3016 return -EBUSY; 3017 3018 /* 3019 * Analyze all RAID superblock(s) 3020 */ 3021 if (!mddev->raid_disks) 3022 analyze_sbs(mddev); 3023 3024 chunk_size = mddev->chunk_size; 3025 3026 if (chunk_size) { 3027 if (chunk_size > MAX_CHUNK_SIZE) { 3028 printk(KERN_ERR "too big chunk_size: %d > %d\n", 3029 chunk_size, MAX_CHUNK_SIZE); 3030 return -EINVAL; 3031 } 3032 /* 3033 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 3034 */ 3035 if ( (1 << ffz(~chunk_size)) != chunk_size) { 3036 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 3037 return -EINVAL; 3038 } 3039 if (chunk_size < PAGE_SIZE) { 3040 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 3041 chunk_size, PAGE_SIZE); 3042 return -EINVAL; 3043 } 3044 3045 /* devices must have minimum size of one chunk */ 3046 ITERATE_RDEV(mddev,rdev,tmp) { 3047 if (test_bit(Faulty, &rdev->flags)) 3048 continue; 3049 if (rdev->size < chunk_size / 1024) { 3050 printk(KERN_WARNING 3051 "md: Dev %s smaller than chunk_size:" 3052 " %lluk < %dk\n", 3053 bdevname(rdev->bdev,b), 3054 (unsigned long long)rdev->size, 3055 chunk_size / 1024); 3056 return -EINVAL; 3057 } 3058 } 3059 } 3060 3061#ifdef CONFIG_KMOD 3062 if (mddev->level != LEVEL_NONE) 3063 request_module("md-level-%d", mddev->level); 3064 else if (mddev->clevel[0]) 3065 request_module("md-%s", mddev->clevel); 3066#endif 3067 3068 /* 3069 * Drop all container device buffers, from now on 3070 * the only valid external interface is through the md 3071 * device. 3072 * Also find largest hardsector size 3073 */ 3074 ITERATE_RDEV(mddev,rdev,tmp) { 3075 if (test_bit(Faulty, &rdev->flags)) 3076 continue; 3077 sync_blockdev(rdev->bdev); 3078 invalidate_bdev(rdev->bdev, 0); 3079 } 3080 3081 md_probe(mddev->unit, NULL, NULL); 3082 disk = mddev->gendisk; 3083 if (!disk) 3084 return -ENOMEM; 3085 3086 spin_lock(&pers_lock); 3087 pers = find_pers(mddev->level, mddev->clevel); 3088 if (!pers || !try_module_get(pers->owner)) { 3089 spin_unlock(&pers_lock); 3090 if (mddev->level != LEVEL_NONE) 3091 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 3092 mddev->level); 3093 else 3094 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 3095 mddev->clevel); 3096 return -EINVAL; 3097 } 3098 mddev->pers = pers; 3099 spin_unlock(&pers_lock); 3100 mddev->level = pers->level; 3101 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3102 3103 if (mddev->reshape_position != MaxSector && 3104 pers->start_reshape == NULL) { 3105 /* This personality cannot handle reshaping... */ 3106 mddev->pers = NULL; 3107 module_put(pers->owner); 3108 return -EINVAL; 3109 } 3110 3111 mddev->recovery = 0; 3112 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 3113 mddev->barriers_work = 1; 3114 mddev->ok_start_degraded = start_dirty_degraded; 3115 3116 if (start_readonly) 3117 mddev->ro = 2; /* read-only, but switch on first write */ 3118 3119 err = mddev->pers->run(mddev); 3120 if (!err && mddev->pers->sync_request) { 3121 err = bitmap_create(mddev); 3122 if (err) { 3123 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 3124 mdname(mddev), err); 3125 mddev->pers->stop(mddev); 3126 } 3127 } 3128 if (err) { 3129 printk(KERN_ERR "md: pers->run() failed ...\n"); 3130 module_put(mddev->pers->owner); 3131 mddev->pers = NULL; 3132 bitmap_destroy(mddev); 3133 return err; 3134 } 3135 if (mddev->pers->sync_request) 3136 sysfs_create_group(&mddev->kobj, &md_redundancy_group); 3137 else if (mddev->ro == 2) /* auto-readonly not meaningful */ 3138 mddev->ro = 0; 3139 3140 atomic_set(&mddev->writes_pending,0); 3141 mddev->safemode = 0; 3142 mddev->safemode_timer.function = md_safemode_timeout; 3143 mddev->safemode_timer.data = (unsigned long) mddev; 3144 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 3145 mddev->in_sync = 1; 3146 3147 ITERATE_RDEV(mddev,rdev,tmp) 3148 if (rdev->raid_disk >= 0) { 3149 char nm[20]; 3150 sprintf(nm, "rd%d", rdev->raid_disk); 3151 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 3152 } 3153 3154 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3155 3156 if (mddev->flags) 3157 md_update_sb(mddev, 0); 3158 3159 set_capacity(disk, mddev->array_size<<1); 3160 3161 /* If we call blk_queue_make_request here, it will 3162 * re-initialise max_sectors etc which may have been 3163 * refined inside -> run. So just set the bits we need to set. 3164 * Most initialisation happended when we called 3165 * blk_queue_make_request(..., md_fail_request) 3166 * earlier. 3167 */ 3168 mddev->queue->queuedata = mddev; 3169 mddev->queue->make_request_fn = mddev->pers->make_request; 3170 3171 /* If there is a partially-recovered drive we need to 3172 * start recovery here. If we leave it to md_check_recovery, 3173 * it will remove the drives and not do the right thing 3174 */ 3175 if (mddev->degraded && !mddev->sync_thread) { 3176 struct list_head *rtmp; 3177 int spares = 0; 3178 ITERATE_RDEV(mddev,rdev,rtmp) 3179 if (rdev->raid_disk >= 0 && 3180 !test_bit(In_sync, &rdev->flags) && 3181 !test_bit(Faulty, &rdev->flags)) 3182 /* complete an interrupted recovery */ 3183 spares++; 3184 if (spares && mddev->pers->sync_request) { 3185 mddev->recovery = 0; 3186 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3187 mddev->sync_thread = md_register_thread(md_do_sync, 3188 mddev, 3189 "%s_resync"); 3190 if (!mddev->sync_thread) { 3191 printk(KERN_ERR "%s: could not start resync" 3192 " thread...\n", 3193 mdname(mddev)); 3194 /* leave the spares where they are, it shouldn't hurt */ 3195 mddev->recovery = 0; 3196 } 3197 } 3198 } 3199 md_wakeup_thread(mddev->thread); 3200 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 3201 3202 mddev->changed = 1; 3203 md_new_event(mddev); 3204 kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE); 3205 return 0; 3206} 3207 3208static int restart_array(mddev_t *mddev) 3209{ 3210 struct gendisk *disk = mddev->gendisk; 3211 int err; 3212 3213 /* 3214 * Complain if it has no devices 3215 */ 3216 err = -ENXIO; 3217 if (list_empty(&mddev->disks)) 3218 goto out; 3219 3220 if (mddev->pers) { 3221 err = -EBUSY; 3222 if (!mddev->ro) 3223 goto out; 3224 3225 mddev->safemode = 0; 3226 mddev->ro = 0; 3227 set_disk_ro(disk, 0); 3228 3229 printk(KERN_INFO "md: %s switched to read-write mode.\n", 3230 mdname(mddev)); 3231 /* 3232 * Kick recovery or resync if necessary 3233 */ 3234 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3235 md_wakeup_thread(mddev->thread); 3236 md_wakeup_thread(mddev->sync_thread); 3237 err = 0; 3238 } else 3239 err = -EINVAL; 3240 3241out: 3242 return err; 3243} 3244 3245/* similar to deny_write_access, but accounts for our holding a reference 3246 * to the file ourselves */ 3247static int deny_bitmap_write_access(struct file * file) 3248{ 3249 struct inode *inode = file->f_mapping->host; 3250 3251 spin_lock(&inode->i_lock); 3252 if (atomic_read(&inode->i_writecount) > 1) { 3253 spin_unlock(&inode->i_lock); 3254 return -ETXTBSY; 3255 } 3256 atomic_set(&inode->i_writecount, -1); 3257 spin_unlock(&inode->i_lock); 3258 3259 return 0; 3260} 3261 3262static void restore_bitmap_write_access(struct file *file) 3263{ 3264 struct inode *inode = file->f_mapping->host; 3265 3266 spin_lock(&inode->i_lock); 3267 atomic_set(&inode->i_writecount, 1); 3268 spin_unlock(&inode->i_lock); 3269} 3270 3271/* mode: 3272 * 0 - completely stop and dis-assemble array 3273 * 1 - switch to readonly 3274 * 2 - stop but do not disassemble array 3275 */ 3276static int do_md_stop(mddev_t * mddev, int mode) 3277{ 3278 int err = 0; 3279 struct gendisk *disk = mddev->gendisk; 3280 3281 if (mddev->pers) { 3282 if (atomic_read(&mddev->active)>2) { 3283 printk("md: %s still in use.\n",mdname(mddev)); 3284 return -EBUSY; 3285 } 3286 3287 if (mddev->sync_thread) { 3288 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3289 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3290 md_unregister_thread(mddev->sync_thread); 3291 mddev->sync_thread = NULL; 3292 } 3293 3294 del_timer_sync(&mddev->safemode_timer); 3295 3296 invalidate_partition(disk, 0); 3297 3298 switch(mode) { 3299 case 1: /* readonly */ 3300 err = -ENXIO; 3301 if (mddev->ro==1) 3302 goto out; 3303 mddev->ro = 1; 3304 break; 3305 case 0: /* disassemble */ 3306 case 2: /* stop */ 3307 bitmap_flush(mddev); 3308 md_super_wait(mddev); 3309 if (mddev->ro) 3310 set_disk_ro(disk, 0); 3311 blk_queue_make_request(mddev->queue, md_fail_request); 3312 mddev->pers->stop(mddev); 3313 if (mddev->pers->sync_request) 3314 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 3315 3316 module_put(mddev->pers->owner); 3317 mddev->pers = NULL; 3318 3319 set_capacity(disk, 0); 3320 mddev->changed = 1; 3321 3322 if (mddev->ro) 3323 mddev->ro = 0; 3324 } 3325 if (!mddev->in_sync || mddev->flags) { 3326 /* mark array as shutdown cleanly */ 3327 mddev->in_sync = 1; 3328 md_update_sb(mddev, 1); 3329 } 3330 if (mode == 1) 3331 set_disk_ro(disk, 1); 3332 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3333 } 3334 3335 /* 3336 * Free resources if final stop 3337 */ 3338 if (mode == 0) { 3339 mdk_rdev_t *rdev; 3340 struct list_head *tmp; 3341 3342 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 3343 3344 bitmap_destroy(mddev); 3345 if (mddev->bitmap_file) { 3346 restore_bitmap_write_access(mddev->bitmap_file); 3347 fput(mddev->bitmap_file); 3348 mddev->bitmap_file = NULL; 3349 } 3350 mddev->bitmap_offset = 0; 3351 3352 ITERATE_RDEV(mddev,rdev,tmp) 3353 if (rdev->raid_disk >= 0) { 3354 char nm[20]; 3355 sprintf(nm, "rd%d", rdev->raid_disk); 3356 sysfs_remove_link(&mddev->kobj, nm); 3357 } 3358 3359 export_array(mddev); 3360 3361 mddev->array_size = 0; 3362 mddev->size = 0; 3363 mddev->raid_disks = 0; 3364 mddev->recovery_cp = 0; 3365 3366 } else if (mddev->pers) 3367 printk(KERN_INFO "md: %s switched to read-only mode.\n", 3368 mdname(mddev)); 3369 err = 0; 3370 md_new_event(mddev); 3371out: 3372 return err; 3373} 3374 3375#ifndef MODULE 3376static void autorun_array(mddev_t *mddev) 3377{ 3378 mdk_rdev_t *rdev; 3379 struct list_head *tmp; 3380 int err; 3381 3382 if (list_empty(&mddev->disks)) 3383 return; 3384 3385 printk(KERN_INFO "md: running: "); 3386 3387 ITERATE_RDEV(mddev,rdev,tmp) { 3388 char b[BDEVNAME_SIZE]; 3389 printk("<%s>", bdevname(rdev->bdev,b)); 3390 } 3391 printk("\n"); 3392 3393 err = do_md_run (mddev); 3394 if (err) { 3395 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 3396 do_md_stop (mddev, 0); 3397 } 3398} 3399 3400/* 3401 * lets try to run arrays based on all disks that have arrived 3402 * until now. (those are in pending_raid_disks) 3403 * 3404 * the method: pick the first pending disk, collect all disks with 3405 * the same UUID, remove all from the pending list and put them into 3406 * the 'same_array' list. Then order this list based on superblock 3407 * update time (freshest comes first), kick out 'old' disks and 3408 * compare superblocks. If everything's fine then run it. 3409 * 3410 * If "unit" is allocated, then bump its reference count 3411 */ 3412static void autorun_devices(int part) 3413{ 3414 struct list_head *tmp; 3415 mdk_rdev_t *rdev0, *rdev; 3416 mddev_t *mddev; 3417 char b[BDEVNAME_SIZE]; 3418 3419 printk(KERN_INFO "md: autorun ...\n"); 3420 while (!list_empty(&pending_raid_disks)) { 3421 int unit; 3422 dev_t dev; 3423 LIST_HEAD(candidates); 3424 rdev0 = list_entry(pending_raid_disks.next, 3425 mdk_rdev_t, same_set); 3426 3427 printk(KERN_INFO "md: considering %s ...\n", 3428 bdevname(rdev0->bdev,b)); 3429 INIT_LIST_HEAD(&candidates); 3430 ITERATE_RDEV_PENDING(rdev,tmp) 3431 if (super_90_load(rdev, rdev0, 0) >= 0) { 3432 printk(KERN_INFO "md: adding %s ...\n", 3433 bdevname(rdev->bdev,b)); 3434 list_move(&rdev->same_set, &candidates); 3435 } 3436 /* 3437 * now we have a set of devices, with all of them having 3438 * mostly sane superblocks. It's time to allocate the 3439 * mddev. 3440 */ 3441 if (part) { 3442 dev = MKDEV(mdp_major, 3443 rdev0->preferred_minor << MdpMinorShift); 3444 unit = MINOR(dev) >> MdpMinorShift; 3445 } else { 3446 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 3447 unit = MINOR(dev); 3448 } 3449 if (rdev0->preferred_minor != unit) { 3450 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 3451 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 3452 break; 3453 } 3454 3455 md_probe(dev, NULL, NULL); 3456 mddev = mddev_find(dev); 3457 if (!mddev) { 3458 printk(KERN_ERR 3459 "md: cannot allocate memory for md drive.\n"); 3460 break; 3461 } 3462 if (mddev_lock(mddev)) 3463 printk(KERN_WARNING "md: %s locked, cannot run\n", 3464 mdname(mddev)); 3465 else if (mddev->raid_disks || mddev->major_version 3466 || !list_empty(&mddev->disks)) { 3467 printk(KERN_WARNING 3468 "md: %s already running, cannot run %s\n", 3469 mdname(mddev), bdevname(rdev0->bdev,b)); 3470 mddev_unlock(mddev); 3471 } else { 3472 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 3473 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 3474 list_del_init(&rdev->same_set); 3475 if (bind_rdev_to_array(rdev, mddev)) 3476 export_rdev(rdev); 3477 } 3478 autorun_array(mddev); 3479 mddev_unlock(mddev); 3480 } 3481 /* on success, candidates will be empty, on error 3482 * it won't... 3483 */ 3484 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 3485 export_rdev(rdev); 3486 mddev_put(mddev); 3487 } 3488 printk(KERN_INFO "md: ... autorun DONE.\n"); 3489} 3490#endif /* !MODULE */ 3491 3492static int get_version(void __user * arg) 3493{ 3494 mdu_version_t ver; 3495 3496 ver.major = MD_MAJOR_VERSION; 3497 ver.minor = MD_MINOR_VERSION; 3498 ver.patchlevel = MD_PATCHLEVEL_VERSION; 3499 3500 if (copy_to_user(arg, &ver, sizeof(ver))) 3501 return -EFAULT; 3502 3503 return 0; 3504} 3505 3506static int get_array_info(mddev_t * mddev, void __user * arg) 3507{ 3508 mdu_array_info_t info; 3509 int nr,working,active,failed,spare; 3510 mdk_rdev_t *rdev; 3511 struct list_head *tmp; 3512 3513 nr=working=active=failed=spare=0; 3514 ITERATE_RDEV(mddev,rdev,tmp) { 3515 nr++; 3516 if (test_bit(Faulty, &rdev->flags)) 3517 failed++; 3518 else { 3519 working++; 3520 if (test_bit(In_sync, &rdev->flags)) 3521 active++; 3522 else 3523 spare++; 3524 } 3525 } 3526 3527 info.major_version = mddev->major_version; 3528 info.minor_version = mddev->minor_version; 3529 info.patch_version = MD_PATCHLEVEL_VERSION; 3530 info.ctime = mddev->ctime; 3531 info.level = mddev->level; 3532 info.size = mddev->size; 3533 if (info.size != mddev->size) /* overflow */ 3534 info.size = -1; 3535 info.nr_disks = nr; 3536 info.raid_disks = mddev->raid_disks; 3537 info.md_minor = mddev->md_minor; 3538 info.not_persistent= !mddev->persistent; 3539 3540 info.utime = mddev->utime; 3541 info.state = 0; 3542 if (mddev->in_sync) 3543 info.state = (1<<MD_SB_CLEAN); 3544 if (mddev->bitmap && mddev->bitmap_offset) 3545 info.state = (1<<MD_SB_BITMAP_PRESENT); 3546 info.active_disks = active; 3547 info.working_disks = working; 3548 info.failed_disks = failed; 3549 info.spare_disks = spare; 3550 3551 info.layout = mddev->layout; 3552 info.chunk_size = mddev->chunk_size; 3553 3554 if (copy_to_user(arg, &info, sizeof(info))) 3555 return -EFAULT; 3556 3557 return 0; 3558} 3559 3560static int get_bitmap_file(mddev_t * mddev, void __user * arg) 3561{ 3562 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 3563 char *ptr, *buf = NULL; 3564 int err = -ENOMEM; 3565 3566 file = kmalloc(sizeof(*file), GFP_KERNEL); 3567 if (!file) 3568 goto out; 3569 3570 /* bitmap disabled, zero the first byte and copy out */ 3571 if (!mddev->bitmap || !mddev->bitmap->file) { 3572 file->pathname[0] = '\0'; 3573 goto copy_out; 3574 } 3575 3576 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 3577 if (!buf) 3578 goto out; 3579 3580 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); 3581 if (!ptr) 3582 goto out; 3583 3584 strcpy(file->pathname, ptr); 3585 3586copy_out: 3587 err = 0; 3588 if (copy_to_user(arg, file, sizeof(*file))) 3589 err = -EFAULT; 3590out: 3591 kfree(buf); 3592 kfree(file); 3593 return err; 3594} 3595 3596static int get_disk_info(mddev_t * mddev, void __user * arg) 3597{ 3598 mdu_disk_info_t info; 3599 unsigned int nr; 3600 mdk_rdev_t *rdev; 3601 3602 if (copy_from_user(&info, arg, sizeof(info))) 3603 return -EFAULT; 3604 3605 nr = info.number; 3606 3607 rdev = find_rdev_nr(mddev, nr); 3608 if (rdev) { 3609 info.major = MAJOR(rdev->bdev->bd_dev); 3610 info.minor = MINOR(rdev->bdev->bd_dev); 3611 info.raid_disk = rdev->raid_disk; 3612 info.state = 0; 3613 if (test_bit(Faulty, &rdev->flags)) 3614 info.state |= (1<<MD_DISK_FAULTY); 3615 else if (test_bit(In_sync, &rdev->flags)) { 3616 info.state |= (1<<MD_DISK_ACTIVE); 3617 info.state |= (1<<MD_DISK_SYNC); 3618 } 3619 if (test_bit(WriteMostly, &rdev->flags)) 3620 info.state |= (1<<MD_DISK_WRITEMOSTLY); 3621 } else { 3622 info.major = info.minor = 0; 3623 info.raid_disk = -1; 3624 info.state = (1<<MD_DISK_REMOVED); 3625 } 3626 3627 if (copy_to_user(arg, &info, sizeof(info))) 3628 return -EFAULT; 3629 3630 return 0; 3631} 3632 3633static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 3634{ 3635 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 3636 mdk_rdev_t *rdev; 3637 dev_t dev = MKDEV(info->major,info->minor); 3638 3639 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 3640 return -EOVERFLOW; 3641 3642 if (!mddev->raid_disks) { 3643 int err; 3644 /* expecting a device which has a superblock */ 3645 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 3646 if (IS_ERR(rdev)) { 3647 printk(KERN_WARNING 3648 "md: md_import_device returned %ld\n", 3649 PTR_ERR(rdev)); 3650 return PTR_ERR(rdev); 3651 } 3652 if (!list_empty(&mddev->disks)) { 3653 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 3654 mdk_rdev_t, same_set); 3655 int err = super_types[mddev->major_version] 3656 .load_super(rdev, rdev0, mddev->minor_version); 3657 if (err < 0) { 3658 printk(KERN_WARNING 3659 "md: %s has different UUID to %s\n", 3660 bdevname(rdev->bdev,b), 3661 bdevname(rdev0->bdev,b2)); 3662 export_rdev(rdev); 3663 return -EINVAL; 3664 } 3665 } 3666 err = bind_rdev_to_array(rdev, mddev); 3667 if (err) 3668 export_rdev(rdev); 3669 return err; 3670 } 3671 3672 /* 3673 * add_new_disk can be used once the array is assembled 3674 * to add "hot spares". They must already have a superblock 3675 * written 3676 */ 3677 if (mddev->pers) { 3678 int err; 3679 if (!mddev->pers->hot_add_disk) { 3680 printk(KERN_WARNING 3681 "%s: personality does not support diskops!\n", 3682 mdname(mddev)); 3683 return -EINVAL; 3684 } 3685 if (mddev->persistent) 3686 rdev = md_import_device(dev, mddev->major_version, 3687 mddev->minor_version); 3688 else 3689 rdev = md_import_device(dev, -1, -1); 3690 if (IS_ERR(rdev)) { 3691 printk(KERN_WARNING 3692 "md: md_import_device returned %ld\n", 3693 PTR_ERR(rdev)); 3694 return PTR_ERR(rdev); 3695 } 3696 /* set save_raid_disk if appropriate */ 3697 if (!mddev->persistent) { 3698 if (info->state & (1<<MD_DISK_SYNC) && 3699 info->raid_disk < mddev->raid_disks) 3700 rdev->raid_disk = info->raid_disk; 3701 else 3702 rdev->raid_disk = -1; 3703 } else 3704 super_types[mddev->major_version]. 3705 validate_super(mddev, rdev); 3706 rdev->saved_raid_disk = rdev->raid_disk; 3707 3708 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 3709 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3710 set_bit(WriteMostly, &rdev->flags); 3711 3712 rdev->raid_disk = -1; 3713 err = bind_rdev_to_array(rdev, mddev); 3714 if (!err && !mddev->pers->hot_remove_disk) { 3715 /* If there is hot_add_disk but no hot_remove_disk 3716 * then added disks for geometry changes, 3717 * and should be added immediately. 3718 */ 3719 super_types[mddev->major_version]. 3720 validate_super(mddev, rdev); 3721 err = mddev->pers->hot_add_disk(mddev, rdev); 3722 if (err) 3723 unbind_rdev_from_array(rdev); 3724 } 3725 if (err) 3726 export_rdev(rdev); 3727 3728 md_update_sb(mddev, 1); 3729 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3730 md_wakeup_thread(mddev->thread); 3731 return err; 3732 } 3733 3734 /* otherwise, add_new_disk is only allowed 3735 * for major_version==0 superblocks 3736 */ 3737 if (mddev->major_version != 0) { 3738 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 3739 mdname(mddev)); 3740 return -EINVAL; 3741 } 3742 3743 if (!(info->state & (1<<MD_DISK_FAULTY))) { 3744 int err; 3745 rdev = md_import_device (dev, -1, 0); 3746 if (IS_ERR(rdev)) { 3747 printk(KERN_WARNING 3748 "md: error, md_import_device() returned %ld\n", 3749 PTR_ERR(rdev)); 3750 return PTR_ERR(rdev); 3751 } 3752 rdev->desc_nr = info->number; 3753 if (info->raid_disk < mddev->raid_disks) 3754 rdev->raid_disk = info->raid_disk; 3755 else 3756 rdev->raid_disk = -1; 3757 3758 rdev->flags = 0; 3759 3760 if (rdev->raid_disk < mddev->raid_disks) 3761 if (info->state & (1<<MD_DISK_SYNC)) 3762 set_bit(In_sync, &rdev->flags); 3763 3764 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3765 set_bit(WriteMostly, &rdev->flags); 3766 3767 if (!mddev->persistent) { 3768 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 3769 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3770 } else 3771 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3772 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 3773 3774 err = bind_rdev_to_array(rdev, mddev); 3775 if (err) { 3776 export_rdev(rdev); 3777 return err; 3778 } 3779 } 3780 3781 return 0; 3782} 3783 3784static int hot_remove_disk(mddev_t * mddev, dev_t dev) 3785{ 3786 char b[BDEVNAME_SIZE]; 3787 mdk_rdev_t *rdev; 3788 3789 if (!mddev->pers) 3790 return -ENODEV; 3791 3792 rdev = find_rdev(mddev, dev); 3793 if (!rdev) 3794 return -ENXIO; 3795 3796 if (rdev->raid_disk >= 0) 3797 goto busy; 3798 3799 kick_rdev_from_array(rdev); 3800 md_update_sb(mddev, 1); 3801 md_new_event(mddev); 3802 3803 return 0; 3804busy: 3805 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", 3806 bdevname(rdev->bdev,b), mdname(mddev)); 3807 return -EBUSY; 3808} 3809 3810static int hot_add_disk(mddev_t * mddev, dev_t dev) 3811{ 3812 char b[BDEVNAME_SIZE]; 3813 int err; 3814 unsigned int size; 3815 mdk_rdev_t *rdev; 3816 3817 if (!mddev->pers) 3818 return -ENODEV; 3819 3820 if (mddev->major_version != 0) { 3821 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 3822 " version-0 superblocks.\n", 3823 mdname(mddev)); 3824 return -EINVAL; 3825 } 3826 if (!mddev->pers->hot_add_disk) { 3827 printk(KERN_WARNING 3828 "%s: personality does not support diskops!\n", 3829 mdname(mddev)); 3830 return -EINVAL; 3831 } 3832 3833 rdev = md_import_device (dev, -1, 0); 3834 if (IS_ERR(rdev)) { 3835 printk(KERN_WARNING 3836 "md: error, md_import_device() returned %ld\n", 3837 PTR_ERR(rdev)); 3838 return -EINVAL; 3839 } 3840 3841 if (mddev->persistent) 3842 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3843 else 3844 rdev->sb_offset = 3845 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3846 3847 size = calc_dev_size(rdev, mddev->chunk_size); 3848 rdev->size = size; 3849 3850 if (test_bit(Faulty, &rdev->flags)) { 3851 printk(KERN_WARNING 3852 "md: can not hot-add faulty %s disk to %s!\n", 3853 bdevname(rdev->bdev,b), mdname(mddev)); 3854 err = -EINVAL; 3855 goto abort_export; 3856 } 3857 clear_bit(In_sync, &rdev->flags); 3858 rdev->desc_nr = -1; 3859 rdev->saved_raid_disk = -1; 3860 err = bind_rdev_to_array(rdev, mddev); 3861 if (err) 3862 goto abort_export; 3863 3864 /* 3865 * The rest should better be atomic, we can have disk failures 3866 * noticed in interrupt contexts ... 3867 */ 3868 3869 if (rdev->desc_nr == mddev->max_disks) { 3870 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 3871 mdname(mddev)); 3872 err = -EBUSY; 3873 goto abort_unbind_export; 3874 } 3875 3876 rdev->raid_disk = -1; 3877 3878 md_update_sb(mddev, 1); 3879 3880 /* 3881 * Kick recovery, maybe this spare has to be added to the 3882 * array immediately. 3883 */ 3884 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3885 md_wakeup_thread(mddev->thread); 3886 md_new_event(mddev); 3887 return 0; 3888 3889abort_unbind_export: 3890 unbind_rdev_from_array(rdev); 3891 3892abort_export: 3893 export_rdev(rdev); 3894 return err; 3895} 3896 3897static int set_bitmap_file(mddev_t *mddev, int fd) 3898{ 3899 int err; 3900 3901 if (mddev->pers) { 3902 if (!mddev->pers->quiesce) 3903 return -EBUSY; 3904 if (mddev->recovery || mddev->sync_thread) 3905 return -EBUSY; 3906 /* we should be able to change the bitmap.. */ 3907 } 3908 3909 3910 if (fd >= 0) { 3911 if (mddev->bitmap) 3912 return -EEXIST; /* cannot add when bitmap is present */ 3913 mddev->bitmap_file = fget(fd); 3914 3915 if (mddev->bitmap_file == NULL) { 3916 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 3917 mdname(mddev)); 3918 return -EBADF; 3919 } 3920 3921 err = deny_bitmap_write_access(mddev->bitmap_file); 3922 if (err) { 3923 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 3924 mdname(mddev)); 3925 fput(mddev->bitmap_file); 3926 mddev->bitmap_file = NULL; 3927 return err; 3928 } 3929 mddev->bitmap_offset = 0; /* file overrides offset */ 3930 } else if (mddev->bitmap == NULL) 3931 return -ENOENT; /* cannot remove what isn't there */ 3932 err = 0; 3933 if (mddev->pers) { 3934 mddev->pers->quiesce(mddev, 1); 3935 if (fd >= 0) 3936 err = bitmap_create(mddev); 3937 if (fd < 0 || err) { 3938 bitmap_destroy(mddev); 3939 fd = -1; /* make sure to put the file */ 3940 } 3941 mddev->pers->quiesce(mddev, 0); 3942 } 3943 if (fd < 0) { 3944 if (mddev->bitmap_file) { 3945 restore_bitmap_write_access(mddev->bitmap_file); 3946 fput(mddev->bitmap_file); 3947 } 3948 mddev->bitmap_file = NULL; 3949 } 3950 3951 return err; 3952} 3953 3954/* 3955 * set_array_info is used two different ways 3956 * The original usage is when creating a new array. 3957 * In this usage, raid_disks is > 0 and it together with 3958 * level, size, not_persistent,layout,chunksize determine the 3959 * shape of the array. 3960 * This will always create an array with a type-0.90.0 superblock. 3961 * The newer usage is when assembling an array. 3962 * In this case raid_disks will be 0, and the major_version field is 3963 * use to determine which style super-blocks are to be found on the devices. 3964 * The minor and patch _version numbers are also kept incase the 3965 * super_block handler wishes to interpret them. 3966 */ 3967static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 3968{ 3969 3970 if (info->raid_disks == 0) { 3971 /* just setting version number for superblock loading */ 3972 if (info->major_version < 0 || 3973 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || 3974 super_types[info->major_version].name == NULL) { 3975 /* maybe try to auto-load a module? */ 3976 printk(KERN_INFO 3977 "md: superblock version %d not known\n", 3978 info->major_version); 3979 return -EINVAL; 3980 } 3981 mddev->major_version = info->major_version; 3982 mddev->minor_version = info->minor_version; 3983 mddev->patch_version = info->patch_version; 3984 mddev->persistent = !info->not_persistent; 3985 return 0; 3986 } 3987 mddev->major_version = MD_MAJOR_VERSION; 3988 mddev->minor_version = MD_MINOR_VERSION; 3989 mddev->patch_version = MD_PATCHLEVEL_VERSION; 3990 mddev->ctime = get_seconds(); 3991 3992 mddev->level = info->level; 3993 mddev->clevel[0] = 0; 3994 mddev->size = info->size; 3995 mddev->raid_disks = info->raid_disks; 3996 /* don't set md_minor, it is determined by which /dev/md* was 3997 * openned 3998 */ 3999 if (info->state & (1<<MD_SB_CLEAN)) 4000 mddev->recovery_cp = MaxSector; 4001 else 4002 mddev->recovery_cp = 0; 4003 mddev->persistent = ! info->not_persistent; 4004 4005 mddev->layout = info->layout; 4006 mddev->chunk_size = info->chunk_size; 4007 4008 mddev->max_disks = MD_SB_DISKS; 4009 4010 mddev->flags = 0; 4011 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4012 4013 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 4014 mddev->bitmap_offset = 0; 4015 4016 mddev->reshape_position = MaxSector; 4017 4018 /* 4019 * Generate a 128 bit UUID 4020 */ 4021 get_random_bytes(mddev->uuid, 16); 4022 4023 mddev->new_level = mddev->level; 4024 mddev->new_chunk = mddev->chunk_size; 4025 mddev->new_layout = mddev->layout; 4026 mddev->delta_disks = 0; 4027 4028 return 0; 4029} 4030 4031static int update_size(mddev_t *mddev, unsigned long size) 4032{ 4033 mdk_rdev_t * rdev; 4034 int rv; 4035 struct list_head *tmp; 4036 int fit = (size == 0); 4037 4038 if (mddev->pers->resize == NULL) 4039 return -EINVAL; 4040 /* The "size" is the amount of each device that is used. 4041 * This can only make sense for arrays with redundancy. 4042 * linear and raid0 always use whatever space is available 4043 * We can only consider changing the size if no resync 4044 * or reconstruction is happening, and if the new size 4045 * is acceptable. It must fit before the sb_offset or, 4046 * if that is <data_offset, it must fit before the 4047 * size of each device. 4048 * If size is zero, we find the largest size that fits. 4049 */ 4050 if (mddev->sync_thread) 4051 return -EBUSY; 4052 ITERATE_RDEV(mddev,rdev,tmp) { 4053 sector_t avail; 4054 avail = rdev->size * 2; 4055 4056 if (fit && (size == 0 || size > avail/2)) 4057 size = avail/2; 4058 if (avail < ((sector_t)size << 1)) 4059 return -ENOSPC; 4060 } 4061 rv = mddev->pers->resize(mddev, (sector_t)size *2); 4062 if (!rv) { 4063 struct block_device *bdev; 4064 4065 bdev = bdget_disk(mddev->gendisk, 0); 4066 if (bdev) { 4067 mutex_lock(&bdev->bd_inode->i_mutex); 4068 i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10); 4069 mutex_unlock(&bdev->bd_inode->i_mutex); 4070 bdput(bdev); 4071 } 4072 } 4073 return rv; 4074} 4075 4076static int update_raid_disks(mddev_t *mddev, int raid_disks) 4077{ 4078 int rv; 4079 /* change the number of raid disks */ 4080 if (mddev->pers->check_reshape == NULL) 4081 return -EINVAL; 4082 if (raid_disks <= 0 || 4083 raid_disks >= mddev->max_disks) 4084 return -EINVAL; 4085 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 4086 return -EBUSY; 4087 mddev->delta_disks = raid_disks - mddev->raid_disks; 4088 4089 rv = mddev->pers->check_reshape(mddev); 4090 return rv; 4091} 4092 4093 4094/* 4095 * update_array_info is used to change the configuration of an 4096 * on-line array. 4097 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 4098 * fields in the info are checked against the array. 4099 * Any differences that cannot be handled will cause an error. 4100 * Normally, only one change can be managed at a time. 4101 */ 4102static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 4103{ 4104 int rv = 0; 4105 int cnt = 0; 4106 int state = 0; 4107 4108 /* calculate expected state,ignoring low bits */ 4109 if (mddev->bitmap && mddev->bitmap_offset) 4110 state |= (1 << MD_SB_BITMAP_PRESENT); 4111 4112 if (mddev->major_version != info->major_version || 4113 mddev->minor_version != info->minor_version || 4114/* mddev->patch_version != info->patch_version || */ 4115 mddev->ctime != info->ctime || 4116 mddev->level != info->level || 4117/* mddev->layout != info->layout || */ 4118 !mddev->persistent != info->not_persistent|| 4119 mddev->chunk_size != info->chunk_size || 4120 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 4121 ((state^info->state) & 0xfffffe00) 4122 ) 4123 return -EINVAL; 4124 /* Check there is only one change */ 4125 if (info->size >= 0 && mddev->size != info->size) cnt++; 4126 if (mddev->raid_disks != info->raid_disks) cnt++; 4127 if (mddev->layout != info->layout) cnt++; 4128 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 4129 if (cnt == 0) return 0; 4130 if (cnt > 1) return -EINVAL; 4131 4132 if (mddev->layout != info->layout) { 4133 /* Change layout 4134 * we don't need to do anything at the md level, the 4135 * personality will take care of it all. 4136 */ 4137 if (mddev->pers->reconfig == NULL) 4138 return -EINVAL; 4139 else 4140 return mddev->pers->reconfig(mddev, info->layout, -1); 4141 } 4142 if (info->size >= 0 && mddev->size != info->size) 4143 rv = update_size(mddev, info->size); 4144 4145 if (mddev->raid_disks != info->raid_disks) 4146 rv = update_raid_disks(mddev, info->raid_disks); 4147 4148 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 4149 if (mddev->pers->quiesce == NULL) 4150 return -EINVAL; 4151 if (mddev->recovery || mddev->sync_thread) 4152 return -EBUSY; 4153 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 4154 /* add the bitmap */ 4155 if (mddev->bitmap) 4156 return -EEXIST; 4157 if (mddev->default_bitmap_offset == 0) 4158 return -EINVAL; 4159 mddev->bitmap_offset = mddev->default_bitmap_offset; 4160 mddev->pers->quiesce(mddev, 1); 4161 rv = bitmap_create(mddev); 4162 if (rv) 4163 bitmap_destroy(mddev); 4164 mddev->pers->quiesce(mddev, 0); 4165 } else { 4166 /* remove the bitmap */ 4167 if (!mddev->bitmap) 4168 return -ENOENT; 4169 if (mddev->bitmap->file) 4170 return -EINVAL; 4171 mddev->pers->quiesce(mddev, 1); 4172 bitmap_destroy(mddev); 4173 mddev->pers->quiesce(mddev, 0); 4174 mddev->bitmap_offset = 0; 4175 } 4176 } 4177 md_update_sb(mddev, 1); 4178 return rv; 4179} 4180 4181static int set_disk_faulty(mddev_t *mddev, dev_t dev) 4182{ 4183 mdk_rdev_t *rdev; 4184 4185 if (mddev->pers == NULL) 4186 return -ENODEV; 4187 4188 rdev = find_rdev(mddev, dev); 4189 if (!rdev) 4190 return -ENODEV; 4191 4192 md_error(mddev, rdev); 4193 return 0; 4194} 4195 4196static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 4197{ 4198 mddev_t *mddev = bdev->bd_disk->private_data; 4199 4200 geo->heads = 2; 4201 geo->sectors = 4; 4202 geo->cylinders = get_capacity(mddev->gendisk) / 8; 4203 return 0; 4204} 4205 4206static int md_ioctl(struct inode *inode, struct file *file, 4207 unsigned int cmd, unsigned long arg) 4208{ 4209 int err = 0; 4210 void __user *argp = (void __user *)arg; 4211 mddev_t *mddev = NULL; 4212 4213 if (!capable(CAP_SYS_ADMIN)) 4214 return -EACCES; 4215 4216 /* 4217 * Commands dealing with the RAID driver but not any 4218 * particular array: 4219 */ 4220 switch (cmd) 4221 { 4222 case RAID_VERSION: 4223 err = get_version(argp); 4224 goto done; 4225 4226 case PRINT_RAID_DEBUG: 4227 err = 0; 4228 md_print_devices(); 4229 goto done; 4230 4231#ifndef MODULE 4232 case RAID_AUTORUN: 4233 err = 0; 4234 autostart_arrays(arg); 4235 goto done; 4236#endif 4237 default:; 4238 } 4239 4240 /* 4241 * Commands creating/starting a new array: 4242 */ 4243 4244 mddev = inode->i_bdev->bd_disk->private_data; 4245 4246 if (!mddev) { 4247 BUG(); 4248 goto abort; 4249 } 4250 4251 err = mddev_lock(mddev); 4252 if (err) { 4253 printk(KERN_INFO 4254 "md: ioctl lock interrupted, reason %d, cmd %d\n", 4255 err, cmd); 4256 goto abort; 4257 } 4258 4259 switch (cmd) 4260 { 4261 case SET_ARRAY_INFO: 4262 { 4263 mdu_array_info_t info; 4264 if (!arg) 4265 memset(&info, 0, sizeof(info)); 4266 else if (copy_from_user(&info, argp, sizeof(info))) { 4267 err = -EFAULT; 4268 goto abort_unlock; 4269 } 4270 if (mddev->pers) { 4271 err = update_array_info(mddev, &info); 4272 if (err) { 4273 printk(KERN_WARNING "md: couldn't update" 4274 " array info. %d\n", err); 4275 goto abort_unlock; 4276 } 4277 goto done_unlock; 4278 } 4279 if (!list_empty(&mddev->disks)) { 4280 printk(KERN_WARNING 4281 "md: array %s already has disks!\n", 4282 mdname(mddev)); 4283 err = -EBUSY; 4284 goto abort_unlock; 4285 } 4286 if (mddev->raid_disks) { 4287 printk(KERN_WARNING 4288 "md: array %s already initialised!\n", 4289 mdname(mddev)); 4290 err = -EBUSY; 4291 goto abort_unlock; 4292 } 4293 err = set_array_info(mddev, &info); 4294 if (err) { 4295 printk(KERN_WARNING "md: couldn't set" 4296 " array info. %d\n", err); 4297 goto abort_unlock; 4298 } 4299 } 4300 goto done_unlock; 4301 4302 default:; 4303 } 4304 4305 /* 4306 * Commands querying/configuring an existing array: 4307 */ 4308 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 4309 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 4310 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 4311 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 4312 && cmd != GET_BITMAP_FILE) { 4313 err = -ENODEV; 4314 goto abort_unlock; 4315 } 4316 4317 /* 4318 * Commands even a read-only array can execute: 4319 */ 4320 switch (cmd) 4321 { 4322 case GET_ARRAY_INFO: 4323 err = get_array_info(mddev, argp); 4324 goto done_unlock; 4325 4326 case GET_BITMAP_FILE: 4327 err = get_bitmap_file(mddev, argp); 4328 goto done_unlock; 4329 4330 case GET_DISK_INFO: 4331 err = get_disk_info(mddev, argp); 4332 goto done_unlock; 4333 4334 case RESTART_ARRAY_RW: 4335 err = restart_array(mddev); 4336 goto done_unlock; 4337 4338 case STOP_ARRAY: 4339 err = do_md_stop (mddev, 0); 4340 goto done_unlock; 4341 4342 case STOP_ARRAY_RO: 4343 err = do_md_stop (mddev, 1); 4344 goto done_unlock; 4345 4346 /* 4347 * We have a problem here : there is no easy way to give a CHS 4348 * virtual geometry. We currently pretend that we have a 2 heads 4349 * 4 sectors (with a BIG number of cylinders...). This drives 4350 * dosfs just mad... ;-) 4351 */ 4352 } 4353 4354 /* 4355 * The remaining ioctls are changing the state of the 4356 * superblock, so we do not allow them on read-only arrays. 4357 * However non-MD ioctls (e.g. get-size) will still come through 4358 * here and hit the 'default' below, so only disallow 4359 * 'md' ioctls, and switch to rw mode if started auto-readonly. 4360 */ 4361 if (_IOC_TYPE(cmd) == MD_MAJOR && 4362 mddev->ro && mddev->pers) { 4363 if (mddev->ro == 2) { 4364 mddev->ro = 0; 4365 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4366 md_wakeup_thread(mddev->thread); 4367 4368 } else { 4369 err = -EROFS; 4370 goto abort_unlock; 4371 } 4372 } 4373 4374 switch (cmd) 4375 { 4376 case ADD_NEW_DISK: 4377 { 4378 mdu_disk_info_t info; 4379 if (copy_from_user(&info, argp, sizeof(info))) 4380 err = -EFAULT; 4381 else 4382 err = add_new_disk(mddev, &info); 4383 goto done_unlock; 4384 } 4385 4386 case HOT_REMOVE_DISK: 4387 err = hot_remove_disk(mddev, new_decode_dev(arg)); 4388 goto done_unlock; 4389 4390 case HOT_ADD_DISK: 4391 err = hot_add_disk(mddev, new_decode_dev(arg)); 4392 goto done_unlock; 4393 4394 case SET_DISK_FAULTY: 4395 err = set_disk_faulty(mddev, new_decode_dev(arg)); 4396 goto done_unlock; 4397 4398 case RUN_ARRAY: 4399 err = do_md_run (mddev); 4400 goto done_unlock; 4401 4402 case SET_BITMAP_FILE: 4403 err = set_bitmap_file(mddev, (int)arg); 4404 goto done_unlock; 4405 4406 default: 4407 err = -EINVAL; 4408 goto abort_unlock; 4409 } 4410 4411done_unlock: 4412abort_unlock: 4413 mddev_unlock(mddev); 4414 4415 return err; 4416done: 4417 if (err) 4418 MD_BUG(); 4419abort: 4420 return err; 4421} 4422 4423static int md_open(struct inode *inode, struct file *file) 4424{ 4425 /* 4426 * Succeed if we can lock the mddev, which confirms that 4427 * it isn't being stopped right now. 4428 */ 4429 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 4430 int err; 4431 4432 if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) 4433 goto out; 4434 4435 err = 0; 4436 mddev_get(mddev); 4437 mddev_unlock(mddev); 4438 4439 check_disk_change(inode->i_bdev); 4440 out: 4441 return err; 4442} 4443 4444static int md_release(struct inode *inode, struct file * file) 4445{ 4446 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 4447 4448 BUG_ON(!mddev); 4449 mddev_put(mddev); 4450 4451 return 0; 4452} 4453 4454static int md_media_changed(struct gendisk *disk) 4455{ 4456 mddev_t *mddev = disk->private_data; 4457 4458 return mddev->changed; 4459} 4460 4461static int md_revalidate(struct gendisk *disk) 4462{ 4463 mddev_t *mddev = disk->private_data; 4464 4465 mddev->changed = 0; 4466 return 0; 4467} 4468static struct block_device_operations md_fops = 4469{ 4470 .owner = THIS_MODULE, 4471 .open = md_open, 4472 .release = md_release, 4473 .ioctl = md_ioctl, 4474 .getgeo = md_getgeo, 4475 .media_changed = md_media_changed, 4476 .revalidate_disk= md_revalidate, 4477}; 4478 4479static int md_thread(void * arg) 4480{ 4481 mdk_thread_t *thread = arg; 4482 4483 /* 4484 * md_thread is a 'system-thread', it's priority should be very 4485 * high. We avoid resource deadlocks individually in each 4486 * raid personality. (RAID5 does preallocation) We also use RR and 4487 * the very same RT priority as kswapd, thus we will never get 4488 * into a priority inversion deadlock. 4489 * 4490 * we definitely have to have equal or higher priority than 4491 * bdflush, otherwise bdflush will deadlock if there are too 4492 * many dirty RAID5 blocks. 4493 */ 4494 4495 current->flags |= PF_NOFREEZE; 4496 allow_signal(SIGKILL); 4497 while (!kthread_should_stop()) { 4498 4499 /* We need to wait INTERRUPTIBLE so that 4500 * we don't add to the load-average. 4501 * That means we need to be sure no signals are 4502 * pending 4503 */ 4504 if (signal_pending(current)) 4505 flush_signals(current); 4506 4507 wait_event_interruptible_timeout 4508 (thread->wqueue, 4509 test_bit(THREAD_WAKEUP, &thread->flags) 4510 || kthread_should_stop(), 4511 thread->timeout); 4512 4513 clear_bit(THREAD_WAKEUP, &thread->flags); 4514 4515 thread->run(thread->mddev); 4516 } 4517 4518 return 0; 4519} 4520 4521void md_wakeup_thread(mdk_thread_t *thread) 4522{ 4523 if (thread) { 4524 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 4525 set_bit(THREAD_WAKEUP, &thread->flags); 4526 wake_up(&thread->wqueue); 4527 } 4528} 4529 4530mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 4531 const char *name) 4532{ 4533 mdk_thread_t *thread; 4534 4535 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 4536 if (!thread) 4537 return NULL; 4538 4539 init_waitqueue_head(&thread->wqueue); 4540 4541 thread->run = run; 4542 thread->mddev = mddev; 4543 thread->timeout = MAX_SCHEDULE_TIMEOUT; 4544 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 4545 if (IS_ERR(thread->tsk)) { 4546 kfree(thread); 4547 return NULL; 4548 } 4549 return thread; 4550} 4551 4552void md_unregister_thread(mdk_thread_t *thread) 4553{ 4554 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 4555 4556 kthread_stop(thread->tsk); 4557 kfree(thread); 4558} 4559 4560void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 4561{ 4562 if (!mddev) { 4563 MD_BUG(); 4564 return; 4565 } 4566 4567 if (!rdev || test_bit(Faulty, &rdev->flags)) 4568 return; 4569/* 4570 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 4571 mdname(mddev), 4572 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 4573 __builtin_return_address(0),__builtin_return_address(1), 4574 __builtin_return_address(2),__builtin_return_address(3)); 4575*/ 4576 if (!mddev->pers) 4577 return; 4578 if (!mddev->pers->error_handler) 4579 return; 4580 mddev->pers->error_handler(mddev,rdev); 4581 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4582 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4583 md_wakeup_thread(mddev->thread); 4584 md_new_event_inintr(mddev); 4585} 4586 4587/* seq_file implementation /proc/mdstat */ 4588 4589static void status_unused(struct seq_file *seq) 4590{ 4591 int i = 0; 4592 mdk_rdev_t *rdev; 4593 struct list_head *tmp; 4594 4595 seq_printf(seq, "unused devices: "); 4596 4597 ITERATE_RDEV_PENDING(rdev,tmp) { 4598 char b[BDEVNAME_SIZE]; 4599 i++; 4600 seq_printf(seq, "%s ", 4601 bdevname(rdev->bdev,b)); 4602 } 4603 if (!i) 4604 seq_printf(seq, "<none>"); 4605 4606 seq_printf(seq, "\n"); 4607} 4608 4609 4610static void status_resync(struct seq_file *seq, mddev_t * mddev) 4611{ 4612 sector_t max_blocks, resync, res; 4613 unsigned long dt, db, rt; 4614 int scale; 4615 unsigned int per_milli; 4616 4617 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 4618 4619 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 4620 max_blocks = mddev->resync_max_sectors >> 1; 4621 else 4622 max_blocks = mddev->size; 4623 4624 /* 4625 * Should not happen. 4626 */ 4627 if (!max_blocks) { 4628 MD_BUG(); 4629 return; 4630 } 4631 /* Pick 'scale' such that (resync>>scale)*1000 will fit 4632 * in a sector_t, and (max_blocks>>scale) will fit in a 4633 * u32, as those are the requirements for sector_div. 4634 * Thus 'scale' must be at least 10 4635 */ 4636 scale = 10; 4637 if (sizeof(sector_t) > sizeof(unsigned long)) { 4638 while ( max_blocks/2 > (1ULL<<(scale+32))) 4639 scale++; 4640 } 4641 res = (resync>>scale)*1000; 4642 sector_div(res, (u32)((max_blocks>>scale)+1)); 4643 4644 per_milli = res; 4645 { 4646 int i, x = per_milli/50, y = 20-x; 4647 seq_printf(seq, "["); 4648 for (i = 0; i < x; i++) 4649 seq_printf(seq, "="); 4650 seq_printf(seq, ">"); 4651 for (i = 0; i < y; i++) 4652 seq_printf(seq, "."); 4653 seq_printf(seq, "] "); 4654 } 4655 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 4656 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 4657 "reshape" : 4658 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 4659 "check" : 4660 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 4661 "resync" : "recovery"))), 4662 per_milli/10, per_milli % 10, 4663 (unsigned long long) resync, 4664 (unsigned long long) max_blocks); 4665 4666 /* 4667 * We do not want to overflow, so the order of operands and 4668 * the * 100 / 100 trick are important. We do a +1 to be 4669 * safe against division by zero. We only estimate anyway. 4670 * 4671 * dt: time from mark until now 4672 * db: blocks written from mark until now 4673 * rt: remaining time 4674 */ 4675 dt = ((jiffies - mddev->resync_mark) / HZ); 4676 if (!dt) dt++; 4677 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 4678 - mddev->resync_mark_cnt; 4679 rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100; 4680 4681 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 4682 4683 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 4684} 4685 4686static void *md_seq_start(struct seq_file *seq, loff_t *pos) 4687{ 4688 struct list_head *tmp; 4689 loff_t l = *pos; 4690 mddev_t *mddev; 4691 4692 if (l >= 0x10000) 4693 return NULL; 4694 if (!l--) 4695 /* header */ 4696 return (void*)1; 4697 4698 spin_lock(&all_mddevs_lock); 4699 list_for_each(tmp,&all_mddevs) 4700 if (!l--) { 4701 mddev = list_entry(tmp, mddev_t, all_mddevs); 4702 mddev_get(mddev); 4703 spin_unlock(&all_mddevs_lock); 4704 return mddev; 4705 } 4706 spin_unlock(&all_mddevs_lock); 4707 if (!l--) 4708 return (void*)2;/* tail */ 4709 return NULL; 4710} 4711 4712static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4713{ 4714 struct list_head *tmp; 4715 mddev_t *next_mddev, *mddev = v; 4716 4717 ++*pos; 4718 if (v == (void*)2) 4719 return NULL; 4720 4721 spin_lock(&all_mddevs_lock); 4722 if (v == (void*)1) 4723 tmp = all_mddevs.next; 4724 else 4725 tmp = mddev->all_mddevs.next; 4726 if (tmp != &all_mddevs) 4727 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 4728 else { 4729 next_mddev = (void*)2; 4730 *pos = 0x10000; 4731 } 4732 spin_unlock(&all_mddevs_lock); 4733 4734 if (v != (void*)1) 4735 mddev_put(mddev); 4736 return next_mddev; 4737 4738} 4739 4740static void md_seq_stop(struct seq_file *seq, void *v) 4741{ 4742 mddev_t *mddev = v; 4743 4744 if (mddev && v != (void*)1 && v != (void*)2) 4745 mddev_put(mddev); 4746} 4747 4748struct mdstat_info { 4749 int event; 4750}; 4751 4752static int md_seq_show(struct seq_file *seq, void *v) 4753{ 4754 mddev_t *mddev = v; 4755 sector_t size; 4756 struct list_head *tmp2; 4757 mdk_rdev_t *rdev; 4758 struct mdstat_info *mi = seq->private; 4759 struct bitmap *bitmap; 4760 4761 if (v == (void*)1) { 4762 struct mdk_personality *pers; 4763 seq_printf(seq, "Personalities : "); 4764 spin_lock(&pers_lock); 4765 list_for_each_entry(pers, &pers_list, list) 4766 seq_printf(seq, "[%s] ", pers->name); 4767 4768 spin_unlock(&pers_lock); 4769 seq_printf(seq, "\n"); 4770 mi->event = atomic_read(&md_event_count); 4771 return 0; 4772 } 4773 if (v == (void*)2) { 4774 status_unused(seq); 4775 return 0; 4776 } 4777 4778 if (mddev_lock(mddev) < 0) 4779 return -EINTR; 4780 4781 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 4782 seq_printf(seq, "%s : %sactive", mdname(mddev), 4783 mddev->pers ? "" : "in"); 4784 if (mddev->pers) { 4785 if (mddev->ro==1) 4786 seq_printf(seq, " (read-only)"); 4787 if (mddev->ro==2) 4788 seq_printf(seq, "(auto-read-only)"); 4789 seq_printf(seq, " %s", mddev->pers->name); 4790 } 4791 4792 size = 0; 4793 ITERATE_RDEV(mddev,rdev,tmp2) { 4794 char b[BDEVNAME_SIZE]; 4795 seq_printf(seq, " %s[%d]", 4796 bdevname(rdev->bdev,b), rdev->desc_nr); 4797 if (test_bit(WriteMostly, &rdev->flags)) 4798 seq_printf(seq, "(W)"); 4799 if (test_bit(Faulty, &rdev->flags)) { 4800 seq_printf(seq, "(F)"); 4801 continue; 4802 } else if (rdev->raid_disk < 0) 4803 seq_printf(seq, "(S)"); /* spare */ 4804 size += rdev->size; 4805 } 4806 4807 if (!list_empty(&mddev->disks)) { 4808 if (mddev->pers) 4809 seq_printf(seq, "\n %llu blocks", 4810 (unsigned long long)mddev->array_size); 4811 else 4812 seq_printf(seq, "\n %llu blocks", 4813 (unsigned long long)size); 4814 } 4815 if (mddev->persistent) { 4816 if (mddev->major_version != 0 || 4817 mddev->minor_version != 90) { 4818 seq_printf(seq," super %d.%d", 4819 mddev->major_version, 4820 mddev->minor_version); 4821 } 4822 } else 4823 seq_printf(seq, " super non-persistent"); 4824 4825 if (mddev->pers) { 4826 mddev->pers->status (seq, mddev); 4827 seq_printf(seq, "\n "); 4828 if (mddev->pers->sync_request) { 4829 if (mddev->curr_resync > 2) { 4830 status_resync (seq, mddev); 4831 seq_printf(seq, "\n "); 4832 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 4833 seq_printf(seq, "\tresync=DELAYED\n "); 4834 else if (mddev->recovery_cp < MaxSector) 4835 seq_printf(seq, "\tresync=PENDING\n "); 4836 } 4837 } else 4838 seq_printf(seq, "\n "); 4839 4840 if ((bitmap = mddev->bitmap)) { 4841 unsigned long chunk_kb; 4842 unsigned long flags; 4843 spin_lock_irqsave(&bitmap->lock, flags); 4844 chunk_kb = bitmap->chunksize >> 10; 4845 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 4846 "%lu%s chunk", 4847 bitmap->pages - bitmap->missing_pages, 4848 bitmap->pages, 4849 (bitmap->pages - bitmap->missing_pages) 4850 << (PAGE_SHIFT - 10), 4851 chunk_kb ? chunk_kb : bitmap->chunksize, 4852 chunk_kb ? "KB" : "B"); 4853 if (bitmap->file) { 4854 seq_printf(seq, ", file: "); 4855 seq_path(seq, bitmap->file->f_path.mnt, 4856 bitmap->file->f_path.dentry," \t\n"); 4857 } 4858 4859 seq_printf(seq, "\n"); 4860 spin_unlock_irqrestore(&bitmap->lock, flags); 4861 } 4862 4863 seq_printf(seq, "\n"); 4864 } 4865 mddev_unlock(mddev); 4866 4867 return 0; 4868} 4869 4870static struct seq_operations md_seq_ops = { 4871 .start = md_seq_start, 4872 .next = md_seq_next, 4873 .stop = md_seq_stop, 4874 .show = md_seq_show, 4875}; 4876 4877static int md_seq_open(struct inode *inode, struct file *file) 4878{ 4879 int error; 4880 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 4881 if (mi == NULL) 4882 return -ENOMEM; 4883 4884 error = seq_open(file, &md_seq_ops); 4885 if (error) 4886 kfree(mi); 4887 else { 4888 struct seq_file *p = file->private_data; 4889 p->private = mi; 4890 mi->event = atomic_read(&md_event_count); 4891 } 4892 return error; 4893} 4894 4895static int md_seq_release(struct inode *inode, struct file *file) 4896{ 4897 struct seq_file *m = file->private_data; 4898 struct mdstat_info *mi = m->private; 4899 m->private = NULL; 4900 kfree(mi); 4901 return seq_release(inode, file); 4902} 4903 4904static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 4905{ 4906 struct seq_file *m = filp->private_data; 4907 struct mdstat_info *mi = m->private; 4908 int mask; 4909 4910 poll_wait(filp, &md_event_waiters, wait); 4911 4912 /* always allow read */ 4913 mask = POLLIN | POLLRDNORM; 4914 4915 if (mi->event != atomic_read(&md_event_count)) 4916 mask |= POLLERR | POLLPRI; 4917 return mask; 4918} 4919 4920static struct file_operations md_seq_fops = { 4921 .owner = THIS_MODULE, 4922 .open = md_seq_open, 4923 .read = seq_read, 4924 .llseek = seq_lseek, 4925 .release = md_seq_release, 4926 .poll = mdstat_poll, 4927}; 4928 4929int register_md_personality(struct mdk_personality *p) 4930{ 4931 spin_lock(&pers_lock); 4932 list_add_tail(&p->list, &pers_list); 4933 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 4934 spin_unlock(&pers_lock); 4935 return 0; 4936} 4937 4938int unregister_md_personality(struct mdk_personality *p) 4939{ 4940 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 4941 spin_lock(&pers_lock); 4942 list_del_init(&p->list); 4943 spin_unlock(&pers_lock); 4944 return 0; 4945} 4946 4947static int is_mddev_idle(mddev_t *mddev) 4948{ 4949 mdk_rdev_t * rdev; 4950 struct list_head *tmp; 4951 int idle; 4952 unsigned long curr_events; 4953 4954 idle = 1; 4955 ITERATE_RDEV(mddev,rdev,tmp) { 4956 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 4957 curr_events = disk_stat_read(disk, sectors[0]) + 4958 disk_stat_read(disk, sectors[1]) - 4959 atomic_read(&disk->sync_io); 4960 /* The difference between curr_events and last_events 4961 * will be affected by any new non-sync IO (making 4962 * curr_events bigger) and any difference in the amount of 4963 * in-flight syncio (making current_events bigger or smaller) 4964 * The amount in-flight is currently limited to 4965 * 32*64K in raid1/10 and 256*PAGE_SIZE in raid5/6 4966 * which is at most 4096 sectors. 4967 * These numbers are fairly fragile and should be made 4968 * more robust, probably by enforcing the 4969 * 'window size' that md_do_sync sort-of uses. 4970 * 4971 * Note: the following is an unsigned comparison. 4972 */ 4973 if ((curr_events - rdev->last_events + 4096) > 8192) { 4974 rdev->last_events = curr_events; 4975 idle = 0; 4976 } 4977 } 4978 return idle; 4979} 4980 4981void md_done_sync(mddev_t *mddev, int blocks, int ok) 4982{ 4983 /* another "blocks" (512byte) blocks have been synced */ 4984 atomic_sub(blocks, &mddev->recovery_active); 4985 wake_up(&mddev->recovery_wait); 4986 if (!ok) { 4987 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 4988 md_wakeup_thread(mddev->thread); 4989 // stop recovery, signal do_sync .... 4990 } 4991} 4992 4993 4994/* md_write_start(mddev, bi) 4995 * If we need to update some array metadata (e.g. 'active' flag 4996 * in superblock) before writing, schedule a superblock update 4997 * and wait for it to complete. 4998 */ 4999void md_write_start(mddev_t *mddev, struct bio *bi) 5000{ 5001 if (bio_data_dir(bi) != WRITE) 5002 return; 5003 5004 BUG_ON(mddev->ro == 1); 5005 if (mddev->ro == 2) { 5006 /* need to switch to read/write */ 5007 mddev->ro = 0; 5008 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5009 md_wakeup_thread(mddev->thread); 5010 } 5011 atomic_inc(&mddev->writes_pending); 5012 if (mddev->in_sync) { 5013 spin_lock_irq(&mddev->write_lock); 5014 if (mddev->in_sync) { 5015 mddev->in_sync = 0; 5016 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5017 md_wakeup_thread(mddev->thread); 5018 } 5019 spin_unlock_irq(&mddev->write_lock); 5020 } 5021 wait_event(mddev->sb_wait, mddev->flags==0); 5022} 5023 5024void md_write_end(mddev_t *mddev) 5025{ 5026 if (atomic_dec_and_test(&mddev->writes_pending)) { 5027 if (mddev->safemode == 2) 5028 md_wakeup_thread(mddev->thread); 5029 else if (mddev->safemode_delay) 5030 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 5031 } 5032} 5033 5034static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 5035 5036#define SYNC_MARKS 10 5037#define SYNC_MARK_STEP (3*HZ) 5038void md_do_sync(mddev_t *mddev) 5039{ 5040 mddev_t *mddev2; 5041 unsigned int currspeed = 0, 5042 window; 5043 sector_t max_sectors,j, io_sectors; 5044 unsigned long mark[SYNC_MARKS]; 5045 sector_t mark_cnt[SYNC_MARKS]; 5046 int last_mark,m; 5047 struct list_head *tmp; 5048 sector_t last_check; 5049 int skipped = 0; 5050 struct list_head *rtmp; 5051 mdk_rdev_t *rdev; 5052 char *desc; 5053 5054 /* just incase thread restarts... */ 5055 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 5056 return; 5057 if (mddev->ro) /* never try to sync a read-only array */ 5058 return; 5059 5060 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5061 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 5062 desc = "data-check"; 5063 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5064 desc = "requested-resync"; 5065 else 5066 desc = "resync"; 5067 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5068 desc = "reshape"; 5069 else 5070 desc = "recovery"; 5071 5072 /* we overload curr_resync somewhat here. 5073 * 0 == not engaged in resync at all 5074 * 2 == checking that there is no conflict with another sync 5075 * 1 == like 2, but have yielded to allow conflicting resync to 5076 * commense 5077 * other == active in resync - this many blocks 5078 * 5079 * Before starting a resync we must have set curr_resync to 5080 * 2, and then checked that every "conflicting" array has curr_resync 5081 * less than ours. When we find one that is the same or higher 5082 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 5083 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 5084 * This will mean we have to start checking from the beginning again. 5085 * 5086 */ 5087 5088 do { 5089 mddev->curr_resync = 2; 5090 5091 try_again: 5092 if (kthread_should_stop()) { 5093 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5094 goto skip; 5095 } 5096 ITERATE_MDDEV(mddev2,tmp) { 5097 if (mddev2 == mddev) 5098 continue; 5099 if (mddev2->curr_resync && 5100 match_mddev_units(mddev,mddev2)) { 5101 DEFINE_WAIT(wq); 5102 if (mddev < mddev2 && mddev->curr_resync == 2) { 5103 /* arbitrarily yield */ 5104 mddev->curr_resync = 1; 5105 wake_up(&resync_wait); 5106 } 5107 if (mddev > mddev2 && mddev->curr_resync == 1) 5108 /* no need to wait here, we can wait the next 5109 * time 'round when curr_resync == 2 5110 */ 5111 continue; 5112 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE); 5113 if (!kthread_should_stop() && 5114 mddev2->curr_resync >= mddev->curr_resync) { 5115 printk(KERN_INFO "md: delaying %s of %s" 5116 " until %s has finished (they" 5117 " share one or more physical units)\n", 5118 desc, mdname(mddev), mdname(mddev2)); 5119 mddev_put(mddev2); 5120 schedule(); 5121 finish_wait(&resync_wait, &wq); 5122 goto try_again; 5123 } 5124 finish_wait(&resync_wait, &wq); 5125 } 5126 } 5127 } while (mddev->curr_resync < 2); 5128 5129 j = 0; 5130 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5131 /* resync follows the size requested by the personality, 5132 * which defaults to physical size, but can be virtual size 5133 */ 5134 max_sectors = mddev->resync_max_sectors; 5135 mddev->resync_mismatches = 0; 5136 /* we don't use the checkpoint if there's a bitmap */ 5137 if (!mddev->bitmap && 5138 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5139 j = mddev->recovery_cp; 5140 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5141 max_sectors = mddev->size << 1; 5142 else { 5143 /* recovery follows the physical size of devices */ 5144 max_sectors = mddev->size << 1; 5145 j = MaxSector; 5146 ITERATE_RDEV(mddev,rdev,rtmp) 5147 if (rdev->raid_disk >= 0 && 5148 !test_bit(Faulty, &rdev->flags) && 5149 !test_bit(In_sync, &rdev->flags) && 5150 rdev->recovery_offset < j) 5151 j = rdev->recovery_offset; 5152 } 5153 5154 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 5155 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 5156 " %d KB/sec/disk.\n", speed_min(mddev)); 5157 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 5158 "(but not more than %d KB/sec) for %s.\n", 5159 speed_max(mddev), desc); 5160 5161 is_mddev_idle(mddev); /* this also initializes IO event counters */ 5162 5163 io_sectors = 0; 5164 for (m = 0; m < SYNC_MARKS; m++) { 5165 mark[m] = jiffies; 5166 mark_cnt[m] = io_sectors; 5167 } 5168 last_mark = 0; 5169 mddev->resync_mark = mark[last_mark]; 5170 mddev->resync_mark_cnt = mark_cnt[last_mark]; 5171 5172 /* 5173 * Tune reconstruction: 5174 */ 5175 window = 32*(PAGE_SIZE/512); 5176 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 5177 window/2,(unsigned long long) max_sectors/2); 5178 5179 atomic_set(&mddev->recovery_active, 0); 5180 init_waitqueue_head(&mddev->recovery_wait); 5181 last_check = 0; 5182 5183 if (j>2) { 5184 printk(KERN_INFO 5185 "md: resuming %s of %s from checkpoint.\n", 5186 desc, mdname(mddev)); 5187 mddev->curr_resync = j; 5188 } 5189 5190 while (j < max_sectors) { 5191 sector_t sectors; 5192 5193 skipped = 0; 5194 sectors = mddev->pers->sync_request(mddev, j, &skipped, 5195 currspeed < speed_min(mddev)); 5196 if (sectors == 0) { 5197 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 5198 goto out; 5199 } 5200 5201 if (!skipped) { /* actual IO requested */ 5202 io_sectors += sectors; 5203 atomic_add(sectors, &mddev->recovery_active); 5204 } 5205 5206 j += sectors; 5207 if (j>1) mddev->curr_resync = j; 5208 mddev->curr_mark_cnt = io_sectors; 5209 if (last_check == 0) 5210 /* this is the earliers that rebuilt will be 5211 * visible in /proc/mdstat 5212 */ 5213 md_new_event(mddev); 5214 5215 if (last_check + window > io_sectors || j == max_sectors) 5216 continue; 5217 5218 last_check = io_sectors; 5219 5220 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 5221 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 5222 break; 5223 5224 repeat: 5225 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 5226 /* step marks */ 5227 int next = (last_mark+1) % SYNC_MARKS; 5228 5229 mddev->resync_mark = mark[next]; 5230 mddev->resync_mark_cnt = mark_cnt[next]; 5231 mark[next] = jiffies; 5232 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 5233 last_mark = next; 5234 } 5235 5236 5237 if (kthread_should_stop()) { 5238 /* 5239 * got a signal, exit. 5240 */ 5241 printk(KERN_INFO 5242 "md: md_do_sync() got signal ... exiting\n"); 5243 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5244 goto out; 5245 } 5246 5247 /* 5248 * this loop exits only if either when we are slower than 5249 * the 'hard' speed limit, or the system was IO-idle for 5250 * a jiffy. 5251 * the system might be non-idle CPU-wise, but we only care 5252 * about not overloading the IO subsystem. (things like an 5253 * e2fsck being done on the RAID array should execute fast) 5254 */ 5255 mddev->queue->unplug_fn(mddev->queue); 5256 cond_resched(); 5257 5258 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 5259 /((jiffies-mddev->resync_mark)/HZ +1) +1; 5260 5261 if (currspeed > speed_min(mddev)) { 5262 if ((currspeed > speed_max(mddev)) || 5263 !is_mddev_idle(mddev)) { 5264 msleep(500); 5265 goto repeat; 5266 } 5267 } 5268 } 5269 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); 5270 /* 5271 * this also signals 'finished resyncing' to md_stop 5272 */ 5273 out: 5274 mddev->queue->unplug_fn(mddev->queue); 5275 5276 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 5277 5278 /* tell personality that we are finished */ 5279 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 5280 5281 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5282 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 5283 mddev->curr_resync > 2) { 5284 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5285 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5286 if (mddev->curr_resync >= mddev->recovery_cp) { 5287 printk(KERN_INFO 5288 "md: checkpointing %s of %s.\n", 5289 desc, mdname(mddev)); 5290 mddev->recovery_cp = mddev->curr_resync; 5291 } 5292 } else 5293 mddev->recovery_cp = MaxSector; 5294 } else { 5295 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5296 mddev->curr_resync = MaxSector; 5297 ITERATE_RDEV(mddev,rdev,rtmp) 5298 if (rdev->raid_disk >= 0 && 5299 !test_bit(Faulty, &rdev->flags) && 5300 !test_bit(In_sync, &rdev->flags) && 5301 rdev->recovery_offset < mddev->curr_resync) 5302 rdev->recovery_offset = mddev->curr_resync; 5303 } 5304 } 5305 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5306 5307 skip: 5308 mddev->curr_resync = 0; 5309 wake_up(&resync_wait); 5310 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 5311 md_wakeup_thread(mddev->thread); 5312} 5313EXPORT_SYMBOL_GPL(md_do_sync); 5314 5315 5316/* 5317 * This routine is regularly called by all per-raid-array threads to 5318 * deal with generic issues like resync and super-block update. 5319 * Raid personalities that don't have a thread (linear/raid0) do not 5320 * need this as they never do any recovery or update the superblock. 5321 * 5322 * It does not do any resync itself, but rather "forks" off other threads 5323 * to do that as needed. 5324 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 5325 * "->recovery" and create a thread at ->sync_thread. 5326 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 5327 * and wakeups up this thread which will reap the thread and finish up. 5328 * This thread also removes any faulty devices (with nr_pending == 0). 5329 * 5330 * The overall approach is: 5331 * 1/ if the superblock needs updating, update it. 5332 * 2/ If a recovery thread is running, don't do anything else. 5333 * 3/ If recovery has finished, clean up, possibly marking spares active. 5334 * 4/ If there are any faulty devices, remove them. 5335 * 5/ If array is degraded, try to add spares devices 5336 * 6/ If array has spares or is not in-sync, start a resync thread. 5337 */ 5338void md_check_recovery(mddev_t *mddev) 5339{ 5340 mdk_rdev_t *rdev; 5341 struct list_head *rtmp; 5342 5343 5344 if (mddev->bitmap) 5345 bitmap_daemon_work(mddev->bitmap); 5346 5347 if (mddev->ro) 5348 return; 5349 5350 if (signal_pending(current)) { 5351 if (mddev->pers->sync_request) { 5352 printk(KERN_INFO "md: %s in immediate safe mode\n", 5353 mdname(mddev)); 5354 mddev->safemode = 2; 5355 } 5356 flush_signals(current); 5357 } 5358 5359 if ( ! ( 5360 mddev->flags || 5361 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 5362 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 5363 (mddev->safemode == 1) || 5364 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 5365 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 5366 )) 5367 return; 5368 5369 if (mddev_trylock(mddev)) { 5370 int spares =0; 5371 5372 spin_lock_irq(&mddev->write_lock); 5373 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 5374 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 5375 mddev->in_sync = 1; 5376 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5377 } 5378 if (mddev->safemode == 1) 5379 mddev->safemode = 0; 5380 spin_unlock_irq(&mddev->write_lock); 5381 5382 if (mddev->flags) 5383 md_update_sb(mddev, 0); 5384 5385 5386 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 5387 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 5388 /* resync/recovery still happening */ 5389 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5390 goto unlock; 5391 } 5392 if (mddev->sync_thread) { 5393 /* resync has finished, collect result */ 5394 md_unregister_thread(mddev->sync_thread); 5395 mddev->sync_thread = NULL; 5396 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5397 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5398 /* success...*/ 5399 /* activate any spares */ 5400 mddev->pers->spare_active(mddev); 5401 } 5402 md_update_sb(mddev, 1); 5403 5404 /* if array is no-longer degraded, then any saved_raid_disk 5405 * information must be scrapped 5406 */ 5407 if (!mddev->degraded) 5408 ITERATE_RDEV(mddev,rdev,rtmp) 5409 rdev->saved_raid_disk = -1; 5410 5411 mddev->recovery = 0; 5412 /* flag recovery needed just to double check */ 5413 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5414 md_new_event(mddev); 5415 goto unlock; 5416 } 5417 /* Clear some bits that don't mean anything, but 5418 * might be left set 5419 */ 5420 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5421 clear_bit(MD_RECOVERY_ERR, &mddev->recovery); 5422 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 5423 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 5424 5425 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 5426 goto unlock; 5427 /* no recovery is running. 5428 * remove any failed drives, then 5429 * add spares if possible. 5430 * Spare are also removed and re-added, to allow 5431 * the personality to fail the re-add. 5432 */ 5433 ITERATE_RDEV(mddev,rdev,rtmp) 5434 if (rdev->raid_disk >= 0 && 5435 (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && 5436 atomic_read(&rdev->nr_pending)==0) { 5437 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) { 5438 char nm[20]; 5439 sprintf(nm,"rd%d", rdev->raid_disk); 5440 sysfs_remove_link(&mddev->kobj, nm); 5441 rdev->raid_disk = -1; 5442 } 5443 } 5444 5445 if (mddev->degraded) { 5446 ITERATE_RDEV(mddev,rdev,rtmp) 5447 if (rdev->raid_disk < 0 5448 && !test_bit(Faulty, &rdev->flags)) { 5449 rdev->recovery_offset = 0; 5450 if (mddev->pers->hot_add_disk(mddev,rdev)) { 5451 char nm[20]; 5452 sprintf(nm, "rd%d", rdev->raid_disk); 5453 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 5454 spares++; 5455 md_new_event(mddev); 5456 } else 5457 break; 5458 } 5459 } 5460 5461 if (spares) { 5462 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5463 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5464 } else if (mddev->recovery_cp < MaxSector) { 5465 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5466 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 5467 /* nothing to be done ... */ 5468 goto unlock; 5469 5470 if (mddev->pers->sync_request) { 5471 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5472 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 5473 /* We are adding a device or devices to an array 5474 * which has the bitmap stored on all devices. 5475 * So make sure all bitmap pages get written 5476 */ 5477 bitmap_write_all(mddev->bitmap); 5478 } 5479 mddev->sync_thread = md_register_thread(md_do_sync, 5480 mddev, 5481 "%s_resync"); 5482 if (!mddev->sync_thread) { 5483 printk(KERN_ERR "%s: could not start resync" 5484 " thread...\n", 5485 mdname(mddev)); 5486 /* leave the spares where they are, it shouldn't hurt */ 5487 mddev->recovery = 0; 5488 } else 5489 md_wakeup_thread(mddev->sync_thread); 5490 md_new_event(mddev); 5491 } 5492 unlock: 5493 mddev_unlock(mddev); 5494 } 5495} 5496 5497static int md_notify_reboot(struct notifier_block *this, 5498 unsigned long code, void *x) 5499{ 5500 struct list_head *tmp; 5501 mddev_t *mddev; 5502 5503 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 5504 5505 printk(KERN_INFO "md: stopping all md devices.\n"); 5506 5507 ITERATE_MDDEV(mddev,tmp) 5508 if (mddev_trylock(mddev)) { 5509 do_md_stop (mddev, 1); 5510 mddev_unlock(mddev); 5511 } 5512 /* 5513 * certain more exotic SCSI devices are known to be 5514 * volatile wrt too early system reboots. While the 5515 * right place to handle this issue is the given 5516 * driver, we do want to have a safe RAID driver ... 5517 */ 5518 mdelay(1000*1); 5519 } 5520 return NOTIFY_DONE; 5521} 5522 5523static struct notifier_block md_notifier = { 5524 .notifier_call = md_notify_reboot, 5525 .next = NULL, 5526 .priority = INT_MAX, /* before any real devices */ 5527}; 5528 5529static void md_geninit(void) 5530{ 5531 struct proc_dir_entry *p; 5532 5533 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 5534 5535 p = create_proc_entry("mdstat", S_IRUGO, NULL); 5536 if (p) 5537 p->proc_fops = &md_seq_fops; 5538} 5539 5540static int __init md_init(void) 5541{ 5542 if (register_blkdev(MAJOR_NR, "md")) 5543 return -1; 5544 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 5545 unregister_blkdev(MAJOR_NR, "md"); 5546 return -1; 5547 } 5548 blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE, 5549 md_probe, NULL, NULL); 5550 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 5551 md_probe, NULL, NULL); 5552 5553 register_reboot_notifier(&md_notifier); 5554 raid_table_header = register_sysctl_table(raid_root_table, 1); 5555 5556 md_geninit(); 5557 return (0); 5558} 5559 5560 5561#ifndef MODULE 5562 5563/* 5564 * Searches all registered partitions for autorun RAID arrays 5565 * at boot time. 5566 */ 5567static dev_t detected_devices[128]; 5568static int dev_cnt; 5569 5570void md_autodetect_dev(dev_t dev) 5571{ 5572 if (dev_cnt >= 0 && dev_cnt < 127) 5573 detected_devices[dev_cnt++] = dev; 5574} 5575 5576 5577static void autostart_arrays(int part) 5578{ 5579 mdk_rdev_t *rdev; 5580 int i; 5581 5582 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 5583 5584 for (i = 0; i < dev_cnt; i++) { 5585 dev_t dev = detected_devices[i]; 5586 5587 rdev = md_import_device(dev,0, 0); 5588 if (IS_ERR(rdev)) 5589 continue; 5590 5591 if (test_bit(Faulty, &rdev->flags)) { 5592 MD_BUG(); 5593 continue; 5594 } 5595 list_add(&rdev->same_set, &pending_raid_disks); 5596 } 5597 dev_cnt = 0; 5598 5599 autorun_devices(part); 5600} 5601 5602#endif /* !MODULE */ 5603 5604static __exit void md_exit(void) 5605{ 5606 mddev_t *mddev; 5607 struct list_head *tmp; 5608 5609 blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS); 5610 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 5611 5612 unregister_blkdev(MAJOR_NR,"md"); 5613 unregister_blkdev(mdp_major, "mdp"); 5614 unregister_reboot_notifier(&md_notifier); 5615 unregister_sysctl_table(raid_table_header); 5616 remove_proc_entry("mdstat", NULL); 5617 ITERATE_MDDEV(mddev,tmp) { 5618 struct gendisk *disk = mddev->gendisk; 5619 if (!disk) 5620 continue; 5621 export_array(mddev); 5622 del_gendisk(disk); 5623 put_disk(disk); 5624 mddev->gendisk = NULL; 5625 mddev_put(mddev); 5626 } 5627} 5628 5629module_init(md_init) 5630module_exit(md_exit) 5631 5632static int get_ro(char *buffer, struct kernel_param *kp) 5633{ 5634 return sprintf(buffer, "%d", start_readonly); 5635} 5636static int set_ro(const char *val, struct kernel_param *kp) 5637{ 5638 char *e; 5639 int num = simple_strtoul(val, &e, 10); 5640 if (*val && (*e == '\0' || *e == '\n')) { 5641 start_readonly = num; 5642 return 0; 5643 } 5644 return -EINVAL; 5645} 5646 5647module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 5648module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 5649 5650 5651EXPORT_SYMBOL(register_md_personality); 5652EXPORT_SYMBOL(unregister_md_personality); 5653EXPORT_SYMBOL(md_error); 5654EXPORT_SYMBOL(md_done_sync); 5655EXPORT_SYMBOL(md_write_start); 5656EXPORT_SYMBOL(md_write_end); 5657EXPORT_SYMBOL(md_register_thread); 5658EXPORT_SYMBOL(md_unregister_thread); 5659EXPORT_SYMBOL(md_wakeup_thread); 5660EXPORT_SYMBOL(md_check_recovery); 5661MODULE_LICENSE("GPL"); 5662MODULE_ALIAS("md"); 5663MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);