at master 286 kB view raw
1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 md.c : Multiple Devices driver for Linux 4 Copyright (C) 1998, 1999, 2000 Ingo Molnar 5 6 completely rewritten, based on the MD driver code from Marc Zyngier 7 8 Changes: 9 10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 14 - kmod support by: Cyrus Durgin 15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 17 18 - lots of fixes and improvements to the RAID1/RAID5 and generic 19 RAID code (such as request based resynchronization): 20 21 Neil Brown <neilb@cse.unsw.edu.au>. 22 23 - persistent bitmap code 24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 25 26 27 Errors, Warnings, etc. 28 Please use: 29 pr_crit() for error conditions that risk data loss 30 pr_err() for error conditions that are unexpected, like an IO error 31 or internal inconsistency 32 pr_warn() for error conditions that could have been predicated, like 33 adding a device to an array when it has incompatible metadata 34 pr_info() for every interesting, very rare events, like an array starting 35 or stopping, or resync starting or stopping 36 pr_debug() for everything else. 37 38*/ 39 40#include <linux/sched/mm.h> 41#include <linux/sched/signal.h> 42#include <linux/kthread.h> 43#include <linux/blkdev.h> 44#include <linux/blk-integrity.h> 45#include <linux/badblocks.h> 46#include <linux/sysctl.h> 47#include <linux/seq_file.h> 48#include <linux/fs.h> 49#include <linux/poll.h> 50#include <linux/ctype.h> 51#include <linux/string.h> 52#include <linux/hdreg.h> 53#include <linux/proc_fs.h> 54#include <linux/random.h> 55#include <linux/major.h> 56#include <linux/module.h> 57#include <linux/reboot.h> 58#include <linux/file.h> 59#include <linux/compat.h> 60#include <linux/delay.h> 61#include <linux/raid/md_p.h> 62#include <linux/raid/md_u.h> 63#include <linux/raid/detect.h> 64#include <linux/slab.h> 65#include <linux/percpu-refcount.h> 66#include <linux/part_stat.h> 67 68#include "md.h" 69#include "md-bitmap.h" 70#include "md-cluster.h" 71 72static const char *action_name[NR_SYNC_ACTIONS] = { 73 [ACTION_RESYNC] = "resync", 74 [ACTION_RECOVER] = "recover", 75 [ACTION_CHECK] = "check", 76 [ACTION_REPAIR] = "repair", 77 [ACTION_RESHAPE] = "reshape", 78 [ACTION_FROZEN] = "frozen", 79 [ACTION_IDLE] = "idle", 80}; 81 82static DEFINE_XARRAY(md_submodule); 83 84static const struct kobj_type md_ktype; 85 86static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 87static struct workqueue_struct *md_wq; 88 89/* 90 * This workqueue is used for sync_work to register new sync_thread, and for 91 * del_work to remove rdev, and for event_work that is only set by dm-raid. 92 * 93 * Noted that sync_work will grab reconfig_mutex, hence never flush this 94 * workqueue whith reconfig_mutex grabbed. 95 */ 96static struct workqueue_struct *md_misc_wq; 97 98static int remove_and_add_spares(struct mddev *mddev, 99 struct md_rdev *this); 100static void mddev_detach(struct mddev *mddev); 101static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); 102static void md_wakeup_thread_directly(struct md_thread __rcu **thread); 103 104/* 105 * Default number of read corrections we'll attempt on an rdev 106 * before ejecting it from the array. We divide the read error 107 * count by 2 for every hour elapsed between read errors. 108 */ 109#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 110/* Default safemode delay: 200 msec */ 111#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) 112/* 113 * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit' 114 * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load 115 * does not show up that much. Increase it if you want to have more guaranteed 116 * speed. Note that the RAID driver will use the maximum bandwidth 117 * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle. 118 * 119 * Background sync IO speed control: 120 * 121 * - below speed min: 122 * no limit; 123 * - above speed min and below speed max: 124 * a) if mddev is idle, then no limit; 125 * b) if mddev is busy handling normal IO, then limit inflight sync IO 126 * to sync_io_depth; 127 * - above speed max: 128 * sync IO can't be issued; 129 * 130 * Following configurations can be changed via /proc/sys/dev/raid/ for system 131 * or /sys/block/mdX/md/ for one array. 132 */ 133static int sysctl_speed_limit_min = 1000; 134static int sysctl_speed_limit_max = 200000; 135static int sysctl_sync_io_depth = 32; 136 137static int speed_min(struct mddev *mddev) 138{ 139 return mddev->sync_speed_min ? 140 mddev->sync_speed_min : sysctl_speed_limit_min; 141} 142 143static int speed_max(struct mddev *mddev) 144{ 145 return mddev->sync_speed_max ? 146 mddev->sync_speed_max : sysctl_speed_limit_max; 147} 148 149static int sync_io_depth(struct mddev *mddev) 150{ 151 return mddev->sync_io_depth ? 152 mddev->sync_io_depth : sysctl_sync_io_depth; 153} 154 155static void rdev_uninit_serial(struct md_rdev *rdev) 156{ 157 if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) 158 return; 159 160 kvfree(rdev->serial); 161 rdev->serial = NULL; 162} 163 164static void rdevs_uninit_serial(struct mddev *mddev) 165{ 166 struct md_rdev *rdev; 167 168 rdev_for_each(rdev, mddev) 169 rdev_uninit_serial(rdev); 170} 171 172static int rdev_init_serial(struct md_rdev *rdev) 173{ 174 /* serial_nums equals with BARRIER_BUCKETS_NR */ 175 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); 176 struct serial_in_rdev *serial = NULL; 177 178 if (test_bit(CollisionCheck, &rdev->flags)) 179 return 0; 180 181 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, 182 GFP_KERNEL); 183 if (!serial) 184 return -ENOMEM; 185 186 for (i = 0; i < serial_nums; i++) { 187 struct serial_in_rdev *serial_tmp = &serial[i]; 188 189 spin_lock_init(&serial_tmp->serial_lock); 190 serial_tmp->serial_rb = RB_ROOT_CACHED; 191 init_waitqueue_head(&serial_tmp->serial_io_wait); 192 } 193 194 rdev->serial = serial; 195 set_bit(CollisionCheck, &rdev->flags); 196 197 return 0; 198} 199 200static int rdevs_init_serial(struct mddev *mddev) 201{ 202 struct md_rdev *rdev; 203 int ret = 0; 204 205 rdev_for_each(rdev, mddev) { 206 ret = rdev_init_serial(rdev); 207 if (ret) 208 break; 209 } 210 211 /* Free all resources if pool is not existed */ 212 if (ret && !mddev->serial_info_pool) 213 rdevs_uninit_serial(mddev); 214 215 return ret; 216} 217 218/* 219 * rdev needs to enable serial stuffs if it meets the conditions: 220 * 1. it is multi-queue device flaged with writemostly. 221 * 2. the write-behind mode is enabled. 222 */ 223static int rdev_need_serial(struct md_rdev *rdev) 224{ 225 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && 226 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && 227 test_bit(WriteMostly, &rdev->flags)); 228} 229 230/* 231 * Init resource for rdev(s), then create serial_info_pool if: 232 * 1. rdev is the first device which return true from rdev_enable_serial. 233 * 2. rdev is NULL, means we want to enable serialization for all rdevs. 234 */ 235void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 236{ 237 int ret = 0; 238 239 if (rdev && !rdev_need_serial(rdev) && 240 !test_bit(CollisionCheck, &rdev->flags)) 241 return; 242 243 if (!rdev) 244 ret = rdevs_init_serial(mddev); 245 else 246 ret = rdev_init_serial(rdev); 247 if (ret) 248 return; 249 250 if (mddev->serial_info_pool == NULL) { 251 /* 252 * already in memalloc noio context by 253 * mddev_suspend() 254 */ 255 mddev->serial_info_pool = 256 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 257 sizeof(struct serial_info)); 258 if (!mddev->serial_info_pool) { 259 rdevs_uninit_serial(mddev); 260 pr_err("can't alloc memory pool for serialization\n"); 261 } 262 } 263} 264 265/* 266 * Free resource from rdev(s), and destroy serial_info_pool under conditions: 267 * 1. rdev is the last device flaged with CollisionCheck. 268 * 2. when bitmap is destroyed while policy is not enabled. 269 * 3. for disable policy, the pool is destroyed only when no rdev needs it. 270 */ 271void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 272{ 273 if (rdev && !test_bit(CollisionCheck, &rdev->flags)) 274 return; 275 276 if (mddev->serial_info_pool) { 277 struct md_rdev *temp; 278 int num = 0; /* used to track if other rdevs need the pool */ 279 280 rdev_for_each(temp, mddev) { 281 if (!rdev) { 282 if (!mddev->serialize_policy || 283 !rdev_need_serial(temp)) 284 rdev_uninit_serial(temp); 285 else 286 num++; 287 } else if (temp != rdev && 288 test_bit(CollisionCheck, &temp->flags)) 289 num++; 290 } 291 292 if (rdev) 293 rdev_uninit_serial(rdev); 294 295 if (num) 296 pr_info("The mempool could be used by other devices\n"); 297 else { 298 mempool_destroy(mddev->serial_info_pool); 299 mddev->serial_info_pool = NULL; 300 } 301 } 302} 303 304static struct ctl_table_header *raid_table_header; 305 306static const struct ctl_table raid_table[] = { 307 { 308 .procname = "speed_limit_min", 309 .data = &sysctl_speed_limit_min, 310 .maxlen = sizeof(int), 311 .mode = 0644, 312 .proc_handler = proc_dointvec, 313 }, 314 { 315 .procname = "speed_limit_max", 316 .data = &sysctl_speed_limit_max, 317 .maxlen = sizeof(int), 318 .mode = 0644, 319 .proc_handler = proc_dointvec, 320 }, 321 { 322 .procname = "sync_io_depth", 323 .data = &sysctl_sync_io_depth, 324 .maxlen = sizeof(int), 325 .mode = 0644, 326 .proc_handler = proc_dointvec, 327 }, 328}; 329 330static int start_readonly; 331 332/* 333 * The original mechanism for creating an md device is to create 334 * a device node in /dev and to open it. This causes races with device-close. 335 * The preferred method is to write to the "new_array" module parameter. 336 * This can avoid races. 337 * Setting create_on_open to false disables the original mechanism 338 * so all the races disappear. 339 */ 340static bool create_on_open = true; 341static bool legacy_async_del_gendisk = true; 342static bool check_new_feature = true; 343 344/* 345 * We have a system wide 'event count' that is incremented 346 * on any 'interesting' event, and readers of /proc/mdstat 347 * can use 'poll' or 'select' to find out when the event 348 * count increases. 349 * 350 * Events are: 351 * start array, stop array, error, add device, remove device, 352 * start build, activate spare 353 */ 354static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 355static atomic_t md_event_count; 356void md_new_event(void) 357{ 358 atomic_inc(&md_event_count); 359 wake_up(&md_event_waiters); 360} 361EXPORT_SYMBOL_GPL(md_new_event); 362 363/* 364 * Enables to iterate over all existing md arrays 365 * all_mddevs_lock protects this list. 366 */ 367static LIST_HEAD(all_mddevs); 368static DEFINE_SPINLOCK(all_mddevs_lock); 369 370static bool is_md_suspended(struct mddev *mddev) 371{ 372 return percpu_ref_is_dying(&mddev->active_io); 373} 374/* Rather than calling directly into the personality make_request function, 375 * IO requests come here first so that we can check if the device is 376 * being suspended pending a reconfiguration. 377 * We hold a refcount over the call to ->make_request. By the time that 378 * call has finished, the bio has been linked into some internal structure 379 * and so is visible to ->quiesce(), so we don't need the refcount any more. 380 */ 381static bool is_suspended(struct mddev *mddev, struct bio *bio) 382{ 383 if (is_md_suspended(mddev)) 384 return true; 385 if (bio_data_dir(bio) != WRITE) 386 return false; 387 if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) 388 return false; 389 if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) 390 return false; 391 if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) 392 return false; 393 return true; 394} 395 396bool md_handle_request(struct mddev *mddev, struct bio *bio) 397{ 398check_suspended: 399 if (is_suspended(mddev, bio)) { 400 DEFINE_WAIT(__wait); 401 /* Bail out if REQ_NOWAIT is set for the bio */ 402 if (bio->bi_opf & REQ_NOWAIT) { 403 bio_wouldblock_error(bio); 404 return true; 405 } 406 for (;;) { 407 prepare_to_wait(&mddev->sb_wait, &__wait, 408 TASK_UNINTERRUPTIBLE); 409 if (!is_suspended(mddev, bio)) 410 break; 411 schedule(); 412 } 413 finish_wait(&mddev->sb_wait, &__wait); 414 } 415 if (!percpu_ref_tryget_live(&mddev->active_io)) 416 goto check_suspended; 417 418 if (!mddev->pers->make_request(mddev, bio)) { 419 percpu_ref_put(&mddev->active_io); 420 if (!mddev->gendisk && mddev->pers->prepare_suspend) 421 return false; 422 goto check_suspended; 423 } 424 425 percpu_ref_put(&mddev->active_io); 426 return true; 427} 428EXPORT_SYMBOL(md_handle_request); 429 430static void md_submit_bio(struct bio *bio) 431{ 432 const int rw = bio_data_dir(bio); 433 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; 434 435 if (mddev == NULL || mddev->pers == NULL) { 436 bio_io_error(bio); 437 return; 438 } 439 440 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { 441 bio_io_error(bio); 442 return; 443 } 444 445 bio = bio_split_to_limits(bio); 446 if (!bio) 447 return; 448 449 if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { 450 if (bio_sectors(bio) != 0) 451 bio->bi_status = BLK_STS_IOERR; 452 bio_endio(bio); 453 return; 454 } 455 456 /* bio could be mergeable after passing to underlayer */ 457 bio->bi_opf &= ~REQ_NOMERGE; 458 459 md_handle_request(mddev, bio); 460} 461 462/* 463 * Make sure no new requests are submitted to the device, and any requests that 464 * have been submitted are completely handled. 465 */ 466int mddev_suspend(struct mddev *mddev, bool interruptible) 467{ 468 int err = 0; 469 470 /* 471 * hold reconfig_mutex to wait for normal io will deadlock, because 472 * other context can't update super_block, and normal io can rely on 473 * updating super_block. 474 */ 475 lockdep_assert_not_held(&mddev->reconfig_mutex); 476 477 if (interruptible) 478 err = mutex_lock_interruptible(&mddev->suspend_mutex); 479 else 480 mutex_lock(&mddev->suspend_mutex); 481 if (err) 482 return err; 483 484 if (mddev->suspended) { 485 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 486 mutex_unlock(&mddev->suspend_mutex); 487 return 0; 488 } 489 490 percpu_ref_kill(&mddev->active_io); 491 if (interruptible) 492 err = wait_event_interruptible(mddev->sb_wait, 493 percpu_ref_is_zero(&mddev->active_io)); 494 else 495 wait_event(mddev->sb_wait, 496 percpu_ref_is_zero(&mddev->active_io)); 497 if (err) { 498 percpu_ref_resurrect(&mddev->active_io); 499 mutex_unlock(&mddev->suspend_mutex); 500 return err; 501 } 502 503 /* 504 * For raid456, io might be waiting for reshape to make progress, 505 * allow new reshape to start while waiting for io to be done to 506 * prevent deadlock. 507 */ 508 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 509 510 /* restrict memory reclaim I/O during raid array is suspend */ 511 mddev->noio_flag = memalloc_noio_save(); 512 513 mutex_unlock(&mddev->suspend_mutex); 514 return 0; 515} 516EXPORT_SYMBOL_GPL(mddev_suspend); 517 518static void __mddev_resume(struct mddev *mddev, bool recovery_needed) 519{ 520 lockdep_assert_not_held(&mddev->reconfig_mutex); 521 522 mutex_lock(&mddev->suspend_mutex); 523 WRITE_ONCE(mddev->suspended, mddev->suspended - 1); 524 if (mddev->suspended) { 525 mutex_unlock(&mddev->suspend_mutex); 526 return; 527 } 528 529 /* entred the memalloc scope from mddev_suspend() */ 530 memalloc_noio_restore(mddev->noio_flag); 531 532 percpu_ref_resurrect(&mddev->active_io); 533 wake_up(&mddev->sb_wait); 534 535 if (recovery_needed) 536 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 537 md_wakeup_thread(mddev->thread); 538 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 539 540 mutex_unlock(&mddev->suspend_mutex); 541} 542 543void mddev_resume(struct mddev *mddev) 544{ 545 return __mddev_resume(mddev, true); 546} 547EXPORT_SYMBOL_GPL(mddev_resume); 548 549/* sync bdev before setting device to readonly or stopping raid*/ 550static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num) 551{ 552 mutex_lock(&mddev->open_mutex); 553 if (mddev->pers && atomic_read(&mddev->openers) > opener_num) { 554 mutex_unlock(&mddev->open_mutex); 555 return -EBUSY; 556 } 557 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { 558 mutex_unlock(&mddev->open_mutex); 559 return -EBUSY; 560 } 561 mutex_unlock(&mddev->open_mutex); 562 563 sync_blockdev(mddev->gendisk->part0); 564 return 0; 565} 566 567/* 568 * The only difference from bio_chain_endio() is that the current 569 * bi_status of bio does not affect the bi_status of parent. 570 */ 571static void md_end_flush(struct bio *bio) 572{ 573 struct bio *parent = bio->bi_private; 574 575 /* 576 * If any flush io error before the power failure, 577 * disk data may be lost. 578 */ 579 if (bio->bi_status) 580 pr_err("md: %pg flush io error %d\n", bio->bi_bdev, 581 blk_status_to_errno(bio->bi_status)); 582 583 bio_put(bio); 584 bio_endio(parent); 585} 586 587bool md_flush_request(struct mddev *mddev, struct bio *bio) 588{ 589 struct md_rdev *rdev; 590 struct bio *new; 591 592 /* 593 * md_flush_reqeust() should be called under md_handle_request() and 594 * 'active_io' is already grabbed. Hence it's safe to get rdev directly 595 * without rcu protection. 596 */ 597 WARN_ON(percpu_ref_is_zero(&mddev->active_io)); 598 599 rdev_for_each(rdev, mddev) { 600 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) 601 continue; 602 603 new = bio_alloc_bioset(rdev->bdev, 0, 604 REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, 605 &mddev->bio_set); 606 new->bi_private = bio; 607 new->bi_end_io = md_end_flush; 608 bio_inc_remaining(bio); 609 submit_bio(new); 610 } 611 612 if (bio_sectors(bio) == 0) { 613 bio_endio(bio); 614 return true; 615 } 616 617 bio->bi_opf &= ~REQ_PREFLUSH; 618 return false; 619} 620EXPORT_SYMBOL(md_flush_request); 621 622static inline struct mddev *mddev_get(struct mddev *mddev) 623{ 624 lockdep_assert_held(&all_mddevs_lock); 625 626 if (test_bit(MD_DELETED, &mddev->flags)) 627 return NULL; 628 atomic_inc(&mddev->active); 629 return mddev; 630} 631 632static void mddev_delayed_delete(struct work_struct *ws); 633 634static void __mddev_put(struct mddev *mddev) 635{ 636 if (mddev->raid_disks || !list_empty(&mddev->disks) || 637 mddev->ctime || mddev->hold_active) 638 return; 639 640 /* 641 * If array is freed by stopping array, MD_DELETED is set by 642 * do_md_stop(), MD_DELETED is still set here in case mddev is freed 643 * directly by closing a mddev that is created by create_on_open. 644 */ 645 set_bit(MD_DELETED, &mddev->flags); 646 /* 647 * Call queue_work inside the spinlock so that flush_workqueue() after 648 * mddev_find will succeed in waiting for the work to be done. 649 */ 650 queue_work(md_misc_wq, &mddev->del_work); 651} 652 653static void mddev_put_locked(struct mddev *mddev) 654{ 655 if (atomic_dec_and_test(&mddev->active)) 656 __mddev_put(mddev); 657} 658 659void mddev_put(struct mddev *mddev) 660{ 661 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 662 return; 663 664 __mddev_put(mddev); 665 spin_unlock(&all_mddevs_lock); 666} 667 668static void md_safemode_timeout(struct timer_list *t); 669static void md_start_sync(struct work_struct *ws); 670 671static void active_io_release(struct percpu_ref *ref) 672{ 673 struct mddev *mddev = container_of(ref, struct mddev, active_io); 674 675 wake_up(&mddev->sb_wait); 676} 677 678static void no_op(struct percpu_ref *r) {} 679 680static bool mddev_set_bitmap_ops(struct mddev *mddev) 681{ 682 struct bitmap_operations *old = mddev->bitmap_ops; 683 struct md_submodule_head *head; 684 685 if (mddev->bitmap_id == ID_BITMAP_NONE || 686 (old && old->head.id == mddev->bitmap_id)) 687 return true; 688 689 xa_lock(&md_submodule); 690 head = xa_load(&md_submodule, mddev->bitmap_id); 691 692 if (!head) { 693 pr_warn("md: can't find bitmap id %d\n", mddev->bitmap_id); 694 goto err; 695 } 696 697 if (head->type != MD_BITMAP) { 698 pr_warn("md: invalid bitmap id %d\n", mddev->bitmap_id); 699 goto err; 700 } 701 702 mddev->bitmap_ops = (void *)head; 703 xa_unlock(&md_submodule); 704 705 if (!mddev_is_dm(mddev) && mddev->bitmap_ops->group) { 706 if (sysfs_create_group(&mddev->kobj, mddev->bitmap_ops->group)) 707 pr_warn("md: cannot register extra bitmap attributes for %s\n", 708 mdname(mddev)); 709 else 710 /* 711 * Inform user with KOBJ_CHANGE about new bitmap 712 * attributes. 713 */ 714 kobject_uevent(&mddev->kobj, KOBJ_CHANGE); 715 } 716 return true; 717 718err: 719 xa_unlock(&md_submodule); 720 return false; 721} 722 723static void mddev_clear_bitmap_ops(struct mddev *mddev) 724{ 725 if (!mddev_is_dm(mddev) && mddev->bitmap_ops && 726 mddev->bitmap_ops->group) 727 sysfs_remove_group(&mddev->kobj, mddev->bitmap_ops->group); 728 729 mddev->bitmap_ops = NULL; 730} 731 732int mddev_init(struct mddev *mddev) 733{ 734 int err = 0; 735 736 if (!IS_ENABLED(CONFIG_MD_BITMAP)) 737 mddev->bitmap_id = ID_BITMAP_NONE; 738 else 739 mddev->bitmap_id = ID_BITMAP; 740 741 if (percpu_ref_init(&mddev->active_io, active_io_release, 742 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) 743 return -ENOMEM; 744 745 if (percpu_ref_init(&mddev->writes_pending, no_op, 746 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { 747 err = -ENOMEM; 748 goto exit_acitve_io; 749 } 750 751 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 752 if (err) 753 goto exit_writes_pending; 754 755 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 756 if (err) 757 goto exit_bio_set; 758 759 err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, 760 offsetof(struct md_io_clone, bio_clone), 0); 761 if (err) 762 goto exit_sync_set; 763 764 /* We want to start with the refcount at zero */ 765 percpu_ref_put(&mddev->writes_pending); 766 767 mutex_init(&mddev->open_mutex); 768 mutex_init(&mddev->reconfig_mutex); 769 mutex_init(&mddev->suspend_mutex); 770 mutex_init(&mddev->bitmap_info.mutex); 771 INIT_LIST_HEAD(&mddev->disks); 772 INIT_LIST_HEAD(&mddev->all_mddevs); 773 INIT_LIST_HEAD(&mddev->deleting); 774 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); 775 atomic_set(&mddev->active, 1); 776 atomic_set(&mddev->openers, 0); 777 atomic_set(&mddev->sync_seq, 0); 778 spin_lock_init(&mddev->lock); 779 init_waitqueue_head(&mddev->sb_wait); 780 init_waitqueue_head(&mddev->recovery_wait); 781 mddev->reshape_position = MaxSector; 782 mddev->reshape_backwards = 0; 783 mddev->last_sync_action = ACTION_IDLE; 784 mddev->resync_min = 0; 785 mddev->resync_max = MaxSector; 786 mddev->level = LEVEL_NONE; 787 788 INIT_WORK(&mddev->sync_work, md_start_sync); 789 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 790 791 return 0; 792 793exit_sync_set: 794 bioset_exit(&mddev->sync_set); 795exit_bio_set: 796 bioset_exit(&mddev->bio_set); 797exit_writes_pending: 798 percpu_ref_exit(&mddev->writes_pending); 799exit_acitve_io: 800 percpu_ref_exit(&mddev->active_io); 801 return err; 802} 803EXPORT_SYMBOL_GPL(mddev_init); 804 805void mddev_destroy(struct mddev *mddev) 806{ 807 bioset_exit(&mddev->bio_set); 808 bioset_exit(&mddev->sync_set); 809 bioset_exit(&mddev->io_clone_set); 810 percpu_ref_exit(&mddev->active_io); 811 percpu_ref_exit(&mddev->writes_pending); 812} 813EXPORT_SYMBOL_GPL(mddev_destroy); 814 815static struct mddev *mddev_find_locked(dev_t unit) 816{ 817 struct mddev *mddev; 818 819 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 820 if (mddev->unit == unit) 821 return mddev; 822 823 return NULL; 824} 825 826/* find an unused unit number */ 827static dev_t mddev_alloc_unit(void) 828{ 829 static int next_minor = 512; 830 int start = next_minor; 831 bool is_free = 0; 832 dev_t dev = 0; 833 834 while (!is_free) { 835 dev = MKDEV(MD_MAJOR, next_minor); 836 next_minor++; 837 if (next_minor > MINORMASK) 838 next_minor = 0; 839 if (next_minor == start) 840 return 0; /* Oh dear, all in use. */ 841 is_free = !mddev_find_locked(dev); 842 } 843 844 return dev; 845} 846 847static struct mddev *mddev_alloc(dev_t unit) 848{ 849 struct mddev *new; 850 int error; 851 852 if (unit && MAJOR(unit) != MD_MAJOR) 853 unit &= ~((1 << MdpMinorShift) - 1); 854 855 new = kzalloc(sizeof(*new), GFP_KERNEL); 856 if (!new) 857 return ERR_PTR(-ENOMEM); 858 859 error = mddev_init(new); 860 if (error) 861 goto out_free_new; 862 863 spin_lock(&all_mddevs_lock); 864 if (unit) { 865 error = -EEXIST; 866 if (mddev_find_locked(unit)) 867 goto out_destroy_new; 868 new->unit = unit; 869 if (MAJOR(unit) == MD_MAJOR) 870 new->md_minor = MINOR(unit); 871 else 872 new->md_minor = MINOR(unit) >> MdpMinorShift; 873 new->hold_active = UNTIL_IOCTL; 874 } else { 875 error = -ENODEV; 876 new->unit = mddev_alloc_unit(); 877 if (!new->unit) 878 goto out_destroy_new; 879 new->md_minor = MINOR(new->unit); 880 new->hold_active = UNTIL_STOP; 881 } 882 883 list_add(&new->all_mddevs, &all_mddevs); 884 spin_unlock(&all_mddevs_lock); 885 return new; 886 887out_destroy_new: 888 spin_unlock(&all_mddevs_lock); 889 mddev_destroy(new); 890out_free_new: 891 kfree(new); 892 return ERR_PTR(error); 893} 894 895static void mddev_free(struct mddev *mddev) 896{ 897 spin_lock(&all_mddevs_lock); 898 list_del(&mddev->all_mddevs); 899 spin_unlock(&all_mddevs_lock); 900 901 mddev_destroy(mddev); 902 kfree(mddev); 903} 904 905static const struct attribute_group md_redundancy_group; 906 907void mddev_unlock(struct mddev *mddev) 908{ 909 struct md_rdev *rdev; 910 struct md_rdev *tmp; 911 LIST_HEAD(delete); 912 913 if (!list_empty(&mddev->deleting)) 914 list_splice_init(&mddev->deleting, &delete); 915 916 if (mddev->to_remove) { 917 /* These cannot be removed under reconfig_mutex as 918 * an access to the files will try to take reconfig_mutex 919 * while holding the file unremovable, which leads to 920 * a deadlock. 921 * So hold set sysfs_active while the remove in happeing, 922 * and anything else which might set ->to_remove or my 923 * otherwise change the sysfs namespace will fail with 924 * -EBUSY if sysfs_active is still set. 925 * We set sysfs_active under reconfig_mutex and elsewhere 926 * test it under the same mutex to ensure its correct value 927 * is seen. 928 */ 929 const struct attribute_group *to_remove = mddev->to_remove; 930 mddev->to_remove = NULL; 931 mddev->sysfs_active = 1; 932 mutex_unlock(&mddev->reconfig_mutex); 933 934 if (mddev->kobj.sd) { 935 if (to_remove != &md_redundancy_group) 936 sysfs_remove_group(&mddev->kobj, to_remove); 937 if (mddev->pers == NULL || 938 mddev->pers->sync_request == NULL) { 939 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 940 if (mddev->sysfs_action) 941 sysfs_put(mddev->sysfs_action); 942 if (mddev->sysfs_completed) 943 sysfs_put(mddev->sysfs_completed); 944 if (mddev->sysfs_degraded) 945 sysfs_put(mddev->sysfs_degraded); 946 mddev->sysfs_action = NULL; 947 mddev->sysfs_completed = NULL; 948 mddev->sysfs_degraded = NULL; 949 } 950 } 951 mddev->sysfs_active = 0; 952 } else 953 mutex_unlock(&mddev->reconfig_mutex); 954 955 md_wakeup_thread(mddev->thread); 956 wake_up(&mddev->sb_wait); 957 958 list_for_each_entry_safe(rdev, tmp, &delete, same_set) { 959 list_del_init(&rdev->same_set); 960 kobject_del(&rdev->kobj); 961 export_rdev(rdev, mddev); 962 } 963 964 if (!legacy_async_del_gendisk) { 965 /* 966 * Call del_gendisk after release reconfig_mutex to avoid 967 * deadlock (e.g. call del_gendisk under the lock and an 968 * access to sysfs files waits the lock) 969 * And MD_DELETED is only used for md raid which is set in 970 * do_md_stop. dm raid only uses md_stop to stop. So dm raid 971 * doesn't need to check MD_DELETED when getting reconfig lock 972 */ 973 if (test_bit(MD_DELETED, &mddev->flags) && 974 !test_and_set_bit(MD_DO_DELETE, &mddev->flags)) { 975 kobject_del(&mddev->kobj); 976 del_gendisk(mddev->gendisk); 977 } 978 } 979} 980EXPORT_SYMBOL_GPL(mddev_unlock); 981 982struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 983{ 984 struct md_rdev *rdev; 985 986 rdev_for_each_rcu(rdev, mddev) 987 if (rdev->desc_nr == nr) 988 return rdev; 989 990 return NULL; 991} 992EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 993 994static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 995{ 996 struct md_rdev *rdev; 997 998 rdev_for_each(rdev, mddev) 999 if (rdev->bdev->bd_dev == dev) 1000 return rdev; 1001 1002 return NULL; 1003} 1004 1005struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) 1006{ 1007 struct md_rdev *rdev; 1008 1009 rdev_for_each_rcu(rdev, mddev) 1010 if (rdev->bdev->bd_dev == dev) 1011 return rdev; 1012 1013 return NULL; 1014} 1015EXPORT_SYMBOL_GPL(md_find_rdev_rcu); 1016 1017static struct md_personality *get_pers(int level, char *clevel) 1018{ 1019 struct md_personality *ret = NULL; 1020 struct md_submodule_head *head; 1021 unsigned long i; 1022 1023 xa_lock(&md_submodule); 1024 xa_for_each(&md_submodule, i, head) { 1025 if (head->type != MD_PERSONALITY) 1026 continue; 1027 if ((level != LEVEL_NONE && head->id == level) || 1028 !strcmp(head->name, clevel)) { 1029 if (try_module_get(head->owner)) 1030 ret = (void *)head; 1031 break; 1032 } 1033 } 1034 xa_unlock(&md_submodule); 1035 1036 if (!ret) { 1037 if (level != LEVEL_NONE) 1038 pr_warn("md: personality for level %d is not loaded!\n", 1039 level); 1040 else 1041 pr_warn("md: personality for level %s is not loaded!\n", 1042 clevel); 1043 } 1044 1045 return ret; 1046} 1047 1048static void put_pers(struct md_personality *pers) 1049{ 1050 module_put(pers->head.owner); 1051} 1052 1053/* return the offset of the super block in 512byte sectors */ 1054static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 1055{ 1056 return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); 1057} 1058 1059static int alloc_disk_sb(struct md_rdev *rdev) 1060{ 1061 rdev->sb_page = alloc_page(GFP_KERNEL); 1062 if (!rdev->sb_page) 1063 return -ENOMEM; 1064 return 0; 1065} 1066 1067void md_rdev_clear(struct md_rdev *rdev) 1068{ 1069 if (rdev->sb_page) { 1070 put_page(rdev->sb_page); 1071 rdev->sb_loaded = 0; 1072 rdev->sb_page = NULL; 1073 rdev->sb_start = 0; 1074 rdev->sectors = 0; 1075 } 1076 if (rdev->bb_page) { 1077 put_page(rdev->bb_page); 1078 rdev->bb_page = NULL; 1079 } 1080 badblocks_exit(&rdev->badblocks); 1081} 1082EXPORT_SYMBOL_GPL(md_rdev_clear); 1083 1084static void super_written(struct bio *bio) 1085{ 1086 struct md_rdev *rdev = bio->bi_private; 1087 struct mddev *mddev = rdev->mddev; 1088 1089 if (bio->bi_status) { 1090 pr_err("md: %s gets error=%d\n", __func__, 1091 blk_status_to_errno(bio->bi_status)); 1092 md_error(mddev, rdev); 1093 if (!test_bit(Faulty, &rdev->flags) 1094 && (bio->bi_opf & MD_FAILFAST)) { 1095 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); 1096 set_bit(LastDev, &rdev->flags); 1097 } 1098 } else 1099 clear_bit(LastDev, &rdev->flags); 1100 1101 bio_put(bio); 1102 1103 rdev_dec_pending(rdev, mddev); 1104 1105 if (atomic_dec_and_test(&mddev->pending_writes)) 1106 wake_up(&mddev->sb_wait); 1107} 1108 1109/** 1110 * md_write_metadata - write metadata to underlying disk, including 1111 * array superblock, badblocks, bitmap superblock and bitmap bits. 1112 * @mddev: the array to write 1113 * @rdev: the underlying disk to write 1114 * @sector: the offset to @rdev 1115 * @size: the length of the metadata 1116 * @page: the metadata 1117 * @offset: the offset to @page 1118 * 1119 * Write @size bytes of @page start from @offset, to @sector of @rdev, Increment 1120 * mddev->pending_writes before returning, and decrement it on completion, 1121 * waking up sb_wait. Caller must call md_super_wait() after issuing io to all 1122 * rdev. If an error occurred, md_error() will be called, and the @rdev will be 1123 * kicked out from @mddev. 1124 */ 1125void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev, 1126 sector_t sector, int size, struct page *page, 1127 unsigned int offset) 1128{ 1129 struct bio *bio; 1130 1131 if (!page) 1132 return; 1133 1134 if (test_bit(Faulty, &rdev->flags)) 1135 return; 1136 1137 bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, 1138 1, 1139 REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META 1140 | REQ_PREFLUSH | REQ_FUA, 1141 GFP_NOIO, &mddev->sync_set); 1142 1143 atomic_inc(&rdev->nr_pending); 1144 1145 bio->bi_iter.bi_sector = sector; 1146 __bio_add_page(bio, page, size, offset); 1147 bio->bi_private = rdev; 1148 bio->bi_end_io = super_written; 1149 1150 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 1151 test_bit(FailFast, &rdev->flags) && 1152 !test_bit(LastDev, &rdev->flags)) 1153 bio->bi_opf |= MD_FAILFAST; 1154 1155 atomic_inc(&mddev->pending_writes); 1156 submit_bio(bio); 1157} 1158 1159int md_super_wait(struct mddev *mddev) 1160{ 1161 /* wait for all superblock writes that were scheduled to complete */ 1162 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1163 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) 1164 return -EAGAIN; 1165 return 0; 1166} 1167 1168int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 1169 struct page *page, blk_opf_t opf, bool metadata_op) 1170{ 1171 struct bio bio; 1172 struct bio_vec bvec; 1173 1174 if (metadata_op && rdev->meta_bdev) 1175 bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); 1176 else 1177 bio_init(&bio, rdev->bdev, &bvec, 1, opf); 1178 1179 if (metadata_op) 1180 bio.bi_iter.bi_sector = sector + rdev->sb_start; 1181 else if (rdev->mddev->reshape_position != MaxSector && 1182 (rdev->mddev->reshape_backwards == 1183 (sector >= rdev->mddev->reshape_position))) 1184 bio.bi_iter.bi_sector = sector + rdev->new_data_offset; 1185 else 1186 bio.bi_iter.bi_sector = sector + rdev->data_offset; 1187 __bio_add_page(&bio, page, size, 0); 1188 1189 submit_bio_wait(&bio); 1190 1191 return !bio.bi_status; 1192} 1193EXPORT_SYMBOL_GPL(sync_page_io); 1194 1195static int read_disk_sb(struct md_rdev *rdev, int size) 1196{ 1197 if (rdev->sb_loaded) 1198 return 0; 1199 1200 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) 1201 goto fail; 1202 rdev->sb_loaded = 1; 1203 return 0; 1204 1205fail: 1206 pr_err("md: disabled device %pg, could not read superblock.\n", 1207 rdev->bdev); 1208 return -EINVAL; 1209} 1210 1211static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1212{ 1213 return sb1->set_uuid0 == sb2->set_uuid0 && 1214 sb1->set_uuid1 == sb2->set_uuid1 && 1215 sb1->set_uuid2 == sb2->set_uuid2 && 1216 sb1->set_uuid3 == sb2->set_uuid3; 1217} 1218 1219static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1220{ 1221 int ret; 1222 mdp_super_t *tmp1, *tmp2; 1223 1224 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 1225 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 1226 1227 if (!tmp1 || !tmp2) { 1228 ret = 0; 1229 goto abort; 1230 } 1231 1232 *tmp1 = *sb1; 1233 *tmp2 = *sb2; 1234 1235 /* 1236 * nr_disks is not constant 1237 */ 1238 tmp1->nr_disks = 0; 1239 tmp2->nr_disks = 0; 1240 1241 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 1242abort: 1243 kfree(tmp1); 1244 kfree(tmp2); 1245 return ret; 1246} 1247 1248static u32 md_csum_fold(u32 csum) 1249{ 1250 csum = (csum & 0xffff) + (csum >> 16); 1251 return (csum & 0xffff) + (csum >> 16); 1252} 1253 1254static unsigned int calc_sb_csum(mdp_super_t *sb) 1255{ 1256 u64 newcsum = 0; 1257 u32 *sb32 = (u32*)sb; 1258 int i; 1259 unsigned int disk_csum, csum; 1260 1261 disk_csum = sb->sb_csum; 1262 sb->sb_csum = 0; 1263 1264 for (i = 0; i < MD_SB_BYTES/4 ; i++) 1265 newcsum += sb32[i]; 1266 csum = (newcsum & 0xffffffff) + (newcsum>>32); 1267 1268#ifdef CONFIG_ALPHA 1269 /* This used to use csum_partial, which was wrong for several 1270 * reasons including that different results are returned on 1271 * different architectures. It isn't critical that we get exactly 1272 * the same return value as before (we always csum_fold before 1273 * testing, and that removes any differences). However as we 1274 * know that csum_partial always returned a 16bit value on 1275 * alphas, do a fold to maximise conformity to previous behaviour. 1276 */ 1277 sb->sb_csum = md_csum_fold(disk_csum); 1278#else 1279 sb->sb_csum = disk_csum; 1280#endif 1281 return csum; 1282} 1283 1284/* 1285 * Handle superblock details. 1286 * We want to be able to handle multiple superblock formats 1287 * so we have a common interface to them all, and an array of 1288 * different handlers. 1289 * We rely on user-space to write the initial superblock, and support 1290 * reading and updating of superblocks. 1291 * Interface methods are: 1292 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 1293 * loads and validates a superblock on dev. 1294 * if refdev != NULL, compare superblocks on both devices 1295 * Return: 1296 * 0 - dev has a superblock that is compatible with refdev 1297 * 1 - dev has a superblock that is compatible and newer than refdev 1298 * so dev should be used as the refdev in future 1299 * -EINVAL superblock incompatible or invalid 1300 * -othererror e.g. -EIO 1301 * 1302 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 1303 * Verify that dev is acceptable into mddev. 1304 * The first time, mddev->raid_disks will be 0, and data from 1305 * dev should be merged in. Subsequent calls check that dev 1306 * is new enough. Return 0 or -EINVAL 1307 * 1308 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 1309 * Update the superblock for rdev with data in mddev 1310 * This does not write to disc. 1311 * 1312 */ 1313 1314struct super_type { 1315 char *name; 1316 struct module *owner; 1317 int (*load_super)(struct md_rdev *rdev, 1318 struct md_rdev *refdev, 1319 int minor_version); 1320 int (*validate_super)(struct mddev *mddev, 1321 struct md_rdev *freshest, 1322 struct md_rdev *rdev); 1323 void (*sync_super)(struct mddev *mddev, 1324 struct md_rdev *rdev); 1325 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 1326 sector_t num_sectors); 1327 int (*allow_new_offset)(struct md_rdev *rdev, 1328 unsigned long long new_offset); 1329}; 1330 1331/* 1332 * Check that the given mddev has no bitmap. 1333 * 1334 * This function is called from the run method of all personalities that do not 1335 * support bitmaps. It prints an error message and returns non-zero if mddev 1336 * has a bitmap. Otherwise, it returns 0. 1337 * 1338 */ 1339int md_check_no_bitmap(struct mddev *mddev) 1340{ 1341 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 1342 return 0; 1343 pr_warn("%s: bitmaps are not supported for %s\n", 1344 mdname(mddev), mddev->pers->head.name); 1345 return 1; 1346} 1347EXPORT_SYMBOL(md_check_no_bitmap); 1348 1349/* 1350 * load_super for 0.90.0 1351 */ 1352static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1353{ 1354 mdp_super_t *sb; 1355 int ret; 1356 bool spare_disk = true; 1357 1358 /* 1359 * Calculate the position of the superblock (512byte sectors), 1360 * it's at the end of the disk. 1361 * 1362 * It also happens to be a multiple of 4Kb. 1363 */ 1364 rdev->sb_start = calc_dev_sboffset(rdev); 1365 1366 ret = read_disk_sb(rdev, MD_SB_BYTES); 1367 if (ret) 1368 return ret; 1369 1370 ret = -EINVAL; 1371 1372 sb = page_address(rdev->sb_page); 1373 1374 if (sb->md_magic != MD_SB_MAGIC) { 1375 pr_warn("md: invalid raid superblock magic on %pg\n", 1376 rdev->bdev); 1377 goto abort; 1378 } 1379 1380 if (sb->major_version != 0 || 1381 sb->minor_version < 90 || 1382 sb->minor_version > 91) { 1383 pr_warn("Bad version number %d.%d on %pg\n", 1384 sb->major_version, sb->minor_version, rdev->bdev); 1385 goto abort; 1386 } 1387 1388 if (sb->raid_disks <= 0) 1389 goto abort; 1390 1391 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1392 pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); 1393 goto abort; 1394 } 1395 1396 rdev->preferred_minor = sb->md_minor; 1397 rdev->data_offset = 0; 1398 rdev->new_data_offset = 0; 1399 rdev->sb_size = MD_SB_BYTES; 1400 rdev->badblocks.shift = -1; 1401 1402 rdev->desc_nr = sb->this_disk.number; 1403 1404 /* not spare disk */ 1405 if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS && 1406 sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1407 spare_disk = false; 1408 1409 if (!refdev) { 1410 if (!spare_disk) 1411 ret = 1; 1412 else 1413 ret = 0; 1414 } else { 1415 __u64 ev1, ev2; 1416 mdp_super_t *refsb = page_address(refdev->sb_page); 1417 if (!md_uuid_equal(refsb, sb)) { 1418 pr_warn("md: %pg has different UUID to %pg\n", 1419 rdev->bdev, refdev->bdev); 1420 goto abort; 1421 } 1422 if (!md_sb_equal(refsb, sb)) { 1423 pr_warn("md: %pg has same UUID but different superblock to %pg\n", 1424 rdev->bdev, refdev->bdev); 1425 goto abort; 1426 } 1427 ev1 = md_event(sb); 1428 ev2 = md_event(refsb); 1429 1430 if (!spare_disk && ev1 > ev2) 1431 ret = 1; 1432 else 1433 ret = 0; 1434 } 1435 rdev->sectors = rdev->sb_start; 1436 /* Limit to 4TB as metadata cannot record more than that. 1437 * (not needed for Linear and RAID0 as metadata doesn't 1438 * record this size) 1439 */ 1440 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1441 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1442 1443 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1444 /* "this cannot possibly happen" ... */ 1445 ret = -EINVAL; 1446 1447 abort: 1448 return ret; 1449} 1450 1451static u64 md_bitmap_events_cleared(struct mddev *mddev) 1452{ 1453 struct md_bitmap_stats stats; 1454 int err; 1455 1456 if (!md_bitmap_enabled(mddev, false)) 1457 return 0; 1458 1459 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 1460 if (err) 1461 return 0; 1462 1463 return stats.events_cleared; 1464} 1465 1466/* 1467 * validate_super for 0.90.0 1468 * note: we are not using "freshest" for 0.9 superblock 1469 */ 1470static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1471{ 1472 mdp_disk_t *desc; 1473 mdp_super_t *sb = page_address(rdev->sb_page); 1474 __u64 ev1 = md_event(sb); 1475 1476 rdev->raid_disk = -1; 1477 clear_bit(Faulty, &rdev->flags); 1478 clear_bit(In_sync, &rdev->flags); 1479 clear_bit(Bitmap_sync, &rdev->flags); 1480 clear_bit(WriteMostly, &rdev->flags); 1481 1482 if (mddev->raid_disks == 0) { 1483 mddev->major_version = 0; 1484 mddev->minor_version = sb->minor_version; 1485 mddev->patch_version = sb->patch_version; 1486 mddev->external = 0; 1487 mddev->chunk_sectors = sb->chunk_size >> 9; 1488 mddev->ctime = sb->ctime; 1489 mddev->utime = sb->utime; 1490 mddev->level = sb->level; 1491 mddev->clevel[0] = 0; 1492 mddev->layout = sb->layout; 1493 mddev->raid_disks = sb->raid_disks; 1494 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1495 mddev->events = ev1; 1496 mddev->bitmap_info.offset = 0; 1497 mddev->bitmap_info.space = 0; 1498 /* bitmap can use 60 K after the 4K superblocks */ 1499 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1500 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1501 mddev->reshape_backwards = 0; 1502 1503 if (mddev->minor_version >= 91) { 1504 mddev->reshape_position = sb->reshape_position; 1505 mddev->delta_disks = sb->delta_disks; 1506 mddev->new_level = sb->new_level; 1507 mddev->new_layout = sb->new_layout; 1508 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1509 if (mddev->delta_disks < 0) 1510 mddev->reshape_backwards = 1; 1511 } else { 1512 mddev->reshape_position = MaxSector; 1513 mddev->delta_disks = 0; 1514 mddev->new_level = mddev->level; 1515 mddev->new_layout = mddev->layout; 1516 mddev->new_chunk_sectors = mddev->chunk_sectors; 1517 } 1518 if (mddev->level == 0) 1519 mddev->layout = -1; 1520 1521 if (sb->state & (1<<MD_SB_CLEAN)) 1522 mddev->resync_offset = MaxSector; 1523 else { 1524 if (sb->events_hi == sb->cp_events_hi && 1525 sb->events_lo == sb->cp_events_lo) { 1526 mddev->resync_offset = sb->recovery_cp; 1527 } else 1528 mddev->resync_offset = 0; 1529 } 1530 1531 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1532 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1533 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1534 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1535 1536 mddev->max_disks = MD_SB_DISKS; 1537 1538 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1539 mddev->bitmap_info.file == NULL) { 1540 mddev->bitmap_info.offset = 1541 mddev->bitmap_info.default_offset; 1542 mddev->bitmap_info.space = 1543 mddev->bitmap_info.default_space; 1544 } 1545 1546 } else if (mddev->pers == NULL) { 1547 /* Insist on good event counter while assembling, except 1548 * for spares (which don't need an event count) */ 1549 ++ev1; 1550 if (sb->disks[rdev->desc_nr].state & ( 1551 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1552 if (ev1 < mddev->events) 1553 return -EINVAL; 1554 } else if (mddev->bitmap) { 1555 /* if adding to array with a bitmap, then we can accept an 1556 * older device ... but not too old. 1557 */ 1558 if (ev1 < md_bitmap_events_cleared(mddev)) 1559 return 0; 1560 if (ev1 < mddev->events) 1561 set_bit(Bitmap_sync, &rdev->flags); 1562 } else { 1563 if (ev1 < mddev->events) 1564 /* just a hot-add of a new device, leave raid_disk at -1 */ 1565 return 0; 1566 } 1567 1568 desc = sb->disks + rdev->desc_nr; 1569 1570 if (desc->state & (1<<MD_DISK_FAULTY)) 1571 set_bit(Faulty, &rdev->flags); 1572 else if (desc->state & (1<<MD_DISK_SYNC)) { 1573 set_bit(In_sync, &rdev->flags); 1574 rdev->raid_disk = desc->raid_disk; 1575 rdev->saved_raid_disk = desc->raid_disk; 1576 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1577 /* active but not in sync implies recovery up to 1578 * reshape position. We don't know exactly where 1579 * that is, so set to zero for now 1580 */ 1581 if (mddev->minor_version >= 91) { 1582 rdev->recovery_offset = 0; 1583 rdev->raid_disk = desc->raid_disk; 1584 } 1585 } 1586 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1587 set_bit(WriteMostly, &rdev->flags); 1588 if (desc->state & (1<<MD_DISK_FAILFAST)) 1589 set_bit(FailFast, &rdev->flags); 1590 return 0; 1591} 1592 1593/* 1594 * sync_super for 0.90.0 1595 */ 1596static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1597{ 1598 mdp_super_t *sb; 1599 struct md_rdev *rdev2; 1600 int next_spare = mddev->raid_disks; 1601 1602 /* make rdev->sb match mddev data.. 1603 * 1604 * 1/ zero out disks 1605 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1606 * 3/ any empty disks < next_spare become removed 1607 * 1608 * disks[0] gets initialised to REMOVED because 1609 * we cannot be sure from other fields if it has 1610 * been initialised or not. 1611 */ 1612 int i; 1613 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1614 1615 rdev->sb_size = MD_SB_BYTES; 1616 1617 sb = page_address(rdev->sb_page); 1618 1619 memset(sb, 0, sizeof(*sb)); 1620 1621 sb->md_magic = MD_SB_MAGIC; 1622 sb->major_version = mddev->major_version; 1623 sb->patch_version = mddev->patch_version; 1624 sb->gvalid_words = 0; /* ignored */ 1625 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1626 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1627 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1628 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1629 1630 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1631 sb->level = mddev->level; 1632 sb->size = mddev->dev_sectors / 2; 1633 sb->raid_disks = mddev->raid_disks; 1634 sb->md_minor = mddev->md_minor; 1635 sb->not_persistent = 0; 1636 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1637 sb->state = 0; 1638 sb->events_hi = (mddev->events>>32); 1639 sb->events_lo = (u32)mddev->events; 1640 1641 if (mddev->reshape_position == MaxSector) 1642 sb->minor_version = 90; 1643 else { 1644 sb->minor_version = 91; 1645 sb->reshape_position = mddev->reshape_position; 1646 sb->new_level = mddev->new_level; 1647 sb->delta_disks = mddev->delta_disks; 1648 sb->new_layout = mddev->new_layout; 1649 sb->new_chunk = mddev->new_chunk_sectors << 9; 1650 } 1651 mddev->minor_version = sb->minor_version; 1652 if (mddev->in_sync) 1653 { 1654 sb->recovery_cp = mddev->resync_offset; 1655 sb->cp_events_hi = (mddev->events>>32); 1656 sb->cp_events_lo = (u32)mddev->events; 1657 if (mddev->resync_offset == MaxSector) 1658 sb->state = (1<< MD_SB_CLEAN); 1659 } else 1660 sb->recovery_cp = 0; 1661 1662 sb->layout = mddev->layout; 1663 sb->chunk_size = mddev->chunk_sectors << 9; 1664 1665 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1666 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1667 1668 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1669 rdev_for_each(rdev2, mddev) { 1670 mdp_disk_t *d; 1671 int desc_nr; 1672 int is_active = test_bit(In_sync, &rdev2->flags); 1673 1674 if (rdev2->raid_disk >= 0 && 1675 sb->minor_version >= 91) 1676 /* we have nowhere to store the recovery_offset, 1677 * but if it is not below the reshape_position, 1678 * we can piggy-back on that. 1679 */ 1680 is_active = 1; 1681 if (rdev2->raid_disk < 0 || 1682 test_bit(Faulty, &rdev2->flags)) 1683 is_active = 0; 1684 if (is_active) 1685 desc_nr = rdev2->raid_disk; 1686 else 1687 desc_nr = next_spare++; 1688 rdev2->desc_nr = desc_nr; 1689 d = &sb->disks[rdev2->desc_nr]; 1690 nr_disks++; 1691 d->number = rdev2->desc_nr; 1692 d->major = MAJOR(rdev2->bdev->bd_dev); 1693 d->minor = MINOR(rdev2->bdev->bd_dev); 1694 if (is_active) 1695 d->raid_disk = rdev2->raid_disk; 1696 else 1697 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1698 if (test_bit(Faulty, &rdev2->flags)) 1699 d->state = (1<<MD_DISK_FAULTY); 1700 else if (is_active) { 1701 d->state = (1<<MD_DISK_ACTIVE); 1702 if (test_bit(In_sync, &rdev2->flags)) 1703 d->state |= (1<<MD_DISK_SYNC); 1704 active++; 1705 working++; 1706 } else { 1707 d->state = 0; 1708 spare++; 1709 working++; 1710 } 1711 if (test_bit(WriteMostly, &rdev2->flags)) 1712 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1713 if (test_bit(FailFast, &rdev2->flags)) 1714 d->state |= (1<<MD_DISK_FAILFAST); 1715 } 1716 /* now set the "removed" and "faulty" bits on any missing devices */ 1717 for (i=0 ; i < mddev->raid_disks ; i++) { 1718 mdp_disk_t *d = &sb->disks[i]; 1719 if (d->state == 0 && d->number == 0) { 1720 d->number = i; 1721 d->raid_disk = i; 1722 d->state = (1<<MD_DISK_REMOVED); 1723 d->state |= (1<<MD_DISK_FAULTY); 1724 failed++; 1725 } 1726 } 1727 sb->nr_disks = nr_disks; 1728 sb->active_disks = active; 1729 sb->working_disks = working; 1730 sb->failed_disks = failed; 1731 sb->spare_disks = spare; 1732 1733 sb->this_disk = sb->disks[rdev->desc_nr]; 1734 sb->sb_csum = calc_sb_csum(sb); 1735} 1736 1737/* 1738 * rdev_size_change for 0.90.0 1739 */ 1740static unsigned long long 1741super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1742{ 1743 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1744 return 0; /* component must fit device */ 1745 if (rdev->mddev->bitmap_info.offset) 1746 return 0; /* can't move bitmap */ 1747 rdev->sb_start = calc_dev_sboffset(rdev); 1748 if (!num_sectors || num_sectors > rdev->sb_start) 1749 num_sectors = rdev->sb_start; 1750 /* Limit to 4TB as metadata cannot record more than that. 1751 * 4TB == 2^32 KB, or 2*2^32 sectors. 1752 */ 1753 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1754 num_sectors = (sector_t)(2ULL << 32) - 2; 1755 do { 1756 md_write_metadata(rdev->mddev, rdev, rdev->sb_start, 1757 rdev->sb_size, rdev->sb_page, 0); 1758 } while (md_super_wait(rdev->mddev) < 0); 1759 return num_sectors; 1760} 1761 1762static int 1763super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1764{ 1765 /* non-zero offset changes not possible with v0.90 */ 1766 return new_offset == 0; 1767} 1768 1769/* 1770 * version 1 superblock 1771 */ 1772 1773static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1774{ 1775 __le32 disk_csum; 1776 u32 csum; 1777 unsigned long long newcsum; 1778 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1779 __le32 *isuper = (__le32*)sb; 1780 1781 disk_csum = sb->sb_csum; 1782 sb->sb_csum = 0; 1783 newcsum = 0; 1784 for (; size >= 4; size -= 4) 1785 newcsum += le32_to_cpu(*isuper++); 1786 1787 if (size == 2) 1788 newcsum += le16_to_cpu(*(__le16*) isuper); 1789 1790 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1791 sb->sb_csum = disk_csum; 1792 return cpu_to_le32(csum); 1793} 1794 1795static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1796{ 1797 struct mdp_superblock_1 *sb; 1798 int ret; 1799 sector_t sb_start; 1800 sector_t sectors; 1801 int bmask; 1802 bool spare_disk = true; 1803 1804 /* 1805 * Calculate the position of the superblock in 512byte sectors. 1806 * It is always aligned to a 4K boundary and 1807 * depeding on minor_version, it can be: 1808 * 0: At least 8K, but less than 12K, from end of device 1809 * 1: At start of device 1810 * 2: 4K from start of device. 1811 */ 1812 switch(minor_version) { 1813 case 0: 1814 sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; 1815 sb_start &= ~(sector_t)(4*2-1); 1816 break; 1817 case 1: 1818 sb_start = 0; 1819 break; 1820 case 2: 1821 sb_start = 8; 1822 break; 1823 default: 1824 return -EINVAL; 1825 } 1826 rdev->sb_start = sb_start; 1827 1828 /* superblock is rarely larger than 1K, but it can be larger, 1829 * and it is safe to read 4k, so we do that 1830 */ 1831 ret = read_disk_sb(rdev, 4096); 1832 if (ret) return ret; 1833 1834 sb = page_address(rdev->sb_page); 1835 1836 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1837 sb->major_version != cpu_to_le32(1) || 1838 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1839 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1840 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1841 return -EINVAL; 1842 1843 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1844 pr_warn("md: invalid superblock checksum on %pg\n", 1845 rdev->bdev); 1846 return -EINVAL; 1847 } 1848 if (le64_to_cpu(sb->data_size) < 10) { 1849 pr_warn("md: data_size too small on %pg\n", 1850 rdev->bdev); 1851 return -EINVAL; 1852 } 1853 if (sb->pad0 || 1854 sb->pad3[0] || 1855 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) { 1856 pr_warn("Some padding is non-zero on %pg, might be a new feature\n", 1857 rdev->bdev); 1858 if (check_new_feature) 1859 return -EINVAL; 1860 pr_warn("check_new_feature is disabled, data corruption possible\n"); 1861 } 1862 1863 rdev->preferred_minor = 0xffff; 1864 rdev->data_offset = le64_to_cpu(sb->data_offset); 1865 rdev->new_data_offset = rdev->data_offset; 1866 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1867 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1868 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1869 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1870 1871 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1872 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1873 if (rdev->sb_size & bmask) 1874 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1875 1876 if (minor_version 1877 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1878 return -EINVAL; 1879 if (minor_version 1880 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1881 return -EINVAL; 1882 1883 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1884 1885 if (!rdev->bb_page) { 1886 rdev->bb_page = alloc_page(GFP_KERNEL); 1887 if (!rdev->bb_page) 1888 return -ENOMEM; 1889 } 1890 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1891 rdev->badblocks.count == 0) { 1892 /* need to load the bad block list. 1893 * Currently we limit it to one page. 1894 */ 1895 s32 offset; 1896 sector_t bb_sector; 1897 __le64 *bbp; 1898 int i; 1899 int sectors = le16_to_cpu(sb->bblog_size); 1900 if (sectors > (PAGE_SIZE / 512)) 1901 return -EINVAL; 1902 offset = le32_to_cpu(sb->bblog_offset); 1903 if (offset == 0) 1904 return -EINVAL; 1905 bb_sector = (long long)offset; 1906 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1907 rdev->bb_page, REQ_OP_READ, true)) 1908 return -EIO; 1909 bbp = (__le64 *)page_address(rdev->bb_page); 1910 rdev->badblocks.shift = sb->bblog_shift; 1911 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1912 u64 bb = le64_to_cpu(*bbp); 1913 int count = bb & (0x3ff); 1914 u64 sector = bb >> 10; 1915 sector <<= sb->bblog_shift; 1916 count <<= sb->bblog_shift; 1917 if (bb + 1 == 0) 1918 break; 1919 if (!badblocks_set(&rdev->badblocks, sector, count, 1)) 1920 return -EINVAL; 1921 } 1922 } else if (sb->bblog_offset != 0) 1923 rdev->badblocks.shift = 0; 1924 1925 if ((le32_to_cpu(sb->feature_map) & 1926 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { 1927 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); 1928 rdev->ppl.size = le16_to_cpu(sb->ppl.size); 1929 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; 1930 } 1931 1932 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && 1933 sb->level != 0) 1934 return -EINVAL; 1935 1936 /* not spare disk */ 1937 if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1938 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1939 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1940 spare_disk = false; 1941 1942 if (!refdev) { 1943 if (!spare_disk) 1944 ret = 1; 1945 else 1946 ret = 0; 1947 } else { 1948 __u64 ev1, ev2; 1949 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1950 1951 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1952 sb->level != refsb->level || 1953 sb->layout != refsb->layout || 1954 sb->chunksize != refsb->chunksize) { 1955 pr_warn("md: %pg has strangely different superblock to %pg\n", 1956 rdev->bdev, 1957 refdev->bdev); 1958 return -EINVAL; 1959 } 1960 ev1 = le64_to_cpu(sb->events); 1961 ev2 = le64_to_cpu(refsb->events); 1962 1963 if (!spare_disk && ev1 > ev2) 1964 ret = 1; 1965 else 1966 ret = 0; 1967 } 1968 if (minor_version) 1969 sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 1970 else 1971 sectors = rdev->sb_start; 1972 if (sectors < le64_to_cpu(sb->data_size)) 1973 return -EINVAL; 1974 rdev->sectors = le64_to_cpu(sb->data_size); 1975 return ret; 1976} 1977 1978static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1979{ 1980 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1981 __u64 ev1 = le64_to_cpu(sb->events); 1982 int role; 1983 1984 rdev->raid_disk = -1; 1985 clear_bit(Faulty, &rdev->flags); 1986 clear_bit(In_sync, &rdev->flags); 1987 clear_bit(Bitmap_sync, &rdev->flags); 1988 clear_bit(WriteMostly, &rdev->flags); 1989 1990 if (mddev->raid_disks == 0) { 1991 mddev->major_version = 1; 1992 mddev->patch_version = 0; 1993 mddev->external = 0; 1994 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1995 mddev->ctime = le64_to_cpu(sb->ctime); 1996 mddev->utime = le64_to_cpu(sb->utime); 1997 mddev->level = le32_to_cpu(sb->level); 1998 mddev->clevel[0] = 0; 1999 mddev->layout = le32_to_cpu(sb->layout); 2000 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 2001 mddev->dev_sectors = le64_to_cpu(sb->size); 2002 mddev->events = ev1; 2003 mddev->bitmap_info.offset = 0; 2004 mddev->bitmap_info.space = 0; 2005 /* Default location for bitmap is 1K after superblock 2006 * using 3K - total of 4K 2007 */ 2008 mddev->bitmap_info.default_offset = 1024 >> 9; 2009 mddev->bitmap_info.default_space = (4096-1024) >> 9; 2010 mddev->reshape_backwards = 0; 2011 2012 mddev->resync_offset = le64_to_cpu(sb->resync_offset); 2013 memcpy(mddev->uuid, sb->set_uuid, 16); 2014 2015 mddev->max_disks = (4096-256)/2; 2016 2017 if (!mddev->logical_block_size) 2018 mddev->logical_block_size = le32_to_cpu(sb->logical_block_size); 2019 2020 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 2021 mddev->bitmap_info.file == NULL) { 2022 mddev->bitmap_info.offset = 2023 (__s32)le32_to_cpu(sb->bitmap_offset); 2024 /* Metadata doesn't record how much space is available. 2025 * For 1.0, we assume we can use up to the superblock 2026 * if before, else to 4K beyond superblock. 2027 * For others, assume no change is possible. 2028 */ 2029 if (mddev->minor_version > 0) 2030 mddev->bitmap_info.space = 0; 2031 else if (mddev->bitmap_info.offset > 0) 2032 mddev->bitmap_info.space = 2033 8 - mddev->bitmap_info.offset; 2034 else 2035 mddev->bitmap_info.space = 2036 -mddev->bitmap_info.offset; 2037 } 2038 2039 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 2040 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 2041 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 2042 mddev->new_level = le32_to_cpu(sb->new_level); 2043 mddev->new_layout = le32_to_cpu(sb->new_layout); 2044 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 2045 if (mddev->delta_disks < 0 || 2046 (mddev->delta_disks == 0 && 2047 (le32_to_cpu(sb->feature_map) 2048 & MD_FEATURE_RESHAPE_BACKWARDS))) 2049 mddev->reshape_backwards = 1; 2050 } else { 2051 mddev->reshape_position = MaxSector; 2052 mddev->delta_disks = 0; 2053 mddev->new_level = mddev->level; 2054 mddev->new_layout = mddev->layout; 2055 mddev->new_chunk_sectors = mddev->chunk_sectors; 2056 } 2057 2058 if (mddev->level == 0 && 2059 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) 2060 mddev->layout = -1; 2061 2062 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 2063 set_bit(MD_HAS_JOURNAL, &mddev->flags); 2064 2065 if (le32_to_cpu(sb->feature_map) & 2066 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { 2067 if (le32_to_cpu(sb->feature_map) & 2068 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) 2069 return -EINVAL; 2070 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && 2071 (le32_to_cpu(sb->feature_map) & 2072 MD_FEATURE_MULTIPLE_PPLS)) 2073 return -EINVAL; 2074 set_bit(MD_HAS_PPL, &mddev->flags); 2075 } 2076 } else if (mddev->pers == NULL) { 2077 /* Insist of good event counter while assembling, except for 2078 * spares (which don't need an event count). 2079 * Similar to mdadm, we allow event counter difference of 1 2080 * from the freshest device. 2081 */ 2082 if (rdev->desc_nr >= 0 && 2083 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 2084 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 2085 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 2086 if (ev1 + 1 < mddev->events) 2087 return -EINVAL; 2088 } else if (mddev->bitmap) { 2089 /* If adding to array with a bitmap, then we can accept an 2090 * older device, but not too old. 2091 */ 2092 if (ev1 < md_bitmap_events_cleared(mddev)) 2093 return 0; 2094 if (ev1 < mddev->events) 2095 set_bit(Bitmap_sync, &rdev->flags); 2096 } else { 2097 if (ev1 < mddev->events) 2098 /* just a hot-add of a new device, leave raid_disk at -1 */ 2099 return 0; 2100 } 2101 2102 if (rdev->desc_nr < 0 || 2103 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 2104 role = MD_DISK_ROLE_SPARE; 2105 rdev->desc_nr = -1; 2106 } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { 2107 /* 2108 * If we are assembling, and our event counter is smaller than the 2109 * highest event counter, we cannot trust our superblock about the role. 2110 * It could happen that our rdev was marked as Faulty, and all other 2111 * superblocks were updated with +1 event counter. 2112 * Then, before the next superblock update, which typically happens when 2113 * remove_and_add_spares() removes the device from the array, there was 2114 * a crash or reboot. 2115 * If we allow current rdev without consulting the freshest superblock, 2116 * we could cause data corruption. 2117 * Note that in this case our event counter is smaller by 1 than the 2118 * highest, otherwise, this rdev would not be allowed into array; 2119 * both kernel and mdadm allow event counter difference of 1. 2120 */ 2121 struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); 2122 u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); 2123 2124 if (rdev->desc_nr >= freshest_max_dev) { 2125 /* this is unexpected, better not proceed */ 2126 pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", 2127 mdname(mddev), rdev->bdev, rdev->desc_nr, 2128 freshest->bdev, freshest_max_dev); 2129 return -EUCLEAN; 2130 } 2131 2132 role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); 2133 pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", 2134 mdname(mddev), rdev->bdev, role, role, freshest->bdev); 2135 } else { 2136 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2137 } 2138 switch (role) { 2139 case MD_DISK_ROLE_SPARE: /* spare */ 2140 break; 2141 case MD_DISK_ROLE_FAULTY: /* faulty */ 2142 set_bit(Faulty, &rdev->flags); 2143 break; 2144 case MD_DISK_ROLE_JOURNAL: /* journal device */ 2145 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 2146 /* journal device without journal feature */ 2147 pr_warn("md: journal device provided without journal feature, ignoring the device\n"); 2148 return -EINVAL; 2149 } 2150 set_bit(Journal, &rdev->flags); 2151 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 2152 rdev->raid_disk = 0; 2153 break; 2154 default: 2155 rdev->saved_raid_disk = role; 2156 if ((le32_to_cpu(sb->feature_map) & 2157 MD_FEATURE_RECOVERY_OFFSET)) { 2158 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 2159 if (!(le32_to_cpu(sb->feature_map) & 2160 MD_FEATURE_RECOVERY_BITMAP)) 2161 rdev->saved_raid_disk = -1; 2162 } else { 2163 /* 2164 * If the array is FROZEN, then the device can't 2165 * be in_sync with rest of array. 2166 */ 2167 if (!test_bit(MD_RECOVERY_FROZEN, 2168 &mddev->recovery)) 2169 set_bit(In_sync, &rdev->flags); 2170 } 2171 rdev->raid_disk = role; 2172 break; 2173 } 2174 if (sb->devflags & WriteMostly1) 2175 set_bit(WriteMostly, &rdev->flags); 2176 if (sb->devflags & FailFast1) 2177 set_bit(FailFast, &rdev->flags); 2178 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 2179 set_bit(Replacement, &rdev->flags); 2180 2181 return 0; 2182} 2183 2184static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 2185{ 2186 struct mdp_superblock_1 *sb; 2187 struct md_rdev *rdev2; 2188 int max_dev, i; 2189 /* make rdev->sb match mddev and rdev data. */ 2190 2191 sb = page_address(rdev->sb_page); 2192 2193 sb->feature_map = 0; 2194 sb->pad0 = 0; 2195 sb->recovery_offset = cpu_to_le64(0); 2196 memset(sb->pad3, 0, sizeof(sb->pad3)); 2197 2198 sb->utime = cpu_to_le64((__u64)mddev->utime); 2199 sb->events = cpu_to_le64(mddev->events); 2200 if (mddev->in_sync) 2201 sb->resync_offset = cpu_to_le64(mddev->resync_offset); 2202 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 2203 sb->resync_offset = cpu_to_le64(MaxSector); 2204 else 2205 sb->resync_offset = cpu_to_le64(0); 2206 2207 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 2208 2209 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 2210 sb->size = cpu_to_le64(mddev->dev_sectors); 2211 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 2212 sb->level = cpu_to_le32(mddev->level); 2213 sb->layout = cpu_to_le32(mddev->layout); 2214 sb->logical_block_size = cpu_to_le32(mddev->logical_block_size); 2215 if (test_bit(FailFast, &rdev->flags)) 2216 sb->devflags |= FailFast1; 2217 else 2218 sb->devflags &= ~FailFast1; 2219 2220 if (test_bit(WriteMostly, &rdev->flags)) 2221 sb->devflags |= WriteMostly1; 2222 else 2223 sb->devflags &= ~WriteMostly1; 2224 sb->data_offset = cpu_to_le64(rdev->data_offset); 2225 sb->data_size = cpu_to_le64(rdev->sectors); 2226 2227 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 2228 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 2229 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 2230 } 2231 2232 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 2233 !test_bit(In_sync, &rdev->flags)) { 2234 sb->feature_map |= 2235 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 2236 sb->recovery_offset = 2237 cpu_to_le64(rdev->recovery_offset); 2238 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 2239 sb->feature_map |= 2240 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 2241 } 2242 /* Note: recovery_offset and journal_tail share space */ 2243 if (test_bit(Journal, &rdev->flags)) 2244 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 2245 if (test_bit(Replacement, &rdev->flags)) 2246 sb->feature_map |= 2247 cpu_to_le32(MD_FEATURE_REPLACEMENT); 2248 2249 if (mddev->reshape_position != MaxSector) { 2250 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 2251 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 2252 sb->new_layout = cpu_to_le32(mddev->new_layout); 2253 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 2254 sb->new_level = cpu_to_le32(mddev->new_level); 2255 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 2256 if (mddev->delta_disks == 0 && 2257 mddev->reshape_backwards) 2258 sb->feature_map 2259 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 2260 if (rdev->new_data_offset != rdev->data_offset) { 2261 sb->feature_map 2262 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 2263 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 2264 - rdev->data_offset)); 2265 } 2266 } 2267 2268 if (mddev_is_clustered(mddev)) 2269 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 2270 2271 if (rdev->badblocks.count == 0) 2272 /* Nothing to do for bad blocks*/ ; 2273 else if (sb->bblog_offset == 0) 2274 /* Cannot record bad blocks on this device */ 2275 md_error(mddev, rdev); 2276 else { 2277 struct badblocks *bb = &rdev->badblocks; 2278 __le64 *bbp = (__le64 *)page_address(rdev->bb_page); 2279 u64 *p = bb->page; 2280 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 2281 if (bb->changed) { 2282 unsigned seq; 2283 2284retry: 2285 seq = read_seqbegin(&bb->lock); 2286 2287 memset(bbp, 0xff, PAGE_SIZE); 2288 2289 for (i = 0 ; i < bb->count ; i++) { 2290 u64 internal_bb = p[i]; 2291 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 2292 | BB_LEN(internal_bb)); 2293 bbp[i] = cpu_to_le64(store_bb); 2294 } 2295 bb->changed = 0; 2296 if (read_seqretry(&bb->lock, seq)) 2297 goto retry; 2298 2299 bb->sector = (rdev->sb_start + 2300 (int)le32_to_cpu(sb->bblog_offset)); 2301 bb->size = le16_to_cpu(sb->bblog_size); 2302 } 2303 } 2304 2305 max_dev = 0; 2306 rdev_for_each(rdev2, mddev) 2307 if (rdev2->desc_nr+1 > max_dev) 2308 max_dev = rdev2->desc_nr+1; 2309 2310 if (max_dev > le32_to_cpu(sb->max_dev)) { 2311 int bmask; 2312 sb->max_dev = cpu_to_le32(max_dev); 2313 rdev->sb_size = max_dev * 2 + 256; 2314 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 2315 if (rdev->sb_size & bmask) 2316 rdev->sb_size = (rdev->sb_size | bmask) + 1; 2317 } else 2318 max_dev = le32_to_cpu(sb->max_dev); 2319 2320 for (i=0; i<max_dev;i++) 2321 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2322 2323 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 2324 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 2325 2326 if (test_bit(MD_HAS_PPL, &mddev->flags)) { 2327 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) 2328 sb->feature_map |= 2329 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); 2330 else 2331 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 2332 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); 2333 sb->ppl.size = cpu_to_le16(rdev->ppl.size); 2334 } 2335 2336 rdev_for_each(rdev2, mddev) { 2337 i = rdev2->desc_nr; 2338 if (test_bit(Faulty, &rdev2->flags)) 2339 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 2340 else if (test_bit(In_sync, &rdev2->flags)) 2341 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2342 else if (test_bit(Journal, &rdev2->flags)) 2343 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 2344 else if (rdev2->raid_disk >= 0) 2345 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2346 else 2347 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2348 } 2349 2350 sb->sb_csum = calc_sb_1_csum(sb); 2351} 2352 2353static sector_t super_1_choose_bm_space(sector_t dev_size) 2354{ 2355 sector_t bm_space; 2356 2357 /* if the device is bigger than 8Gig, save 64k for bitmap 2358 * usage, if bigger than 200Gig, save 128k 2359 */ 2360 if (dev_size < 64*2) 2361 bm_space = 0; 2362 else if (dev_size - 64*2 >= 200*1024*1024*2) 2363 bm_space = 128*2; 2364 else if (dev_size - 4*2 > 8*1024*1024*2) 2365 bm_space = 64*2; 2366 else 2367 bm_space = 4*2; 2368 return bm_space; 2369} 2370 2371static unsigned long long 2372super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 2373{ 2374 struct mdp_superblock_1 *sb; 2375 sector_t max_sectors; 2376 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 2377 return 0; /* component must fit device */ 2378 if (rdev->data_offset != rdev->new_data_offset) 2379 return 0; /* too confusing */ 2380 if (rdev->sb_start < rdev->data_offset) { 2381 /* minor versions 1 and 2; superblock before data */ 2382 max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 2383 if (!num_sectors || num_sectors > max_sectors) 2384 num_sectors = max_sectors; 2385 } else if (rdev->mddev->bitmap_info.offset) { 2386 /* minor version 0 with bitmap we can't move */ 2387 return 0; 2388 } else { 2389 /* minor version 0; superblock after data */ 2390 sector_t sb_start, bm_space; 2391 sector_t dev_size = bdev_nr_sectors(rdev->bdev); 2392 2393 /* 8K is for superblock */ 2394 sb_start = dev_size - 8*2; 2395 sb_start &= ~(sector_t)(4*2 - 1); 2396 2397 bm_space = super_1_choose_bm_space(dev_size); 2398 2399 /* Space that can be used to store date needs to decrease 2400 * superblock bitmap space and bad block space(4K) 2401 */ 2402 max_sectors = sb_start - bm_space - 4*2; 2403 2404 if (!num_sectors || num_sectors > max_sectors) 2405 num_sectors = max_sectors; 2406 rdev->sb_start = sb_start; 2407 } 2408 sb = page_address(rdev->sb_page); 2409 sb->data_size = cpu_to_le64(num_sectors); 2410 sb->super_offset = cpu_to_le64(rdev->sb_start); 2411 sb->sb_csum = calc_sb_1_csum(sb); 2412 do { 2413 md_write_metadata(rdev->mddev, rdev, rdev->sb_start, 2414 rdev->sb_size, rdev->sb_page, 0); 2415 } while (md_super_wait(rdev->mddev) < 0); 2416 return num_sectors; 2417 2418} 2419 2420static int 2421super_1_allow_new_offset(struct md_rdev *rdev, 2422 unsigned long long new_offset) 2423{ 2424 struct mddev *mddev = rdev->mddev; 2425 2426 /* All necessary checks on new >= old have been done */ 2427 if (new_offset >= rdev->data_offset) 2428 return 1; 2429 2430 /* with 1.0 metadata, there is no metadata to tread on 2431 * so we can always move back */ 2432 if (mddev->minor_version == 0) 2433 return 1; 2434 2435 /* otherwise we must be sure not to step on 2436 * any metadata, so stay: 2437 * 36K beyond start of superblock 2438 * beyond end of badblocks 2439 * beyond write-intent bitmap 2440 */ 2441 if (rdev->sb_start + (32+4)*2 > new_offset) 2442 return 0; 2443 2444 if (md_bitmap_registered(mddev) && !mddev->bitmap_info.file) { 2445 struct md_bitmap_stats stats; 2446 int err; 2447 2448 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 2449 if (!err && rdev->sb_start + mddev->bitmap_info.offset + 2450 stats.file_pages * (PAGE_SIZE >> 9) > new_offset) 2451 return 0; 2452 } 2453 2454 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 2455 return 0; 2456 2457 return 1; 2458} 2459 2460static struct super_type super_types[] = { 2461 [0] = { 2462 .name = "0.90.0", 2463 .owner = THIS_MODULE, 2464 .load_super = super_90_load, 2465 .validate_super = super_90_validate, 2466 .sync_super = super_90_sync, 2467 .rdev_size_change = super_90_rdev_size_change, 2468 .allow_new_offset = super_90_allow_new_offset, 2469 }, 2470 [1] = { 2471 .name = "md-1", 2472 .owner = THIS_MODULE, 2473 .load_super = super_1_load, 2474 .validate_super = super_1_validate, 2475 .sync_super = super_1_sync, 2476 .rdev_size_change = super_1_rdev_size_change, 2477 .allow_new_offset = super_1_allow_new_offset, 2478 }, 2479}; 2480 2481static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 2482{ 2483 if (mddev->sync_super) { 2484 mddev->sync_super(mddev, rdev); 2485 return; 2486 } 2487 2488 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 2489 2490 super_types[mddev->major_version].sync_super(mddev, rdev); 2491} 2492 2493static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 2494{ 2495 struct md_rdev *rdev, *rdev2; 2496 2497 rcu_read_lock(); 2498 rdev_for_each_rcu(rdev, mddev1) { 2499 if (test_bit(Faulty, &rdev->flags) || 2500 test_bit(Journal, &rdev->flags) || 2501 rdev->raid_disk == -1) 2502 continue; 2503 rdev_for_each_rcu(rdev2, mddev2) { 2504 if (test_bit(Faulty, &rdev2->flags) || 2505 test_bit(Journal, &rdev2->flags) || 2506 rdev2->raid_disk == -1) 2507 continue; 2508 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { 2509 rcu_read_unlock(); 2510 return 1; 2511 } 2512 } 2513 } 2514 rcu_read_unlock(); 2515 return 0; 2516} 2517 2518static LIST_HEAD(pending_raid_disks); 2519 2520/* 2521 * Try to register data integrity profile for an mddev 2522 * 2523 * This is called when an array is started and after a disk has been kicked 2524 * from the array. It only succeeds if all working and active component devices 2525 * are integrity capable with matching profiles. 2526 */ 2527int md_integrity_register(struct mddev *mddev) 2528{ 2529 if (list_empty(&mddev->disks)) 2530 return 0; /* nothing to do */ 2531 if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk)) 2532 return 0; /* shouldn't register */ 2533 2534 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2535 return 0; 2536} 2537EXPORT_SYMBOL(md_integrity_register); 2538 2539static bool rdev_read_only(struct md_rdev *rdev) 2540{ 2541 return bdev_read_only(rdev->bdev) || 2542 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); 2543} 2544 2545static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2546{ 2547 char b[BDEVNAME_SIZE]; 2548 int err; 2549 2550 /* prevent duplicates */ 2551 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2552 return -EEXIST; 2553 2554 if (rdev_read_only(rdev) && mddev->pers) 2555 return -EROFS; 2556 2557 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2558 if (!test_bit(Journal, &rdev->flags) && 2559 rdev->sectors && 2560 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2561 if (mddev->pers) { 2562 /* Cannot change size, so fail 2563 * If mddev->level <= 0, then we don't care 2564 * about aligning sizes (e.g. linear) 2565 */ 2566 if (mddev->level > 0) 2567 return -ENOSPC; 2568 } else 2569 mddev->dev_sectors = rdev->sectors; 2570 } 2571 2572 /* Verify rdev->desc_nr is unique. 2573 * If it is -1, assign a free number, else 2574 * check number is not in use 2575 */ 2576 rcu_read_lock(); 2577 if (rdev->desc_nr < 0) { 2578 int choice = 0; 2579 if (mddev->pers) 2580 choice = mddev->raid_disks; 2581 while (md_find_rdev_nr_rcu(mddev, choice)) 2582 choice++; 2583 rdev->desc_nr = choice; 2584 } else { 2585 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2586 rcu_read_unlock(); 2587 return -EBUSY; 2588 } 2589 } 2590 rcu_read_unlock(); 2591 if (!test_bit(Journal, &rdev->flags) && 2592 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2593 pr_warn("md: %s: array is limited to %d devices\n", 2594 mdname(mddev), mddev->max_disks); 2595 return -EBUSY; 2596 } 2597 snprintf(b, sizeof(b), "%pg", rdev->bdev); 2598 strreplace(b, '/', '!'); 2599 2600 rdev->mddev = mddev; 2601 pr_debug("md: bind<%s>\n", b); 2602 2603 if (mddev->raid_disks) 2604 mddev_create_serial_pool(mddev, rdev); 2605 2606 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2607 goto fail; 2608 2609 /* failure here is OK */ 2610 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); 2611 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2612 rdev->sysfs_unack_badblocks = 2613 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); 2614 rdev->sysfs_badblocks = 2615 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); 2616 2617 list_add_rcu(&rdev->same_set, &mddev->disks); 2618 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2619 2620 /* May as well allow recovery to be retried once */ 2621 mddev->recovery_disabled++; 2622 2623 return 0; 2624 2625 fail: 2626 pr_warn("md: failed to register dev-%s for %s\n", 2627 b, mdname(mddev)); 2628 mddev_destroy_serial_pool(mddev, rdev); 2629 return err; 2630} 2631 2632void md_autodetect_dev(dev_t dev); 2633 2634/* just for claiming the bdev */ 2635static struct md_rdev claim_rdev; 2636 2637static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) 2638{ 2639 pr_debug("md: export_rdev(%pg)\n", rdev->bdev); 2640 md_rdev_clear(rdev); 2641#ifndef MODULE 2642 if (test_bit(AutoDetected, &rdev->flags)) 2643 md_autodetect_dev(rdev->bdev->bd_dev); 2644#endif 2645 fput(rdev->bdev_file); 2646 rdev->bdev = NULL; 2647 kobject_put(&rdev->kobj); 2648} 2649 2650static void md_kick_rdev_from_array(struct md_rdev *rdev) 2651{ 2652 struct mddev *mddev = rdev->mddev; 2653 2654 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2655 list_del_rcu(&rdev->same_set); 2656 pr_debug("md: unbind<%pg>\n", rdev->bdev); 2657 mddev_destroy_serial_pool(rdev->mddev, rdev); 2658 WRITE_ONCE(rdev->mddev, NULL); 2659 sysfs_remove_link(&rdev->kobj, "block"); 2660 sysfs_put(rdev->sysfs_state); 2661 sysfs_put(rdev->sysfs_unack_badblocks); 2662 sysfs_put(rdev->sysfs_badblocks); 2663 rdev->sysfs_state = NULL; 2664 rdev->sysfs_unack_badblocks = NULL; 2665 rdev->sysfs_badblocks = NULL; 2666 rdev->badblocks.count = 0; 2667 2668 synchronize_rcu(); 2669 2670 /* 2671 * kobject_del() will wait for all in progress writers to be done, where 2672 * reconfig_mutex is held, hence it can't be called under 2673 * reconfig_mutex and it's delayed to mddev_unlock(). 2674 */ 2675 list_add(&rdev->same_set, &mddev->deleting); 2676} 2677 2678static void export_array(struct mddev *mddev) 2679{ 2680 struct md_rdev *rdev; 2681 2682 while (!list_empty(&mddev->disks)) { 2683 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2684 same_set); 2685 md_kick_rdev_from_array(rdev); 2686 } 2687 mddev->raid_disks = 0; 2688 mddev->major_version = 0; 2689} 2690 2691static bool set_in_sync(struct mddev *mddev) 2692{ 2693 lockdep_assert_held(&mddev->lock); 2694 if (!mddev->in_sync) { 2695 mddev->sync_checkers++; 2696 spin_unlock(&mddev->lock); 2697 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); 2698 spin_lock(&mddev->lock); 2699 if (!mddev->in_sync && 2700 percpu_ref_is_zero(&mddev->writes_pending)) { 2701 mddev->in_sync = 1; 2702 /* 2703 * Ensure ->in_sync is visible before we clear 2704 * ->sync_checkers. 2705 */ 2706 smp_mb(); 2707 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2708 sysfs_notify_dirent_safe(mddev->sysfs_state); 2709 } 2710 if (--mddev->sync_checkers == 0) 2711 percpu_ref_switch_to_percpu(&mddev->writes_pending); 2712 } 2713 if (mddev->safemode == 1) 2714 mddev->safemode = 0; 2715 return mddev->in_sync; 2716} 2717 2718static void sync_sbs(struct mddev *mddev, int nospares) 2719{ 2720 /* Update each superblock (in-memory image), but 2721 * if we are allowed to, skip spares which already 2722 * have the right event counter, or have one earlier 2723 * (which would mean they aren't being marked as dirty 2724 * with the rest of the array) 2725 */ 2726 struct md_rdev *rdev; 2727 rdev_for_each(rdev, mddev) { 2728 if (rdev->sb_events == mddev->events || 2729 (nospares && 2730 rdev->raid_disk < 0 && 2731 rdev->sb_events+1 == mddev->events)) { 2732 /* Don't update this superblock */ 2733 rdev->sb_loaded = 2; 2734 } else { 2735 sync_super(mddev, rdev); 2736 rdev->sb_loaded = 1; 2737 } 2738 } 2739} 2740 2741static bool does_sb_need_changing(struct mddev *mddev) 2742{ 2743 struct md_rdev *rdev = NULL, *iter; 2744 struct mdp_superblock_1 *sb; 2745 int role; 2746 2747 /* Find a good rdev */ 2748 rdev_for_each(iter, mddev) 2749 if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { 2750 rdev = iter; 2751 break; 2752 } 2753 2754 /* No good device found. */ 2755 if (!rdev) 2756 return false; 2757 2758 sb = page_address(rdev->sb_page); 2759 /* Check if a device has become faulty or a spare become active */ 2760 rdev_for_each(rdev, mddev) { 2761 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2762 /* Device activated? */ 2763 if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && 2764 !test_bit(Faulty, &rdev->flags)) 2765 return true; 2766 /* Device turned faulty? */ 2767 if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) 2768 return true; 2769 } 2770 2771 /* Check if any mddev parameters have changed */ 2772 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2773 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2774 (mddev->layout != le32_to_cpu(sb->layout)) || 2775 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2776 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2777 return true; 2778 2779 return false; 2780} 2781 2782void md_update_sb(struct mddev *mddev, int force_change) 2783{ 2784 struct md_rdev *rdev; 2785 int sync_req; 2786 int nospares = 0; 2787 int any_badblocks_changed = 0; 2788 int ret = -1; 2789 2790 if (!md_is_rdwr(mddev)) { 2791 if (force_change) 2792 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2793 pr_err("%s: can't update sb for read-only array %s\n", __func__, mdname(mddev)); 2794 return; 2795 } 2796 2797repeat: 2798 if (mddev_is_clustered(mddev)) { 2799 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2800 force_change = 1; 2801 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2802 nospares = 1; 2803 ret = mddev->cluster_ops->metadata_update_start(mddev); 2804 /* Has someone else has updated the sb */ 2805 if (!does_sb_need_changing(mddev)) { 2806 if (ret == 0) 2807 mddev->cluster_ops->metadata_update_cancel(mddev); 2808 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2809 BIT(MD_SB_CHANGE_DEVS) | 2810 BIT(MD_SB_CHANGE_CLEAN)); 2811 return; 2812 } 2813 } 2814 2815 /* 2816 * First make sure individual recovery_offsets are correct 2817 * curr_resync_completed can only be used during recovery. 2818 * During reshape/resync it might use array-addresses rather 2819 * that device addresses. 2820 */ 2821 rdev_for_each(rdev, mddev) { 2822 if (rdev->raid_disk >= 0 && 2823 mddev->delta_disks >= 0 && 2824 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 2825 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 2826 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 2827 !test_bit(Journal, &rdev->flags) && 2828 !test_bit(In_sync, &rdev->flags) && 2829 mddev->curr_resync_completed > rdev->recovery_offset) 2830 rdev->recovery_offset = mddev->curr_resync_completed; 2831 2832 } 2833 if (!mddev->persistent) { 2834 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2835 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2836 if (!mddev->external) { 2837 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2838 rdev_for_each(rdev, mddev) { 2839 if (rdev->badblocks.changed) { 2840 rdev->badblocks.changed = 0; 2841 ack_all_badblocks(&rdev->badblocks); 2842 md_error(mddev, rdev); 2843 } 2844 clear_bit(Blocked, &rdev->flags); 2845 clear_bit(BlockedBadBlocks, &rdev->flags); 2846 wake_up(&rdev->blocked_wait); 2847 } 2848 } 2849 wake_up(&mddev->sb_wait); 2850 return; 2851 } 2852 2853 spin_lock(&mddev->lock); 2854 2855 mddev->utime = ktime_get_real_seconds(); 2856 2857 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2858 force_change = 1; 2859 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2860 /* just a clean<-> dirty transition, possibly leave spares alone, 2861 * though if events isn't the right even/odd, we will have to do 2862 * spares after all 2863 */ 2864 nospares = 1; 2865 if (force_change) 2866 nospares = 0; 2867 if (mddev->degraded) 2868 /* If the array is degraded, then skipping spares is both 2869 * dangerous and fairly pointless. 2870 * Dangerous because a device that was removed from the array 2871 * might have a event_count that still looks up-to-date, 2872 * so it can be re-added without a resync. 2873 * Pointless because if there are any spares to skip, 2874 * then a recovery will happen and soon that array won't 2875 * be degraded any more and the spare can go back to sleep then. 2876 */ 2877 nospares = 0; 2878 2879 sync_req = mddev->in_sync; 2880 2881 /* If this is just a dirty<->clean transition, and the array is clean 2882 * and 'events' is odd, we can roll back to the previous clean state */ 2883 if (nospares 2884 && (mddev->in_sync && mddev->resync_offset == MaxSector) 2885 && mddev->can_decrease_events 2886 && mddev->events != 1) { 2887 mddev->events--; 2888 mddev->can_decrease_events = 0; 2889 } else { 2890 /* otherwise we have to go forward and ... */ 2891 mddev->events ++; 2892 mddev->can_decrease_events = nospares; 2893 } 2894 2895 /* 2896 * This 64-bit counter should never wrap. 2897 * Either we are in around ~1 trillion A.C., assuming 2898 * 1 reboot per second, or we have a bug... 2899 */ 2900 WARN_ON(mddev->events == 0); 2901 2902 rdev_for_each(rdev, mddev) { 2903 if (rdev->badblocks.changed) 2904 any_badblocks_changed++; 2905 if (test_bit(Faulty, &rdev->flags)) 2906 set_bit(FaultRecorded, &rdev->flags); 2907 } 2908 2909 sync_sbs(mddev, nospares); 2910 spin_unlock(&mddev->lock); 2911 2912 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2913 mdname(mddev), mddev->in_sync); 2914 2915 mddev_add_trace_msg(mddev, "md md_update_sb"); 2916rewrite: 2917 if (md_bitmap_enabled(mddev, false)) 2918 mddev->bitmap_ops->update_sb(mddev->bitmap); 2919 rdev_for_each(rdev, mddev) { 2920 if (rdev->sb_loaded != 1) 2921 continue; /* no noise on spare devices */ 2922 2923 if (!test_bit(Faulty, &rdev->flags)) { 2924 md_write_metadata(mddev, rdev, rdev->sb_start, 2925 rdev->sb_size, rdev->sb_page, 0); 2926 pr_debug("md: (write) %pg's sb offset: %llu\n", 2927 rdev->bdev, 2928 (unsigned long long)rdev->sb_start); 2929 rdev->sb_events = mddev->events; 2930 if (rdev->badblocks.size) { 2931 md_write_metadata(mddev, rdev, 2932 rdev->badblocks.sector, 2933 rdev->badblocks.size << 9, 2934 rdev->bb_page, 0); 2935 rdev->badblocks.size = 0; 2936 } 2937 2938 } else 2939 pr_debug("md: %pg (skipping faulty)\n", 2940 rdev->bdev); 2941 } 2942 if (md_super_wait(mddev) < 0) 2943 goto rewrite; 2944 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 2945 2946 if (mddev_is_clustered(mddev) && ret == 0) 2947 mddev->cluster_ops->metadata_update_finish(mddev); 2948 2949 if (mddev->in_sync != sync_req || 2950 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2951 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 2952 /* have to write it out again */ 2953 goto repeat; 2954 wake_up(&mddev->sb_wait); 2955 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2956 sysfs_notify_dirent_safe(mddev->sysfs_completed); 2957 2958 rdev_for_each(rdev, mddev) { 2959 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2960 clear_bit(Blocked, &rdev->flags); 2961 2962 if (any_badblocks_changed) 2963 ack_all_badblocks(&rdev->badblocks); 2964 clear_bit(BlockedBadBlocks, &rdev->flags); 2965 wake_up(&rdev->blocked_wait); 2966 } 2967} 2968EXPORT_SYMBOL(md_update_sb); 2969 2970static int add_bound_rdev(struct md_rdev *rdev) 2971{ 2972 struct mddev *mddev = rdev->mddev; 2973 int err = 0; 2974 bool add_journal = test_bit(Journal, &rdev->flags); 2975 2976 if (!mddev->pers->hot_remove_disk || add_journal) { 2977 /* If there is hot_add_disk but no hot_remove_disk 2978 * then added disks for geometry changes, 2979 * and should be added immediately. 2980 */ 2981 super_types[mddev->major_version]. 2982 validate_super(mddev, NULL/*freshest*/, rdev); 2983 err = mddev->pers->hot_add_disk(mddev, rdev); 2984 if (err) { 2985 md_kick_rdev_from_array(rdev); 2986 return err; 2987 } 2988 } 2989 sysfs_notify_dirent_safe(rdev->sysfs_state); 2990 2991 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2992 if (mddev->degraded) 2993 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2994 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2995 md_new_event(); 2996 return 0; 2997} 2998 2999/* words written to sysfs files may, or may not, be \n terminated. 3000 * We want to accept with case. For this we use cmd_match. 3001 */ 3002static int cmd_match(const char *cmd, const char *str) 3003{ 3004 /* See if cmd, written into a sysfs file, matches 3005 * str. They must either be the same, or cmd can 3006 * have a trailing newline 3007 */ 3008 while (*cmd && *str && *cmd == *str) { 3009 cmd++; 3010 str++; 3011 } 3012 if (*cmd == '\n') 3013 cmd++; 3014 if (*str || *cmd) 3015 return 0; 3016 return 1; 3017} 3018 3019struct rdev_sysfs_entry { 3020 struct attribute attr; 3021 ssize_t (*show)(struct md_rdev *, char *); 3022 ssize_t (*store)(struct md_rdev *, const char *, size_t); 3023}; 3024 3025static ssize_t 3026state_show(struct md_rdev *rdev, char *page) 3027{ 3028 char *sep = ","; 3029 size_t len = 0; 3030 unsigned long flags = READ_ONCE(rdev->flags); 3031 3032 if (test_bit(Faulty, &flags) || 3033 (!test_bit(ExternalBbl, &flags) && 3034 rdev->badblocks.unacked_exist)) 3035 len += sprintf(page+len, "faulty%s", sep); 3036 if (test_bit(In_sync, &flags)) 3037 len += sprintf(page+len, "in_sync%s", sep); 3038 if (test_bit(Journal, &flags)) 3039 len += sprintf(page+len, "journal%s", sep); 3040 if (test_bit(WriteMostly, &flags)) 3041 len += sprintf(page+len, "write_mostly%s", sep); 3042 if (test_bit(Blocked, &flags) || 3043 (rdev->badblocks.unacked_exist 3044 && !test_bit(Faulty, &flags))) 3045 len += sprintf(page+len, "blocked%s", sep); 3046 if (!test_bit(Faulty, &flags) && 3047 !test_bit(Journal, &flags) && 3048 !test_bit(In_sync, &flags)) 3049 len += sprintf(page+len, "spare%s", sep); 3050 if (test_bit(WriteErrorSeen, &flags)) 3051 len += sprintf(page+len, "write_error%s", sep); 3052 if (test_bit(WantReplacement, &flags)) 3053 len += sprintf(page+len, "want_replacement%s", sep); 3054 if (test_bit(Replacement, &flags)) 3055 len += sprintf(page+len, "replacement%s", sep); 3056 if (test_bit(ExternalBbl, &flags)) 3057 len += sprintf(page+len, "external_bbl%s", sep); 3058 if (test_bit(FailFast, &flags)) 3059 len += sprintf(page+len, "failfast%s", sep); 3060 3061 if (len) 3062 len -= strlen(sep); 3063 3064 return len+sprintf(page+len, "\n"); 3065} 3066 3067static ssize_t 3068state_store(struct md_rdev *rdev, const char *buf, size_t len) 3069{ 3070 /* can write 3071 * faulty - simulates an error 3072 * remove - disconnects the device 3073 * writemostly - sets write_mostly 3074 * -writemostly - clears write_mostly 3075 * blocked - sets the Blocked flags 3076 * -blocked - clears the Blocked and possibly simulates an error 3077 * insync - sets Insync providing device isn't active 3078 * -insync - clear Insync for a device with a slot assigned, 3079 * so that it gets rebuilt based on bitmap 3080 * write_error - sets WriteErrorSeen 3081 * -write_error - clears WriteErrorSeen 3082 * {,-}failfast - set/clear FailFast 3083 */ 3084 3085 struct mddev *mddev = rdev->mddev; 3086 int err = -EINVAL; 3087 bool need_update_sb = false; 3088 3089 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 3090 md_error(rdev->mddev, rdev); 3091 3092 if (test_bit(MD_BROKEN, &rdev->mddev->flags)) 3093 err = -EBUSY; 3094 else 3095 err = 0; 3096 } else if (cmd_match(buf, "remove")) { 3097 if (rdev->mddev->pers) { 3098 clear_bit(Blocked, &rdev->flags); 3099 remove_and_add_spares(rdev->mddev, rdev); 3100 } 3101 if (rdev->raid_disk >= 0) 3102 err = -EBUSY; 3103 else { 3104 err = 0; 3105 if (mddev_is_clustered(mddev)) 3106 err = mddev->cluster_ops->remove_disk(mddev, rdev); 3107 3108 if (err == 0) { 3109 md_kick_rdev_from_array(rdev); 3110 if (mddev->pers) 3111 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 3112 md_new_event(); 3113 } 3114 } 3115 } else if (cmd_match(buf, "writemostly")) { 3116 set_bit(WriteMostly, &rdev->flags); 3117 mddev_create_serial_pool(rdev->mddev, rdev); 3118 need_update_sb = true; 3119 err = 0; 3120 } else if (cmd_match(buf, "-writemostly")) { 3121 mddev_destroy_serial_pool(rdev->mddev, rdev); 3122 clear_bit(WriteMostly, &rdev->flags); 3123 need_update_sb = true; 3124 err = 0; 3125 } else if (cmd_match(buf, "blocked")) { 3126 set_bit(Blocked, &rdev->flags); 3127 err = 0; 3128 } else if (cmd_match(buf, "-blocked")) { 3129 if (!test_bit(Faulty, &rdev->flags) && 3130 !test_bit(ExternalBbl, &rdev->flags) && 3131 rdev->badblocks.unacked_exist) { 3132 /* metadata handler doesn't understand badblocks, 3133 * so we need to fail the device 3134 */ 3135 md_error(rdev->mddev, rdev); 3136 } 3137 clear_bit(Blocked, &rdev->flags); 3138 clear_bit(BlockedBadBlocks, &rdev->flags); 3139 wake_up(&rdev->blocked_wait); 3140 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3141 3142 err = 0; 3143 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 3144 set_bit(In_sync, &rdev->flags); 3145 err = 0; 3146 } else if (cmd_match(buf, "failfast")) { 3147 set_bit(FailFast, &rdev->flags); 3148 need_update_sb = true; 3149 err = 0; 3150 } else if (cmd_match(buf, "-failfast")) { 3151 clear_bit(FailFast, &rdev->flags); 3152 need_update_sb = true; 3153 err = 0; 3154 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 3155 !test_bit(Journal, &rdev->flags)) { 3156 if (rdev->mddev->pers == NULL) { 3157 clear_bit(In_sync, &rdev->flags); 3158 rdev->saved_raid_disk = rdev->raid_disk; 3159 rdev->raid_disk = -1; 3160 err = 0; 3161 } 3162 } else if (cmd_match(buf, "write_error")) { 3163 set_bit(WriteErrorSeen, &rdev->flags); 3164 err = 0; 3165 } else if (cmd_match(buf, "-write_error")) { 3166 clear_bit(WriteErrorSeen, &rdev->flags); 3167 err = 0; 3168 } else if (cmd_match(buf, "want_replacement")) { 3169 /* Any non-spare device that is not a replacement can 3170 * become want_replacement at any time, but we then need to 3171 * check if recovery is needed. 3172 */ 3173 if (rdev->raid_disk >= 0 && 3174 !test_bit(Journal, &rdev->flags) && 3175 !test_bit(Replacement, &rdev->flags)) 3176 set_bit(WantReplacement, &rdev->flags); 3177 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3178 err = 0; 3179 } else if (cmd_match(buf, "-want_replacement")) { 3180 /* Clearing 'want_replacement' is always allowed. 3181 * Once replacements starts it is too late though. 3182 */ 3183 err = 0; 3184 clear_bit(WantReplacement, &rdev->flags); 3185 } else if (cmd_match(buf, "replacement")) { 3186 /* Can only set a device as a replacement when array has not 3187 * yet been started. Once running, replacement is automatic 3188 * from spares, or by assigning 'slot'. 3189 */ 3190 if (rdev->mddev->pers) 3191 err = -EBUSY; 3192 else { 3193 set_bit(Replacement, &rdev->flags); 3194 err = 0; 3195 } 3196 } else if (cmd_match(buf, "-replacement")) { 3197 /* Similarly, can only clear Replacement before start */ 3198 if (rdev->mddev->pers) 3199 err = -EBUSY; 3200 else { 3201 clear_bit(Replacement, &rdev->flags); 3202 err = 0; 3203 } 3204 } else if (cmd_match(buf, "re-add")) { 3205 if (!rdev->mddev->pers) 3206 err = -EINVAL; 3207 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && 3208 rdev->saved_raid_disk >= 0) { 3209 /* clear_bit is performed _after_ all the devices 3210 * have their local Faulty bit cleared. If any writes 3211 * happen in the meantime in the local node, they 3212 * will land in the local bitmap, which will be synced 3213 * by this node eventually 3214 */ 3215 if (!mddev_is_clustered(rdev->mddev) || 3216 (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) { 3217 clear_bit(Faulty, &rdev->flags); 3218 err = add_bound_rdev(rdev); 3219 } 3220 } else 3221 err = -EBUSY; 3222 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { 3223 set_bit(ExternalBbl, &rdev->flags); 3224 rdev->badblocks.shift = 0; 3225 err = 0; 3226 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { 3227 clear_bit(ExternalBbl, &rdev->flags); 3228 err = 0; 3229 } 3230 if (need_update_sb) 3231 md_update_sb(mddev, 1); 3232 if (!err) 3233 sysfs_notify_dirent_safe(rdev->sysfs_state); 3234 return err ? err : len; 3235} 3236static struct rdev_sysfs_entry rdev_state = 3237__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 3238 3239static ssize_t 3240errors_show(struct md_rdev *rdev, char *page) 3241{ 3242 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 3243} 3244 3245static ssize_t 3246errors_store(struct md_rdev *rdev, const char *buf, size_t len) 3247{ 3248 unsigned int n; 3249 int rv; 3250 3251 rv = kstrtouint(buf, 10, &n); 3252 if (rv < 0) 3253 return rv; 3254 atomic_set(&rdev->corrected_errors, n); 3255 return len; 3256} 3257static struct rdev_sysfs_entry rdev_errors = 3258__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 3259 3260static ssize_t 3261slot_show(struct md_rdev *rdev, char *page) 3262{ 3263 if (test_bit(Journal, &rdev->flags)) 3264 return sprintf(page, "journal\n"); 3265 else if (rdev->raid_disk < 0) 3266 return sprintf(page, "none\n"); 3267 else 3268 return sprintf(page, "%d\n", rdev->raid_disk); 3269} 3270 3271static ssize_t 3272slot_store(struct md_rdev *rdev, const char *buf, size_t len) 3273{ 3274 int slot; 3275 int err; 3276 3277 if (test_bit(Journal, &rdev->flags)) 3278 return -EBUSY; 3279 if (strncmp(buf, "none", 4)==0) 3280 slot = -1; 3281 else { 3282 err = kstrtouint(buf, 10, (unsigned int *)&slot); 3283 if (err < 0) 3284 return err; 3285 if (slot < 0) 3286 /* overflow */ 3287 return -ENOSPC; 3288 } 3289 if (rdev->mddev->pers && slot == -1) { 3290 /* Setting 'slot' on an active array requires also 3291 * updating the 'rd%d' link, and communicating 3292 * with the personality with ->hot_*_disk. 3293 * For now we only support removing 3294 * failed/spare devices. This normally happens automatically, 3295 * but not when the metadata is externally managed. 3296 */ 3297 if (rdev->raid_disk == -1) 3298 return -EEXIST; 3299 /* personality does all needed checks */ 3300 if (rdev->mddev->pers->hot_remove_disk == NULL) 3301 return -EINVAL; 3302 clear_bit(Blocked, &rdev->flags); 3303 remove_and_add_spares(rdev->mddev, rdev); 3304 if (rdev->raid_disk >= 0) 3305 return -EBUSY; 3306 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3307 } else if (rdev->mddev->pers) { 3308 /* Activating a spare .. or possibly reactivating 3309 * if we ever get bitmaps working here. 3310 */ 3311 int err; 3312 3313 if (rdev->raid_disk != -1) 3314 return -EBUSY; 3315 3316 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 3317 return -EBUSY; 3318 3319 if (rdev->mddev->pers->hot_add_disk == NULL) 3320 return -EINVAL; 3321 3322 if (slot >= rdev->mddev->raid_disks && 3323 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3324 return -ENOSPC; 3325 3326 rdev->raid_disk = slot; 3327 if (test_bit(In_sync, &rdev->flags)) 3328 rdev->saved_raid_disk = slot; 3329 else 3330 rdev->saved_raid_disk = -1; 3331 clear_bit(In_sync, &rdev->flags); 3332 clear_bit(Bitmap_sync, &rdev->flags); 3333 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); 3334 if (err) { 3335 rdev->raid_disk = -1; 3336 return err; 3337 } else 3338 sysfs_notify_dirent_safe(rdev->sysfs_state); 3339 /* failure here is OK */; 3340 sysfs_link_rdev(rdev->mddev, rdev); 3341 /* don't wakeup anyone, leave that to userspace. */ 3342 } else { 3343 if (slot >= rdev->mddev->raid_disks && 3344 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3345 return -ENOSPC; 3346 rdev->raid_disk = slot; 3347 /* assume it is working */ 3348 clear_bit(Faulty, &rdev->flags); 3349 clear_bit(WriteMostly, &rdev->flags); 3350 set_bit(In_sync, &rdev->flags); 3351 sysfs_notify_dirent_safe(rdev->sysfs_state); 3352 } 3353 return len; 3354} 3355 3356static struct rdev_sysfs_entry rdev_slot = 3357__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 3358 3359static ssize_t 3360offset_show(struct md_rdev *rdev, char *page) 3361{ 3362 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 3363} 3364 3365static ssize_t 3366offset_store(struct md_rdev *rdev, const char *buf, size_t len) 3367{ 3368 unsigned long long offset; 3369 if (kstrtoull(buf, 10, &offset) < 0) 3370 return -EINVAL; 3371 if (rdev->mddev->pers && rdev->raid_disk >= 0) 3372 return -EBUSY; 3373 if (rdev->sectors && rdev->mddev->external) 3374 /* Must set offset before size, so overlap checks 3375 * can be sane */ 3376 return -EBUSY; 3377 rdev->data_offset = offset; 3378 rdev->new_data_offset = offset; 3379 return len; 3380} 3381 3382static struct rdev_sysfs_entry rdev_offset = 3383__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 3384 3385static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 3386{ 3387 return sprintf(page, "%llu\n", 3388 (unsigned long long)rdev->new_data_offset); 3389} 3390 3391static ssize_t new_offset_store(struct md_rdev *rdev, 3392 const char *buf, size_t len) 3393{ 3394 unsigned long long new_offset; 3395 struct mddev *mddev = rdev->mddev; 3396 3397 if (kstrtoull(buf, 10, &new_offset) < 0) 3398 return -EINVAL; 3399 3400 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3401 return -EBUSY; 3402 if (new_offset == rdev->data_offset) 3403 /* reset is always permitted */ 3404 ; 3405 else if (new_offset > rdev->data_offset) { 3406 /* must not push array size beyond rdev_sectors */ 3407 if (new_offset - rdev->data_offset 3408 + mddev->dev_sectors > rdev->sectors) 3409 return -E2BIG; 3410 } 3411 /* Metadata worries about other space details. */ 3412 3413 /* decreasing the offset is inconsistent with a backwards 3414 * reshape. 3415 */ 3416 if (new_offset < rdev->data_offset && 3417 mddev->reshape_backwards) 3418 return -EINVAL; 3419 /* Increasing offset is inconsistent with forwards 3420 * reshape. reshape_direction should be set to 3421 * 'backwards' first. 3422 */ 3423 if (new_offset > rdev->data_offset && 3424 !mddev->reshape_backwards) 3425 return -EINVAL; 3426 3427 if (mddev->pers && mddev->persistent && 3428 !super_types[mddev->major_version] 3429 .allow_new_offset(rdev, new_offset)) 3430 return -E2BIG; 3431 rdev->new_data_offset = new_offset; 3432 if (new_offset > rdev->data_offset) 3433 mddev->reshape_backwards = 1; 3434 else if (new_offset < rdev->data_offset) 3435 mddev->reshape_backwards = 0; 3436 3437 return len; 3438} 3439static struct rdev_sysfs_entry rdev_new_offset = 3440__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 3441 3442static ssize_t 3443rdev_size_show(struct md_rdev *rdev, char *page) 3444{ 3445 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 3446} 3447 3448static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) 3449{ 3450 /* check if two start/length pairs overlap */ 3451 if (a->data_offset + a->sectors <= b->data_offset) 3452 return false; 3453 if (b->data_offset + b->sectors <= a->data_offset) 3454 return false; 3455 return true; 3456} 3457 3458static bool md_rdev_overlaps(struct md_rdev *rdev) 3459{ 3460 struct mddev *mddev; 3461 struct md_rdev *rdev2; 3462 3463 spin_lock(&all_mddevs_lock); 3464 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 3465 if (test_bit(MD_DELETED, &mddev->flags)) 3466 continue; 3467 rdev_for_each(rdev2, mddev) { 3468 if (rdev != rdev2 && rdev->bdev == rdev2->bdev && 3469 md_rdevs_overlap(rdev, rdev2)) { 3470 spin_unlock(&all_mddevs_lock); 3471 return true; 3472 } 3473 } 3474 } 3475 spin_unlock(&all_mddevs_lock); 3476 return false; 3477} 3478 3479static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 3480{ 3481 unsigned long long blocks; 3482 sector_t new; 3483 3484 if (kstrtoull(buf, 10, &blocks) < 0) 3485 return -EINVAL; 3486 3487 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 3488 return -EINVAL; /* sector conversion overflow */ 3489 3490 new = blocks * 2; 3491 if (new != blocks * 2) 3492 return -EINVAL; /* unsigned long long to sector_t overflow */ 3493 3494 *sectors = new; 3495 return 0; 3496} 3497 3498static ssize_t 3499rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3500{ 3501 struct mddev *my_mddev = rdev->mddev; 3502 sector_t oldsectors = rdev->sectors; 3503 sector_t sectors; 3504 3505 if (test_bit(Journal, &rdev->flags)) 3506 return -EBUSY; 3507 if (strict_blocks_to_sectors(buf, &sectors) < 0) 3508 return -EINVAL; 3509 if (rdev->data_offset != rdev->new_data_offset) 3510 return -EINVAL; /* too confusing */ 3511 if (my_mddev->pers && rdev->raid_disk >= 0) { 3512 if (my_mddev->persistent) { 3513 sectors = super_types[my_mddev->major_version]. 3514 rdev_size_change(rdev, sectors); 3515 if (!sectors) 3516 return -EBUSY; 3517 } else if (!sectors) 3518 sectors = bdev_nr_sectors(rdev->bdev) - 3519 rdev->data_offset; 3520 if (!my_mddev->pers->resize) 3521 /* Cannot change size for RAID0 or Linear etc */ 3522 return -EINVAL; 3523 } 3524 if (sectors < my_mddev->dev_sectors) 3525 return -EINVAL; /* component must fit device */ 3526 3527 rdev->sectors = sectors; 3528 3529 /* 3530 * Check that all other rdevs with the same bdev do not overlap. This 3531 * check does not provide a hard guarantee, it just helps avoid 3532 * dangerous mistakes. 3533 */ 3534 if (sectors > oldsectors && my_mddev->external && 3535 md_rdev_overlaps(rdev)) { 3536 /* 3537 * Someone else could have slipped in a size change here, but 3538 * doing so is just silly. We put oldsectors back because we 3539 * know it is safe, and trust userspace not to race with itself. 3540 */ 3541 rdev->sectors = oldsectors; 3542 return -EBUSY; 3543 } 3544 return len; 3545} 3546 3547static struct rdev_sysfs_entry rdev_size = 3548__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3549 3550static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3551{ 3552 unsigned long long recovery_start = rdev->recovery_offset; 3553 3554 if (test_bit(In_sync, &rdev->flags) || 3555 recovery_start == MaxSector) 3556 return sprintf(page, "none\n"); 3557 3558 return sprintf(page, "%llu\n", recovery_start); 3559} 3560 3561static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3562{ 3563 unsigned long long recovery_start; 3564 3565 if (cmd_match(buf, "none")) 3566 recovery_start = MaxSector; 3567 else if (kstrtoull(buf, 10, &recovery_start)) 3568 return -EINVAL; 3569 3570 if (rdev->mddev->pers && 3571 rdev->raid_disk >= 0) 3572 return -EBUSY; 3573 3574 rdev->recovery_offset = recovery_start; 3575 if (recovery_start == MaxSector) 3576 set_bit(In_sync, &rdev->flags); 3577 else 3578 clear_bit(In_sync, &rdev->flags); 3579 return len; 3580} 3581 3582static struct rdev_sysfs_entry rdev_recovery_start = 3583__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3584 3585/* sysfs access to bad-blocks list. 3586 * We present two files. 3587 * 'bad-blocks' lists sector numbers and lengths of ranges that 3588 * are recorded as bad. The list is truncated to fit within 3589 * the one-page limit of sysfs. 3590 * Writing "sector length" to this file adds an acknowledged 3591 * bad block list. 3592 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3593 * been acknowledged. Writing to this file adds bad blocks 3594 * without acknowledging them. This is largely for testing. 3595 */ 3596static ssize_t bb_show(struct md_rdev *rdev, char *page) 3597{ 3598 return badblocks_show(&rdev->badblocks, page, 0); 3599} 3600static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3601{ 3602 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3603 /* Maybe that ack was all we needed */ 3604 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3605 wake_up(&rdev->blocked_wait); 3606 return rv; 3607} 3608static struct rdev_sysfs_entry rdev_bad_blocks = 3609__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3610 3611static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3612{ 3613 return badblocks_show(&rdev->badblocks, page, 1); 3614} 3615static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3616{ 3617 return badblocks_store(&rdev->badblocks, page, len, 1); 3618} 3619static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3620__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3621 3622static ssize_t 3623ppl_sector_show(struct md_rdev *rdev, char *page) 3624{ 3625 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); 3626} 3627 3628static ssize_t 3629ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) 3630{ 3631 unsigned long long sector; 3632 3633 if (kstrtoull(buf, 10, &sector) < 0) 3634 return -EINVAL; 3635 if (sector != (sector_t)sector) 3636 return -EINVAL; 3637 3638 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3639 rdev->raid_disk >= 0) 3640 return -EBUSY; 3641 3642 if (rdev->mddev->persistent) { 3643 if (rdev->mddev->major_version == 0) 3644 return -EINVAL; 3645 if ((sector > rdev->sb_start && 3646 sector - rdev->sb_start > S16_MAX) || 3647 (sector < rdev->sb_start && 3648 rdev->sb_start - sector > -S16_MIN)) 3649 return -EINVAL; 3650 rdev->ppl.offset = sector - rdev->sb_start; 3651 } else if (!rdev->mddev->external) { 3652 return -EBUSY; 3653 } 3654 rdev->ppl.sector = sector; 3655 return len; 3656} 3657 3658static struct rdev_sysfs_entry rdev_ppl_sector = 3659__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); 3660 3661static ssize_t 3662ppl_size_show(struct md_rdev *rdev, char *page) 3663{ 3664 return sprintf(page, "%u\n", rdev->ppl.size); 3665} 3666 3667static ssize_t 3668ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3669{ 3670 unsigned int size; 3671 3672 if (kstrtouint(buf, 10, &size) < 0) 3673 return -EINVAL; 3674 3675 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3676 rdev->raid_disk >= 0) 3677 return -EBUSY; 3678 3679 if (rdev->mddev->persistent) { 3680 if (rdev->mddev->major_version == 0) 3681 return -EINVAL; 3682 if (size > U16_MAX) 3683 return -EINVAL; 3684 } else if (!rdev->mddev->external) { 3685 return -EBUSY; 3686 } 3687 rdev->ppl.size = size; 3688 return len; 3689} 3690 3691static struct rdev_sysfs_entry rdev_ppl_size = 3692__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); 3693 3694static struct attribute *rdev_default_attrs[] = { 3695 &rdev_state.attr, 3696 &rdev_errors.attr, 3697 &rdev_slot.attr, 3698 &rdev_offset.attr, 3699 &rdev_new_offset.attr, 3700 &rdev_size.attr, 3701 &rdev_recovery_start.attr, 3702 &rdev_bad_blocks.attr, 3703 &rdev_unack_bad_blocks.attr, 3704 &rdev_ppl_sector.attr, 3705 &rdev_ppl_size.attr, 3706 NULL, 3707}; 3708ATTRIBUTE_GROUPS(rdev_default); 3709static ssize_t 3710rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3711{ 3712 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3713 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3714 3715 if (!entry->show) 3716 return -EIO; 3717 if (!rdev->mddev) 3718 return -ENODEV; 3719 return entry->show(rdev, page); 3720} 3721 3722static ssize_t 3723rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3724 const char *page, size_t length) 3725{ 3726 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3727 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3728 struct kernfs_node *kn = NULL; 3729 bool suspend = false; 3730 ssize_t rv; 3731 struct mddev *mddev = READ_ONCE(rdev->mddev); 3732 3733 if (!entry->store) 3734 return -EIO; 3735 if (!capable(CAP_SYS_ADMIN)) 3736 return -EACCES; 3737 if (!mddev) 3738 return -ENODEV; 3739 3740 if (entry->store == state_store) { 3741 if (cmd_match(page, "remove")) 3742 kn = sysfs_break_active_protection(kobj, attr); 3743 if (cmd_match(page, "remove") || cmd_match(page, "re-add") || 3744 cmd_match(page, "writemostly") || 3745 cmd_match(page, "-writemostly")) 3746 suspend = true; 3747 } 3748 3749 rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); 3750 if (!rv) { 3751 if (rdev->mddev == NULL) 3752 rv = -ENODEV; 3753 else 3754 rv = entry->store(rdev, page, length); 3755 suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); 3756 } 3757 3758 if (kn) 3759 sysfs_unbreak_active_protection(kn); 3760 3761 return rv; 3762} 3763 3764static void rdev_free(struct kobject *ko) 3765{ 3766 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3767 kfree(rdev); 3768} 3769static const struct sysfs_ops rdev_sysfs_ops = { 3770 .show = rdev_attr_show, 3771 .store = rdev_attr_store, 3772}; 3773static const struct kobj_type rdev_ktype = { 3774 .release = rdev_free, 3775 .sysfs_ops = &rdev_sysfs_ops, 3776 .default_groups = rdev_default_groups, 3777}; 3778 3779int md_rdev_init(struct md_rdev *rdev) 3780{ 3781 rdev->desc_nr = -1; 3782 rdev->saved_raid_disk = -1; 3783 rdev->raid_disk = -1; 3784 rdev->flags = 0; 3785 rdev->data_offset = 0; 3786 rdev->new_data_offset = 0; 3787 rdev->sb_events = 0; 3788 rdev->last_read_error = 0; 3789 rdev->sb_loaded = 0; 3790 rdev->bb_page = NULL; 3791 atomic_set(&rdev->nr_pending, 0); 3792 atomic_set(&rdev->read_errors, 0); 3793 atomic_set(&rdev->corrected_errors, 0); 3794 3795 INIT_LIST_HEAD(&rdev->same_set); 3796 init_waitqueue_head(&rdev->blocked_wait); 3797 3798 /* Add space to store bad block list. 3799 * This reserves the space even on arrays where it cannot 3800 * be used - I wonder if that matters 3801 */ 3802 return badblocks_init(&rdev->badblocks, 0); 3803} 3804EXPORT_SYMBOL_GPL(md_rdev_init); 3805 3806/* 3807 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3808 * 3809 * mark the device faulty if: 3810 * 3811 * - the device is nonexistent (zero size) 3812 * - the device has no valid superblock 3813 * 3814 * a faulty rdev _never_ has rdev->sb set. 3815 */ 3816static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3817{ 3818 struct md_rdev *rdev; 3819 sector_t size; 3820 int err; 3821 3822 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3823 if (!rdev) 3824 return ERR_PTR(-ENOMEM); 3825 3826 err = md_rdev_init(rdev); 3827 if (err) 3828 goto out_free_rdev; 3829 err = alloc_disk_sb(rdev); 3830 if (err) 3831 goto out_clear_rdev; 3832 3833 rdev->bdev_file = bdev_file_open_by_dev(newdev, 3834 BLK_OPEN_READ | BLK_OPEN_WRITE, 3835 super_format == -2 ? &claim_rdev : rdev, NULL); 3836 if (IS_ERR(rdev->bdev_file)) { 3837 pr_warn("md: could not open device unknown-block(%u,%u).\n", 3838 MAJOR(newdev), MINOR(newdev)); 3839 err = PTR_ERR(rdev->bdev_file); 3840 goto out_clear_rdev; 3841 } 3842 rdev->bdev = file_bdev(rdev->bdev_file); 3843 3844 kobject_init(&rdev->kobj, &rdev_ktype); 3845 3846 size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; 3847 if (!size) { 3848 pr_warn("md: %pg has zero or unknown size, marking faulty!\n", 3849 rdev->bdev); 3850 err = -EINVAL; 3851 goto out_blkdev_put; 3852 } 3853 3854 if (super_format >= 0) { 3855 err = super_types[super_format]. 3856 load_super(rdev, NULL, super_minor); 3857 if (err == -EINVAL) { 3858 pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", 3859 rdev->bdev, 3860 super_format, super_minor); 3861 goto out_blkdev_put; 3862 } 3863 if (err < 0) { 3864 pr_warn("md: could not read %pg's sb, not importing!\n", 3865 rdev->bdev); 3866 goto out_blkdev_put; 3867 } 3868 } 3869 3870 return rdev; 3871 3872out_blkdev_put: 3873 fput(rdev->bdev_file); 3874out_clear_rdev: 3875 md_rdev_clear(rdev); 3876out_free_rdev: 3877 kfree(rdev); 3878 return ERR_PTR(err); 3879} 3880 3881/* 3882 * Check a full RAID array for plausibility 3883 */ 3884 3885static int analyze_sbs(struct mddev *mddev) 3886{ 3887 struct md_rdev *rdev, *freshest, *tmp; 3888 3889 freshest = NULL; 3890 rdev_for_each_safe(rdev, tmp, mddev) 3891 switch (super_types[mddev->major_version]. 3892 load_super(rdev, freshest, mddev->minor_version)) { 3893 case 1: 3894 freshest = rdev; 3895 break; 3896 case 0: 3897 break; 3898 default: 3899 pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", 3900 rdev->bdev); 3901 md_kick_rdev_from_array(rdev); 3902 } 3903 3904 /* Cannot find a valid fresh disk */ 3905 if (!freshest) { 3906 pr_warn("md: cannot find a valid disk\n"); 3907 return -EINVAL; 3908 } 3909 3910 super_types[mddev->major_version]. 3911 validate_super(mddev, NULL/*freshest*/, freshest); 3912 3913 rdev_for_each_safe(rdev, tmp, mddev) { 3914 if (mddev->max_disks && 3915 rdev->desc_nr >= mddev->max_disks) { 3916 pr_warn("md: %s: %pg: only %d devices permitted\n", 3917 mdname(mddev), rdev->bdev, 3918 mddev->max_disks); 3919 md_kick_rdev_from_array(rdev); 3920 continue; 3921 } 3922 if (rdev != freshest) { 3923 if (super_types[mddev->major_version]. 3924 validate_super(mddev, freshest, rdev)) { 3925 pr_warn("md: kicking non-fresh %pg from array!\n", 3926 rdev->bdev); 3927 md_kick_rdev_from_array(rdev); 3928 continue; 3929 } 3930 } 3931 if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) && 3932 !test_bit(Journal, &rdev->flags)) { 3933 rdev->raid_disk = -1; 3934 clear_bit(In_sync, &rdev->flags); 3935 } 3936 } 3937 3938 return 0; 3939} 3940 3941/* Read a fixed-point number. 3942 * Numbers in sysfs attributes should be in "standard" units where 3943 * possible, so time should be in seconds. 3944 * However we internally use a a much smaller unit such as 3945 * milliseconds or jiffies. 3946 * This function takes a decimal number with a possible fractional 3947 * component, and produces an integer which is the result of 3948 * multiplying that number by 10^'scale'. 3949 * all without any floating-point arithmetic. 3950 */ 3951int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3952{ 3953 unsigned long result = 0; 3954 long decimals = -1; 3955 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3956 if (*cp == '.') 3957 decimals = 0; 3958 else if (decimals < scale) { 3959 unsigned int value; 3960 value = *cp - '0'; 3961 result = result * 10 + value; 3962 if (decimals >= 0) 3963 decimals++; 3964 } 3965 cp++; 3966 } 3967 if (*cp == '\n') 3968 cp++; 3969 if (*cp) 3970 return -EINVAL; 3971 if (decimals < 0) 3972 decimals = 0; 3973 *res = result * int_pow(10, scale - decimals); 3974 return 0; 3975} 3976 3977static ssize_t 3978safe_delay_show(struct mddev *mddev, char *page) 3979{ 3980 unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; 3981 3982 return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); 3983} 3984static ssize_t 3985safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3986{ 3987 unsigned long msec; 3988 3989 if (mddev_is_clustered(mddev)) { 3990 pr_warn("md: Safemode is disabled for clustered mode\n"); 3991 return -EINVAL; 3992 } 3993 3994 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) 3995 return -EINVAL; 3996 if (msec == 0) 3997 mddev->safemode_delay = 0; 3998 else { 3999 unsigned long old_delay = mddev->safemode_delay; 4000 unsigned long new_delay = (msec*HZ)/1000; 4001 4002 if (new_delay == 0) 4003 new_delay = 1; 4004 mddev->safemode_delay = new_delay; 4005 if (new_delay < old_delay || old_delay == 0) 4006 mod_timer(&mddev->safemode_timer, jiffies+1); 4007 } 4008 return len; 4009} 4010static struct md_sysfs_entry md_safe_delay = 4011__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 4012 4013static ssize_t 4014level_show(struct mddev *mddev, char *page) 4015{ 4016 struct md_personality *p; 4017 int ret; 4018 spin_lock(&mddev->lock); 4019 p = mddev->pers; 4020 if (p) 4021 ret = sprintf(page, "%s\n", p->head.name); 4022 else if (mddev->clevel[0]) 4023 ret = sprintf(page, "%s\n", mddev->clevel); 4024 else if (mddev->level != LEVEL_NONE) 4025 ret = sprintf(page, "%d\n", mddev->level); 4026 else 4027 ret = 0; 4028 spin_unlock(&mddev->lock); 4029 return ret; 4030} 4031 4032static ssize_t 4033level_store(struct mddev *mddev, const char *buf, size_t len) 4034{ 4035 char clevel[16]; 4036 ssize_t rv; 4037 size_t slen = len; 4038 struct md_personality *pers, *oldpers; 4039 long level; 4040 void *priv, *oldpriv; 4041 struct md_rdev *rdev; 4042 4043 if (slen == 0 || slen >= sizeof(clevel)) 4044 return -EINVAL; 4045 4046 rv = mddev_suspend_and_lock(mddev); 4047 if (rv) 4048 return rv; 4049 4050 if (mddev->pers == NULL) { 4051 memcpy(mddev->clevel, buf, slen); 4052 if (mddev->clevel[slen-1] == '\n') 4053 slen--; 4054 mddev->clevel[slen] = 0; 4055 mddev->level = LEVEL_NONE; 4056 rv = len; 4057 goto out_unlock; 4058 } 4059 rv = -EROFS; 4060 if (!md_is_rdwr(mddev)) 4061 goto out_unlock; 4062 4063 /* request to change the personality. Need to ensure: 4064 * - array is not engaged in resync/recovery/reshape 4065 * - old personality can be suspended 4066 * - new personality will access other array. 4067 */ 4068 4069 rv = -EBUSY; 4070 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4071 mddev->reshape_position != MaxSector || 4072 mddev->sysfs_active) 4073 goto out_unlock; 4074 4075 rv = -EINVAL; 4076 if (!mddev->pers->quiesce) { 4077 pr_warn("md: %s: %s does not support online personality change\n", 4078 mdname(mddev), mddev->pers->head.name); 4079 goto out_unlock; 4080 } 4081 4082 /* Now find the new personality */ 4083 memcpy(clevel, buf, slen); 4084 if (clevel[slen-1] == '\n') 4085 slen--; 4086 clevel[slen] = 0; 4087 if (kstrtol(clevel, 10, &level)) 4088 level = LEVEL_NONE; 4089 4090 if (request_module("md-%s", clevel) != 0) 4091 request_module("md-level-%s", clevel); 4092 pers = get_pers(level, clevel); 4093 if (!pers) { 4094 rv = -EINVAL; 4095 goto out_unlock; 4096 } 4097 4098 if (pers == mddev->pers) { 4099 /* Nothing to do! */ 4100 put_pers(pers); 4101 rv = len; 4102 goto out_unlock; 4103 } 4104 if (!pers->takeover) { 4105 put_pers(pers); 4106 pr_warn("md: %s: %s does not support personality takeover\n", 4107 mdname(mddev), clevel); 4108 rv = -EINVAL; 4109 goto out_unlock; 4110 } 4111 4112 rdev_for_each(rdev, mddev) 4113 rdev->new_raid_disk = rdev->raid_disk; 4114 4115 /* ->takeover must set new_* and/or delta_disks 4116 * if it succeeds, and may set them when it fails. 4117 */ 4118 priv = pers->takeover(mddev); 4119 if (IS_ERR(priv)) { 4120 mddev->new_level = mddev->level; 4121 mddev->new_layout = mddev->layout; 4122 mddev->new_chunk_sectors = mddev->chunk_sectors; 4123 mddev->raid_disks -= mddev->delta_disks; 4124 mddev->delta_disks = 0; 4125 mddev->reshape_backwards = 0; 4126 put_pers(pers); 4127 pr_warn("md: %s: %s would not accept array\n", 4128 mdname(mddev), clevel); 4129 rv = PTR_ERR(priv); 4130 goto out_unlock; 4131 } 4132 4133 /* Looks like we have a winner */ 4134 mddev_detach(mddev); 4135 4136 spin_lock(&mddev->lock); 4137 oldpers = mddev->pers; 4138 oldpriv = mddev->private; 4139 mddev->pers = pers; 4140 mddev->private = priv; 4141 strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); 4142 mddev->level = mddev->new_level; 4143 mddev->layout = mddev->new_layout; 4144 mddev->chunk_sectors = mddev->new_chunk_sectors; 4145 mddev->delta_disks = 0; 4146 mddev->reshape_backwards = 0; 4147 mddev->degraded = 0; 4148 spin_unlock(&mddev->lock); 4149 4150 if (oldpers->sync_request == NULL && 4151 mddev->external) { 4152 /* We are converting from a no-redundancy array 4153 * to a redundancy array and metadata is managed 4154 * externally so we need to be sure that writes 4155 * won't block due to a need to transition 4156 * clean->dirty 4157 * until external management is started. 4158 */ 4159 mddev->in_sync = 0; 4160 mddev->safemode_delay = 0; 4161 mddev->safemode = 0; 4162 } 4163 4164 oldpers->free(mddev, oldpriv); 4165 4166 if (oldpers->sync_request == NULL && 4167 pers->sync_request != NULL) { 4168 /* need to add the md_redundancy_group */ 4169 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4170 pr_warn("md: cannot register extra attributes for %s\n", 4171 mdname(mddev)); 4172 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4173 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 4174 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 4175 } 4176 if (oldpers->sync_request != NULL && 4177 pers->sync_request == NULL) { 4178 /* need to remove the md_redundancy_group */ 4179 if (mddev->to_remove == NULL) 4180 mddev->to_remove = &md_redundancy_group; 4181 } 4182 4183 put_pers(oldpers); 4184 4185 rdev_for_each(rdev, mddev) { 4186 if (rdev->raid_disk < 0) 4187 continue; 4188 if (rdev->new_raid_disk >= mddev->raid_disks) 4189 rdev->new_raid_disk = -1; 4190 if (rdev->new_raid_disk == rdev->raid_disk) 4191 continue; 4192 sysfs_unlink_rdev(mddev, rdev); 4193 } 4194 rdev_for_each(rdev, mddev) { 4195 if (rdev->raid_disk < 0) 4196 continue; 4197 if (rdev->new_raid_disk == rdev->raid_disk) 4198 continue; 4199 rdev->raid_disk = rdev->new_raid_disk; 4200 if (rdev->raid_disk < 0) 4201 clear_bit(In_sync, &rdev->flags); 4202 else { 4203 if (sysfs_link_rdev(mddev, rdev)) 4204 pr_warn("md: cannot register rd%d for %s after level change\n", 4205 rdev->raid_disk, mdname(mddev)); 4206 } 4207 } 4208 4209 if (pers->sync_request == NULL) { 4210 /* this is now an array without redundancy, so 4211 * it must always be in_sync 4212 */ 4213 mddev->in_sync = 1; 4214 timer_delete_sync(&mddev->safemode_timer); 4215 } 4216 pers->run(mddev); 4217 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4218 if (!mddev->thread) 4219 md_update_sb(mddev, 1); 4220 sysfs_notify_dirent_safe(mddev->sysfs_level); 4221 md_new_event(); 4222 rv = len; 4223out_unlock: 4224 mddev_unlock_and_resume(mddev); 4225 return rv; 4226} 4227 4228static struct md_sysfs_entry md_level = 4229__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 4230 4231static ssize_t 4232new_level_show(struct mddev *mddev, char *page) 4233{ 4234 return sprintf(page, "%d\n", mddev->new_level); 4235} 4236 4237static ssize_t 4238new_level_store(struct mddev *mddev, const char *buf, size_t len) 4239{ 4240 unsigned int n; 4241 int err; 4242 4243 err = kstrtouint(buf, 10, &n); 4244 if (err < 0) 4245 return err; 4246 err = mddev_lock(mddev); 4247 if (err) 4248 return err; 4249 4250 mddev->new_level = n; 4251 md_update_sb(mddev, 1); 4252 4253 mddev_unlock(mddev); 4254 return len; 4255} 4256static struct md_sysfs_entry md_new_level = 4257__ATTR(new_level, 0664, new_level_show, new_level_store); 4258 4259static ssize_t 4260bitmap_type_show(struct mddev *mddev, char *page) 4261{ 4262 struct md_submodule_head *head; 4263 unsigned long i; 4264 ssize_t len = 0; 4265 4266 if (mddev->bitmap_id == ID_BITMAP_NONE) 4267 len += sprintf(page + len, "[none] "); 4268 else 4269 len += sprintf(page + len, "none "); 4270 4271 xa_lock(&md_submodule); 4272 xa_for_each(&md_submodule, i, head) { 4273 if (head->type != MD_BITMAP) 4274 continue; 4275 4276 if (mddev->bitmap_id == head->id) 4277 len += sprintf(page + len, "[%s] ", head->name); 4278 else 4279 len += sprintf(page + len, "%s ", head->name); 4280 } 4281 xa_unlock(&md_submodule); 4282 4283 len += sprintf(page + len, "\n"); 4284 return len; 4285} 4286 4287static ssize_t 4288bitmap_type_store(struct mddev *mddev, const char *buf, size_t len) 4289{ 4290 struct md_submodule_head *head; 4291 enum md_submodule_id id; 4292 unsigned long i; 4293 int err = 0; 4294 4295 xa_lock(&md_submodule); 4296 4297 if (mddev->bitmap_ops) { 4298 err = -EBUSY; 4299 goto out; 4300 } 4301 4302 if (cmd_match(buf, "none")) { 4303 mddev->bitmap_id = ID_BITMAP_NONE; 4304 goto out; 4305 } 4306 4307 xa_for_each(&md_submodule, i, head) { 4308 if (head->type == MD_BITMAP && cmd_match(buf, head->name)) { 4309 mddev->bitmap_id = head->id; 4310 goto out; 4311 } 4312 } 4313 4314 err = kstrtoint(buf, 10, &id); 4315 if (err) 4316 goto out; 4317 4318 if (id == ID_BITMAP_NONE) { 4319 mddev->bitmap_id = id; 4320 goto out; 4321 } 4322 4323 head = xa_load(&md_submodule, id); 4324 if (head && head->type == MD_BITMAP) { 4325 mddev->bitmap_id = id; 4326 goto out; 4327 } 4328 4329 err = -ENOENT; 4330 4331out: 4332 xa_unlock(&md_submodule); 4333 return err ? err : len; 4334} 4335 4336static struct md_sysfs_entry md_bitmap_type = 4337__ATTR(bitmap_type, 0664, bitmap_type_show, bitmap_type_store); 4338 4339static ssize_t 4340layout_show(struct mddev *mddev, char *page) 4341{ 4342 /* just a number, not meaningful for all levels */ 4343 if (mddev->reshape_position != MaxSector && 4344 mddev->layout != mddev->new_layout) 4345 return sprintf(page, "%d (%d)\n", 4346 mddev->new_layout, mddev->layout); 4347 return sprintf(page, "%d\n", mddev->layout); 4348} 4349 4350static ssize_t 4351layout_store(struct mddev *mddev, const char *buf, size_t len) 4352{ 4353 unsigned int n; 4354 int err; 4355 4356 err = kstrtouint(buf, 10, &n); 4357 if (err < 0) 4358 return err; 4359 err = mddev_lock(mddev); 4360 if (err) 4361 return err; 4362 4363 if (mddev->pers) { 4364 if (mddev->pers->check_reshape == NULL) 4365 err = -EBUSY; 4366 else if (!md_is_rdwr(mddev)) 4367 err = -EROFS; 4368 else { 4369 mddev->new_layout = n; 4370 err = mddev->pers->check_reshape(mddev); 4371 if (err) 4372 mddev->new_layout = mddev->layout; 4373 } 4374 } else { 4375 mddev->new_layout = n; 4376 if (mddev->reshape_position == MaxSector) 4377 mddev->layout = n; 4378 } 4379 mddev_unlock(mddev); 4380 return err ?: len; 4381} 4382static struct md_sysfs_entry md_layout = 4383__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 4384 4385static ssize_t 4386raid_disks_show(struct mddev *mddev, char *page) 4387{ 4388 if (mddev->raid_disks == 0) 4389 return 0; 4390 if (mddev->reshape_position != MaxSector && 4391 mddev->delta_disks != 0) 4392 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 4393 mddev->raid_disks - mddev->delta_disks); 4394 return sprintf(page, "%d\n", mddev->raid_disks); 4395} 4396 4397static int update_raid_disks(struct mddev *mddev, int raid_disks); 4398 4399static ssize_t 4400raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 4401{ 4402 unsigned int n; 4403 int err; 4404 4405 err = kstrtouint(buf, 10, &n); 4406 if (err < 0) 4407 return err; 4408 4409 err = mddev_suspend_and_lock(mddev); 4410 if (err) 4411 return err; 4412 if (mddev->pers) 4413 err = update_raid_disks(mddev, n); 4414 else if (mddev->reshape_position != MaxSector) { 4415 struct md_rdev *rdev; 4416 int olddisks = mddev->raid_disks - mddev->delta_disks; 4417 4418 err = -EINVAL; 4419 rdev_for_each(rdev, mddev) { 4420 if (olddisks < n && 4421 rdev->data_offset < rdev->new_data_offset) 4422 goto out_unlock; 4423 if (olddisks > n && 4424 rdev->data_offset > rdev->new_data_offset) 4425 goto out_unlock; 4426 } 4427 err = 0; 4428 mddev->delta_disks = n - olddisks; 4429 mddev->raid_disks = n; 4430 mddev->reshape_backwards = (mddev->delta_disks < 0); 4431 } else 4432 mddev->raid_disks = n; 4433out_unlock: 4434 mddev_unlock_and_resume(mddev); 4435 return err ? err : len; 4436} 4437static struct md_sysfs_entry md_raid_disks = 4438__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 4439 4440static ssize_t 4441uuid_show(struct mddev *mddev, char *page) 4442{ 4443 return sprintf(page, "%pU\n", mddev->uuid); 4444} 4445static struct md_sysfs_entry md_uuid = 4446__ATTR(uuid, S_IRUGO, uuid_show, NULL); 4447 4448static ssize_t 4449chunk_size_show(struct mddev *mddev, char *page) 4450{ 4451 if (mddev->reshape_position != MaxSector && 4452 mddev->chunk_sectors != mddev->new_chunk_sectors) 4453 return sprintf(page, "%d (%d)\n", 4454 mddev->new_chunk_sectors << 9, 4455 mddev->chunk_sectors << 9); 4456 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 4457} 4458 4459static ssize_t 4460chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 4461{ 4462 unsigned long n; 4463 int err; 4464 4465 err = kstrtoul(buf, 10, &n); 4466 if (err < 0) 4467 return err; 4468 4469 err = mddev_lock(mddev); 4470 if (err) 4471 return err; 4472 if (mddev->pers) { 4473 if (mddev->pers->check_reshape == NULL) 4474 err = -EBUSY; 4475 else if (!md_is_rdwr(mddev)) 4476 err = -EROFS; 4477 else { 4478 mddev->new_chunk_sectors = n >> 9; 4479 err = mddev->pers->check_reshape(mddev); 4480 if (err) 4481 mddev->new_chunk_sectors = mddev->chunk_sectors; 4482 } 4483 } else { 4484 mddev->new_chunk_sectors = n >> 9; 4485 if (mddev->reshape_position == MaxSector) 4486 mddev->chunk_sectors = n >> 9; 4487 } 4488 mddev_unlock(mddev); 4489 return err ?: len; 4490} 4491static struct md_sysfs_entry md_chunk_size = 4492__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 4493 4494static ssize_t 4495resync_start_show(struct mddev *mddev, char *page) 4496{ 4497 if (mddev->resync_offset == MaxSector) 4498 return sprintf(page, "none\n"); 4499 return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset); 4500} 4501 4502static ssize_t 4503resync_start_store(struct mddev *mddev, const char *buf, size_t len) 4504{ 4505 unsigned long long n; 4506 int err; 4507 4508 if (cmd_match(buf, "none")) 4509 n = MaxSector; 4510 else { 4511 err = kstrtoull(buf, 10, &n); 4512 if (err < 0) 4513 return err; 4514 if (n != (sector_t)n) 4515 return -EINVAL; 4516 } 4517 4518 err = mddev_lock(mddev); 4519 if (err) 4520 return err; 4521 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4522 err = -EBUSY; 4523 4524 if (!err) { 4525 mddev->resync_offset = n; 4526 if (mddev->pers) 4527 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 4528 } 4529 mddev_unlock(mddev); 4530 return err ?: len; 4531} 4532static struct md_sysfs_entry md_resync_start = 4533__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 4534 resync_start_show, resync_start_store); 4535 4536/* 4537 * The array state can be: 4538 * 4539 * clear 4540 * No devices, no size, no level 4541 * Equivalent to STOP_ARRAY ioctl 4542 * inactive 4543 * May have some settings, but array is not active 4544 * all IO results in error 4545 * When written, doesn't tear down array, but just stops it 4546 * suspended (not supported yet) 4547 * All IO requests will block. The array can be reconfigured. 4548 * Writing this, if accepted, will block until array is quiescent 4549 * readonly 4550 * no resync can happen. no superblocks get written. 4551 * write requests fail 4552 * read-auto 4553 * like readonly, but behaves like 'clean' on a write request. 4554 * 4555 * clean - no pending writes, but otherwise active. 4556 * When written to inactive array, starts without resync 4557 * If a write request arrives then 4558 * if metadata is known, mark 'dirty' and switch to 'active'. 4559 * if not known, block and switch to write-pending 4560 * If written to an active array that has pending writes, then fails. 4561 * active 4562 * fully active: IO and resync can be happening. 4563 * When written to inactive array, starts with resync 4564 * 4565 * write-pending 4566 * clean, but writes are blocked waiting for 'active' to be written. 4567 * 4568 * active-idle 4569 * like active, but no writes have been seen for a while (100msec). 4570 * 4571 * broken 4572* Array is failed. It's useful because mounted-arrays aren't stopped 4573* when array is failed, so this state will at least alert the user that 4574* something is wrong. 4575 */ 4576enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 4577 write_pending, active_idle, broken, bad_word}; 4578static char *array_states[] = { 4579 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 4580 "write-pending", "active-idle", "broken", NULL }; 4581 4582static int match_word(const char *word, char **list) 4583{ 4584 int n; 4585 for (n=0; list[n]; n++) 4586 if (cmd_match(word, list[n])) 4587 break; 4588 return n; 4589} 4590 4591static ssize_t 4592array_state_show(struct mddev *mddev, char *page) 4593{ 4594 enum array_state st = inactive; 4595 4596 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { 4597 switch(mddev->ro) { 4598 case MD_RDONLY: 4599 st = readonly; 4600 break; 4601 case MD_AUTO_READ: 4602 st = read_auto; 4603 break; 4604 case MD_RDWR: 4605 spin_lock(&mddev->lock); 4606 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 4607 st = write_pending; 4608 else if (mddev->in_sync) 4609 st = clean; 4610 else if (mddev->safemode) 4611 st = active_idle; 4612 else 4613 st = active; 4614 spin_unlock(&mddev->lock); 4615 } 4616 4617 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) 4618 st = broken; 4619 } else { 4620 if (list_empty(&mddev->disks) && 4621 mddev->raid_disks == 0 && 4622 mddev->dev_sectors == 0) 4623 st = clear; 4624 else 4625 st = inactive; 4626 } 4627 return sprintf(page, "%s\n", array_states[st]); 4628} 4629 4630static int do_md_stop(struct mddev *mddev, int ro); 4631static int md_set_readonly(struct mddev *mddev); 4632static int restart_array(struct mddev *mddev); 4633 4634static ssize_t 4635array_state_store(struct mddev *mddev, const char *buf, size_t len) 4636{ 4637 int err = 0; 4638 enum array_state st = match_word(buf, array_states); 4639 4640 /* No lock dependent actions */ 4641 switch (st) { 4642 case suspended: /* not supported yet */ 4643 case write_pending: /* cannot be set */ 4644 case active_idle: /* cannot be set */ 4645 case broken: /* cannot be set */ 4646 case bad_word: 4647 return -EINVAL; 4648 case clear: 4649 case readonly: 4650 case inactive: 4651 case read_auto: 4652 if (!mddev->pers || !md_is_rdwr(mddev)) 4653 break; 4654 /* write sysfs will not open mddev and opener should be 0 */ 4655 err = mddev_set_closing_and_sync_blockdev(mddev, 0); 4656 if (err) 4657 return err; 4658 break; 4659 default: 4660 break; 4661 } 4662 4663 if (mddev->pers && (st == active || st == clean) && 4664 mddev->ro != MD_RDONLY) { 4665 /* don't take reconfig_mutex when toggling between 4666 * clean and active 4667 */ 4668 spin_lock(&mddev->lock); 4669 if (st == active) { 4670 restart_array(mddev); 4671 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4672 md_wakeup_thread(mddev->thread); 4673 wake_up(&mddev->sb_wait); 4674 } else /* st == clean */ { 4675 restart_array(mddev); 4676 if (!set_in_sync(mddev)) 4677 err = -EBUSY; 4678 } 4679 if (!err) 4680 sysfs_notify_dirent_safe(mddev->sysfs_state); 4681 spin_unlock(&mddev->lock); 4682 return err ?: len; 4683 } 4684 err = mddev_lock(mddev); 4685 if (err) 4686 return err; 4687 4688 switch (st) { 4689 case inactive: 4690 /* stop an active array, return 0 otherwise */ 4691 if (mddev->pers) 4692 err = do_md_stop(mddev, 2); 4693 break; 4694 case clear: 4695 err = do_md_stop(mddev, 0); 4696 break; 4697 case readonly: 4698 if (mddev->pers) 4699 err = md_set_readonly(mddev); 4700 else { 4701 mddev->ro = MD_RDONLY; 4702 set_disk_ro(mddev->gendisk, 1); 4703 err = do_md_run(mddev); 4704 } 4705 break; 4706 case read_auto: 4707 if (mddev->pers) { 4708 if (md_is_rdwr(mddev)) 4709 err = md_set_readonly(mddev); 4710 else if (mddev->ro == MD_RDONLY) 4711 err = restart_array(mddev); 4712 if (err == 0) { 4713 mddev->ro = MD_AUTO_READ; 4714 set_disk_ro(mddev->gendisk, 0); 4715 } 4716 } else { 4717 mddev->ro = MD_AUTO_READ; 4718 err = do_md_run(mddev); 4719 } 4720 break; 4721 case clean: 4722 if (mddev->pers) { 4723 err = restart_array(mddev); 4724 if (err) 4725 break; 4726 spin_lock(&mddev->lock); 4727 if (!set_in_sync(mddev)) 4728 err = -EBUSY; 4729 spin_unlock(&mddev->lock); 4730 } else 4731 err = -EINVAL; 4732 break; 4733 case active: 4734 if (mddev->pers) { 4735 err = restart_array(mddev); 4736 if (err) 4737 break; 4738 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4739 wake_up(&mddev->sb_wait); 4740 err = 0; 4741 } else { 4742 mddev->ro = MD_RDWR; 4743 set_disk_ro(mddev->gendisk, 0); 4744 err = do_md_run(mddev); 4745 } 4746 break; 4747 default: 4748 err = -EINVAL; 4749 break; 4750 } 4751 4752 if (!err) { 4753 if (mddev->hold_active == UNTIL_IOCTL) 4754 mddev->hold_active = 0; 4755 sysfs_notify_dirent_safe(mddev->sysfs_state); 4756 } 4757 mddev_unlock(mddev); 4758 4759 if (st == readonly || st == read_auto || st == inactive || 4760 (err && st == clear)) 4761 clear_bit(MD_CLOSING, &mddev->flags); 4762 4763 return err ?: len; 4764} 4765static struct md_sysfs_entry md_array_state = 4766__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4767 4768static ssize_t 4769max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4770 return sprintf(page, "%d\n", 4771 atomic_read(&mddev->max_corr_read_errors)); 4772} 4773 4774static ssize_t 4775max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4776{ 4777 unsigned int n; 4778 int rv; 4779 4780 rv = kstrtouint(buf, 10, &n); 4781 if (rv < 0) 4782 return rv; 4783 if (n > INT_MAX) 4784 return -EINVAL; 4785 atomic_set(&mddev->max_corr_read_errors, n); 4786 return len; 4787} 4788 4789static struct md_sysfs_entry max_corr_read_errors = 4790__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4791 max_corrected_read_errors_store); 4792 4793static ssize_t 4794null_show(struct mddev *mddev, char *page) 4795{ 4796 return -EINVAL; 4797} 4798 4799static ssize_t 4800new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4801{ 4802 /* buf must be %d:%d\n? giving major and minor numbers */ 4803 /* The new device is added to the array. 4804 * If the array has a persistent superblock, we read the 4805 * superblock to initialise info and check validity. 4806 * Otherwise, only checking done is that in bind_rdev_to_array, 4807 * which mainly checks size. 4808 */ 4809 char *e; 4810 int major = simple_strtoul(buf, &e, 10); 4811 int minor; 4812 dev_t dev; 4813 struct md_rdev *rdev; 4814 int err; 4815 4816 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4817 return -EINVAL; 4818 minor = simple_strtoul(e+1, &e, 10); 4819 if (*e && *e != '\n') 4820 return -EINVAL; 4821 dev = MKDEV(major, minor); 4822 if (major != MAJOR(dev) || 4823 minor != MINOR(dev)) 4824 return -EOVERFLOW; 4825 4826 err = mddev_suspend_and_lock(mddev); 4827 if (err) 4828 return err; 4829 if (mddev->persistent) { 4830 rdev = md_import_device(dev, mddev->major_version, 4831 mddev->minor_version); 4832 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4833 struct md_rdev *rdev0 4834 = list_entry(mddev->disks.next, 4835 struct md_rdev, same_set); 4836 err = super_types[mddev->major_version] 4837 .load_super(rdev, rdev0, mddev->minor_version); 4838 if (err < 0) 4839 goto out; 4840 } 4841 } else if (mddev->external) 4842 rdev = md_import_device(dev, -2, -1); 4843 else 4844 rdev = md_import_device(dev, -1, -1); 4845 4846 if (IS_ERR(rdev)) { 4847 mddev_unlock_and_resume(mddev); 4848 return PTR_ERR(rdev); 4849 } 4850 err = bind_rdev_to_array(rdev, mddev); 4851 out: 4852 if (err) 4853 export_rdev(rdev, mddev); 4854 mddev_unlock_and_resume(mddev); 4855 if (!err) 4856 md_new_event(); 4857 return err ? err : len; 4858} 4859 4860static struct md_sysfs_entry md_new_device = 4861__ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4862 4863static ssize_t 4864bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4865{ 4866 char *end; 4867 unsigned long chunk, end_chunk; 4868 int err; 4869 4870 if (!md_bitmap_enabled(mddev, false)) 4871 return len; 4872 4873 err = mddev_lock(mddev); 4874 if (err) 4875 return err; 4876 if (!mddev->bitmap) 4877 goto out; 4878 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4879 while (*buf) { 4880 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4881 if (buf == end) 4882 break; 4883 4884 if (*end == '-') { /* range */ 4885 buf = end + 1; 4886 end_chunk = simple_strtoul(buf, &end, 0); 4887 if (buf == end) 4888 break; 4889 } 4890 4891 if (*end && !isspace(*end)) 4892 break; 4893 4894 mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk); 4895 buf = skip_spaces(end); 4896 } 4897 mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */ 4898out: 4899 mddev_unlock(mddev); 4900 return len; 4901} 4902 4903static struct md_sysfs_entry md_bitmap = 4904__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4905 4906static ssize_t 4907size_show(struct mddev *mddev, char *page) 4908{ 4909 return sprintf(page, "%llu\n", 4910 (unsigned long long)mddev->dev_sectors / 2); 4911} 4912 4913static int update_size(struct mddev *mddev, sector_t num_sectors); 4914 4915static ssize_t 4916size_store(struct mddev *mddev, const char *buf, size_t len) 4917{ 4918 /* If array is inactive, we can reduce the component size, but 4919 * not increase it (except from 0). 4920 * If array is active, we can try an on-line resize 4921 */ 4922 sector_t sectors; 4923 int err = strict_blocks_to_sectors(buf, &sectors); 4924 4925 if (err < 0) 4926 return err; 4927 err = mddev_lock(mddev); 4928 if (err) 4929 return err; 4930 if (mddev->pers) { 4931 err = update_size(mddev, sectors); 4932 if (err == 0) 4933 md_update_sb(mddev, 1); 4934 } else { 4935 if (mddev->dev_sectors == 0 || 4936 mddev->dev_sectors > sectors) 4937 mddev->dev_sectors = sectors; 4938 else 4939 err = -ENOSPC; 4940 } 4941 mddev_unlock(mddev); 4942 return err ? err : len; 4943} 4944 4945static struct md_sysfs_entry md_size = 4946__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4947 4948/* Metadata version. 4949 * This is one of 4950 * 'none' for arrays with no metadata (good luck...) 4951 * 'external' for arrays with externally managed metadata, 4952 * or N.M for internally known formats 4953 */ 4954static ssize_t 4955metadata_show(struct mddev *mddev, char *page) 4956{ 4957 if (mddev->persistent) 4958 return sprintf(page, "%d.%d\n", 4959 mddev->major_version, mddev->minor_version); 4960 else if (mddev->external) 4961 return sprintf(page, "external:%s\n", mddev->metadata_type); 4962 else 4963 return sprintf(page, "none\n"); 4964} 4965 4966static ssize_t 4967metadata_store(struct mddev *mddev, const char *buf, size_t len) 4968{ 4969 int major, minor; 4970 char *e; 4971 int err; 4972 /* Changing the details of 'external' metadata is 4973 * always permitted. Otherwise there must be 4974 * no devices attached to the array. 4975 */ 4976 4977 err = mddev_lock(mddev); 4978 if (err) 4979 return err; 4980 err = -EBUSY; 4981 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4982 ; 4983 else if (!list_empty(&mddev->disks)) 4984 goto out_unlock; 4985 4986 err = 0; 4987 if (cmd_match(buf, "none")) { 4988 mddev->persistent = 0; 4989 mddev->external = 0; 4990 mddev->major_version = 0; 4991 mddev->minor_version = 90; 4992 goto out_unlock; 4993 } 4994 if (strncmp(buf, "external:", 9) == 0) { 4995 size_t namelen = len-9; 4996 if (namelen >= sizeof(mddev->metadata_type)) 4997 namelen = sizeof(mddev->metadata_type)-1; 4998 memcpy(mddev->metadata_type, buf+9, namelen); 4999 mddev->metadata_type[namelen] = 0; 5000 if (namelen && mddev->metadata_type[namelen-1] == '\n') 5001 mddev->metadata_type[--namelen] = 0; 5002 mddev->persistent = 0; 5003 mddev->external = 1; 5004 mddev->major_version = 0; 5005 mddev->minor_version = 90; 5006 goto out_unlock; 5007 } 5008 major = simple_strtoul(buf, &e, 10); 5009 err = -EINVAL; 5010 if (e==buf || *e != '.') 5011 goto out_unlock; 5012 buf = e+1; 5013 minor = simple_strtoul(buf, &e, 10); 5014 if (e==buf || (*e && *e != '\n') ) 5015 goto out_unlock; 5016 err = -ENOENT; 5017 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 5018 goto out_unlock; 5019 mddev->major_version = major; 5020 mddev->minor_version = minor; 5021 mddev->persistent = 1; 5022 mddev->external = 0; 5023 err = 0; 5024out_unlock: 5025 mddev_unlock(mddev); 5026 return err ?: len; 5027} 5028 5029static struct md_sysfs_entry md_metadata = 5030__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 5031 5032static bool rdev_needs_recovery(struct md_rdev *rdev, sector_t sectors) 5033{ 5034 return rdev->raid_disk >= 0 && 5035 !test_bit(Journal, &rdev->flags) && 5036 !test_bit(Faulty, &rdev->flags) && 5037 !test_bit(In_sync, &rdev->flags) && 5038 rdev->recovery_offset < sectors; 5039} 5040 5041static enum sync_action md_get_active_sync_action(struct mddev *mddev) 5042{ 5043 struct md_rdev *rdev; 5044 bool is_recover = false; 5045 5046 if (mddev->resync_offset < MaxSector) 5047 return ACTION_RESYNC; 5048 5049 if (mddev->reshape_position != MaxSector) 5050 return ACTION_RESHAPE; 5051 5052 rcu_read_lock(); 5053 rdev_for_each_rcu(rdev, mddev) { 5054 if (rdev_needs_recovery(rdev, MaxSector)) { 5055 is_recover = true; 5056 break; 5057 } 5058 } 5059 rcu_read_unlock(); 5060 5061 return is_recover ? ACTION_RECOVER : ACTION_IDLE; 5062} 5063 5064enum sync_action md_sync_action(struct mddev *mddev) 5065{ 5066 unsigned long recovery = mddev->recovery; 5067 enum sync_action active_action; 5068 5069 /* 5070 * frozen has the highest priority, means running sync_thread will be 5071 * stopped immediately, and no new sync_thread can start. 5072 */ 5073 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 5074 return ACTION_FROZEN; 5075 5076 /* 5077 * read-only array can't register sync_thread, and it can only 5078 * add/remove spares. 5079 */ 5080 if (!md_is_rdwr(mddev)) 5081 return ACTION_IDLE; 5082 5083 /* 5084 * idle means no sync_thread is running, and no new sync_thread is 5085 * requested. 5086 */ 5087 if (!test_bit(MD_RECOVERY_RUNNING, &recovery) && 5088 !test_bit(MD_RECOVERY_NEEDED, &recovery)) 5089 return ACTION_IDLE; 5090 5091 /* 5092 * Check if any sync operation (resync/recover/reshape) is 5093 * currently active. This ensures that only one sync operation 5094 * can run at a time. Returns the type of active operation, or 5095 * ACTION_IDLE if none are active. 5096 */ 5097 active_action = md_get_active_sync_action(mddev); 5098 if (active_action != ACTION_IDLE) 5099 return active_action; 5100 5101 if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) 5102 return ACTION_RESHAPE; 5103 5104 if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 5105 return ACTION_RECOVER; 5106 5107 if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 5108 /* 5109 * MD_RECOVERY_CHECK must be paired with 5110 * MD_RECOVERY_REQUESTED. 5111 */ 5112 if (test_bit(MD_RECOVERY_CHECK, &recovery)) 5113 return ACTION_CHECK; 5114 if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) 5115 return ACTION_REPAIR; 5116 return ACTION_RESYNC; 5117 } 5118 5119 /* 5120 * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no 5121 * sync_action is specified. 5122 */ 5123 return ACTION_IDLE; 5124} 5125 5126enum sync_action md_sync_action_by_name(const char *page) 5127{ 5128 enum sync_action action; 5129 5130 for (action = 0; action < NR_SYNC_ACTIONS; ++action) { 5131 if (cmd_match(page, action_name[action])) 5132 return action; 5133 } 5134 5135 return NR_SYNC_ACTIONS; 5136} 5137 5138const char *md_sync_action_name(enum sync_action action) 5139{ 5140 return action_name[action]; 5141} 5142 5143static ssize_t 5144action_show(struct mddev *mddev, char *page) 5145{ 5146 enum sync_action action = md_sync_action(mddev); 5147 5148 return sprintf(page, "%s\n", md_sync_action_name(action)); 5149} 5150 5151/** 5152 * stop_sync_thread() - wait for sync_thread to stop if it's running. 5153 * @mddev: the array. 5154 * @locked: if set, reconfig_mutex will still be held after this function 5155 * return; if not set, reconfig_mutex will be released after this 5156 * function return. 5157 */ 5158static void stop_sync_thread(struct mddev *mddev, bool locked) 5159{ 5160 int sync_seq = atomic_read(&mddev->sync_seq); 5161 5162 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5163 if (!locked) 5164 mddev_unlock(mddev); 5165 return; 5166 } 5167 5168 mddev_unlock(mddev); 5169 5170 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5171 /* 5172 * Thread might be blocked waiting for metadata update which will now 5173 * never happen 5174 */ 5175 md_wakeup_thread_directly(&mddev->sync_thread); 5176 if (work_pending(&mddev->sync_work)) 5177 flush_work(&mddev->sync_work); 5178 5179 wait_event(resync_wait, 5180 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 5181 (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) && 5182 sync_seq != atomic_read(&mddev->sync_seq))); 5183 5184 if (locked) 5185 mddev_lock_nointr(mddev); 5186} 5187 5188void md_idle_sync_thread(struct mddev *mddev) 5189{ 5190 lockdep_assert_held(&mddev->reconfig_mutex); 5191 5192 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5193 stop_sync_thread(mddev, true); 5194} 5195EXPORT_SYMBOL_GPL(md_idle_sync_thread); 5196 5197void md_frozen_sync_thread(struct mddev *mddev) 5198{ 5199 lockdep_assert_held(&mddev->reconfig_mutex); 5200 5201 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5202 stop_sync_thread(mddev, true); 5203} 5204EXPORT_SYMBOL_GPL(md_frozen_sync_thread); 5205 5206void md_unfrozen_sync_thread(struct mddev *mddev) 5207{ 5208 lockdep_assert_held(&mddev->reconfig_mutex); 5209 5210 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5211 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5212 md_wakeup_thread(mddev->thread); 5213 sysfs_notify_dirent_safe(mddev->sysfs_action); 5214} 5215EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); 5216 5217static int mddev_start_reshape(struct mddev *mddev) 5218{ 5219 int ret; 5220 5221 if (mddev->pers->start_reshape == NULL) 5222 return -EINVAL; 5223 5224 if (mddev->reshape_position == MaxSector || 5225 mddev->pers->check_reshape == NULL || 5226 mddev->pers->check_reshape(mddev)) { 5227 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5228 ret = mddev->pers->start_reshape(mddev); 5229 if (ret) 5230 return ret; 5231 } else { 5232 /* 5233 * If reshape is still in progress, and md_check_recovery() can 5234 * continue to reshape, don't restart reshape because data can 5235 * be corrupted for raid456. 5236 */ 5237 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5238 } 5239 5240 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 5241 return 0; 5242} 5243 5244static ssize_t 5245action_store(struct mddev *mddev, const char *page, size_t len) 5246{ 5247 int ret; 5248 enum sync_action action; 5249 5250 if (!mddev->pers || !mddev->pers->sync_request) 5251 return -EINVAL; 5252 5253retry: 5254 if (work_busy(&mddev->sync_work)) 5255 flush_work(&mddev->sync_work); 5256 5257 ret = mddev_lock(mddev); 5258 if (ret) 5259 return ret; 5260 5261 if (work_busy(&mddev->sync_work)) { 5262 mddev_unlock(mddev); 5263 goto retry; 5264 } 5265 5266 action = md_sync_action_by_name(page); 5267 5268 /* TODO: mdadm rely on "idle" to start sync_thread. */ 5269 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5270 switch (action) { 5271 case ACTION_FROZEN: 5272 md_frozen_sync_thread(mddev); 5273 ret = len; 5274 goto out; 5275 case ACTION_IDLE: 5276 md_idle_sync_thread(mddev); 5277 break; 5278 case ACTION_RESHAPE: 5279 case ACTION_RECOVER: 5280 case ACTION_CHECK: 5281 case ACTION_REPAIR: 5282 case ACTION_RESYNC: 5283 ret = -EBUSY; 5284 goto out; 5285 default: 5286 ret = -EINVAL; 5287 goto out; 5288 } 5289 } else { 5290 switch (action) { 5291 case ACTION_FROZEN: 5292 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5293 ret = len; 5294 goto out; 5295 case ACTION_RESHAPE: 5296 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5297 ret = mddev_start_reshape(mddev); 5298 if (ret) 5299 goto out; 5300 break; 5301 case ACTION_RECOVER: 5302 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5303 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5304 break; 5305 case ACTION_CHECK: 5306 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5307 fallthrough; 5308 case ACTION_REPAIR: 5309 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 5310 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5311 fallthrough; 5312 case ACTION_RESYNC: 5313 case ACTION_IDLE: 5314 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5315 break; 5316 default: 5317 ret = -EINVAL; 5318 goto out; 5319 } 5320 } 5321 5322 if (mddev->ro == MD_AUTO_READ) { 5323 /* A write to sync_action is enough to justify 5324 * canceling read-auto mode 5325 */ 5326 mddev->ro = MD_RDWR; 5327 md_wakeup_thread(mddev->sync_thread); 5328 } 5329 5330 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5331 md_wakeup_thread(mddev->thread); 5332 sysfs_notify_dirent_safe(mddev->sysfs_action); 5333 ret = len; 5334 5335out: 5336 mddev_unlock(mddev); 5337 return ret; 5338} 5339 5340static struct md_sysfs_entry md_scan_mode = 5341__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 5342 5343static ssize_t 5344last_sync_action_show(struct mddev *mddev, char *page) 5345{ 5346 return sprintf(page, "%s\n", 5347 md_sync_action_name(mddev->last_sync_action)); 5348} 5349 5350static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 5351 5352static ssize_t 5353mismatch_cnt_show(struct mddev *mddev, char *page) 5354{ 5355 return sprintf(page, "%llu\n", 5356 (unsigned long long) 5357 atomic64_read(&mddev->resync_mismatches)); 5358} 5359 5360static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 5361 5362static ssize_t 5363sync_min_show(struct mddev *mddev, char *page) 5364{ 5365 return sprintf(page, "%d (%s)\n", speed_min(mddev), 5366 mddev->sync_speed_min ? "local" : "system"); 5367} 5368 5369static ssize_t 5370sync_min_store(struct mddev *mddev, const char *buf, size_t len) 5371{ 5372 unsigned int min; 5373 int rv; 5374 5375 if (strncmp(buf, "system", 6) == 0) { 5376 min = 0; 5377 } else { 5378 rv = kstrtouint(buf, 10, &min); 5379 if (rv < 0) 5380 return rv; 5381 if (min == 0) 5382 return -EINVAL; 5383 } 5384 mddev->sync_speed_min = min; 5385 return len; 5386} 5387 5388static struct md_sysfs_entry md_sync_min = 5389__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 5390 5391static ssize_t 5392sync_max_show(struct mddev *mddev, char *page) 5393{ 5394 return sprintf(page, "%d (%s)\n", speed_max(mddev), 5395 mddev->sync_speed_max ? "local" : "system"); 5396} 5397 5398static ssize_t 5399sync_max_store(struct mddev *mddev, const char *buf, size_t len) 5400{ 5401 unsigned int max; 5402 int rv; 5403 5404 if (strncmp(buf, "system", 6) == 0) { 5405 max = 0; 5406 } else { 5407 rv = kstrtouint(buf, 10, &max); 5408 if (rv < 0) 5409 return rv; 5410 if (max == 0) 5411 return -EINVAL; 5412 } 5413 mddev->sync_speed_max = max; 5414 return len; 5415} 5416 5417static struct md_sysfs_entry md_sync_max = 5418__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 5419 5420static ssize_t 5421sync_io_depth_show(struct mddev *mddev, char *page) 5422{ 5423 return sprintf(page, "%d (%s)\n", sync_io_depth(mddev), 5424 mddev->sync_io_depth ? "local" : "system"); 5425} 5426 5427static ssize_t 5428sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len) 5429{ 5430 unsigned int max; 5431 int rv; 5432 5433 if (strncmp(buf, "system", 6) == 0) { 5434 max = 0; 5435 } else { 5436 rv = kstrtouint(buf, 10, &max); 5437 if (rv < 0) 5438 return rv; 5439 if (max == 0) 5440 return -EINVAL; 5441 } 5442 mddev->sync_io_depth = max; 5443 return len; 5444} 5445 5446static struct md_sysfs_entry md_sync_io_depth = 5447__ATTR_RW(sync_io_depth); 5448 5449static ssize_t 5450degraded_show(struct mddev *mddev, char *page) 5451{ 5452 return sprintf(page, "%d\n", mddev->degraded); 5453} 5454static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 5455 5456static ssize_t 5457sync_force_parallel_show(struct mddev *mddev, char *page) 5458{ 5459 return sprintf(page, "%d\n", mddev->parallel_resync); 5460} 5461 5462static ssize_t 5463sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 5464{ 5465 long n; 5466 5467 if (kstrtol(buf, 10, &n)) 5468 return -EINVAL; 5469 5470 if (n != 0 && n != 1) 5471 return -EINVAL; 5472 5473 mddev->parallel_resync = n; 5474 5475 if (mddev->sync_thread) 5476 wake_up(&resync_wait); 5477 5478 return len; 5479} 5480 5481/* force parallel resync, even with shared block devices */ 5482static struct md_sysfs_entry md_sync_force_parallel = 5483__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 5484 sync_force_parallel_show, sync_force_parallel_store); 5485 5486static ssize_t 5487sync_speed_show(struct mddev *mddev, char *page) 5488{ 5489 unsigned long resync, dt, db; 5490 if (mddev->curr_resync == MD_RESYNC_NONE) 5491 return sprintf(page, "none\n"); 5492 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 5493 dt = (jiffies - mddev->resync_mark) / HZ; 5494 if (!dt) dt++; 5495 db = resync - mddev->resync_mark_cnt; 5496 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 5497} 5498 5499static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 5500 5501static ssize_t 5502sync_completed_show(struct mddev *mddev, char *page) 5503{ 5504 unsigned long long max_sectors, resync; 5505 5506 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5507 return sprintf(page, "none\n"); 5508 5509 if (mddev->curr_resync == MD_RESYNC_YIELDED || 5510 mddev->curr_resync == MD_RESYNC_DELAYED) 5511 return sprintf(page, "delayed\n"); 5512 5513 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 5514 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5515 max_sectors = mddev->resync_max_sectors; 5516 else 5517 max_sectors = mddev->dev_sectors; 5518 5519 resync = mddev->curr_resync_completed; 5520 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 5521} 5522 5523static struct md_sysfs_entry md_sync_completed = 5524 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 5525 5526static ssize_t 5527min_sync_show(struct mddev *mddev, char *page) 5528{ 5529 return sprintf(page, "%llu\n", 5530 (unsigned long long)mddev->resync_min); 5531} 5532static ssize_t 5533min_sync_store(struct mddev *mddev, const char *buf, size_t len) 5534{ 5535 unsigned long long min; 5536 int err; 5537 5538 if (kstrtoull(buf, 10, &min)) 5539 return -EINVAL; 5540 5541 spin_lock(&mddev->lock); 5542 err = -EINVAL; 5543 if (min > mddev->resync_max) 5544 goto out_unlock; 5545 5546 err = -EBUSY; 5547 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5548 goto out_unlock; 5549 5550 /* Round down to multiple of 4K for safety */ 5551 mddev->resync_min = round_down(min, 8); 5552 err = 0; 5553 5554out_unlock: 5555 spin_unlock(&mddev->lock); 5556 return err ?: len; 5557} 5558 5559static struct md_sysfs_entry md_min_sync = 5560__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 5561 5562static ssize_t 5563max_sync_show(struct mddev *mddev, char *page) 5564{ 5565 if (mddev->resync_max == MaxSector) 5566 return sprintf(page, "max\n"); 5567 else 5568 return sprintf(page, "%llu\n", 5569 (unsigned long long)mddev->resync_max); 5570} 5571static ssize_t 5572max_sync_store(struct mddev *mddev, const char *buf, size_t len) 5573{ 5574 int err; 5575 spin_lock(&mddev->lock); 5576 if (strncmp(buf, "max", 3) == 0) 5577 mddev->resync_max = MaxSector; 5578 else { 5579 unsigned long long max; 5580 int chunk; 5581 5582 err = -EINVAL; 5583 if (kstrtoull(buf, 10, &max)) 5584 goto out_unlock; 5585 if (max < mddev->resync_min) 5586 goto out_unlock; 5587 5588 err = -EBUSY; 5589 if (max < mddev->resync_max && md_is_rdwr(mddev) && 5590 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5591 goto out_unlock; 5592 5593 /* Must be a multiple of chunk_size */ 5594 chunk = mddev->chunk_sectors; 5595 if (chunk) { 5596 sector_t temp = max; 5597 5598 err = -EINVAL; 5599 if (sector_div(temp, chunk)) 5600 goto out_unlock; 5601 } 5602 mddev->resync_max = max; 5603 } 5604 wake_up(&mddev->recovery_wait); 5605 err = 0; 5606out_unlock: 5607 spin_unlock(&mddev->lock); 5608 return err ?: len; 5609} 5610 5611static struct md_sysfs_entry md_max_sync = 5612__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 5613 5614static ssize_t 5615suspend_lo_show(struct mddev *mddev, char *page) 5616{ 5617 return sprintf(page, "%llu\n", 5618 (unsigned long long)READ_ONCE(mddev->suspend_lo)); 5619} 5620 5621static ssize_t 5622suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 5623{ 5624 unsigned long long new; 5625 int err; 5626 5627 err = kstrtoull(buf, 10, &new); 5628 if (err < 0) 5629 return err; 5630 if (new != (sector_t)new) 5631 return -EINVAL; 5632 5633 err = mddev_suspend(mddev, true); 5634 if (err) 5635 return err; 5636 5637 WRITE_ONCE(mddev->suspend_lo, new); 5638 mddev_resume(mddev); 5639 5640 return len; 5641} 5642static struct md_sysfs_entry md_suspend_lo = 5643__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 5644 5645static ssize_t 5646suspend_hi_show(struct mddev *mddev, char *page) 5647{ 5648 return sprintf(page, "%llu\n", 5649 (unsigned long long)READ_ONCE(mddev->suspend_hi)); 5650} 5651 5652static ssize_t 5653suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 5654{ 5655 unsigned long long new; 5656 int err; 5657 5658 err = kstrtoull(buf, 10, &new); 5659 if (err < 0) 5660 return err; 5661 if (new != (sector_t)new) 5662 return -EINVAL; 5663 5664 err = mddev_suspend(mddev, true); 5665 if (err) 5666 return err; 5667 5668 WRITE_ONCE(mddev->suspend_hi, new); 5669 mddev_resume(mddev); 5670 5671 return len; 5672} 5673static struct md_sysfs_entry md_suspend_hi = 5674__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 5675 5676static ssize_t 5677reshape_position_show(struct mddev *mddev, char *page) 5678{ 5679 if (mddev->reshape_position != MaxSector) 5680 return sprintf(page, "%llu\n", 5681 (unsigned long long)mddev->reshape_position); 5682 strcpy(page, "none\n"); 5683 return 5; 5684} 5685 5686static ssize_t 5687reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 5688{ 5689 struct md_rdev *rdev; 5690 unsigned long long new; 5691 int err; 5692 5693 err = kstrtoull(buf, 10, &new); 5694 if (err < 0) 5695 return err; 5696 if (new != (sector_t)new) 5697 return -EINVAL; 5698 err = mddev_lock(mddev); 5699 if (err) 5700 return err; 5701 err = -EBUSY; 5702 if (mddev->pers) 5703 goto unlock; 5704 mddev->reshape_position = new; 5705 mddev->delta_disks = 0; 5706 mddev->reshape_backwards = 0; 5707 mddev->new_level = mddev->level; 5708 mddev->new_layout = mddev->layout; 5709 mddev->new_chunk_sectors = mddev->chunk_sectors; 5710 rdev_for_each(rdev, mddev) 5711 rdev->new_data_offset = rdev->data_offset; 5712 err = 0; 5713unlock: 5714 mddev_unlock(mddev); 5715 return err ?: len; 5716} 5717 5718static struct md_sysfs_entry md_reshape_position = 5719__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 5720 reshape_position_store); 5721 5722static ssize_t 5723reshape_direction_show(struct mddev *mddev, char *page) 5724{ 5725 return sprintf(page, "%s\n", 5726 mddev->reshape_backwards ? "backwards" : "forwards"); 5727} 5728 5729static ssize_t 5730reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 5731{ 5732 int backwards = 0; 5733 int err; 5734 5735 if (cmd_match(buf, "forwards")) 5736 backwards = 0; 5737 else if (cmd_match(buf, "backwards")) 5738 backwards = 1; 5739 else 5740 return -EINVAL; 5741 if (mddev->reshape_backwards == backwards) 5742 return len; 5743 5744 err = mddev_lock(mddev); 5745 if (err) 5746 return err; 5747 /* check if we are allowed to change */ 5748 if (mddev->delta_disks) 5749 err = -EBUSY; 5750 else if (mddev->persistent && 5751 mddev->major_version == 0) 5752 err = -EINVAL; 5753 else 5754 mddev->reshape_backwards = backwards; 5755 mddev_unlock(mddev); 5756 return err ?: len; 5757} 5758 5759static struct md_sysfs_entry md_reshape_direction = 5760__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 5761 reshape_direction_store); 5762 5763static ssize_t 5764array_size_show(struct mddev *mddev, char *page) 5765{ 5766 if (mddev->external_size) 5767 return sprintf(page, "%llu\n", 5768 (unsigned long long)mddev->array_sectors/2); 5769 else 5770 return sprintf(page, "default\n"); 5771} 5772 5773static ssize_t 5774array_size_store(struct mddev *mddev, const char *buf, size_t len) 5775{ 5776 sector_t sectors; 5777 int err; 5778 5779 err = mddev_lock(mddev); 5780 if (err) 5781 return err; 5782 5783 /* cluster raid doesn't support change array_sectors */ 5784 if (mddev_is_clustered(mddev)) { 5785 mddev_unlock(mddev); 5786 return -EINVAL; 5787 } 5788 5789 if (strncmp(buf, "default", 7) == 0) { 5790 if (mddev->pers) 5791 sectors = mddev->pers->size(mddev, 0, 0); 5792 else 5793 sectors = mddev->array_sectors; 5794 5795 mddev->external_size = 0; 5796 } else { 5797 if (strict_blocks_to_sectors(buf, &sectors) < 0) 5798 err = -EINVAL; 5799 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 5800 err = -E2BIG; 5801 else 5802 mddev->external_size = 1; 5803 } 5804 5805 if (!err) { 5806 mddev->array_sectors = sectors; 5807 if (mddev->pers) 5808 set_capacity_and_notify(mddev->gendisk, 5809 mddev->array_sectors); 5810 } 5811 mddev_unlock(mddev); 5812 return err ?: len; 5813} 5814 5815static struct md_sysfs_entry md_array_size = 5816__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 5817 array_size_store); 5818 5819static ssize_t 5820consistency_policy_show(struct mddev *mddev, char *page) 5821{ 5822 int ret; 5823 5824 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5825 ret = sprintf(page, "journal\n"); 5826 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { 5827 ret = sprintf(page, "ppl\n"); 5828 } else if (mddev->bitmap) { 5829 ret = sprintf(page, "bitmap\n"); 5830 } else if (mddev->pers) { 5831 if (mddev->pers->sync_request) 5832 ret = sprintf(page, "resync\n"); 5833 else 5834 ret = sprintf(page, "none\n"); 5835 } else { 5836 ret = sprintf(page, "unknown\n"); 5837 } 5838 5839 return ret; 5840} 5841 5842static ssize_t 5843consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) 5844{ 5845 int err = 0; 5846 5847 if (mddev->pers) { 5848 if (mddev->pers->change_consistency_policy) 5849 err = mddev->pers->change_consistency_policy(mddev, buf); 5850 else 5851 err = -EBUSY; 5852 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { 5853 set_bit(MD_HAS_PPL, &mddev->flags); 5854 } else { 5855 err = -EINVAL; 5856 } 5857 5858 return err ? err : len; 5859} 5860 5861static struct md_sysfs_entry md_consistency_policy = 5862__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, 5863 consistency_policy_store); 5864 5865static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) 5866{ 5867 return sprintf(page, "%d\n", mddev->fail_last_dev); 5868} 5869 5870/* 5871 * Setting fail_last_dev to true to allow last device to be forcibly removed 5872 * from RAID1/RAID10. 5873 */ 5874static ssize_t 5875fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) 5876{ 5877 int ret; 5878 bool value; 5879 5880 ret = kstrtobool(buf, &value); 5881 if (ret) 5882 return ret; 5883 5884 if (value != mddev->fail_last_dev) 5885 mddev->fail_last_dev = value; 5886 5887 return len; 5888} 5889static struct md_sysfs_entry md_fail_last_dev = 5890__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, 5891 fail_last_dev_store); 5892 5893static ssize_t serialize_policy_show(struct mddev *mddev, char *page) 5894{ 5895 if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) 5896 return sprintf(page, "n/a\n"); 5897 else 5898 return sprintf(page, "%d\n", mddev->serialize_policy); 5899} 5900 5901/* 5902 * Setting serialize_policy to true to enforce write IO is not reordered 5903 * for raid1. 5904 */ 5905static ssize_t 5906serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) 5907{ 5908 int err; 5909 bool value; 5910 5911 err = kstrtobool(buf, &value); 5912 if (err) 5913 return err; 5914 5915 if (value == mddev->serialize_policy) 5916 return len; 5917 5918 err = mddev_suspend_and_lock(mddev); 5919 if (err) 5920 return err; 5921 if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) { 5922 pr_err("md: serialize_policy is only effective for raid1\n"); 5923 err = -EINVAL; 5924 goto unlock; 5925 } 5926 5927 if (value) 5928 mddev_create_serial_pool(mddev, NULL); 5929 else 5930 mddev_destroy_serial_pool(mddev, NULL); 5931 mddev->serialize_policy = value; 5932unlock: 5933 mddev_unlock_and_resume(mddev); 5934 return err ?: len; 5935} 5936 5937static struct md_sysfs_entry md_serialize_policy = 5938__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, 5939 serialize_policy_store); 5940 5941static int mddev_set_logical_block_size(struct mddev *mddev, 5942 unsigned int lbs) 5943{ 5944 int err = 0; 5945 struct queue_limits lim; 5946 5947 if (queue_logical_block_size(mddev->gendisk->queue) >= lbs) { 5948 pr_err("%s: Cannot set LBS smaller than mddev LBS %u\n", 5949 mdname(mddev), lbs); 5950 return -EINVAL; 5951 } 5952 5953 lim = queue_limits_start_update(mddev->gendisk->queue); 5954 lim.logical_block_size = lbs; 5955 pr_info("%s: logical_block_size is changed, data may be lost\n", 5956 mdname(mddev)); 5957 err = queue_limits_commit_update(mddev->gendisk->queue, &lim); 5958 if (err) 5959 return err; 5960 5961 mddev->logical_block_size = lbs; 5962 /* New lbs will be written to superblock after array is running */ 5963 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5964 return 0; 5965} 5966 5967static ssize_t 5968lbs_show(struct mddev *mddev, char *page) 5969{ 5970 return sprintf(page, "%u\n", mddev->logical_block_size); 5971} 5972 5973static ssize_t 5974lbs_store(struct mddev *mddev, const char *buf, size_t len) 5975{ 5976 unsigned int lbs; 5977 int err = -EBUSY; 5978 5979 /* Only 1.x meta supports configurable LBS */ 5980 if (mddev->major_version == 0) 5981 return -EINVAL; 5982 5983 err = kstrtouint(buf, 10, &lbs); 5984 if (err < 0) 5985 return -EINVAL; 5986 5987 if (mddev->pers) { 5988 unsigned int curr_lbs; 5989 5990 if (mddev->logical_block_size) 5991 return -EBUSY; 5992 /* 5993 * To fix forward compatibility issues, LBS is not 5994 * configured for arrays from old kernels (<=6.18) by default. 5995 * If the user confirms no rollback to old kernels, 5996 * enable LBS by writing current LBS — to prevent data 5997 * loss from LBS changes. 5998 */ 5999 curr_lbs = queue_logical_block_size(mddev->gendisk->queue); 6000 if (lbs != curr_lbs) 6001 return -EINVAL; 6002 6003 mddev->logical_block_size = curr_lbs; 6004 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6005 pr_info("%s: logical block size configured successfully, array will not be assembled in old kernels (<= 6.18)\n", 6006 mdname(mddev)); 6007 return len; 6008 } 6009 6010 err = mddev_lock(mddev); 6011 if (err) 6012 goto unlock; 6013 6014 err = mddev_set_logical_block_size(mddev, lbs); 6015 6016unlock: 6017 mddev_unlock(mddev); 6018 return err ?: len; 6019} 6020 6021static struct md_sysfs_entry md_logical_block_size = 6022__ATTR(logical_block_size, 0644, lbs_show, lbs_store); 6023 6024static struct attribute *md_default_attrs[] = { 6025 &md_level.attr, 6026 &md_new_level.attr, 6027 &md_bitmap_type.attr, 6028 &md_layout.attr, 6029 &md_raid_disks.attr, 6030 &md_uuid.attr, 6031 &md_chunk_size.attr, 6032 &md_size.attr, 6033 &md_resync_start.attr, 6034 &md_metadata.attr, 6035 &md_new_device.attr, 6036 &md_safe_delay.attr, 6037 &md_array_state.attr, 6038 &md_reshape_position.attr, 6039 &md_reshape_direction.attr, 6040 &md_array_size.attr, 6041 &max_corr_read_errors.attr, 6042 &md_consistency_policy.attr, 6043 &md_fail_last_dev.attr, 6044 &md_serialize_policy.attr, 6045 &md_logical_block_size.attr, 6046 NULL, 6047}; 6048 6049static const struct attribute_group md_default_group = { 6050 .attrs = md_default_attrs, 6051}; 6052 6053static struct attribute *md_redundancy_attrs[] = { 6054 &md_scan_mode.attr, 6055 &md_last_scan_mode.attr, 6056 &md_mismatches.attr, 6057 &md_sync_min.attr, 6058 &md_sync_max.attr, 6059 &md_sync_io_depth.attr, 6060 &md_sync_speed.attr, 6061 &md_sync_force_parallel.attr, 6062 &md_sync_completed.attr, 6063 &md_min_sync.attr, 6064 &md_max_sync.attr, 6065 &md_suspend_lo.attr, 6066 &md_suspend_hi.attr, 6067 &md_bitmap.attr, 6068 &md_degraded.attr, 6069 NULL, 6070}; 6071static const struct attribute_group md_redundancy_group = { 6072 .name = NULL, 6073 .attrs = md_redundancy_attrs, 6074}; 6075 6076static const struct attribute_group *md_attr_groups[] = { 6077 &md_default_group, 6078 NULL, 6079}; 6080 6081static ssize_t 6082md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 6083{ 6084 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 6085 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 6086 ssize_t rv; 6087 6088 if (!entry->show) 6089 return -EIO; 6090 spin_lock(&all_mddevs_lock); 6091 if (!mddev_get(mddev)) { 6092 spin_unlock(&all_mddevs_lock); 6093 return -EBUSY; 6094 } 6095 spin_unlock(&all_mddevs_lock); 6096 6097 rv = entry->show(mddev, page); 6098 mddev_put(mddev); 6099 return rv; 6100} 6101 6102static ssize_t 6103md_attr_store(struct kobject *kobj, struct attribute *attr, 6104 const char *page, size_t length) 6105{ 6106 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 6107 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 6108 ssize_t rv; 6109 struct kernfs_node *kn = NULL; 6110 6111 if (!entry->store) 6112 return -EIO; 6113 if (!capable(CAP_SYS_ADMIN)) 6114 return -EACCES; 6115 6116 if (entry->store == array_state_store && cmd_match(page, "clear")) 6117 kn = sysfs_break_active_protection(kobj, attr); 6118 6119 spin_lock(&all_mddevs_lock); 6120 if (!mddev_get(mddev)) { 6121 spin_unlock(&all_mddevs_lock); 6122 if (kn) 6123 sysfs_unbreak_active_protection(kn); 6124 return -EBUSY; 6125 } 6126 spin_unlock(&all_mddevs_lock); 6127 rv = entry->store(mddev, page, length); 6128 mddev_put(mddev); 6129 6130 if (kn) 6131 sysfs_unbreak_active_protection(kn); 6132 6133 return rv; 6134} 6135 6136static void md_kobj_release(struct kobject *ko) 6137{ 6138 struct mddev *mddev = container_of(ko, struct mddev, kobj); 6139 6140 if (legacy_async_del_gendisk) { 6141 if (mddev->sysfs_state) 6142 sysfs_put(mddev->sysfs_state); 6143 if (mddev->sysfs_level) 6144 sysfs_put(mddev->sysfs_level); 6145 del_gendisk(mddev->gendisk); 6146 } 6147 put_disk(mddev->gendisk); 6148} 6149 6150static const struct sysfs_ops md_sysfs_ops = { 6151 .show = md_attr_show, 6152 .store = md_attr_store, 6153}; 6154static const struct kobj_type md_ktype = { 6155 .release = md_kobj_release, 6156 .sysfs_ops = &md_sysfs_ops, 6157 .default_groups = md_attr_groups, 6158}; 6159 6160int mdp_major = 0; 6161 6162/* stack the limit for all rdevs into lim */ 6163int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, 6164 unsigned int flags) 6165{ 6166 struct md_rdev *rdev; 6167 6168 rdev_for_each(rdev, mddev) { 6169 queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, 6170 mddev->gendisk->disk_name); 6171 if ((flags & MDDEV_STACK_INTEGRITY) && 6172 !queue_limits_stack_integrity_bdev(lim, rdev->bdev)) 6173 return -EINVAL; 6174 } 6175 6176 /* 6177 * Before RAID adding folio support, the logical_block_size 6178 * should be smaller than the page size. 6179 */ 6180 if (lim->logical_block_size > PAGE_SIZE) { 6181 pr_err("%s: logical_block_size must not larger than PAGE_SIZE\n", 6182 mdname(mddev)); 6183 return -EINVAL; 6184 } 6185 6186 /* Only 1.x meta needs to set logical block size */ 6187 if (mddev->major_version == 0) 6188 return 0; 6189 6190 /* 6191 * Fix forward compatibility issue. Only set LBS by default for 6192 * new arrays, mddev->events == 0 indicates the array was just 6193 * created. When assembling an array, read LBS from the superblock 6194 * instead — LBS is 0 in superblocks created by old kernels. 6195 */ 6196 if (!mddev->events) { 6197 pr_info("%s: array will not be assembled in old kernels that lack configurable LBS support (<= 6.18)\n", 6198 mdname(mddev)); 6199 mddev->logical_block_size = lim->logical_block_size; 6200 } 6201 6202 if (!mddev->logical_block_size) 6203 pr_warn("%s: echo current LBS to md/logical_block_size to prevent data loss issues from LBS changes.\n" 6204 "\tNote: After setting, array will not be assembled in old kernels (<= 6.18)\n", 6205 mdname(mddev)); 6206 6207 return 0; 6208} 6209EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); 6210 6211/* apply the extra stacking limits from a new rdev into mddev */ 6212int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) 6213{ 6214 struct queue_limits lim; 6215 6216 if (mddev_is_dm(mddev)) 6217 return 0; 6218 6219 if (queue_logical_block_size(rdev->bdev->bd_disk->queue) > 6220 queue_logical_block_size(mddev->gendisk->queue)) { 6221 pr_err("%s: incompatible logical_block_size, can not add\n", 6222 mdname(mddev)); 6223 return -EINVAL; 6224 } 6225 6226 lim = queue_limits_start_update(mddev->gendisk->queue); 6227 queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, 6228 mddev->gendisk->disk_name); 6229 6230 if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) { 6231 pr_err("%s: incompatible integrity profile for %pg\n", 6232 mdname(mddev), rdev->bdev); 6233 queue_limits_cancel_update(mddev->gendisk->queue); 6234 return -ENXIO; 6235 } 6236 6237 return queue_limits_commit_update(mddev->gendisk->queue, &lim); 6238} 6239EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); 6240 6241/* update the optimal I/O size after a reshape */ 6242void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes) 6243{ 6244 struct queue_limits lim; 6245 6246 if (mddev_is_dm(mddev)) 6247 return; 6248 6249 /* don't bother updating io_opt if we can't suspend the array */ 6250 if (mddev_suspend(mddev, false) < 0) 6251 return; 6252 lim = queue_limits_start_update(mddev->gendisk->queue); 6253 lim.io_opt = lim.io_min * nr_stripes; 6254 queue_limits_commit_update(mddev->gendisk->queue, &lim); 6255 mddev_resume(mddev); 6256} 6257EXPORT_SYMBOL_GPL(mddev_update_io_opt); 6258 6259static void mddev_delayed_delete(struct work_struct *ws) 6260{ 6261 struct mddev *mddev = container_of(ws, struct mddev, del_work); 6262 6263 kobject_put(&mddev->kobj); 6264} 6265 6266void md_init_stacking_limits(struct queue_limits *lim) 6267{ 6268 blk_set_stacking_limits(lim); 6269 lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | 6270 BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; 6271} 6272EXPORT_SYMBOL_GPL(md_init_stacking_limits); 6273 6274struct mddev *md_alloc(dev_t dev, char *name) 6275{ 6276 /* 6277 * If dev is zero, name is the name of a device to allocate with 6278 * an arbitrary minor number. It will be "md_???" 6279 * If dev is non-zero it must be a device number with a MAJOR of 6280 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then 6281 * the device is being created by opening a node in /dev. 6282 * If "name" is not NULL, the device is being created by 6283 * writing to /sys/module/md_mod/parameters/new_array. 6284 */ 6285 static DEFINE_MUTEX(disks_mutex); 6286 struct mddev *mddev; 6287 struct gendisk *disk; 6288 int partitioned; 6289 int shift; 6290 int unit; 6291 int error; 6292 6293 /* 6294 * Wait for any previous instance of this device to be completely 6295 * removed (mddev_delayed_delete). 6296 */ 6297 flush_workqueue(md_misc_wq); 6298 6299 mutex_lock(&disks_mutex); 6300 mddev = mddev_alloc(dev); 6301 if (IS_ERR(mddev)) { 6302 error = PTR_ERR(mddev); 6303 goto out_unlock; 6304 } 6305 6306 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 6307 shift = partitioned ? MdpMinorShift : 0; 6308 unit = MINOR(mddev->unit) >> shift; 6309 6310 if (name && !dev) { 6311 /* Need to ensure that 'name' is not a duplicate. 6312 */ 6313 struct mddev *mddev2; 6314 spin_lock(&all_mddevs_lock); 6315 6316 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 6317 if (mddev2->gendisk && 6318 strcmp(mddev2->gendisk->disk_name, name) == 0) { 6319 spin_unlock(&all_mddevs_lock); 6320 error = -EEXIST; 6321 goto out_free_mddev; 6322 } 6323 spin_unlock(&all_mddevs_lock); 6324 } 6325 if (name && dev) 6326 /* 6327 * Creating /dev/mdNNN via "newarray", so adjust hold_active. 6328 */ 6329 mddev->hold_active = UNTIL_STOP; 6330 6331 disk = blk_alloc_disk(NULL, NUMA_NO_NODE); 6332 if (IS_ERR(disk)) { 6333 error = PTR_ERR(disk); 6334 goto out_free_mddev; 6335 } 6336 6337 disk->major = MAJOR(mddev->unit); 6338 disk->first_minor = unit << shift; 6339 disk->minors = 1 << shift; 6340 if (name) 6341 strcpy(disk->disk_name, name); 6342 else if (partitioned) 6343 sprintf(disk->disk_name, "md_d%d", unit); 6344 else 6345 sprintf(disk->disk_name, "md%d", unit); 6346 disk->fops = &md_fops; 6347 disk->private_data = mddev; 6348 6349 disk->events |= DISK_EVENT_MEDIA_CHANGE; 6350 mddev->gendisk = disk; 6351 error = add_disk(disk); 6352 if (error) 6353 goto out_put_disk; 6354 6355 kobject_init(&mddev->kobj, &md_ktype); 6356 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); 6357 if (error) { 6358 /* 6359 * The disk is already live at this point. Clear the hold flag 6360 * and let mddev_put take care of the deletion, as it isn't any 6361 * different from a normal close on last release now. 6362 */ 6363 mddev->hold_active = 0; 6364 mutex_unlock(&disks_mutex); 6365 mddev_put(mddev); 6366 return ERR_PTR(error); 6367 } 6368 6369 kobject_uevent(&mddev->kobj, KOBJ_ADD); 6370 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 6371 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); 6372 mutex_unlock(&disks_mutex); 6373 return mddev; 6374 6375out_put_disk: 6376 put_disk(disk); 6377out_free_mddev: 6378 mddev_free(mddev); 6379out_unlock: 6380 mutex_unlock(&disks_mutex); 6381 return ERR_PTR(error); 6382} 6383 6384static int md_alloc_and_put(dev_t dev, char *name) 6385{ 6386 struct mddev *mddev = md_alloc(dev, name); 6387 6388 if (legacy_async_del_gendisk) 6389 pr_warn("md: async del_gendisk mode will be removed in future, please upgrade to mdadm-4.5+\n"); 6390 6391 if (IS_ERR(mddev)) 6392 return PTR_ERR(mddev); 6393 mddev_put(mddev); 6394 return 0; 6395} 6396 6397static void md_probe(dev_t dev) 6398{ 6399 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) 6400 return; 6401 if (create_on_open) 6402 md_alloc_and_put(dev, NULL); 6403} 6404 6405static int add_named_array(const char *val, const struct kernel_param *kp) 6406{ 6407 /* 6408 * val must be "md_*" or "mdNNN". 6409 * For "md_*" we allocate an array with a large free minor number, and 6410 * set the name to val. val must not already be an active name. 6411 * For "mdNNN" we allocate an array with the minor number NNN 6412 * which must not already be in use. 6413 */ 6414 int len = strlen(val); 6415 char buf[DISK_NAME_LEN]; 6416 unsigned long devnum; 6417 6418 while (len && val[len-1] == '\n') 6419 len--; 6420 if (len >= DISK_NAME_LEN) 6421 return -E2BIG; 6422 strscpy(buf, val, len+1); 6423 if (strncmp(buf, "md_", 3) == 0) 6424 return md_alloc_and_put(0, buf); 6425 if (strncmp(buf, "md", 2) == 0 && 6426 isdigit(buf[2]) && 6427 kstrtoul(buf+2, 10, &devnum) == 0 && 6428 devnum <= MINORMASK) 6429 return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); 6430 6431 return -EINVAL; 6432} 6433 6434static void md_safemode_timeout(struct timer_list *t) 6435{ 6436 struct mddev *mddev = timer_container_of(mddev, t, safemode_timer); 6437 6438 mddev->safemode = 1; 6439 if (mddev->external) 6440 sysfs_notify_dirent_safe(mddev->sysfs_state); 6441 6442 md_wakeup_thread(mddev->thread); 6443} 6444 6445static int start_dirty_degraded; 6446 6447static int md_bitmap_create(struct mddev *mddev) 6448{ 6449 if (mddev->bitmap_id == ID_BITMAP_NONE) 6450 return -EINVAL; 6451 6452 if (!mddev_set_bitmap_ops(mddev)) 6453 return -ENOENT; 6454 6455 return mddev->bitmap_ops->create(mddev); 6456} 6457 6458static void md_bitmap_destroy(struct mddev *mddev) 6459{ 6460 if (!md_bitmap_registered(mddev)) 6461 return; 6462 6463 mddev->bitmap_ops->destroy(mddev); 6464 mddev_clear_bitmap_ops(mddev); 6465} 6466 6467int md_run(struct mddev *mddev) 6468{ 6469 int err; 6470 struct md_rdev *rdev; 6471 struct md_personality *pers; 6472 bool nowait = true; 6473 6474 if (list_empty(&mddev->disks)) 6475 /* cannot run an array with no devices.. */ 6476 return -EINVAL; 6477 6478 if (mddev->pers) 6479 return -EBUSY; 6480 /* Cannot run until previous stop completes properly */ 6481 if (mddev->sysfs_active) 6482 return -EBUSY; 6483 6484 /* 6485 * Analyze all RAID superblock(s) 6486 */ 6487 if (!mddev->raid_disks) { 6488 if (!mddev->persistent) 6489 return -EINVAL; 6490 err = analyze_sbs(mddev); 6491 if (err) 6492 return -EINVAL; 6493 } 6494 6495 if (mddev->level != LEVEL_NONE) 6496 request_module("md-level-%d", mddev->level); 6497 else if (mddev->clevel[0]) 6498 request_module("md-%s", mddev->clevel); 6499 6500 /* 6501 * Drop all container device buffers, from now on 6502 * the only valid external interface is through the md 6503 * device. 6504 */ 6505 mddev->has_superblocks = false; 6506 rdev_for_each(rdev, mddev) { 6507 if (test_bit(Faulty, &rdev->flags)) 6508 continue; 6509 sync_blockdev(rdev->bdev); 6510 invalidate_bdev(rdev->bdev); 6511 if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { 6512 mddev->ro = MD_RDONLY; 6513 if (!mddev_is_dm(mddev)) 6514 set_disk_ro(mddev->gendisk, 1); 6515 } 6516 6517 if (rdev->sb_page) 6518 mddev->has_superblocks = true; 6519 6520 /* perform some consistency tests on the device. 6521 * We don't want the data to overlap the metadata, 6522 * Internal Bitmap issues have been handled elsewhere. 6523 */ 6524 if (rdev->meta_bdev) { 6525 /* Nothing to check */; 6526 } else if (rdev->data_offset < rdev->sb_start) { 6527 if (mddev->dev_sectors && 6528 rdev->data_offset + mddev->dev_sectors 6529 > rdev->sb_start) { 6530 pr_warn("md: %s: data overlaps metadata\n", 6531 mdname(mddev)); 6532 return -EINVAL; 6533 } 6534 } else { 6535 if (rdev->sb_start + rdev->sb_size/512 6536 > rdev->data_offset) { 6537 pr_warn("md: %s: metadata overlaps data\n", 6538 mdname(mddev)); 6539 return -EINVAL; 6540 } 6541 } 6542 sysfs_notify_dirent_safe(rdev->sysfs_state); 6543 nowait = nowait && bdev_nowait(rdev->bdev); 6544 } 6545 6546 pers = get_pers(mddev->level, mddev->clevel); 6547 if (!pers) 6548 return -EINVAL; 6549 if (mddev->level != pers->head.id) { 6550 mddev->level = pers->head.id; 6551 mddev->new_level = pers->head.id; 6552 } 6553 strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); 6554 6555 if (mddev->reshape_position != MaxSector && 6556 pers->start_reshape == NULL) { 6557 /* This personality cannot handle reshaping... */ 6558 put_pers(pers); 6559 return -EINVAL; 6560 } 6561 6562 if (pers->sync_request) { 6563 /* Warn if this is a potentially silly 6564 * configuration. 6565 */ 6566 struct md_rdev *rdev2; 6567 int warned = 0; 6568 6569 rdev_for_each(rdev, mddev) 6570 rdev_for_each(rdev2, mddev) { 6571 if (rdev < rdev2 && 6572 rdev->bdev->bd_disk == 6573 rdev2->bdev->bd_disk) { 6574 pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", 6575 mdname(mddev), 6576 rdev->bdev, 6577 rdev2->bdev); 6578 warned = 1; 6579 } 6580 } 6581 6582 if (warned) 6583 pr_warn("True protection against single-disk failure might be compromised.\n"); 6584 } 6585 6586 /* dm-raid expect sync_thread to be frozen until resume */ 6587 if (mddev->gendisk) 6588 mddev->recovery = 0; 6589 6590 /* may be over-ridden by personality */ 6591 mddev->resync_max_sectors = mddev->dev_sectors; 6592 6593 mddev->ok_start_degraded = start_dirty_degraded; 6594 6595 if (start_readonly && md_is_rdwr(mddev)) 6596 mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ 6597 6598 err = pers->run(mddev); 6599 if (err) 6600 pr_warn("md: pers->run() failed ...\n"); 6601 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 6602 WARN_ONCE(!mddev->external_size, 6603 "%s: default size too small, but 'external_size' not in effect?\n", 6604 __func__); 6605 pr_warn("md: invalid array_size %llu > default size %llu\n", 6606 (unsigned long long)mddev->array_sectors / 2, 6607 (unsigned long long)pers->size(mddev, 0, 0) / 2); 6608 err = -EINVAL; 6609 } 6610 if (err == 0 && pers->sync_request && 6611 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 6612 err = md_bitmap_create(mddev); 6613 if (err) 6614 pr_warn("%s: failed to create bitmap (%d)\n", 6615 mdname(mddev), err); 6616 } 6617 if (err) 6618 goto bitmap_abort; 6619 6620 if (mddev->bitmap_info.max_write_behind > 0) { 6621 bool create_pool = false; 6622 6623 rdev_for_each(rdev, mddev) { 6624 if (test_bit(WriteMostly, &rdev->flags) && 6625 rdev_init_serial(rdev)) 6626 create_pool = true; 6627 } 6628 if (create_pool && mddev->serial_info_pool == NULL) { 6629 mddev->serial_info_pool = 6630 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 6631 sizeof(struct serial_info)); 6632 if (!mddev->serial_info_pool) { 6633 err = -ENOMEM; 6634 goto bitmap_abort; 6635 } 6636 } 6637 } 6638 6639 if (pers->sync_request) { 6640 if (mddev->kobj.sd && 6641 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 6642 pr_warn("md: cannot register extra attributes for %s\n", 6643 mdname(mddev)); 6644 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 6645 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 6646 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 6647 } else if (mddev->ro == MD_AUTO_READ) 6648 mddev->ro = MD_RDWR; 6649 6650 atomic_set(&mddev->max_corr_read_errors, 6651 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 6652 mddev->safemode = 0; 6653 if (mddev_is_clustered(mddev)) 6654 mddev->safemode_delay = 0; 6655 else 6656 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 6657 mddev->in_sync = 1; 6658 smp_wmb(); 6659 spin_lock(&mddev->lock); 6660 mddev->pers = pers; 6661 spin_unlock(&mddev->lock); 6662 rdev_for_each(rdev, mddev) 6663 if (rdev->raid_disk >= 0) 6664 sysfs_link_rdev(mddev, rdev); /* failure here is OK */ 6665 6666 if (mddev->degraded && md_is_rdwr(mddev)) 6667 /* This ensures that recovering status is reported immediately 6668 * via sysfs - until a lack of spares is confirmed. 6669 */ 6670 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6671 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6672 6673 if (mddev->sb_flags) 6674 md_update_sb(mddev, 0); 6675 6676 md_new_event(); 6677 return 0; 6678 6679bitmap_abort: 6680 mddev_detach(mddev); 6681 if (mddev->private) 6682 pers->free(mddev, mddev->private); 6683 mddev->private = NULL; 6684 put_pers(pers); 6685 md_bitmap_destroy(mddev); 6686 return err; 6687} 6688EXPORT_SYMBOL_GPL(md_run); 6689 6690int do_md_run(struct mddev *mddev) 6691{ 6692 int err; 6693 6694 set_bit(MD_NOT_READY, &mddev->flags); 6695 err = md_run(mddev); 6696 if (err) 6697 goto out; 6698 6699 if (md_bitmap_registered(mddev)) { 6700 err = mddev->bitmap_ops->load(mddev); 6701 if (err) { 6702 md_bitmap_destroy(mddev); 6703 goto out; 6704 } 6705 } 6706 6707 if (mddev_is_clustered(mddev)) 6708 md_allow_write(mddev); 6709 6710 /* run start up tasks that require md_thread */ 6711 md_start(mddev); 6712 6713 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 6714 6715 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); 6716 clear_bit(MD_NOT_READY, &mddev->flags); 6717 mddev->changed = 1; 6718 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 6719 sysfs_notify_dirent_safe(mddev->sysfs_state); 6720 sysfs_notify_dirent_safe(mddev->sysfs_action); 6721 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 6722out: 6723 clear_bit(MD_NOT_READY, &mddev->flags); 6724 return err; 6725} 6726 6727int md_start(struct mddev *mddev) 6728{ 6729 int ret = 0; 6730 6731 if (mddev->pers->start) { 6732 set_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6733 ret = mddev->pers->start(mddev); 6734 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6735 md_wakeup_thread(mddev->sync_thread); 6736 } 6737 return ret; 6738} 6739EXPORT_SYMBOL_GPL(md_start); 6740 6741static int restart_array(struct mddev *mddev) 6742{ 6743 struct gendisk *disk = mddev->gendisk; 6744 struct md_rdev *rdev; 6745 bool has_journal = false; 6746 bool has_readonly = false; 6747 6748 /* Complain if it has no devices */ 6749 if (list_empty(&mddev->disks)) 6750 return -ENXIO; 6751 if (!mddev->pers) 6752 return -EINVAL; 6753 if (md_is_rdwr(mddev)) 6754 return -EBUSY; 6755 6756 rcu_read_lock(); 6757 rdev_for_each_rcu(rdev, mddev) { 6758 if (test_bit(Journal, &rdev->flags) && 6759 !test_bit(Faulty, &rdev->flags)) 6760 has_journal = true; 6761 if (rdev_read_only(rdev)) 6762 has_readonly = true; 6763 } 6764 rcu_read_unlock(); 6765 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) 6766 /* Don't restart rw with journal missing/faulty */ 6767 return -EINVAL; 6768 if (has_readonly) 6769 return -EROFS; 6770 6771 mddev->safemode = 0; 6772 mddev->ro = MD_RDWR; 6773 set_disk_ro(disk, 0); 6774 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 6775 /* Kick recovery or resync if necessary */ 6776 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6777 md_wakeup_thread(mddev->sync_thread); 6778 sysfs_notify_dirent_safe(mddev->sysfs_state); 6779 return 0; 6780} 6781 6782static void md_clean(struct mddev *mddev) 6783{ 6784 mddev->array_sectors = 0; 6785 mddev->external_size = 0; 6786 mddev->dev_sectors = 0; 6787 mddev->raid_disks = 0; 6788 mddev->resync_offset = 0; 6789 mddev->resync_min = 0; 6790 mddev->resync_max = MaxSector; 6791 mddev->reshape_position = MaxSector; 6792 /* we still need mddev->external in export_rdev, do not clear it yet */ 6793 mddev->persistent = 0; 6794 mddev->level = LEVEL_NONE; 6795 mddev->clevel[0] = 0; 6796 6797 /* 6798 * For legacy_async_del_gendisk mode, it can stop the array in the 6799 * middle of assembling it, then it still can access the array. So 6800 * it needs to clear MD_CLOSING. If not legacy_async_del_gendisk, 6801 * it can't open the array again after stopping it. So it doesn't 6802 * clear MD_CLOSING. 6803 */ 6804 if (legacy_async_del_gendisk && mddev->hold_active) { 6805 clear_bit(MD_CLOSING, &mddev->flags); 6806 } else { 6807 /* if UNTIL_STOP is set, it's cleared here */ 6808 mddev->hold_active = 0; 6809 /* Don't clear MD_CLOSING, or mddev can be opened again. */ 6810 mddev->flags &= BIT_ULL_MASK(MD_CLOSING); 6811 } 6812 mddev->sb_flags = 0; 6813 mddev->ro = MD_RDWR; 6814 mddev->metadata_type[0] = 0; 6815 mddev->chunk_sectors = 0; 6816 mddev->ctime = mddev->utime = 0; 6817 mddev->layout = 0; 6818 mddev->logical_block_size = 0; 6819 mddev->max_disks = 0; 6820 mddev->events = 0; 6821 mddev->can_decrease_events = 0; 6822 mddev->delta_disks = 0; 6823 mddev->reshape_backwards = 0; 6824 mddev->new_level = LEVEL_NONE; 6825 mddev->new_layout = 0; 6826 mddev->new_chunk_sectors = 0; 6827 mddev->curr_resync = MD_RESYNC_NONE; 6828 atomic64_set(&mddev->resync_mismatches, 0); 6829 mddev->suspend_lo = mddev->suspend_hi = 0; 6830 mddev->sync_speed_min = mddev->sync_speed_max = 0; 6831 mddev->recovery = 0; 6832 mddev->in_sync = 0; 6833 mddev->changed = 0; 6834 mddev->degraded = 0; 6835 mddev->safemode = 0; 6836 mddev->private = NULL; 6837 mddev->cluster_info = NULL; 6838 mddev->bitmap_info.offset = 0; 6839 mddev->bitmap_info.default_offset = 0; 6840 mddev->bitmap_info.default_space = 0; 6841 mddev->bitmap_info.chunksize = 0; 6842 mddev->bitmap_info.daemon_sleep = 0; 6843 mddev->bitmap_info.max_write_behind = 0; 6844 mddev->bitmap_info.nodes = 0; 6845} 6846 6847static void __md_stop_writes(struct mddev *mddev) 6848{ 6849 timer_delete_sync(&mddev->safemode_timer); 6850 6851 if (mddev->pers && mddev->pers->quiesce) { 6852 mddev->pers->quiesce(mddev, 1); 6853 mddev->pers->quiesce(mddev, 0); 6854 } 6855 6856 if (md_bitmap_enabled(mddev, true)) 6857 mddev->bitmap_ops->flush(mddev); 6858 6859 if (md_is_rdwr(mddev) && 6860 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 6861 mddev->sb_flags)) { 6862 /* mark array as shutdown cleanly */ 6863 if (!mddev_is_clustered(mddev)) 6864 mddev->in_sync = 1; 6865 md_update_sb(mddev, 1); 6866 } 6867 /* disable policy to guarantee rdevs free resources for serialization */ 6868 mddev->serialize_policy = 0; 6869 mddev_destroy_serial_pool(mddev, NULL); 6870} 6871 6872void md_stop_writes(struct mddev *mddev) 6873{ 6874 mddev_lock_nointr(mddev); 6875 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6876 stop_sync_thread(mddev, true); 6877 __md_stop_writes(mddev); 6878 mddev_unlock(mddev); 6879} 6880EXPORT_SYMBOL_GPL(md_stop_writes); 6881 6882static void mddev_detach(struct mddev *mddev) 6883{ 6884 if (md_bitmap_enabled(mddev, false)) 6885 mddev->bitmap_ops->wait_behind_writes(mddev); 6886 if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { 6887 mddev->pers->quiesce(mddev, 1); 6888 mddev->pers->quiesce(mddev, 0); 6889 } 6890 md_unregister_thread(mddev, &mddev->thread); 6891 6892 /* the unplug fn references 'conf' */ 6893 if (!mddev_is_dm(mddev)) 6894 blk_sync_queue(mddev->gendisk->queue); 6895} 6896 6897static void __md_stop(struct mddev *mddev) 6898{ 6899 struct md_personality *pers = mddev->pers; 6900 6901 md_bitmap_destroy(mddev); 6902 mddev_detach(mddev); 6903 spin_lock(&mddev->lock); 6904 mddev->pers = NULL; 6905 spin_unlock(&mddev->lock); 6906 if (mddev->private) 6907 pers->free(mddev, mddev->private); 6908 mddev->private = NULL; 6909 put_pers(pers); 6910 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6911} 6912 6913void md_stop(struct mddev *mddev) 6914{ 6915 lockdep_assert_held(&mddev->reconfig_mutex); 6916 6917 /* stop the array and free an attached data structures. 6918 * This is called from dm-raid 6919 */ 6920 __md_stop_writes(mddev); 6921 __md_stop(mddev); 6922} 6923 6924EXPORT_SYMBOL_GPL(md_stop); 6925 6926/* ensure 'mddev->pers' exist before calling md_set_readonly() */ 6927static int md_set_readonly(struct mddev *mddev) 6928{ 6929 int err = 0; 6930 int did_freeze = 0; 6931 6932 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 6933 return -EBUSY; 6934 6935 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6936 did_freeze = 1; 6937 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6938 } 6939 6940 stop_sync_thread(mddev, false); 6941 wait_event(mddev->sb_wait, 6942 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 6943 mddev_lock_nointr(mddev); 6944 6945 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6946 pr_warn("md: %s still in use.\n",mdname(mddev)); 6947 err = -EBUSY; 6948 goto out; 6949 } 6950 6951 __md_stop_writes(mddev); 6952 6953 if (mddev->ro == MD_RDONLY) { 6954 err = -ENXIO; 6955 goto out; 6956 } 6957 6958 mddev->ro = MD_RDONLY; 6959 set_disk_ro(mddev->gendisk, 1); 6960 6961out: 6962 if (!err || did_freeze) { 6963 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6964 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6965 sysfs_notify_dirent_safe(mddev->sysfs_state); 6966 } 6967 6968 return err; 6969} 6970 6971/* mode: 6972 * 0 - completely stop and dis-assemble array 6973 * 2 - stop but do not disassemble array 6974 */ 6975static int do_md_stop(struct mddev *mddev, int mode) 6976{ 6977 struct gendisk *disk = mddev->gendisk; 6978 struct md_rdev *rdev; 6979 int did_freeze = 0; 6980 6981 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6982 did_freeze = 1; 6983 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6984 } 6985 6986 stop_sync_thread(mddev, true); 6987 6988 if (mddev->sysfs_active || 6989 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6990 pr_warn("md: %s still in use.\n",mdname(mddev)); 6991 if (did_freeze) { 6992 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6993 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6994 } 6995 return -EBUSY; 6996 } 6997 if (mddev->pers) { 6998 if (!md_is_rdwr(mddev)) 6999 set_disk_ro(disk, 0); 7000 7001 if (mode == 2 && mddev->pers->sync_request && 7002 mddev->to_remove == NULL) 7003 mddev->to_remove = &md_redundancy_group; 7004 7005 __md_stop_writes(mddev); 7006 __md_stop(mddev); 7007 7008 /* tell userspace to handle 'inactive' */ 7009 sysfs_notify_dirent_safe(mddev->sysfs_state); 7010 7011 rdev_for_each(rdev, mddev) 7012 if (rdev->raid_disk >= 0) 7013 sysfs_unlink_rdev(mddev, rdev); 7014 7015 set_capacity_and_notify(disk, 0); 7016 mddev->changed = 1; 7017 7018 if (!md_is_rdwr(mddev)) 7019 mddev->ro = MD_RDWR; 7020 } 7021 /* 7022 * Free resources if final stop 7023 */ 7024 if (mode == 0) { 7025 pr_info("md: %s stopped.\n", mdname(mddev)); 7026 7027 if (mddev->bitmap_info.file) { 7028 struct file *f = mddev->bitmap_info.file; 7029 spin_lock(&mddev->lock); 7030 mddev->bitmap_info.file = NULL; 7031 spin_unlock(&mddev->lock); 7032 fput(f); 7033 } 7034 mddev->bitmap_info.offset = 0; 7035 7036 export_array(mddev); 7037 md_clean(mddev); 7038 if (!legacy_async_del_gendisk) 7039 set_bit(MD_DELETED, &mddev->flags); 7040 } 7041 md_new_event(); 7042 sysfs_notify_dirent_safe(mddev->sysfs_state); 7043 return 0; 7044} 7045 7046#ifndef MODULE 7047static void autorun_array(struct mddev *mddev) 7048{ 7049 struct md_rdev *rdev; 7050 int err; 7051 7052 if (list_empty(&mddev->disks)) 7053 return; 7054 7055 pr_info("md: running: "); 7056 7057 rdev_for_each(rdev, mddev) { 7058 pr_cont("<%pg>", rdev->bdev); 7059 } 7060 pr_cont("\n"); 7061 7062 err = do_md_run(mddev); 7063 if (err) { 7064 pr_warn("md: do_md_run() returned %d\n", err); 7065 do_md_stop(mddev, 0); 7066 } 7067} 7068 7069/* 7070 * lets try to run arrays based on all disks that have arrived 7071 * until now. (those are in pending_raid_disks) 7072 * 7073 * the method: pick the first pending disk, collect all disks with 7074 * the same UUID, remove all from the pending list and put them into 7075 * the 'same_array' list. Then order this list based on superblock 7076 * update time (freshest comes first), kick out 'old' disks and 7077 * compare superblocks. If everything's fine then run it. 7078 * 7079 * If "unit" is allocated, then bump its reference count 7080 */ 7081static void autorun_devices(int part) 7082{ 7083 struct md_rdev *rdev0, *rdev, *tmp; 7084 struct mddev *mddev; 7085 7086 pr_info("md: autorun ...\n"); 7087 while (!list_empty(&pending_raid_disks)) { 7088 int unit; 7089 dev_t dev; 7090 LIST_HEAD(candidates); 7091 rdev0 = list_entry(pending_raid_disks.next, 7092 struct md_rdev, same_set); 7093 7094 pr_debug("md: considering %pg ...\n", rdev0->bdev); 7095 INIT_LIST_HEAD(&candidates); 7096 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 7097 if (super_90_load(rdev, rdev0, 0) >= 0) { 7098 pr_debug("md: adding %pg ...\n", 7099 rdev->bdev); 7100 list_move(&rdev->same_set, &candidates); 7101 } 7102 /* 7103 * now we have a set of devices, with all of them having 7104 * mostly sane superblocks. It's time to allocate the 7105 * mddev. 7106 */ 7107 if (part) { 7108 dev = MKDEV(mdp_major, 7109 rdev0->preferred_minor << MdpMinorShift); 7110 unit = MINOR(dev) >> MdpMinorShift; 7111 } else { 7112 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 7113 unit = MINOR(dev); 7114 } 7115 if (rdev0->preferred_minor != unit) { 7116 pr_warn("md: unit number in %pg is bad: %d\n", 7117 rdev0->bdev, rdev0->preferred_minor); 7118 break; 7119 } 7120 7121 mddev = md_alloc(dev, NULL); 7122 if (IS_ERR(mddev)) 7123 break; 7124 7125 if (mddev_suspend_and_lock(mddev)) 7126 pr_warn("md: %s locked, cannot run\n", mdname(mddev)); 7127 else if (mddev->raid_disks || mddev->major_version 7128 || !list_empty(&mddev->disks)) { 7129 pr_warn("md: %s already running, cannot run %pg\n", 7130 mdname(mddev), rdev0->bdev); 7131 mddev_unlock_and_resume(mddev); 7132 } else { 7133 pr_debug("md: created %s\n", mdname(mddev)); 7134 mddev->persistent = 1; 7135 rdev_for_each_list(rdev, tmp, &candidates) { 7136 list_del_init(&rdev->same_set); 7137 if (bind_rdev_to_array(rdev, mddev)) 7138 export_rdev(rdev, mddev); 7139 } 7140 autorun_array(mddev); 7141 mddev_unlock_and_resume(mddev); 7142 } 7143 /* on success, candidates will be empty, on error 7144 * it won't... 7145 */ 7146 rdev_for_each_list(rdev, tmp, &candidates) { 7147 list_del_init(&rdev->same_set); 7148 export_rdev(rdev, mddev); 7149 } 7150 mddev_put(mddev); 7151 } 7152 pr_info("md: ... autorun DONE.\n"); 7153} 7154#endif /* !MODULE */ 7155 7156static int get_version(void __user *arg) 7157{ 7158 mdu_version_t ver; 7159 7160 ver.major = MD_MAJOR_VERSION; 7161 ver.minor = MD_MINOR_VERSION; 7162 ver.patchlevel = MD_PATCHLEVEL_VERSION; 7163 7164 if (copy_to_user(arg, &ver, sizeof(ver))) 7165 return -EFAULT; 7166 7167 return 0; 7168} 7169 7170static int get_array_info(struct mddev *mddev, void __user *arg) 7171{ 7172 mdu_array_info_t info; 7173 int nr,working,insync,failed,spare; 7174 struct md_rdev *rdev; 7175 7176 nr = working = insync = failed = spare = 0; 7177 rcu_read_lock(); 7178 rdev_for_each_rcu(rdev, mddev) { 7179 nr++; 7180 if (test_bit(Faulty, &rdev->flags)) 7181 failed++; 7182 else { 7183 working++; 7184 if (test_bit(In_sync, &rdev->flags)) 7185 insync++; 7186 else if (test_bit(Journal, &rdev->flags)) 7187 /* TODO: add journal count to md_u.h */ 7188 ; 7189 else 7190 spare++; 7191 } 7192 } 7193 rcu_read_unlock(); 7194 7195 info.major_version = mddev->major_version; 7196 info.minor_version = mddev->minor_version; 7197 info.patch_version = MD_PATCHLEVEL_VERSION; 7198 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 7199 info.level = mddev->level; 7200 info.size = mddev->dev_sectors / 2; 7201 if (info.size != mddev->dev_sectors / 2) /* overflow */ 7202 info.size = -1; 7203 info.nr_disks = nr; 7204 info.raid_disks = mddev->raid_disks; 7205 info.md_minor = mddev->md_minor; 7206 info.not_persistent= !mddev->persistent; 7207 7208 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 7209 info.state = 0; 7210 if (mddev->in_sync) 7211 info.state = (1<<MD_SB_CLEAN); 7212 if (mddev->bitmap && mddev->bitmap_info.offset) 7213 info.state |= (1<<MD_SB_BITMAP_PRESENT); 7214 if (mddev_is_clustered(mddev)) 7215 info.state |= (1<<MD_SB_CLUSTERED); 7216 info.active_disks = insync; 7217 info.working_disks = working; 7218 info.failed_disks = failed; 7219 info.spare_disks = spare; 7220 7221 info.layout = mddev->layout; 7222 info.chunk_size = mddev->chunk_sectors << 9; 7223 7224 if (copy_to_user(arg, &info, sizeof(info))) 7225 return -EFAULT; 7226 7227 return 0; 7228} 7229 7230static int get_bitmap_file(struct mddev *mddev, void __user * arg) 7231{ 7232 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 7233 char *ptr; 7234 int err; 7235 7236 file = kzalloc(sizeof(*file), GFP_NOIO); 7237 if (!file) 7238 return -ENOMEM; 7239 7240 err = 0; 7241 spin_lock(&mddev->lock); 7242 /* bitmap enabled */ 7243 if (mddev->bitmap_info.file) { 7244 ptr = file_path(mddev->bitmap_info.file, file->pathname, 7245 sizeof(file->pathname)); 7246 if (IS_ERR(ptr)) 7247 err = PTR_ERR(ptr); 7248 else 7249 memmove(file->pathname, ptr, 7250 sizeof(file->pathname)-(ptr-file->pathname)); 7251 } 7252 spin_unlock(&mddev->lock); 7253 7254 if (err == 0 && 7255 copy_to_user(arg, file, sizeof(*file))) 7256 err = -EFAULT; 7257 7258 kfree(file); 7259 return err; 7260} 7261 7262static int get_disk_info(struct mddev *mddev, void __user * arg) 7263{ 7264 mdu_disk_info_t info; 7265 struct md_rdev *rdev; 7266 7267 if (copy_from_user(&info, arg, sizeof(info))) 7268 return -EFAULT; 7269 7270 rcu_read_lock(); 7271 rdev = md_find_rdev_nr_rcu(mddev, info.number); 7272 if (rdev) { 7273 info.major = MAJOR(rdev->bdev->bd_dev); 7274 info.minor = MINOR(rdev->bdev->bd_dev); 7275 info.raid_disk = rdev->raid_disk; 7276 info.state = 0; 7277 if (test_bit(Faulty, &rdev->flags)) 7278 info.state |= (1<<MD_DISK_FAULTY); 7279 else if (test_bit(In_sync, &rdev->flags)) { 7280 info.state |= (1<<MD_DISK_ACTIVE); 7281 info.state |= (1<<MD_DISK_SYNC); 7282 } 7283 if (test_bit(Journal, &rdev->flags)) 7284 info.state |= (1<<MD_DISK_JOURNAL); 7285 if (test_bit(WriteMostly, &rdev->flags)) 7286 info.state |= (1<<MD_DISK_WRITEMOSTLY); 7287 if (test_bit(FailFast, &rdev->flags)) 7288 info.state |= (1<<MD_DISK_FAILFAST); 7289 } else { 7290 info.major = info.minor = 0; 7291 info.raid_disk = -1; 7292 info.state = (1<<MD_DISK_REMOVED); 7293 } 7294 rcu_read_unlock(); 7295 7296 if (copy_to_user(arg, &info, sizeof(info))) 7297 return -EFAULT; 7298 7299 return 0; 7300} 7301 7302int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) 7303{ 7304 struct md_rdev *rdev; 7305 dev_t dev = MKDEV(info->major,info->minor); 7306 7307 if (mddev_is_clustered(mddev) && 7308 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 7309 pr_warn("%s: Cannot add to clustered mddev.\n", 7310 mdname(mddev)); 7311 return -EINVAL; 7312 } 7313 7314 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 7315 return -EOVERFLOW; 7316 7317 if (!mddev->raid_disks) { 7318 int err; 7319 /* expecting a device which has a superblock */ 7320 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 7321 if (IS_ERR(rdev)) { 7322 pr_warn("md: md_import_device returned %ld\n", 7323 PTR_ERR(rdev)); 7324 return PTR_ERR(rdev); 7325 } 7326 if (!list_empty(&mddev->disks)) { 7327 struct md_rdev *rdev0 7328 = list_entry(mddev->disks.next, 7329 struct md_rdev, same_set); 7330 err = super_types[mddev->major_version] 7331 .load_super(rdev, rdev0, mddev->minor_version); 7332 if (err < 0) { 7333 pr_warn("md: %pg has different UUID to %pg\n", 7334 rdev->bdev, 7335 rdev0->bdev); 7336 export_rdev(rdev, mddev); 7337 return -EINVAL; 7338 } 7339 } 7340 err = bind_rdev_to_array(rdev, mddev); 7341 if (err) 7342 export_rdev(rdev, mddev); 7343 return err; 7344 } 7345 7346 /* 7347 * md_add_new_disk can be used once the array is assembled 7348 * to add "hot spares". They must already have a superblock 7349 * written 7350 */ 7351 if (mddev->pers) { 7352 int err; 7353 if (!mddev->pers->hot_add_disk) { 7354 pr_warn("%s: personality does not support diskops!\n", 7355 mdname(mddev)); 7356 return -EINVAL; 7357 } 7358 if (mddev->persistent) 7359 rdev = md_import_device(dev, mddev->major_version, 7360 mddev->minor_version); 7361 else 7362 rdev = md_import_device(dev, -1, -1); 7363 if (IS_ERR(rdev)) { 7364 pr_warn("md: md_import_device returned %ld\n", 7365 PTR_ERR(rdev)); 7366 return PTR_ERR(rdev); 7367 } 7368 /* set saved_raid_disk if appropriate */ 7369 if (!mddev->persistent) { 7370 if (info->state & (1<<MD_DISK_SYNC) && 7371 info->raid_disk < mddev->raid_disks) { 7372 rdev->raid_disk = info->raid_disk; 7373 clear_bit(Bitmap_sync, &rdev->flags); 7374 } else 7375 rdev->raid_disk = -1; 7376 rdev->saved_raid_disk = rdev->raid_disk; 7377 } else 7378 super_types[mddev->major_version]. 7379 validate_super(mddev, NULL/*freshest*/, rdev); 7380 if ((info->state & (1<<MD_DISK_SYNC)) && 7381 rdev->raid_disk != info->raid_disk) { 7382 /* This was a hot-add request, but events doesn't 7383 * match, so reject it. 7384 */ 7385 export_rdev(rdev, mddev); 7386 return -EINVAL; 7387 } 7388 7389 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 7390 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 7391 set_bit(WriteMostly, &rdev->flags); 7392 else 7393 clear_bit(WriteMostly, &rdev->flags); 7394 if (info->state & (1<<MD_DISK_FAILFAST)) 7395 set_bit(FailFast, &rdev->flags); 7396 else 7397 clear_bit(FailFast, &rdev->flags); 7398 7399 if (info->state & (1<<MD_DISK_JOURNAL)) { 7400 struct md_rdev *rdev2; 7401 bool has_journal = false; 7402 7403 /* make sure no existing journal disk */ 7404 rdev_for_each(rdev2, mddev) { 7405 if (test_bit(Journal, &rdev2->flags)) { 7406 has_journal = true; 7407 break; 7408 } 7409 } 7410 if (has_journal || mddev->bitmap) { 7411 export_rdev(rdev, mddev); 7412 return -EBUSY; 7413 } 7414 set_bit(Journal, &rdev->flags); 7415 } 7416 /* 7417 * check whether the device shows up in other nodes 7418 */ 7419 if (mddev_is_clustered(mddev)) { 7420 if (info->state & (1 << MD_DISK_CANDIDATE)) 7421 set_bit(Candidate, &rdev->flags); 7422 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 7423 /* --add initiated by this node */ 7424 err = mddev->cluster_ops->add_new_disk(mddev, rdev); 7425 if (err) { 7426 export_rdev(rdev, mddev); 7427 return err; 7428 } 7429 } 7430 } 7431 7432 rdev->raid_disk = -1; 7433 err = bind_rdev_to_array(rdev, mddev); 7434 7435 if (err) 7436 export_rdev(rdev, mddev); 7437 7438 if (mddev_is_clustered(mddev)) { 7439 if (info->state & (1 << MD_DISK_CANDIDATE)) { 7440 if (!err) { 7441 err = mddev->cluster_ops->new_disk_ack( 7442 mddev, err == 0); 7443 if (err) 7444 md_kick_rdev_from_array(rdev); 7445 } 7446 } else { 7447 if (err) 7448 mddev->cluster_ops->add_new_disk_cancel(mddev); 7449 else 7450 err = add_bound_rdev(rdev); 7451 } 7452 7453 } else if (!err) 7454 err = add_bound_rdev(rdev); 7455 7456 return err; 7457 } 7458 7459 /* otherwise, md_add_new_disk is only allowed 7460 * for major_version==0 superblocks 7461 */ 7462 if (mddev->major_version != 0) { 7463 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); 7464 return -EINVAL; 7465 } 7466 7467 if (!(info->state & (1<<MD_DISK_FAULTY))) { 7468 int err; 7469 rdev = md_import_device(dev, -1, 0); 7470 if (IS_ERR(rdev)) { 7471 pr_warn("md: error, md_import_device() returned %ld\n", 7472 PTR_ERR(rdev)); 7473 return PTR_ERR(rdev); 7474 } 7475 rdev->desc_nr = info->number; 7476 if (info->raid_disk < mddev->raid_disks) 7477 rdev->raid_disk = info->raid_disk; 7478 else 7479 rdev->raid_disk = -1; 7480 7481 if (rdev->raid_disk < mddev->raid_disks) 7482 if (info->state & (1<<MD_DISK_SYNC)) 7483 set_bit(In_sync, &rdev->flags); 7484 7485 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 7486 set_bit(WriteMostly, &rdev->flags); 7487 if (info->state & (1<<MD_DISK_FAILFAST)) 7488 set_bit(FailFast, &rdev->flags); 7489 7490 if (!mddev->persistent) { 7491 pr_debug("md: nonpersistent superblock ...\n"); 7492 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7493 } else 7494 rdev->sb_start = calc_dev_sboffset(rdev); 7495 rdev->sectors = rdev->sb_start; 7496 7497 err = bind_rdev_to_array(rdev, mddev); 7498 if (err) { 7499 export_rdev(rdev, mddev); 7500 return err; 7501 } 7502 } 7503 7504 return 0; 7505} 7506 7507static int hot_remove_disk(struct mddev *mddev, dev_t dev) 7508{ 7509 struct md_rdev *rdev; 7510 7511 if (!mddev->pers) 7512 return -ENODEV; 7513 7514 rdev = find_rdev(mddev, dev); 7515 if (!rdev) 7516 return -ENXIO; 7517 7518 if (rdev->raid_disk < 0) 7519 goto kick_rdev; 7520 7521 clear_bit(Blocked, &rdev->flags); 7522 remove_and_add_spares(mddev, rdev); 7523 7524 if (rdev->raid_disk >= 0) 7525 goto busy; 7526 7527kick_rdev: 7528 if (mddev_is_clustered(mddev) && 7529 mddev->cluster_ops->remove_disk(mddev, rdev)) 7530 goto busy; 7531 7532 md_kick_rdev_from_array(rdev); 7533 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7534 if (!mddev->thread) 7535 md_update_sb(mddev, 1); 7536 md_new_event(); 7537 7538 return 0; 7539busy: 7540 pr_debug("md: cannot remove active disk %pg from %s ...\n", 7541 rdev->bdev, mdname(mddev)); 7542 return -EBUSY; 7543} 7544 7545static int hot_add_disk(struct mddev *mddev, dev_t dev) 7546{ 7547 int err; 7548 struct md_rdev *rdev; 7549 7550 if (!mddev->pers) 7551 return -ENODEV; 7552 7553 if (mddev->major_version != 0) { 7554 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", 7555 mdname(mddev)); 7556 return -EINVAL; 7557 } 7558 if (!mddev->pers->hot_add_disk) { 7559 pr_warn("%s: personality does not support diskops!\n", 7560 mdname(mddev)); 7561 return -EINVAL; 7562 } 7563 7564 rdev = md_import_device(dev, -1, 0); 7565 if (IS_ERR(rdev)) { 7566 pr_warn("md: error, md_import_device() returned %ld\n", 7567 PTR_ERR(rdev)); 7568 return -EINVAL; 7569 } 7570 7571 if (mddev->persistent) 7572 rdev->sb_start = calc_dev_sboffset(rdev); 7573 else 7574 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7575 7576 rdev->sectors = rdev->sb_start; 7577 7578 if (test_bit(Faulty, &rdev->flags)) { 7579 pr_warn("md: can not hot-add faulty %pg disk to %s!\n", 7580 rdev->bdev, mdname(mddev)); 7581 err = -EINVAL; 7582 goto abort_export; 7583 } 7584 7585 clear_bit(In_sync, &rdev->flags); 7586 rdev->desc_nr = -1; 7587 rdev->saved_raid_disk = -1; 7588 err = bind_rdev_to_array(rdev, mddev); 7589 if (err) 7590 goto abort_export; 7591 7592 /* 7593 * The rest should better be atomic, we can have disk failures 7594 * noticed in interrupt contexts ... 7595 */ 7596 7597 rdev->raid_disk = -1; 7598 7599 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7600 if (!mddev->thread) 7601 md_update_sb(mddev, 1); 7602 /* 7603 * Kick recovery, maybe this spare has to be added to the 7604 * array immediately. 7605 */ 7606 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7607 md_new_event(); 7608 return 0; 7609 7610abort_export: 7611 export_rdev(rdev, mddev); 7612 return err; 7613} 7614 7615static int set_bitmap_file(struct mddev *mddev, int fd) 7616{ 7617 int err = 0; 7618 7619 if (!md_bitmap_registered(mddev)) 7620 return -EINVAL; 7621 7622 if (mddev->pers) { 7623 if (!mddev->pers->quiesce || !mddev->thread) 7624 return -EBUSY; 7625 if (mddev->recovery || mddev->sync_thread) 7626 return -EBUSY; 7627 /* we should be able to change the bitmap.. */ 7628 } 7629 7630 if (fd >= 0) { 7631 struct inode *inode; 7632 struct file *f; 7633 7634 if (mddev->bitmap || mddev->bitmap_info.file) 7635 return -EEXIST; /* cannot add when bitmap is present */ 7636 7637 if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { 7638 pr_warn("%s: bitmap files not supported by this kernel\n", 7639 mdname(mddev)); 7640 return -EINVAL; 7641 } 7642 pr_warn("%s: using deprecated bitmap file support\n", 7643 mdname(mddev)); 7644 7645 f = fget(fd); 7646 7647 if (f == NULL) { 7648 pr_warn("%s: error: failed to get bitmap file\n", 7649 mdname(mddev)); 7650 return -EBADF; 7651 } 7652 7653 inode = f->f_mapping->host; 7654 if (!S_ISREG(inode->i_mode)) { 7655 pr_warn("%s: error: bitmap file must be a regular file\n", 7656 mdname(mddev)); 7657 err = -EBADF; 7658 } else if (!(f->f_mode & FMODE_WRITE)) { 7659 pr_warn("%s: error: bitmap file must open for write\n", 7660 mdname(mddev)); 7661 err = -EBADF; 7662 } else if (atomic_read(&inode->i_writecount) != 1) { 7663 pr_warn("%s: error: bitmap file is already in use\n", 7664 mdname(mddev)); 7665 err = -EBUSY; 7666 } 7667 if (err) { 7668 fput(f); 7669 return err; 7670 } 7671 mddev->bitmap_info.file = f; 7672 mddev->bitmap_info.offset = 0; /* file overrides offset */ 7673 } else if (mddev->bitmap == NULL) 7674 return -ENOENT; /* cannot remove what isn't there */ 7675 err = 0; 7676 if (mddev->pers) { 7677 if (fd >= 0) { 7678 err = md_bitmap_create(mddev); 7679 if (!err) 7680 err = mddev->bitmap_ops->load(mddev); 7681 7682 if (err) { 7683 md_bitmap_destroy(mddev); 7684 fd = -1; 7685 } 7686 } else if (fd < 0) { 7687 md_bitmap_destroy(mddev); 7688 } 7689 } 7690 7691 if (fd < 0) { 7692 struct file *f = mddev->bitmap_info.file; 7693 if (f) { 7694 spin_lock(&mddev->lock); 7695 mddev->bitmap_info.file = NULL; 7696 spin_unlock(&mddev->lock); 7697 fput(f); 7698 } 7699 } 7700 7701 return err; 7702} 7703 7704/* 7705 * md_set_array_info is used two different ways 7706 * The original usage is when creating a new array. 7707 * In this usage, raid_disks is > 0 and it together with 7708 * level, size, not_persistent,layout,chunksize determine the 7709 * shape of the array. 7710 * This will always create an array with a type-0.90.0 superblock. 7711 * The newer usage is when assembling an array. 7712 * In this case raid_disks will be 0, and the major_version field is 7713 * use to determine which style super-blocks are to be found on the devices. 7714 * The minor and patch _version numbers are also kept incase the 7715 * super_block handler wishes to interpret them. 7716 */ 7717int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) 7718{ 7719 if (info->raid_disks == 0) { 7720 /* just setting version number for superblock loading */ 7721 if (info->major_version < 0 || 7722 info->major_version >= ARRAY_SIZE(super_types) || 7723 super_types[info->major_version].name == NULL) { 7724 /* maybe try to auto-load a module? */ 7725 pr_warn("md: superblock version %d not known\n", 7726 info->major_version); 7727 return -EINVAL; 7728 } 7729 mddev->major_version = info->major_version; 7730 mddev->minor_version = info->minor_version; 7731 mddev->patch_version = info->patch_version; 7732 mddev->persistent = !info->not_persistent; 7733 /* ensure mddev_put doesn't delete this now that there 7734 * is some minimal configuration. 7735 */ 7736 mddev->ctime = ktime_get_real_seconds(); 7737 return 0; 7738 } 7739 mddev->major_version = MD_MAJOR_VERSION; 7740 mddev->minor_version = MD_MINOR_VERSION; 7741 mddev->patch_version = MD_PATCHLEVEL_VERSION; 7742 mddev->ctime = ktime_get_real_seconds(); 7743 7744 mddev->level = info->level; 7745 mddev->clevel[0] = 0; 7746 mddev->dev_sectors = 2 * (sector_t)info->size; 7747 mddev->raid_disks = info->raid_disks; 7748 /* don't set md_minor, it is determined by which /dev/md* was 7749 * openned 7750 */ 7751 if (info->state & (1<<MD_SB_CLEAN)) 7752 mddev->resync_offset = MaxSector; 7753 else 7754 mddev->resync_offset = 0; 7755 mddev->persistent = ! info->not_persistent; 7756 mddev->external = 0; 7757 7758 mddev->layout = info->layout; 7759 if (mddev->level == 0) 7760 /* Cannot trust RAID0 layout info here */ 7761 mddev->layout = -1; 7762 mddev->chunk_sectors = info->chunk_size >> 9; 7763 7764 if (mddev->persistent) { 7765 mddev->max_disks = MD_SB_DISKS; 7766 mddev->flags = 0; 7767 mddev->sb_flags = 0; 7768 } 7769 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7770 7771 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 7772 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 7773 mddev->bitmap_info.offset = 0; 7774 7775 mddev->reshape_position = MaxSector; 7776 7777 /* 7778 * Generate a 128 bit UUID 7779 */ 7780 get_random_bytes(mddev->uuid, 16); 7781 7782 mddev->new_level = mddev->level; 7783 mddev->new_chunk_sectors = mddev->chunk_sectors; 7784 mddev->new_layout = mddev->layout; 7785 mddev->delta_disks = 0; 7786 mddev->reshape_backwards = 0; 7787 7788 return 0; 7789} 7790 7791void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 7792{ 7793 lockdep_assert_held(&mddev->reconfig_mutex); 7794 7795 if (mddev->external_size) 7796 return; 7797 7798 mddev->array_sectors = array_sectors; 7799} 7800EXPORT_SYMBOL(md_set_array_sectors); 7801 7802static int update_size(struct mddev *mddev, sector_t num_sectors) 7803{ 7804 struct md_rdev *rdev; 7805 int rv; 7806 int fit = (num_sectors == 0); 7807 sector_t old_dev_sectors = mddev->dev_sectors; 7808 7809 if (mddev->pers->resize == NULL) 7810 return -EINVAL; 7811 /* The "num_sectors" is the number of sectors of each device that 7812 * is used. This can only make sense for arrays with redundancy. 7813 * linear and raid0 always use whatever space is available. We can only 7814 * consider changing this number if no resync or reconstruction is 7815 * happening, and if the new size is acceptable. It must fit before the 7816 * sb_start or, if that is <data_offset, it must fit before the size 7817 * of each device. If num_sectors is zero, we find the largest size 7818 * that fits. 7819 */ 7820 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7821 return -EBUSY; 7822 if (!md_is_rdwr(mddev)) 7823 return -EROFS; 7824 7825 rdev_for_each(rdev, mddev) { 7826 sector_t avail = rdev->sectors; 7827 7828 if (fit && (num_sectors == 0 || num_sectors > avail)) 7829 num_sectors = avail; 7830 if (avail < num_sectors) 7831 return -ENOSPC; 7832 } 7833 rv = mddev->pers->resize(mddev, num_sectors); 7834 if (!rv) { 7835 if (mddev_is_clustered(mddev)) 7836 mddev->cluster_ops->update_size(mddev, old_dev_sectors); 7837 else if (!mddev_is_dm(mddev)) 7838 set_capacity_and_notify(mddev->gendisk, 7839 mddev->array_sectors); 7840 } 7841 return rv; 7842} 7843 7844static int update_raid_disks(struct mddev *mddev, int raid_disks) 7845{ 7846 int rv; 7847 struct md_rdev *rdev; 7848 /* change the number of raid disks */ 7849 if (mddev->pers->check_reshape == NULL) 7850 return -EINVAL; 7851 if (!md_is_rdwr(mddev)) 7852 return -EROFS; 7853 if (raid_disks <= 0 || 7854 (mddev->max_disks && raid_disks >= mddev->max_disks)) 7855 return -EINVAL; 7856 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7857 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || 7858 mddev->reshape_position != MaxSector) 7859 return -EBUSY; 7860 7861 rdev_for_each(rdev, mddev) { 7862 if (mddev->raid_disks < raid_disks && 7863 rdev->data_offset < rdev->new_data_offset) 7864 return -EINVAL; 7865 if (mddev->raid_disks > raid_disks && 7866 rdev->data_offset > rdev->new_data_offset) 7867 return -EINVAL; 7868 } 7869 7870 mddev->delta_disks = raid_disks - mddev->raid_disks; 7871 if (mddev->delta_disks < 0) 7872 mddev->reshape_backwards = 1; 7873 else if (mddev->delta_disks > 0) 7874 mddev->reshape_backwards = 0; 7875 7876 rv = mddev->pers->check_reshape(mddev); 7877 if (rv < 0) { 7878 mddev->delta_disks = 0; 7879 mddev->reshape_backwards = 0; 7880 } 7881 return rv; 7882} 7883 7884static int get_cluster_ops(struct mddev *mddev) 7885{ 7886 xa_lock(&md_submodule); 7887 mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER); 7888 if (mddev->cluster_ops && 7889 !try_module_get(mddev->cluster_ops->head.owner)) 7890 mddev->cluster_ops = NULL; 7891 xa_unlock(&md_submodule); 7892 7893 return mddev->cluster_ops == NULL ? -ENOENT : 0; 7894} 7895 7896static void put_cluster_ops(struct mddev *mddev) 7897{ 7898 if (!mddev->cluster_ops) 7899 return; 7900 7901 mddev->cluster_ops->leave(mddev); 7902 module_put(mddev->cluster_ops->head.owner); 7903 mddev->cluster_ops = NULL; 7904} 7905 7906/* 7907 * update_array_info is used to change the configuration of an 7908 * on-line array. 7909 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 7910 * fields in the info are checked against the array. 7911 * Any differences that cannot be handled will cause an error. 7912 * Normally, only one change can be managed at a time. 7913 */ 7914static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 7915{ 7916 int rv = 0; 7917 int cnt = 0; 7918 int state = 0; 7919 7920 /* calculate expected state,ignoring low bits */ 7921 if (mddev->bitmap && mddev->bitmap_info.offset) 7922 state |= (1 << MD_SB_BITMAP_PRESENT); 7923 7924 if (mddev->major_version != info->major_version || 7925 mddev->minor_version != info->minor_version || 7926/* mddev->patch_version != info->patch_version || */ 7927 mddev->ctime != info->ctime || 7928 mddev->level != info->level || 7929/* mddev->layout != info->layout || */ 7930 mddev->persistent != !info->not_persistent || 7931 mddev->chunk_sectors != info->chunk_size >> 9 || 7932 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 7933 ((state^info->state) & 0xfffffe00) 7934 ) 7935 return -EINVAL; 7936 /* Check there is only one change */ 7937 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7938 cnt++; 7939 if (mddev->raid_disks != info->raid_disks) 7940 cnt++; 7941 if (mddev->layout != info->layout) 7942 cnt++; 7943 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 7944 cnt++; 7945 if (cnt == 0) 7946 return 0; 7947 if (cnt > 1) 7948 return -EINVAL; 7949 7950 if (mddev->layout != info->layout) { 7951 /* Change layout 7952 * we don't need to do anything at the md level, the 7953 * personality will take care of it all. 7954 */ 7955 if (mddev->pers->check_reshape == NULL) 7956 return -EINVAL; 7957 else { 7958 mddev->new_layout = info->layout; 7959 rv = mddev->pers->check_reshape(mddev); 7960 if (rv) 7961 mddev->new_layout = mddev->layout; 7962 return rv; 7963 } 7964 } 7965 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7966 rv = update_size(mddev, (sector_t)info->size * 2); 7967 7968 if (mddev->raid_disks != info->raid_disks) 7969 rv = update_raid_disks(mddev, info->raid_disks); 7970 7971 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 7972 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 7973 rv = -EINVAL; 7974 goto err; 7975 } 7976 if (mddev->recovery || mddev->sync_thread) { 7977 rv = -EBUSY; 7978 goto err; 7979 } 7980 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 7981 /* add the bitmap */ 7982 if (mddev->bitmap) { 7983 rv = -EEXIST; 7984 goto err; 7985 } 7986 if (mddev->bitmap_info.default_offset == 0) { 7987 rv = -EINVAL; 7988 goto err; 7989 } 7990 mddev->bitmap_info.offset = 7991 mddev->bitmap_info.default_offset; 7992 mddev->bitmap_info.space = 7993 mddev->bitmap_info.default_space; 7994 rv = md_bitmap_create(mddev); 7995 if (!rv) 7996 rv = mddev->bitmap_ops->load(mddev); 7997 7998 if (rv) 7999 md_bitmap_destroy(mddev); 8000 } else { 8001 struct md_bitmap_stats stats; 8002 8003 rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 8004 if (rv) 8005 goto err; 8006 8007 if (stats.file) { 8008 rv = -EINVAL; 8009 goto err; 8010 } 8011 8012 if (mddev->bitmap_info.nodes) { 8013 /* hold PW on all the bitmap lock */ 8014 if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) { 8015 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); 8016 rv = -EPERM; 8017 mddev->cluster_ops->unlock_all_bitmaps(mddev); 8018 goto err; 8019 } 8020 8021 mddev->bitmap_info.nodes = 0; 8022 put_cluster_ops(mddev); 8023 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 8024 } 8025 md_bitmap_destroy(mddev); 8026 mddev->bitmap_info.offset = 0; 8027 } 8028 } 8029 md_update_sb(mddev, 1); 8030 return rv; 8031err: 8032 return rv; 8033} 8034 8035static int set_disk_faulty(struct mddev *mddev, dev_t dev) 8036{ 8037 struct md_rdev *rdev; 8038 int err = 0; 8039 8040 if (mddev->pers == NULL) 8041 return -ENODEV; 8042 8043 rcu_read_lock(); 8044 rdev = md_find_rdev_rcu(mddev, dev); 8045 if (!rdev) 8046 err = -ENODEV; 8047 else { 8048 md_error(mddev, rdev); 8049 if (test_bit(MD_BROKEN, &mddev->flags)) 8050 err = -EBUSY; 8051 } 8052 rcu_read_unlock(); 8053 return err; 8054} 8055 8056/* 8057 * We have a problem here : there is no easy way to give a CHS 8058 * virtual geometry. We currently pretend that we have a 2 heads 8059 * 4 sectors (with a BIG number of cylinders...). This drives 8060 * dosfs just mad... ;-) 8061 */ 8062static int md_getgeo(struct gendisk *disk, struct hd_geometry *geo) 8063{ 8064 struct mddev *mddev = disk->private_data; 8065 8066 geo->heads = 2; 8067 geo->sectors = 4; 8068 geo->cylinders = mddev->array_sectors / 8; 8069 return 0; 8070} 8071 8072static inline int md_ioctl_valid(unsigned int cmd) 8073{ 8074 switch (cmd) { 8075 case GET_ARRAY_INFO: 8076 case GET_DISK_INFO: 8077 case RAID_VERSION: 8078 return 0; 8079 case ADD_NEW_DISK: 8080 case GET_BITMAP_FILE: 8081 case HOT_ADD_DISK: 8082 case HOT_REMOVE_DISK: 8083 case RESTART_ARRAY_RW: 8084 case RUN_ARRAY: 8085 case SET_ARRAY_INFO: 8086 case SET_BITMAP_FILE: 8087 case SET_DISK_FAULTY: 8088 case STOP_ARRAY: 8089 case STOP_ARRAY_RO: 8090 case CLUSTERED_DISK_NACK: 8091 if (!capable(CAP_SYS_ADMIN)) 8092 return -EACCES; 8093 return 0; 8094 default: 8095 return -ENOTTY; 8096 } 8097} 8098 8099static bool md_ioctl_need_suspend(unsigned int cmd) 8100{ 8101 switch (cmd) { 8102 case ADD_NEW_DISK: 8103 case HOT_ADD_DISK: 8104 case HOT_REMOVE_DISK: 8105 case SET_BITMAP_FILE: 8106 case SET_ARRAY_INFO: 8107 return true; 8108 default: 8109 return false; 8110 } 8111} 8112 8113static int __md_set_array_info(struct mddev *mddev, void __user *argp) 8114{ 8115 mdu_array_info_t info; 8116 int err; 8117 8118 if (!argp) 8119 memset(&info, 0, sizeof(info)); 8120 else if (copy_from_user(&info, argp, sizeof(info))) 8121 return -EFAULT; 8122 8123 if (mddev->pers) { 8124 err = update_array_info(mddev, &info); 8125 if (err) 8126 pr_warn("md: couldn't update array info. %d\n", err); 8127 return err; 8128 } 8129 8130 if (!list_empty(&mddev->disks)) { 8131 pr_warn("md: array %s already has disks!\n", mdname(mddev)); 8132 return -EBUSY; 8133 } 8134 8135 if (mddev->raid_disks) { 8136 pr_warn("md: array %s already initialised!\n", mdname(mddev)); 8137 return -EBUSY; 8138 } 8139 8140 err = md_set_array_info(mddev, &info); 8141 if (err) 8142 pr_warn("md: couldn't set array info. %d\n", err); 8143 8144 return err; 8145} 8146 8147static int md_ioctl(struct block_device *bdev, blk_mode_t mode, 8148 unsigned int cmd, unsigned long arg) 8149{ 8150 int err = 0; 8151 void __user *argp = (void __user *)arg; 8152 struct mddev *mddev = NULL; 8153 8154 err = md_ioctl_valid(cmd); 8155 if (err) 8156 return err; 8157 8158 /* 8159 * Commands dealing with the RAID driver but not any 8160 * particular array: 8161 */ 8162 if (cmd == RAID_VERSION) 8163 return get_version(argp); 8164 8165 /* 8166 * Commands creating/starting a new array: 8167 */ 8168 8169 mddev = bdev->bd_disk->private_data; 8170 8171 /* Some actions do not requires the mutex */ 8172 switch (cmd) { 8173 case GET_ARRAY_INFO: 8174 if (!mddev->raid_disks && !mddev->external) 8175 return -ENODEV; 8176 return get_array_info(mddev, argp); 8177 8178 case GET_DISK_INFO: 8179 if (!mddev->raid_disks && !mddev->external) 8180 return -ENODEV; 8181 return get_disk_info(mddev, argp); 8182 8183 case SET_DISK_FAULTY: 8184 return set_disk_faulty(mddev, new_decode_dev(arg)); 8185 8186 case GET_BITMAP_FILE: 8187 return get_bitmap_file(mddev, argp); 8188 } 8189 8190 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 8191 /* Need to flush page cache, and ensure no-one else opens 8192 * and writes 8193 */ 8194 err = mddev_set_closing_and_sync_blockdev(mddev, 1); 8195 if (err) 8196 return err; 8197 } 8198 8199 if (!md_is_rdwr(mddev)) 8200 flush_work(&mddev->sync_work); 8201 8202 err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : 8203 mddev_lock(mddev); 8204 if (err) { 8205 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", 8206 err, cmd); 8207 goto out; 8208 } 8209 8210 if (cmd == SET_ARRAY_INFO) { 8211 err = __md_set_array_info(mddev, argp); 8212 goto unlock; 8213 } 8214 8215 /* 8216 * Commands querying/configuring an existing array: 8217 */ 8218 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 8219 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 8220 if ((!mddev->raid_disks && !mddev->external) 8221 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 8222 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 8223 && cmd != GET_BITMAP_FILE) { 8224 err = -ENODEV; 8225 goto unlock; 8226 } 8227 8228 /* 8229 * Commands even a read-only array can execute: 8230 */ 8231 switch (cmd) { 8232 case RESTART_ARRAY_RW: 8233 err = restart_array(mddev); 8234 goto unlock; 8235 8236 case STOP_ARRAY: 8237 err = do_md_stop(mddev, 0); 8238 goto unlock; 8239 8240 case STOP_ARRAY_RO: 8241 if (mddev->pers) 8242 err = md_set_readonly(mddev); 8243 goto unlock; 8244 8245 case HOT_REMOVE_DISK: 8246 err = hot_remove_disk(mddev, new_decode_dev(arg)); 8247 goto unlock; 8248 8249 case ADD_NEW_DISK: 8250 /* We can support ADD_NEW_DISK on read-only arrays 8251 * only if we are re-adding a preexisting device. 8252 * So require mddev->pers and MD_DISK_SYNC. 8253 */ 8254 if (mddev->pers) { 8255 mdu_disk_info_t info; 8256 if (copy_from_user(&info, argp, sizeof(info))) 8257 err = -EFAULT; 8258 else if (!(info.state & (1<<MD_DISK_SYNC))) 8259 /* Need to clear read-only for this */ 8260 break; 8261 else 8262 err = md_add_new_disk(mddev, &info); 8263 goto unlock; 8264 } 8265 break; 8266 } 8267 8268 /* 8269 * The remaining ioctls are changing the state of the 8270 * superblock, so we do not allow them on read-only arrays. 8271 */ 8272 if (!md_is_rdwr(mddev) && mddev->pers) { 8273 if (mddev->ro != MD_AUTO_READ) { 8274 err = -EROFS; 8275 goto unlock; 8276 } 8277 mddev->ro = MD_RDWR; 8278 sysfs_notify_dirent_safe(mddev->sysfs_state); 8279 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8280 /* mddev_unlock will wake thread */ 8281 /* If a device failed while we were read-only, we 8282 * need to make sure the metadata is updated now. 8283 */ 8284 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { 8285 mddev_unlock(mddev); 8286 wait_event(mddev->sb_wait, 8287 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && 8288 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8289 mddev_lock_nointr(mddev); 8290 } 8291 } 8292 8293 switch (cmd) { 8294 case ADD_NEW_DISK: 8295 { 8296 mdu_disk_info_t info; 8297 if (copy_from_user(&info, argp, sizeof(info))) 8298 err = -EFAULT; 8299 else 8300 err = md_add_new_disk(mddev, &info); 8301 goto unlock; 8302 } 8303 8304 case CLUSTERED_DISK_NACK: 8305 if (mddev_is_clustered(mddev)) 8306 mddev->cluster_ops->new_disk_ack(mddev, false); 8307 else 8308 err = -EINVAL; 8309 goto unlock; 8310 8311 case HOT_ADD_DISK: 8312 err = hot_add_disk(mddev, new_decode_dev(arg)); 8313 goto unlock; 8314 8315 case RUN_ARRAY: 8316 err = do_md_run(mddev); 8317 goto unlock; 8318 8319 case SET_BITMAP_FILE: 8320 err = set_bitmap_file(mddev, (int)arg); 8321 goto unlock; 8322 8323 default: 8324 err = -EINVAL; 8325 goto unlock; 8326 } 8327 8328unlock: 8329 if (mddev->hold_active == UNTIL_IOCTL && 8330 err != -EINVAL) 8331 mddev->hold_active = 0; 8332 8333 md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : 8334 mddev_unlock(mddev); 8335 8336out: 8337 if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY)) 8338 clear_bit(MD_CLOSING, &mddev->flags); 8339 return err; 8340} 8341#ifdef CONFIG_COMPAT 8342static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, 8343 unsigned int cmd, unsigned long arg) 8344{ 8345 switch (cmd) { 8346 case HOT_REMOVE_DISK: 8347 case HOT_ADD_DISK: 8348 case SET_DISK_FAULTY: 8349 case SET_BITMAP_FILE: 8350 /* These take in integer arg, do not convert */ 8351 break; 8352 default: 8353 arg = (unsigned long)compat_ptr(arg); 8354 break; 8355 } 8356 8357 return md_ioctl(bdev, mode, cmd, arg); 8358} 8359#endif /* CONFIG_COMPAT */ 8360 8361static int md_set_read_only(struct block_device *bdev, bool ro) 8362{ 8363 struct mddev *mddev = bdev->bd_disk->private_data; 8364 int err; 8365 8366 err = mddev_lock(mddev); 8367 if (err) 8368 return err; 8369 8370 if (!mddev->raid_disks && !mddev->external) { 8371 err = -ENODEV; 8372 goto out_unlock; 8373 } 8374 8375 /* 8376 * Transitioning to read-auto need only happen for arrays that call 8377 * md_write_start and which are not ready for writes yet. 8378 */ 8379 if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { 8380 err = restart_array(mddev); 8381 if (err) 8382 goto out_unlock; 8383 mddev->ro = MD_AUTO_READ; 8384 } 8385 8386out_unlock: 8387 mddev_unlock(mddev); 8388 return err; 8389} 8390 8391static int md_open(struct gendisk *disk, blk_mode_t mode) 8392{ 8393 struct mddev *mddev; 8394 int err; 8395 8396 spin_lock(&all_mddevs_lock); 8397 mddev = mddev_get(disk->private_data); 8398 spin_unlock(&all_mddevs_lock); 8399 if (!mddev) 8400 return -ENODEV; 8401 8402 err = mutex_lock_interruptible(&mddev->open_mutex); 8403 if (err) 8404 goto out; 8405 8406 err = -ENODEV; 8407 if (test_bit(MD_CLOSING, &mddev->flags)) 8408 goto out_unlock; 8409 8410 atomic_inc(&mddev->openers); 8411 mutex_unlock(&mddev->open_mutex); 8412 8413 disk_check_media_change(disk); 8414 return 0; 8415 8416out_unlock: 8417 mutex_unlock(&mddev->open_mutex); 8418out: 8419 mddev_put(mddev); 8420 return err; 8421} 8422 8423static void md_release(struct gendisk *disk) 8424{ 8425 struct mddev *mddev = disk->private_data; 8426 8427 BUG_ON(!mddev); 8428 atomic_dec(&mddev->openers); 8429 mddev_put(mddev); 8430} 8431 8432static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) 8433{ 8434 struct mddev *mddev = disk->private_data; 8435 unsigned int ret = 0; 8436 8437 if (mddev->changed) 8438 ret = DISK_EVENT_MEDIA_CHANGE; 8439 mddev->changed = 0; 8440 return ret; 8441} 8442 8443static void md_free_disk(struct gendisk *disk) 8444{ 8445 struct mddev *mddev = disk->private_data; 8446 8447 mddev_free(mddev); 8448} 8449 8450const struct block_device_operations md_fops = 8451{ 8452 .owner = THIS_MODULE, 8453 .submit_bio = md_submit_bio, 8454 .open = md_open, 8455 .release = md_release, 8456 .ioctl = md_ioctl, 8457#ifdef CONFIG_COMPAT 8458 .compat_ioctl = md_compat_ioctl, 8459#endif 8460 .getgeo = md_getgeo, 8461 .check_events = md_check_events, 8462 .set_read_only = md_set_read_only, 8463 .free_disk = md_free_disk, 8464}; 8465 8466static int md_thread(void *arg) 8467{ 8468 struct md_thread *thread = arg; 8469 8470 /* 8471 * md_thread is a 'system-thread', it's priority should be very 8472 * high. We avoid resource deadlocks individually in each 8473 * raid personality. (RAID5 does preallocation) We also use RR and 8474 * the very same RT priority as kswapd, thus we will never get 8475 * into a priority inversion deadlock. 8476 * 8477 * we definitely have to have equal or higher priority than 8478 * bdflush, otherwise bdflush will deadlock if there are too 8479 * many dirty RAID5 blocks. 8480 */ 8481 8482 allow_signal(SIGKILL); 8483 while (!kthread_should_stop()) { 8484 8485 /* We need to wait INTERRUPTIBLE so that 8486 * we don't add to the load-average. 8487 * That means we need to be sure no signals are 8488 * pending 8489 */ 8490 if (signal_pending(current)) 8491 flush_signals(current); 8492 8493 wait_event_interruptible_timeout 8494 (thread->wqueue, 8495 test_bit(THREAD_WAKEUP, &thread->flags) 8496 || kthread_should_stop() || kthread_should_park(), 8497 thread->timeout); 8498 8499 clear_bit(THREAD_WAKEUP, &thread->flags); 8500 if (kthread_should_park()) 8501 kthread_parkme(); 8502 if (!kthread_should_stop()) 8503 thread->run(thread); 8504 } 8505 8506 return 0; 8507} 8508 8509static void md_wakeup_thread_directly(struct md_thread __rcu **thread) 8510{ 8511 struct md_thread *t; 8512 8513 rcu_read_lock(); 8514 t = rcu_dereference(*thread); 8515 if (t) 8516 wake_up_process(t->tsk); 8517 rcu_read_unlock(); 8518} 8519 8520void __md_wakeup_thread(struct md_thread __rcu *thread) 8521{ 8522 struct md_thread *t; 8523 8524 t = rcu_dereference(thread); 8525 if (t) { 8526 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); 8527 set_bit(THREAD_WAKEUP, &t->flags); 8528 if (wq_has_sleeper(&t->wqueue)) 8529 wake_up(&t->wqueue); 8530 } 8531} 8532EXPORT_SYMBOL(__md_wakeup_thread); 8533 8534struct md_thread *md_register_thread(void (*run) (struct md_thread *), 8535 struct mddev *mddev, const char *name) 8536{ 8537 struct md_thread *thread; 8538 8539 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 8540 if (!thread) 8541 return NULL; 8542 8543 init_waitqueue_head(&thread->wqueue); 8544 8545 thread->run = run; 8546 thread->mddev = mddev; 8547 thread->timeout = MAX_SCHEDULE_TIMEOUT; 8548 thread->tsk = kthread_run(md_thread, thread, 8549 "%s_%s", 8550 mdname(thread->mddev), 8551 name); 8552 if (IS_ERR(thread->tsk)) { 8553 kfree(thread); 8554 return NULL; 8555 } 8556 return thread; 8557} 8558EXPORT_SYMBOL(md_register_thread); 8559 8560void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) 8561{ 8562 struct md_thread *thread = rcu_dereference_protected(*threadp, 8563 lockdep_is_held(&mddev->reconfig_mutex)); 8564 8565 if (!thread) 8566 return; 8567 8568 rcu_assign_pointer(*threadp, NULL); 8569 synchronize_rcu(); 8570 8571 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 8572 kthread_stop(thread->tsk); 8573 kfree(thread); 8574} 8575EXPORT_SYMBOL(md_unregister_thread); 8576 8577void md_error(struct mddev *mddev, struct md_rdev *rdev) 8578{ 8579 if (!rdev || test_bit(Faulty, &rdev->flags)) 8580 return; 8581 8582 if (!mddev->pers || !mddev->pers->error_handler) 8583 return; 8584 mddev->pers->error_handler(mddev, rdev); 8585 8586 if (mddev->pers->head.id == ID_RAID0 || 8587 mddev->pers->head.id == ID_LINEAR) 8588 return; 8589 8590 if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) 8591 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8592 sysfs_notify_dirent_safe(rdev->sysfs_state); 8593 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8594 if (!test_bit(MD_BROKEN, &mddev->flags)) { 8595 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8596 md_wakeup_thread(mddev->thread); 8597 } 8598 if (mddev->event_work.func) 8599 queue_work(md_misc_wq, &mddev->event_work); 8600 md_new_event(); 8601} 8602EXPORT_SYMBOL(md_error); 8603 8604/* seq_file implementation /proc/mdstat */ 8605 8606static void status_unused(struct seq_file *seq) 8607{ 8608 int i = 0; 8609 struct md_rdev *rdev; 8610 8611 seq_printf(seq, "unused devices: "); 8612 8613 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 8614 i++; 8615 seq_printf(seq, "%pg ", rdev->bdev); 8616 } 8617 if (!i) 8618 seq_printf(seq, "<none>"); 8619 8620 seq_printf(seq, "\n"); 8621} 8622 8623static void status_personalities(struct seq_file *seq) 8624{ 8625 struct md_submodule_head *head; 8626 unsigned long i; 8627 8628 seq_puts(seq, "Personalities : "); 8629 8630 xa_lock(&md_submodule); 8631 xa_for_each(&md_submodule, i, head) 8632 if (head->type == MD_PERSONALITY) 8633 seq_printf(seq, "[%s] ", head->name); 8634 xa_unlock(&md_submodule); 8635 8636 seq_puts(seq, "\n"); 8637} 8638 8639static int status_resync(struct seq_file *seq, struct mddev *mddev) 8640{ 8641 sector_t max_sectors, resync, res; 8642 unsigned long dt, db = 0; 8643 sector_t rt, curr_mark_cnt, resync_mark_cnt; 8644 int scale, recovery_active; 8645 unsigned int per_milli; 8646 8647 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8648 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8649 max_sectors = mddev->resync_max_sectors; 8650 else 8651 max_sectors = mddev->dev_sectors; 8652 8653 resync = mddev->curr_resync; 8654 if (resync < MD_RESYNC_ACTIVE) { 8655 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8656 /* Still cleaning up */ 8657 resync = max_sectors; 8658 } else if (resync > max_sectors) { 8659 resync = max_sectors; 8660 } else { 8661 res = atomic_read(&mddev->recovery_active); 8662 /* 8663 * Resync has started, but the subtraction has overflowed or 8664 * yielded one of the special values. Force it to active to 8665 * ensure the status reports an active resync. 8666 */ 8667 if (resync < res || resync - res < MD_RESYNC_ACTIVE) 8668 resync = MD_RESYNC_ACTIVE; 8669 else 8670 resync -= res; 8671 } 8672 8673 if (resync == MD_RESYNC_NONE) { 8674 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { 8675 struct md_rdev *rdev; 8676 8677 rdev_for_each(rdev, mddev) 8678 if (rdev->raid_disk >= 0 && 8679 !test_bit(Faulty, &rdev->flags) && 8680 rdev->recovery_offset != MaxSector && 8681 rdev->recovery_offset) { 8682 seq_printf(seq, "\trecover=REMOTE"); 8683 return 1; 8684 } 8685 if (mddev->reshape_position != MaxSector) 8686 seq_printf(seq, "\treshape=REMOTE"); 8687 else 8688 seq_printf(seq, "\tresync=REMOTE"); 8689 return 1; 8690 } 8691 if (mddev->resync_offset < MaxSector) { 8692 seq_printf(seq, "\tresync=PENDING"); 8693 return 1; 8694 } 8695 return 0; 8696 } 8697 if (resync < MD_RESYNC_ACTIVE) { 8698 seq_printf(seq, "\tresync=DELAYED"); 8699 return 1; 8700 } 8701 8702 WARN_ON(max_sectors == 0); 8703 /* Pick 'scale' such that (resync>>scale)*1000 will fit 8704 * in a sector_t, and (max_sectors>>scale) will fit in a 8705 * u32, as those are the requirements for sector_div. 8706 * Thus 'scale' must be at least 10 8707 */ 8708 scale = 10; 8709 if (sizeof(sector_t) > sizeof(unsigned long)) { 8710 while ( max_sectors/2 > (1ULL<<(scale+32))) 8711 scale++; 8712 } 8713 res = (resync>>scale)*1000; 8714 sector_div(res, (u32)((max_sectors>>scale)+1)); 8715 8716 per_milli = res; 8717 { 8718 int i, x = per_milli/50, y = 20-x; 8719 seq_printf(seq, "["); 8720 for (i = 0; i < x; i++) 8721 seq_printf(seq, "="); 8722 seq_printf(seq, ">"); 8723 for (i = 0; i < y; i++) 8724 seq_printf(seq, "."); 8725 seq_printf(seq, "] "); 8726 } 8727 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 8728 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 8729 "reshape" : 8730 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 8731 "check" : 8732 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 8733 "resync" : "recovery"))), 8734 per_milli/10, per_milli % 10, 8735 (unsigned long long) resync/2, 8736 (unsigned long long) max_sectors/2); 8737 8738 /* 8739 * dt: time from mark until now 8740 * db: blocks written from mark until now 8741 * rt: remaining time 8742 * 8743 * rt is a sector_t, which is always 64bit now. We are keeping 8744 * the original algorithm, but it is not really necessary. 8745 * 8746 * Original algorithm: 8747 * So we divide before multiply in case it is 32bit and close 8748 * to the limit. 8749 * We scale the divisor (db) by 32 to avoid losing precision 8750 * near the end of resync when the number of remaining sectors 8751 * is close to 'db'. 8752 * We then divide rt by 32 after multiplying by db to compensate. 8753 * The '+1' avoids division by zero if db is very small. 8754 */ 8755 dt = ((jiffies - mddev->resync_mark) / HZ); 8756 if (!dt) dt++; 8757 8758 curr_mark_cnt = mddev->curr_mark_cnt; 8759 recovery_active = atomic_read(&mddev->recovery_active); 8760 resync_mark_cnt = mddev->resync_mark_cnt; 8761 8762 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) 8763 db = curr_mark_cnt - (recovery_active + resync_mark_cnt); 8764 8765 rt = max_sectors - resync; /* number of remaining sectors */ 8766 rt = div64_u64(rt, db/32+1); 8767 rt *= dt; 8768 rt >>= 5; 8769 8770 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 8771 ((unsigned long)rt % 60)/6); 8772 8773 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 8774 return 1; 8775} 8776 8777static void *md_seq_start(struct seq_file *seq, loff_t *pos) 8778 __acquires(&all_mddevs_lock) 8779{ 8780 seq->poll_event = atomic_read(&md_event_count); 8781 spin_lock(&all_mddevs_lock); 8782 8783 return seq_list_start_head(&all_mddevs, *pos); 8784} 8785 8786static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 8787{ 8788 return seq_list_next(v, &all_mddevs, pos); 8789} 8790 8791static void md_seq_stop(struct seq_file *seq, void *v) 8792 __releases(&all_mddevs_lock) 8793{ 8794 spin_unlock(&all_mddevs_lock); 8795} 8796 8797static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) 8798{ 8799 struct md_bitmap_stats stats; 8800 unsigned long used_pages; 8801 unsigned long chunk_kb; 8802 int err; 8803 8804 if (!md_bitmap_enabled(mddev, false)) 8805 return; 8806 8807 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 8808 if (err) 8809 return; 8810 8811 chunk_kb = mddev->bitmap_info.chunksize >> 10; 8812 used_pages = stats.pages - stats.missing_pages; 8813 8814 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk", 8815 used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10), 8816 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, 8817 chunk_kb ? "KB" : "B"); 8818 8819 if (stats.file) { 8820 seq_puts(seq, ", file: "); 8821 seq_file_path(seq, stats.file, " \t\n"); 8822 } 8823 8824 seq_putc(seq, '\n'); 8825} 8826 8827static int md_seq_show(struct seq_file *seq, void *v) 8828{ 8829 struct mddev *mddev; 8830 sector_t sectors; 8831 struct md_rdev *rdev; 8832 8833 if (v == &all_mddevs) { 8834 status_personalities(seq); 8835 if (list_empty(&all_mddevs)) 8836 status_unused(seq); 8837 return 0; 8838 } 8839 8840 mddev = list_entry(v, struct mddev, all_mddevs); 8841 if (!mddev_get(mddev)) 8842 return 0; 8843 8844 spin_unlock(&all_mddevs_lock); 8845 8846 /* prevent bitmap to be freed after checking */ 8847 mutex_lock(&mddev->bitmap_info.mutex); 8848 8849 spin_lock(&mddev->lock); 8850 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 8851 seq_printf(seq, "%s : ", mdname(mddev)); 8852 if (mddev->pers) { 8853 if (test_bit(MD_BROKEN, &mddev->flags)) 8854 seq_printf(seq, "broken"); 8855 else 8856 seq_printf(seq, "active"); 8857 if (mddev->ro == MD_RDONLY) 8858 seq_printf(seq, " (read-only)"); 8859 if (mddev->ro == MD_AUTO_READ) 8860 seq_printf(seq, " (auto-read-only)"); 8861 seq_printf(seq, " %s", mddev->pers->head.name); 8862 } else { 8863 seq_printf(seq, "inactive"); 8864 } 8865 8866 sectors = 0; 8867 rcu_read_lock(); 8868 rdev_for_each_rcu(rdev, mddev) { 8869 seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); 8870 8871 if (test_bit(WriteMostly, &rdev->flags)) 8872 seq_printf(seq, "(W)"); 8873 if (test_bit(Journal, &rdev->flags)) 8874 seq_printf(seq, "(J)"); 8875 if (test_bit(Faulty, &rdev->flags)) { 8876 seq_printf(seq, "(F)"); 8877 continue; 8878 } 8879 if (rdev->raid_disk < 0) 8880 seq_printf(seq, "(S)"); /* spare */ 8881 if (test_bit(Replacement, &rdev->flags)) 8882 seq_printf(seq, "(R)"); 8883 sectors += rdev->sectors; 8884 } 8885 rcu_read_unlock(); 8886 8887 if (!list_empty(&mddev->disks)) { 8888 if (mddev->pers) 8889 seq_printf(seq, "\n %llu blocks", 8890 (unsigned long long) 8891 mddev->array_sectors / 2); 8892 else 8893 seq_printf(seq, "\n %llu blocks", 8894 (unsigned long long)sectors / 2); 8895 } 8896 if (mddev->persistent) { 8897 if (mddev->major_version != 0 || 8898 mddev->minor_version != 90) { 8899 seq_printf(seq," super %d.%d", 8900 mddev->major_version, 8901 mddev->minor_version); 8902 } 8903 } else if (mddev->external) 8904 seq_printf(seq, " super external:%s", 8905 mddev->metadata_type); 8906 else 8907 seq_printf(seq, " super non-persistent"); 8908 8909 if (mddev->pers) { 8910 mddev->pers->status(seq, mddev); 8911 seq_printf(seq, "\n "); 8912 if (mddev->pers->sync_request) { 8913 if (status_resync(seq, mddev)) 8914 seq_printf(seq, "\n "); 8915 } 8916 } else 8917 seq_printf(seq, "\n "); 8918 8919 md_bitmap_status(seq, mddev); 8920 8921 seq_printf(seq, "\n"); 8922 } 8923 spin_unlock(&mddev->lock); 8924 mutex_unlock(&mddev->bitmap_info.mutex); 8925 spin_lock(&all_mddevs_lock); 8926 8927 if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) 8928 status_unused(seq); 8929 8930 mddev_put_locked(mddev); 8931 return 0; 8932} 8933 8934static const struct seq_operations md_seq_ops = { 8935 .start = md_seq_start, 8936 .next = md_seq_next, 8937 .stop = md_seq_stop, 8938 .show = md_seq_show, 8939}; 8940 8941static int md_seq_open(struct inode *inode, struct file *file) 8942{ 8943 struct seq_file *seq; 8944 int error; 8945 8946 error = seq_open(file, &md_seq_ops); 8947 if (error) 8948 return error; 8949 8950 seq = file->private_data; 8951 seq->poll_event = atomic_read(&md_event_count); 8952 return error; 8953} 8954 8955static int md_unloading; 8956static __poll_t mdstat_poll(struct file *filp, poll_table *wait) 8957{ 8958 struct seq_file *seq = filp->private_data; 8959 __poll_t mask; 8960 8961 if (md_unloading) 8962 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; 8963 poll_wait(filp, &md_event_waiters, wait); 8964 8965 /* always allow read */ 8966 mask = EPOLLIN | EPOLLRDNORM; 8967 8968 if (seq->poll_event != atomic_read(&md_event_count)) 8969 mask |= EPOLLERR | EPOLLPRI; 8970 return mask; 8971} 8972 8973static const struct proc_ops mdstat_proc_ops = { 8974 .proc_open = md_seq_open, 8975 .proc_read = seq_read, 8976 .proc_lseek = seq_lseek, 8977 .proc_release = seq_release, 8978 .proc_poll = mdstat_poll, 8979}; 8980 8981int register_md_submodule(struct md_submodule_head *msh) 8982{ 8983 return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL); 8984} 8985EXPORT_SYMBOL_GPL(register_md_submodule); 8986 8987void unregister_md_submodule(struct md_submodule_head *msh) 8988{ 8989 xa_erase(&md_submodule, msh->id); 8990} 8991EXPORT_SYMBOL_GPL(unregister_md_submodule); 8992 8993int md_setup_cluster(struct mddev *mddev, int nodes) 8994{ 8995 int ret = get_cluster_ops(mddev); 8996 8997 if (ret) { 8998 request_module("md-cluster"); 8999 ret = get_cluster_ops(mddev); 9000 } 9001 9002 /* ensure module won't be unloaded */ 9003 if (ret) { 9004 pr_warn("can't find md-cluster module or get its reference.\n"); 9005 return ret; 9006 } 9007 9008 ret = mddev->cluster_ops->join(mddev, nodes); 9009 if (!ret) 9010 mddev->safemode_delay = 0; 9011 return ret; 9012} 9013 9014void md_cluster_stop(struct mddev *mddev) 9015{ 9016 put_cluster_ops(mddev); 9017} 9018 9019static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init) 9020{ 9021 unsigned long last_events = rdev->last_events; 9022 9023 if (!bdev_is_partition(rdev->bdev)) 9024 return true; 9025 9026 /* 9027 * If rdev is partition, and user doesn't issue IO to the array, the 9028 * array is still not idle if user issues IO to other partitions. 9029 */ 9030 rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0, 9031 sectors) - 9032 part_stat_read_accum(rdev->bdev, sectors); 9033 9034 return init || rdev->last_events <= last_events; 9035} 9036 9037/* 9038 * mddev is idle if following conditions are matched since last check: 9039 * 1) mddev doesn't have normal IO completed; 9040 * 2) mddev doesn't have inflight normal IO; 9041 * 3) if any member disk is partition, and other partitions don't have IO 9042 * completed; 9043 * 9044 * Noted this checking rely on IO accounting is enabled. 9045 */ 9046static bool is_mddev_idle(struct mddev *mddev, int init) 9047{ 9048 unsigned long last_events = mddev->normal_io_events; 9049 struct gendisk *disk; 9050 struct md_rdev *rdev; 9051 bool idle = true; 9052 9053 disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk; 9054 if (!disk) 9055 return true; 9056 9057 mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors); 9058 if (!init && (mddev->normal_io_events > last_events || 9059 bdev_count_inflight(disk->part0))) 9060 idle = false; 9061 9062 rcu_read_lock(); 9063 rdev_for_each_rcu(rdev, mddev) 9064 if (!is_rdev_holder_idle(rdev, init)) 9065 idle = false; 9066 rcu_read_unlock(); 9067 9068 return idle; 9069} 9070 9071void md_done_sync(struct mddev *mddev, int blocks, int ok) 9072{ 9073 /* another "blocks" (512byte) blocks have been synced */ 9074 atomic_sub(blocks, &mddev->recovery_active); 9075 wake_up(&mddev->recovery_wait); 9076 if (!ok) { 9077 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9078 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 9079 md_wakeup_thread(mddev->thread); 9080 // stop recovery, signal do_sync .... 9081 } 9082} 9083EXPORT_SYMBOL(md_done_sync); 9084 9085/* md_write_start(mddev, bi) 9086 * If we need to update some array metadata (e.g. 'active' flag 9087 * in superblock) before writing, schedule a superblock update 9088 * and wait for it to complete. 9089 * A return value of 'false' means that the write wasn't recorded 9090 * and cannot proceed as the array is being suspend. 9091 */ 9092void md_write_start(struct mddev *mddev, struct bio *bi) 9093{ 9094 int did_change = 0; 9095 9096 if (bio_data_dir(bi) != WRITE) 9097 return; 9098 9099 BUG_ON(mddev->ro == MD_RDONLY); 9100 if (mddev->ro == MD_AUTO_READ) { 9101 /* need to switch to read/write */ 9102 mddev->ro = MD_RDWR; 9103 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9104 md_wakeup_thread(mddev->thread); 9105 md_wakeup_thread(mddev->sync_thread); 9106 did_change = 1; 9107 } 9108 rcu_read_lock(); 9109 percpu_ref_get(&mddev->writes_pending); 9110 smp_mb(); /* Match smp_mb in set_in_sync() */ 9111 if (mddev->safemode == 1) 9112 mddev->safemode = 0; 9113 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ 9114 if (mddev->in_sync || mddev->sync_checkers) { 9115 spin_lock(&mddev->lock); 9116 if (mddev->in_sync) { 9117 mddev->in_sync = 0; 9118 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 9119 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 9120 md_wakeup_thread(mddev->thread); 9121 did_change = 1; 9122 } 9123 spin_unlock(&mddev->lock); 9124 } 9125 rcu_read_unlock(); 9126 if (did_change) 9127 sysfs_notify_dirent_safe(mddev->sysfs_state); 9128 if (!mddev->has_superblocks) 9129 return; 9130 wait_event(mddev->sb_wait, 9131 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 9132} 9133EXPORT_SYMBOL(md_write_start); 9134 9135/* md_write_inc can only be called when md_write_start() has 9136 * already been called at least once of the current request. 9137 * It increments the counter and is useful when a single request 9138 * is split into several parts. Each part causes an increment and 9139 * so needs a matching md_write_end(). 9140 * Unlike md_write_start(), it is safe to call md_write_inc() inside 9141 * a spinlocked region. 9142 */ 9143void md_write_inc(struct mddev *mddev, struct bio *bi) 9144{ 9145 if (bio_data_dir(bi) != WRITE) 9146 return; 9147 WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); 9148 percpu_ref_get(&mddev->writes_pending); 9149} 9150EXPORT_SYMBOL(md_write_inc); 9151 9152void md_write_end(struct mddev *mddev) 9153{ 9154 percpu_ref_put(&mddev->writes_pending); 9155 9156 if (mddev->safemode == 2) 9157 md_wakeup_thread(mddev->thread); 9158 else if (mddev->safemode_delay) 9159 /* The roundup() ensures this only performs locking once 9160 * every ->safemode_delay jiffies 9161 */ 9162 mod_timer(&mddev->safemode_timer, 9163 roundup(jiffies, mddev->safemode_delay) + 9164 mddev->safemode_delay); 9165} 9166 9167EXPORT_SYMBOL(md_write_end); 9168 9169/* This is used by raid0 and raid10 */ 9170void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, 9171 struct bio *bio, sector_t start, sector_t size) 9172{ 9173 struct bio *discard_bio = NULL; 9174 9175 if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 9176 &discard_bio) || !discard_bio) 9177 return; 9178 9179 bio_chain(discard_bio, bio); 9180 bio_clone_blkg_association(discard_bio, bio); 9181 mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector); 9182 submit_bio_noacct(discard_bio); 9183} 9184EXPORT_SYMBOL_GPL(md_submit_discard_bio); 9185 9186static void md_bitmap_start(struct mddev *mddev, 9187 struct md_io_clone *md_io_clone) 9188{ 9189 md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? 9190 mddev->bitmap_ops->start_discard : 9191 mddev->bitmap_ops->start_write; 9192 9193 if (mddev->pers->bitmap_sector) 9194 mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, 9195 &md_io_clone->sectors); 9196 9197 fn(mddev, md_io_clone->offset, md_io_clone->sectors); 9198} 9199 9200static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) 9201{ 9202 md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? 9203 mddev->bitmap_ops->end_discard : 9204 mddev->bitmap_ops->end_write; 9205 9206 fn(mddev, md_io_clone->offset, md_io_clone->sectors); 9207} 9208 9209static void md_end_clone_io(struct bio *bio) 9210{ 9211 struct md_io_clone *md_io_clone = bio->bi_private; 9212 struct bio *orig_bio = md_io_clone->orig_bio; 9213 struct mddev *mddev = md_io_clone->mddev; 9214 9215 if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) 9216 md_bitmap_end(mddev, md_io_clone); 9217 9218 if (bio->bi_status && !orig_bio->bi_status) 9219 orig_bio->bi_status = bio->bi_status; 9220 9221 if (md_io_clone->start_time) 9222 bio_end_io_acct(orig_bio, md_io_clone->start_time); 9223 9224 bio_put(bio); 9225 bio_endio(orig_bio); 9226 percpu_ref_put(&mddev->active_io); 9227} 9228 9229static void md_clone_bio(struct mddev *mddev, struct bio **bio) 9230{ 9231 struct block_device *bdev = (*bio)->bi_bdev; 9232 struct md_io_clone *md_io_clone; 9233 struct bio *clone = 9234 bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); 9235 9236 md_io_clone = container_of(clone, struct md_io_clone, bio_clone); 9237 md_io_clone->orig_bio = *bio; 9238 md_io_clone->mddev = mddev; 9239 if (blk_queue_io_stat(bdev->bd_disk->queue)) 9240 md_io_clone->start_time = bio_start_io_acct(*bio); 9241 9242 if (bio_data_dir(*bio) == WRITE && md_bitmap_enabled(mddev, false)) { 9243 md_io_clone->offset = (*bio)->bi_iter.bi_sector; 9244 md_io_clone->sectors = bio_sectors(*bio); 9245 md_io_clone->rw = op_stat_group(bio_op(*bio)); 9246 md_bitmap_start(mddev, md_io_clone); 9247 } 9248 9249 clone->bi_end_io = md_end_clone_io; 9250 clone->bi_private = md_io_clone; 9251 *bio = clone; 9252} 9253 9254void md_account_bio(struct mddev *mddev, struct bio **bio) 9255{ 9256 percpu_ref_get(&mddev->active_io); 9257 md_clone_bio(mddev, bio); 9258} 9259EXPORT_SYMBOL_GPL(md_account_bio); 9260 9261void md_free_cloned_bio(struct bio *bio) 9262{ 9263 struct md_io_clone *md_io_clone = bio->bi_private; 9264 struct bio *orig_bio = md_io_clone->orig_bio; 9265 struct mddev *mddev = md_io_clone->mddev; 9266 9267 if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) 9268 md_bitmap_end(mddev, md_io_clone); 9269 9270 if (bio->bi_status && !orig_bio->bi_status) 9271 orig_bio->bi_status = bio->bi_status; 9272 9273 if (md_io_clone->start_time) 9274 bio_end_io_acct(orig_bio, md_io_clone->start_time); 9275 9276 bio_put(bio); 9277 percpu_ref_put(&mddev->active_io); 9278} 9279EXPORT_SYMBOL_GPL(md_free_cloned_bio); 9280 9281/* md_allow_write(mddev) 9282 * Calling this ensures that the array is marked 'active' so that writes 9283 * may proceed without blocking. It is important to call this before 9284 * attempting a GFP_KERNEL allocation while holding the mddev lock. 9285 * Must be called with mddev_lock held. 9286 */ 9287void md_allow_write(struct mddev *mddev) 9288{ 9289 if (!mddev->pers) 9290 return; 9291 if (!md_is_rdwr(mddev)) 9292 return; 9293 if (!mddev->pers->sync_request) 9294 return; 9295 9296 spin_lock(&mddev->lock); 9297 if (mddev->in_sync) { 9298 mddev->in_sync = 0; 9299 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 9300 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 9301 if (mddev->safemode_delay && 9302 mddev->safemode == 0) 9303 mddev->safemode = 1; 9304 spin_unlock(&mddev->lock); 9305 md_update_sb(mddev, 0); 9306 sysfs_notify_dirent_safe(mddev->sysfs_state); 9307 /* wait for the dirty state to be recorded in the metadata */ 9308 wait_event(mddev->sb_wait, 9309 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 9310 } else 9311 spin_unlock(&mddev->lock); 9312} 9313EXPORT_SYMBOL_GPL(md_allow_write); 9314 9315static sector_t md_sync_max_sectors(struct mddev *mddev, 9316 enum sync_action action) 9317{ 9318 switch (action) { 9319 case ACTION_RESYNC: 9320 case ACTION_CHECK: 9321 case ACTION_REPAIR: 9322 atomic64_set(&mddev->resync_mismatches, 0); 9323 fallthrough; 9324 case ACTION_RESHAPE: 9325 return mddev->resync_max_sectors; 9326 case ACTION_RECOVER: 9327 return mddev->dev_sectors; 9328 default: 9329 return 0; 9330 } 9331} 9332 9333/* 9334 * If lazy recovery is requested and all rdevs are in sync, select the rdev with 9335 * the higest index to perfore recovery to build initial xor data, this is the 9336 * same as old bitmap. 9337 */ 9338static bool mddev_select_lazy_recover_rdev(struct mddev *mddev) 9339{ 9340 struct md_rdev *recover_rdev = NULL; 9341 struct md_rdev *rdev; 9342 bool ret = false; 9343 9344 rcu_read_lock(); 9345 rdev_for_each_rcu(rdev, mddev) { 9346 if (rdev->raid_disk < 0) 9347 continue; 9348 9349 if (test_bit(Faulty, &rdev->flags) || 9350 !test_bit(In_sync, &rdev->flags)) 9351 break; 9352 9353 if (!recover_rdev || recover_rdev->raid_disk < rdev->raid_disk) 9354 recover_rdev = rdev; 9355 } 9356 9357 if (recover_rdev) { 9358 clear_bit(In_sync, &recover_rdev->flags); 9359 ret = true; 9360 } 9361 9362 rcu_read_unlock(); 9363 return ret; 9364} 9365 9366static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) 9367{ 9368 sector_t start = 0; 9369 struct md_rdev *rdev; 9370 9371 switch (action) { 9372 case ACTION_CHECK: 9373 case ACTION_REPAIR: 9374 return mddev->resync_min; 9375 case ACTION_RESYNC: 9376 if (!mddev->bitmap) 9377 return mddev->resync_offset; 9378 return 0; 9379 case ACTION_RESHAPE: 9380 /* 9381 * If the original node aborts reshaping then we continue the 9382 * reshaping, so set again to avoid restart reshape from the 9383 * first beginning 9384 */ 9385 if (mddev_is_clustered(mddev) && 9386 mddev->reshape_position != MaxSector) 9387 return mddev->reshape_position; 9388 return 0; 9389 case ACTION_RECOVER: 9390 start = MaxSector; 9391 rcu_read_lock(); 9392 rdev_for_each_rcu(rdev, mddev) 9393 if (rdev_needs_recovery(rdev, start)) 9394 start = rdev->recovery_offset; 9395 rcu_read_unlock(); 9396 9397 /* 9398 * If there are no spares, and raid456 lazy initial recover is 9399 * requested. 9400 */ 9401 if (test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery) && 9402 start == MaxSector && mddev_select_lazy_recover_rdev(mddev)) 9403 start = 0; 9404 9405 /* If there is a bitmap, we need to make sure all 9406 * writes that started before we added a spare 9407 * complete before we start doing a recovery. 9408 * Otherwise the write might complete and (via 9409 * bitmap_endwrite) set a bit in the bitmap after the 9410 * recovery has checked that bit and skipped that 9411 * region. 9412 */ 9413 if (mddev->bitmap) { 9414 mddev->pers->quiesce(mddev, 1); 9415 mddev->pers->quiesce(mddev, 0); 9416 } 9417 return start; 9418 default: 9419 return MaxSector; 9420 } 9421} 9422 9423static bool sync_io_within_limit(struct mddev *mddev) 9424{ 9425 /* 9426 * For raid456, sync IO is stripe(4k) per IO, for other levels, it's 9427 * RESYNC_PAGES(64k) per IO. 9428 */ 9429 return atomic_read(&mddev->recovery_active) < 9430 (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev); 9431} 9432 9433#define SYNC_MARKS 10 9434#define SYNC_MARK_STEP (3*HZ) 9435#define UPDATE_FREQUENCY (5*60*HZ) 9436void md_do_sync(struct md_thread *thread) 9437{ 9438 struct mddev *mddev = thread->mddev; 9439 struct mddev *mddev2; 9440 unsigned int currspeed = 0, window; 9441 sector_t max_sectors,j, io_sectors, recovery_done; 9442 unsigned long mark[SYNC_MARKS]; 9443 unsigned long update_time; 9444 sector_t mark_cnt[SYNC_MARKS]; 9445 int last_mark,m; 9446 sector_t last_check; 9447 int skipped = 0; 9448 struct md_rdev *rdev; 9449 enum sync_action action; 9450 const char *desc; 9451 struct blk_plug plug; 9452 int ret; 9453 9454 /* just incase thread restarts... */ 9455 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 9456 return; 9457 9458 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9459 goto skip; 9460 9461 if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || 9462 !md_is_rdwr(mddev)) {/* never try to sync a read-only array */ 9463 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9464 goto skip; 9465 } 9466 9467 if (mddev_is_clustered(mddev)) { 9468 ret = mddev->cluster_ops->resync_start(mddev); 9469 if (ret) 9470 goto skip; 9471 9472 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); 9473 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 9474 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 9475 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 9476 && ((unsigned long long)mddev->curr_resync_completed 9477 < (unsigned long long)mddev->resync_max_sectors)) 9478 goto skip; 9479 } 9480 9481 action = md_sync_action(mddev); 9482 if (action == ACTION_FROZEN || action == ACTION_IDLE) { 9483 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9484 goto skip; 9485 } 9486 9487 desc = md_sync_action_name(action); 9488 mddev->last_sync_action = action; 9489 9490 /* 9491 * Before starting a resync we must have set curr_resync to 9492 * 2, and then checked that every "conflicting" array has curr_resync 9493 * less than ours. When we find one that is the same or higher 9494 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 9495 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 9496 * This will mean we have to start checking from the beginning again. 9497 * 9498 */ 9499 if (mddev_is_clustered(mddev)) 9500 mddev->cluster_ops->resync_start_notify(mddev); 9501 do { 9502 int mddev2_minor = -1; 9503 mddev->curr_resync = MD_RESYNC_DELAYED; 9504 9505 try_again: 9506 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9507 goto skip; 9508 spin_lock(&all_mddevs_lock); 9509 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { 9510 if (test_bit(MD_DELETED, &mddev2->flags)) 9511 continue; 9512 if (mddev2 == mddev) 9513 continue; 9514 if (!mddev->parallel_resync 9515 && mddev2->curr_resync 9516 && match_mddev_units(mddev, mddev2)) { 9517 DEFINE_WAIT(wq); 9518 if (mddev < mddev2 && 9519 mddev->curr_resync == MD_RESYNC_DELAYED) { 9520 /* arbitrarily yield */ 9521 mddev->curr_resync = MD_RESYNC_YIELDED; 9522 wake_up(&resync_wait); 9523 } 9524 if (mddev > mddev2 && 9525 mddev->curr_resync == MD_RESYNC_YIELDED) 9526 /* no need to wait here, we can wait the next 9527 * time 'round when curr_resync == 2 9528 */ 9529 continue; 9530 /* We need to wait 'interruptible' so as not to 9531 * contribute to the load average, and not to 9532 * be caught by 'softlockup' 9533 */ 9534 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 9535 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9536 mddev2->curr_resync >= mddev->curr_resync) { 9537 if (mddev2_minor != mddev2->md_minor) { 9538 mddev2_minor = mddev2->md_minor; 9539 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", 9540 desc, mdname(mddev), 9541 mdname(mddev2)); 9542 } 9543 spin_unlock(&all_mddevs_lock); 9544 9545 if (signal_pending(current)) 9546 flush_signals(current); 9547 schedule(); 9548 finish_wait(&resync_wait, &wq); 9549 goto try_again; 9550 } 9551 finish_wait(&resync_wait, &wq); 9552 } 9553 } 9554 spin_unlock(&all_mddevs_lock); 9555 } while (mddev->curr_resync < MD_RESYNC_DELAYED); 9556 9557 max_sectors = md_sync_max_sectors(mddev, action); 9558 j = md_sync_position(mddev, action); 9559 9560 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); 9561 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); 9562 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", 9563 speed_max(mddev), desc); 9564 9565 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 9566 9567 io_sectors = 0; 9568 for (m = 0; m < SYNC_MARKS; m++) { 9569 mark[m] = jiffies; 9570 mark_cnt[m] = io_sectors; 9571 } 9572 last_mark = 0; 9573 mddev->resync_mark = mark[last_mark]; 9574 mddev->resync_mark_cnt = mark_cnt[last_mark]; 9575 9576 /* 9577 * Tune reconstruction: 9578 */ 9579 window = 32 * (PAGE_SIZE / 512); 9580 pr_debug("md: using %dk window, over a total of %lluk.\n", 9581 window/2, (unsigned long long)max_sectors/2); 9582 9583 atomic_set(&mddev->recovery_active, 0); 9584 last_check = 0; 9585 9586 if (j >= MD_RESYNC_ACTIVE) { 9587 pr_debug("md: resuming %s of %s from checkpoint.\n", 9588 desc, mdname(mddev)); 9589 mddev->curr_resync = j; 9590 } else 9591 mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ 9592 mddev->curr_resync_completed = j; 9593 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9594 md_new_event(); 9595 update_time = jiffies; 9596 9597 blk_start_plug(&plug); 9598 while (j < max_sectors) { 9599 sector_t sectors; 9600 9601 skipped = 0; 9602 9603 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9604 ((mddev->curr_resync > mddev->curr_resync_completed && 9605 (mddev->curr_resync - mddev->curr_resync_completed) 9606 > (max_sectors >> 4)) || 9607 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 9608 (j - mddev->curr_resync_completed)*2 9609 >= mddev->resync_max - mddev->curr_resync_completed || 9610 mddev->curr_resync_completed > mddev->resync_max 9611 )) { 9612 /* time to update curr_resync_completed */ 9613 wait_event(mddev->recovery_wait, 9614 atomic_read(&mddev->recovery_active) == 0); 9615 mddev->curr_resync_completed = j; 9616 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 9617 j > mddev->resync_offset) 9618 mddev->resync_offset = j; 9619 update_time = jiffies; 9620 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 9621 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9622 } 9623 9624 while (j >= mddev->resync_max && 9625 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9626 /* As this condition is controlled by user-space, 9627 * we can block indefinitely, so use '_interruptible' 9628 * to avoid triggering warnings. 9629 */ 9630 flush_signals(current); /* just in case */ 9631 wait_event_interruptible(mddev->recovery_wait, 9632 mddev->resync_max > j 9633 || test_bit(MD_RECOVERY_INTR, 9634 &mddev->recovery)); 9635 } 9636 9637 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9638 break; 9639 9640 if (mddev->bitmap_ops && mddev->bitmap_ops->skip_sync_blocks) { 9641 sectors = mddev->bitmap_ops->skip_sync_blocks(mddev, j); 9642 if (sectors) 9643 goto update; 9644 } 9645 9646 sectors = mddev->pers->sync_request(mddev, j, max_sectors, 9647 &skipped); 9648 if (sectors == 0) { 9649 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9650 break; 9651 } 9652 9653 if (!skipped) { /* actual IO requested */ 9654 io_sectors += sectors; 9655 atomic_add(sectors, &mddev->recovery_active); 9656 } 9657 9658 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9659 break; 9660 9661update: 9662 j += sectors; 9663 if (j > max_sectors) 9664 /* when skipping, extra large numbers can be returned. */ 9665 j = max_sectors; 9666 if (j >= MD_RESYNC_ACTIVE) 9667 mddev->curr_resync = j; 9668 mddev->curr_mark_cnt = io_sectors; 9669 if (last_check == 0) 9670 /* this is the earliest that rebuild will be 9671 * visible in /proc/mdstat 9672 */ 9673 md_new_event(); 9674 9675 if (last_check + window > io_sectors || j == max_sectors) 9676 continue; 9677 9678 last_check = io_sectors; 9679 repeat: 9680 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 9681 /* step marks */ 9682 int next = (last_mark+1) % SYNC_MARKS; 9683 9684 mddev->resync_mark = mark[next]; 9685 mddev->resync_mark_cnt = mark_cnt[next]; 9686 mark[next] = jiffies; 9687 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 9688 last_mark = next; 9689 } 9690 9691 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9692 break; 9693 9694 /* 9695 * this loop exits only if either when we are slower than 9696 * the 'hard' speed limit, or the system was IO-idle for 9697 * a jiffy. 9698 * the system might be non-idle CPU-wise, but we only care 9699 * about not overloading the IO subsystem. (things like an 9700 * e2fsck being done on the RAID array should execute fast) 9701 */ 9702 cond_resched(); 9703 9704 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 9705 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 9706 /((jiffies-mddev->resync_mark)/HZ +1) +1; 9707 9708 if (currspeed > speed_min(mddev)) { 9709 if (currspeed > speed_max(mddev)) { 9710 msleep(500); 9711 goto repeat; 9712 } 9713 if (!sync_io_within_limit(mddev) && 9714 !is_mddev_idle(mddev, 0)) { 9715 /* 9716 * Give other IO more of a chance. 9717 * The faster the devices, the less we wait. 9718 */ 9719 wait_event(mddev->recovery_wait, 9720 !atomic_read(&mddev->recovery_active)); 9721 } 9722 } 9723 } 9724 pr_info("md: %s: %s %s.\n",mdname(mddev), desc, 9725 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 9726 ? "interrupted" : "done"); 9727 /* 9728 * this also signals 'finished resyncing' to md_stop 9729 */ 9730 blk_finish_plug(&plug); 9731 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 9732 9733 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9734 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9735 mddev->curr_resync >= MD_RESYNC_ACTIVE) { 9736 mddev->curr_resync_completed = mddev->curr_resync; 9737 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9738 } 9739 mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped); 9740 9741 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 9742 mddev->curr_resync > MD_RESYNC_ACTIVE) { 9743 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 9744 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9745 if (mddev->curr_resync >= mddev->resync_offset) { 9746 pr_debug("md: checkpointing %s of %s.\n", 9747 desc, mdname(mddev)); 9748 if (test_bit(MD_RECOVERY_ERROR, 9749 &mddev->recovery)) 9750 mddev->resync_offset = 9751 mddev->curr_resync_completed; 9752 else 9753 mddev->resync_offset = 9754 mddev->curr_resync; 9755 } 9756 } else 9757 mddev->resync_offset = MaxSector; 9758 } else { 9759 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9760 mddev->curr_resync = MaxSector; 9761 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9762 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { 9763 rcu_read_lock(); 9764 rdev_for_each_rcu(rdev, mddev) 9765 if (mddev->delta_disks >= 0 && 9766 rdev_needs_recovery(rdev, mddev->curr_resync)) 9767 rdev->recovery_offset = mddev->curr_resync; 9768 rcu_read_unlock(); 9769 } 9770 } 9771 } 9772 skip: 9773 /* set CHANGE_PENDING here since maybe another update is needed, 9774 * so other nodes are informed. It should be harmless for normal 9775 * raid */ 9776 set_mask_bits(&mddev->sb_flags, 0, 9777 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); 9778 9779 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9780 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9781 mddev->delta_disks > 0 && 9782 mddev->pers->finish_reshape && 9783 mddev->pers->size && 9784 !mddev_is_dm(mddev)) { 9785 mddev_lock_nointr(mddev); 9786 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); 9787 mddev_unlock(mddev); 9788 if (!mddev_is_clustered(mddev)) 9789 set_capacity_and_notify(mddev->gendisk, 9790 mddev->array_sectors); 9791 } 9792 9793 spin_lock(&mddev->lock); 9794 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9795 /* We completed so min/max setting can be forgotten if used. */ 9796 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9797 mddev->resync_min = 0; 9798 mddev->resync_max = MaxSector; 9799 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9800 mddev->resync_min = mddev->curr_resync_completed; 9801 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 9802 mddev->curr_resync = MD_RESYNC_NONE; 9803 spin_unlock(&mddev->lock); 9804 9805 wake_up(&resync_wait); 9806 md_wakeup_thread(mddev->thread); 9807 return; 9808} 9809EXPORT_SYMBOL_GPL(md_do_sync); 9810 9811static bool rdev_removeable(struct md_rdev *rdev) 9812{ 9813 /* rdev is not used. */ 9814 if (rdev->raid_disk < 0) 9815 return false; 9816 9817 /* There are still inflight io, don't remove this rdev. */ 9818 if (atomic_read(&rdev->nr_pending)) 9819 return false; 9820 9821 /* 9822 * An error occurred but has not yet been acknowledged by the metadata 9823 * handler, don't remove this rdev. 9824 */ 9825 if (test_bit(Blocked, &rdev->flags)) 9826 return false; 9827 9828 /* Fautly rdev is not used, it's safe to remove it. */ 9829 if (test_bit(Faulty, &rdev->flags)) 9830 return true; 9831 9832 /* Journal disk can only be removed if it's faulty. */ 9833 if (test_bit(Journal, &rdev->flags)) 9834 return false; 9835 9836 /* 9837 * 'In_sync' is cleared while 'raid_disk' is valid, which means 9838 * replacement has just become active from pers->spare_active(), and 9839 * then pers->hot_remove_disk() will replace this rdev with replacement. 9840 */ 9841 if (!test_bit(In_sync, &rdev->flags)) 9842 return true; 9843 9844 return false; 9845} 9846 9847static bool rdev_is_spare(struct md_rdev *rdev) 9848{ 9849 return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && 9850 !test_bit(In_sync, &rdev->flags) && 9851 !test_bit(Journal, &rdev->flags) && 9852 !test_bit(Faulty, &rdev->flags); 9853} 9854 9855static bool rdev_addable(struct md_rdev *rdev) 9856{ 9857 struct mddev *mddev; 9858 9859 mddev = READ_ONCE(rdev->mddev); 9860 if (!mddev) 9861 return false; 9862 9863 /* rdev is already used, don't add it again. */ 9864 if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || 9865 test_bit(Faulty, &rdev->flags)) 9866 return false; 9867 9868 /* Allow to add journal disk. */ 9869 if (test_bit(Journal, &rdev->flags)) 9870 return true; 9871 9872 /* Allow to add if array is read-write. */ 9873 if (md_is_rdwr(mddev)) 9874 return true; 9875 9876 /* 9877 * For read-only array, only allow to readd a rdev. And if bitmap is 9878 * used, don't allow to readd a rdev that is too old. 9879 */ 9880 if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) 9881 return true; 9882 9883 return false; 9884} 9885 9886static bool md_spares_need_change(struct mddev *mddev) 9887{ 9888 struct md_rdev *rdev; 9889 9890 rcu_read_lock(); 9891 rdev_for_each_rcu(rdev, mddev) { 9892 if (rdev_removeable(rdev) || rdev_addable(rdev)) { 9893 rcu_read_unlock(); 9894 return true; 9895 } 9896 } 9897 rcu_read_unlock(); 9898 return false; 9899} 9900 9901static int remove_spares(struct mddev *mddev, struct md_rdev *this) 9902{ 9903 struct md_rdev *rdev; 9904 int removed = 0; 9905 9906 rdev_for_each(rdev, mddev) { 9907 if ((this == NULL || rdev == this) && rdev_removeable(rdev) && 9908 !mddev->pers->hot_remove_disk(mddev, rdev)) { 9909 sysfs_unlink_rdev(mddev, rdev); 9910 rdev->saved_raid_disk = rdev->raid_disk; 9911 rdev->raid_disk = -1; 9912 removed++; 9913 } 9914 } 9915 9916 if (removed && mddev->kobj.sd) 9917 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9918 9919 return removed; 9920} 9921 9922static int remove_and_add_spares(struct mddev *mddev, 9923 struct md_rdev *this) 9924{ 9925 struct md_rdev *rdev; 9926 int spares = 0; 9927 int removed = 0; 9928 9929 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 9930 /* Mustn't remove devices when resync thread is running */ 9931 return 0; 9932 9933 removed = remove_spares(mddev, this); 9934 if (this && removed) 9935 goto no_add; 9936 9937 rdev_for_each(rdev, mddev) { 9938 if (this && this != rdev) 9939 continue; 9940 if (rdev_is_spare(rdev)) 9941 spares++; 9942 if (!rdev_addable(rdev)) 9943 continue; 9944 if (!test_bit(Journal, &rdev->flags)) 9945 rdev->recovery_offset = 0; 9946 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { 9947 /* failure here is OK */ 9948 sysfs_link_rdev(mddev, rdev); 9949 if (!test_bit(Journal, &rdev->flags)) 9950 spares++; 9951 md_new_event(); 9952 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9953 } 9954 } 9955no_add: 9956 if (removed) 9957 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9958 return spares; 9959} 9960 9961static bool md_choose_sync_action(struct mddev *mddev, int *spares) 9962{ 9963 /* Check if reshape is in progress first. */ 9964 if (mddev->reshape_position != MaxSector) { 9965 if (mddev->pers->check_reshape == NULL || 9966 mddev->pers->check_reshape(mddev) != 0) 9967 return false; 9968 9969 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9970 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9971 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); 9972 return true; 9973 } 9974 9975 /* Check if resync is in progress. */ 9976 if (mddev->resync_offset < MaxSector) { 9977 remove_spares(mddev, NULL); 9978 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9979 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9980 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); 9981 return true; 9982 } 9983 9984 /* 9985 * Remove any failed drives, then add spares if possible. Spares are 9986 * also removed and re-added, to allow the personality to fail the 9987 * re-add. 9988 */ 9989 *spares = remove_and_add_spares(mddev, NULL); 9990 if (*spares || test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) { 9991 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9992 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9993 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9994 9995 /* Start new recovery. */ 9996 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9997 return true; 9998 } 9999 10000 /* Delay to choose resync/check/repair in md_do_sync(). */ 10001 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 10002 return true; 10003 10004 /* Nothing to be done */ 10005 return false; 10006} 10007 10008static void md_start_sync(struct work_struct *ws) 10009{ 10010 struct mddev *mddev = container_of(ws, struct mddev, sync_work); 10011 int spares = 0; 10012 bool suspend = false; 10013 char *name; 10014 10015 /* 10016 * If reshape is still in progress, spares won't be added or removed 10017 * from conf until reshape is done. 10018 */ 10019 if (mddev->reshape_position == MaxSector && 10020 md_spares_need_change(mddev)) { 10021 suspend = true; 10022 mddev_suspend(mddev, false); 10023 } 10024 10025 mddev_lock_nointr(mddev); 10026 if (!md_is_rdwr(mddev)) { 10027 /* 10028 * On a read-only array we can: 10029 * - remove failed devices 10030 * - add already-in_sync devices if the array itself is in-sync. 10031 * As we only add devices that are already in-sync, we can 10032 * activate the spares immediately. 10033 */ 10034 remove_and_add_spares(mddev, NULL); 10035 goto not_running; 10036 } 10037 10038 if (!md_choose_sync_action(mddev, &spares)) 10039 goto not_running; 10040 10041 if (!mddev->pers->sync_request) 10042 goto not_running; 10043 10044 /* 10045 * We are adding a device or devices to an array which has the bitmap 10046 * stored on all devices. So make sure all bitmap pages get written. 10047 */ 10048 if (spares && md_bitmap_enabled(mddev, true)) 10049 mddev->bitmap_ops->write_all(mddev); 10050 10051 name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? 10052 "reshape" : "resync"; 10053 rcu_assign_pointer(mddev->sync_thread, 10054 md_register_thread(md_do_sync, mddev, name)); 10055 if (!mddev->sync_thread) { 10056 pr_warn("%s: could not start resync thread...\n", 10057 mdname(mddev)); 10058 /* leave the spares where they are, it shouldn't hurt */ 10059 goto not_running; 10060 } 10061 10062 mddev_unlock(mddev); 10063 /* 10064 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 10065 * not set it again. Otherwise, we may cause issue like this one: 10066 * https://bugzilla.kernel.org/show_bug.cgi?id=218200 10067 * Therefore, use __mddev_resume(mddev, false). 10068 */ 10069 if (suspend) 10070 __mddev_resume(mddev, false); 10071 md_wakeup_thread(mddev->sync_thread); 10072 sysfs_notify_dirent_safe(mddev->sysfs_action); 10073 md_new_event(); 10074 return; 10075 10076not_running: 10077 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 10078 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 10079 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 10080 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 10081 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 10082 mddev_unlock(mddev); 10083 /* 10084 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 10085 * not set it again. Otherwise, we may cause issue like this one: 10086 * https://bugzilla.kernel.org/show_bug.cgi?id=218200 10087 * Therefore, use __mddev_resume(mddev, false). 10088 */ 10089 if (suspend) 10090 __mddev_resume(mddev, false); 10091 10092 wake_up(&resync_wait); 10093 if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 10094 mddev->sysfs_action) 10095 sysfs_notify_dirent_safe(mddev->sysfs_action); 10096} 10097 10098static void unregister_sync_thread(struct mddev *mddev) 10099{ 10100 if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 10101 /* resync/recovery still happening */ 10102 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10103 return; 10104 } 10105 10106 if (WARN_ON_ONCE(!mddev->sync_thread)) 10107 return; 10108 10109 md_reap_sync_thread(mddev); 10110} 10111 10112static bool md_should_do_recovery(struct mddev *mddev) 10113{ 10114 /* 10115 * As long as one of the following flags is set, 10116 * recovery needs to do or cleanup. 10117 */ 10118 if (test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 10119 test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 10120 return true; 10121 10122 /* 10123 * If no flags are set and it is in read-only status, 10124 * there is nothing to do. 10125 */ 10126 if (!md_is_rdwr(mddev)) 10127 return false; 10128 10129 /* 10130 * MD_SB_CHANGE_PENDING indicates that the array is switching from clean to 10131 * active, and no action is needed for now. 10132 * All other MD_SB_* flags require to update the superblock. 10133 */ 10134 if (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) 10135 return true; 10136 10137 /* 10138 * If the array is not using external metadata and there has been no data 10139 * written for some time, then the array's status needs to be set to 10140 * in_sync. 10141 */ 10142 if (mddev->external == 0 && mddev->safemode == 1) 10143 return true; 10144 10145 /* 10146 * When the system is about to restart or the process receives an signal, 10147 * the array needs to be synchronized as soon as possible. 10148 * Once the data synchronization is completed, need to change the array 10149 * status to in_sync. 10150 */ 10151 if (mddev->safemode == 2 && !mddev->in_sync && 10152 mddev->resync_offset == MaxSector) 10153 return true; 10154 10155 return false; 10156} 10157 10158/* 10159 * This routine is regularly called by all per-raid-array threads to 10160 * deal with generic issues like resync and super-block update. 10161 * Raid personalities that don't have a thread (linear/raid0) do not 10162 * need this as they never do any recovery or update the superblock. 10163 * 10164 * It does not do any resync itself, but rather "forks" off other threads 10165 * to do that as needed. 10166 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 10167 * "->recovery" and create a thread at ->sync_thread. 10168 * When the thread finishes it sets MD_RECOVERY_DONE 10169 * and wakeups up this thread which will reap the thread and finish up. 10170 * This thread also removes any faulty devices (with nr_pending == 0). 10171 * 10172 * The overall approach is: 10173 * 1/ if the superblock needs updating, update it. 10174 * 2/ If a recovery thread is running, don't do anything else. 10175 * 3/ If recovery has finished, clean up, possibly marking spares active. 10176 * 4/ If there are any faulty devices, remove them. 10177 * 5/ If array is degraded, try to add spares devices 10178 * 6/ If array has spares or is not in-sync, start a resync thread. 10179 */ 10180void md_check_recovery(struct mddev *mddev) 10181{ 10182 if (md_bitmap_enabled(mddev, false) && mddev->bitmap_ops->daemon_work) 10183 mddev->bitmap_ops->daemon_work(mddev); 10184 10185 if (signal_pending(current)) { 10186 if (mddev->pers->sync_request && !mddev->external) { 10187 pr_debug("md: %s in immediate safe mode\n", 10188 mdname(mddev)); 10189 mddev->safemode = 2; 10190 } 10191 flush_signals(current); 10192 } 10193 10194 if (!md_should_do_recovery(mddev)) 10195 return; 10196 10197 if (mddev_trylock(mddev)) { 10198 bool try_set_sync = mddev->safemode != 0; 10199 10200 if (!mddev->external && mddev->safemode == 1) 10201 mddev->safemode = 0; 10202 10203 if (!md_is_rdwr(mddev)) { 10204 struct md_rdev *rdev; 10205 10206 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 10207 unregister_sync_thread(mddev); 10208 goto unlock; 10209 } 10210 10211 if (!mddev->external && mddev->in_sync) 10212 /* 10213 * 'Blocked' flag not needed as failed devices 10214 * will be recorded if array switched to read/write. 10215 * Leaving it set will prevent the device 10216 * from being removed. 10217 */ 10218 rdev_for_each(rdev, mddev) 10219 clear_bit(Blocked, &rdev->flags); 10220 10221 /* 10222 * There is no thread, but we need to call 10223 * ->spare_active and clear saved_raid_disk 10224 */ 10225 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 10226 md_reap_sync_thread(mddev); 10227 10228 /* 10229 * Let md_start_sync() to remove and add rdevs to the 10230 * array. 10231 */ 10232 if (md_spares_need_change(mddev)) { 10233 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 10234 queue_work(md_misc_wq, &mddev->sync_work); 10235 } 10236 10237 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 10238 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); 10239 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10240 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 10241 10242 goto unlock; 10243 } 10244 10245 if (mddev_is_clustered(mddev)) { 10246 struct md_rdev *rdev, *tmp; 10247 /* kick the device if another node issued a 10248 * remove disk. 10249 */ 10250 rdev_for_each_safe(rdev, tmp, mddev) { 10251 if (rdev->raid_disk < 0 && 10252 test_and_clear_bit(ClusterRemove, &rdev->flags)) 10253 md_kick_rdev_from_array(rdev); 10254 } 10255 } 10256 10257 if (try_set_sync && !mddev->external && !mddev->in_sync) { 10258 spin_lock(&mddev->lock); 10259 set_in_sync(mddev); 10260 spin_unlock(&mddev->lock); 10261 } 10262 10263 if (mddev->sb_flags) 10264 md_update_sb(mddev, 0); 10265 10266 /* 10267 * Never start a new sync thread if MD_RECOVERY_RUNNING is 10268 * still set. 10269 */ 10270 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 10271 unregister_sync_thread(mddev); 10272 goto unlock; 10273 } 10274 10275 /* Set RUNNING before clearing NEEDED to avoid 10276 * any transients in the value of "sync_action". 10277 */ 10278 mddev->curr_resync_completed = 0; 10279 spin_lock(&mddev->lock); 10280 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 10281 spin_unlock(&mddev->lock); 10282 /* Clear some bits that don't mean anything, but 10283 * might be left set 10284 */ 10285 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 10286 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 10287 10288 if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && 10289 !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 10290 queue_work(md_misc_wq, &mddev->sync_work); 10291 } else { 10292 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 10293 wake_up(&resync_wait); 10294 } 10295 10296 unlock: 10297 wake_up(&mddev->sb_wait); 10298 mddev_unlock(mddev); 10299 } 10300} 10301EXPORT_SYMBOL(md_check_recovery); 10302 10303void md_reap_sync_thread(struct mddev *mddev) 10304{ 10305 struct md_rdev *rdev; 10306 sector_t old_dev_sectors = mddev->dev_sectors; 10307 bool is_reshaped = false; 10308 10309 /* resync has finished, collect result */ 10310 md_unregister_thread(mddev, &mddev->sync_thread); 10311 atomic_inc(&mddev->sync_seq); 10312 10313 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 10314 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 10315 mddev->degraded != mddev->raid_disks) { 10316 /* success...*/ 10317 /* activate any spares */ 10318 if (mddev->pers->spare_active(mddev)) { 10319 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 10320 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 10321 } 10322 } 10323 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 10324 mddev->pers->finish_reshape) { 10325 mddev->pers->finish_reshape(mddev); 10326 if (mddev_is_clustered(mddev)) 10327 is_reshaped = true; 10328 } 10329 10330 /* If array is no-longer degraded, then any saved_raid_disk 10331 * information must be scrapped. 10332 */ 10333 if (!mddev->degraded) 10334 rdev_for_each(rdev, mddev) 10335 rdev->saved_raid_disk = -1; 10336 10337 md_update_sb(mddev, 1); 10338 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can 10339 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 10340 * clustered raid */ 10341 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 10342 mddev->cluster_ops->resync_finish(mddev); 10343 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 10344 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 10345 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 10346 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 10347 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 10348 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 10349 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); 10350 /* 10351 * We call mddev->cluster_ops->update_size here because sync_size could 10352 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, 10353 * so it is time to update size across cluster. 10354 */ 10355 if (mddev_is_clustered(mddev) && is_reshaped 10356 && !test_bit(MD_CLOSING, &mddev->flags)) 10357 mddev->cluster_ops->update_size(mddev, old_dev_sectors); 10358 /* flag recovery needed just to double check */ 10359 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10360 sysfs_notify_dirent_safe(mddev->sysfs_completed); 10361 sysfs_notify_dirent_safe(mddev->sysfs_action); 10362 md_new_event(); 10363 if (mddev->event_work.func) 10364 queue_work(md_misc_wq, &mddev->event_work); 10365 wake_up(&resync_wait); 10366} 10367EXPORT_SYMBOL(md_reap_sync_thread); 10368 10369void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 10370{ 10371 sysfs_notify_dirent_safe(rdev->sysfs_state); 10372 wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev), 10373 msecs_to_jiffies(5000)); 10374 rdev_dec_pending(rdev, mddev); 10375} 10376EXPORT_SYMBOL(md_wait_for_blocked_rdev); 10377 10378void md_finish_reshape(struct mddev *mddev) 10379{ 10380 /* called be personality module when reshape completes. */ 10381 struct md_rdev *rdev; 10382 10383 rdev_for_each(rdev, mddev) { 10384 if (rdev->data_offset > rdev->new_data_offset) 10385 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 10386 else 10387 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 10388 rdev->data_offset = rdev->new_data_offset; 10389 } 10390} 10391EXPORT_SYMBOL(md_finish_reshape); 10392 10393/* Bad block management */ 10394 10395/* Returns true on success, false on failure */ 10396bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 10397 int is_new) 10398{ 10399 struct mddev *mddev = rdev->mddev; 10400 10401 /* 10402 * Recording new badblocks for faulty rdev will force unnecessary 10403 * super block updating. This is fragile for external management because 10404 * userspace daemon may trying to remove this device and deadlock may 10405 * occur. This will be probably solved in the mdadm, but it is safer to 10406 * avoid it. 10407 */ 10408 if (test_bit(Faulty, &rdev->flags)) 10409 return true; 10410 10411 if (is_new) 10412 s += rdev->new_data_offset; 10413 else 10414 s += rdev->data_offset; 10415 10416 if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) 10417 return false; 10418 10419 /* Make sure they get written out promptly */ 10420 if (test_bit(ExternalBbl, &rdev->flags)) 10421 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); 10422 sysfs_notify_dirent_safe(rdev->sysfs_state); 10423 set_mask_bits(&mddev->sb_flags, 0, 10424 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); 10425 md_wakeup_thread(rdev->mddev->thread); 10426 return true; 10427} 10428EXPORT_SYMBOL_GPL(rdev_set_badblocks); 10429 10430void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 10431 int is_new) 10432{ 10433 if (is_new) 10434 s += rdev->new_data_offset; 10435 else 10436 s += rdev->data_offset; 10437 10438 if (!badblocks_clear(&rdev->badblocks, s, sectors)) 10439 return; 10440 10441 if (test_bit(ExternalBbl, &rdev->flags)) 10442 sysfs_notify_dirent_safe(rdev->sysfs_badblocks); 10443} 10444EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 10445 10446static int md_notify_reboot(struct notifier_block *this, 10447 unsigned long code, void *x) 10448{ 10449 struct mddev *mddev; 10450 10451 spin_lock(&all_mddevs_lock); 10452 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 10453 if (!mddev_get(mddev)) 10454 continue; 10455 spin_unlock(&all_mddevs_lock); 10456 if (mddev_trylock(mddev)) { 10457 if (mddev->pers) 10458 __md_stop_writes(mddev); 10459 if (mddev->persistent) 10460 mddev->safemode = 2; 10461 mddev_unlock(mddev); 10462 } 10463 spin_lock(&all_mddevs_lock); 10464 mddev_put_locked(mddev); 10465 } 10466 spin_unlock(&all_mddevs_lock); 10467 10468 return NOTIFY_DONE; 10469} 10470 10471static struct notifier_block md_notifier = { 10472 .notifier_call = md_notify_reboot, 10473 .next = NULL, 10474 .priority = INT_MAX, /* before any real devices */ 10475}; 10476 10477static void md_geninit(void) 10478{ 10479 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 10480 10481 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); 10482} 10483 10484static int __init md_init(void) 10485{ 10486 int ret = md_bitmap_init(); 10487 10488 if (ret) 10489 return ret; 10490 10491 ret = md_llbitmap_init(); 10492 if (ret) 10493 goto err_bitmap; 10494 10495 ret = -ENOMEM; 10496 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 10497 if (!md_wq) 10498 goto err_wq; 10499 10500 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 10501 if (!md_misc_wq) 10502 goto err_misc_wq; 10503 10504 ret = __register_blkdev(MD_MAJOR, "md", md_probe); 10505 if (ret < 0) 10506 goto err_md; 10507 10508 ret = __register_blkdev(0, "mdp", md_probe); 10509 if (ret < 0) 10510 goto err_mdp; 10511 mdp_major = ret; 10512 10513 register_reboot_notifier(&md_notifier); 10514 raid_table_header = register_sysctl("dev/raid", raid_table); 10515 10516 md_geninit(); 10517 return 0; 10518 10519err_mdp: 10520 unregister_blkdev(MD_MAJOR, "md"); 10521err_md: 10522 destroy_workqueue(md_misc_wq); 10523err_misc_wq: 10524 destroy_workqueue(md_wq); 10525err_wq: 10526 md_llbitmap_exit(); 10527err_bitmap: 10528 md_bitmap_exit(); 10529 return ret; 10530} 10531 10532static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 10533{ 10534 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 10535 struct md_rdev *rdev2, *tmp; 10536 int role, ret; 10537 10538 /* 10539 * If size is changed in another node then we need to 10540 * do resize as well. 10541 */ 10542 if (mddev->dev_sectors != le64_to_cpu(sb->size)) { 10543 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); 10544 if (ret) 10545 pr_info("md-cluster: resize failed\n"); 10546 else if (md_bitmap_enabled(mddev, false)) 10547 mddev->bitmap_ops->update_sb(mddev->bitmap); 10548 } 10549 10550 /* Check for change of roles in the active devices */ 10551 rdev_for_each_safe(rdev2, tmp, mddev) { 10552 if (test_bit(Faulty, &rdev2->flags)) { 10553 if (test_bit(ClusterRemove, &rdev2->flags)) 10554 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10555 continue; 10556 } 10557 10558 /* Check if the roles changed */ 10559 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 10560 10561 if (test_bit(Candidate, &rdev2->flags)) { 10562 if (role == MD_DISK_ROLE_FAULTY) { 10563 pr_info("md: Removing Candidate device %pg because add failed\n", 10564 rdev2->bdev); 10565 md_kick_rdev_from_array(rdev2); 10566 continue; 10567 } 10568 else 10569 clear_bit(Candidate, &rdev2->flags); 10570 } 10571 10572 if (role != rdev2->raid_disk) { 10573 /* 10574 * got activated except reshape is happening. 10575 */ 10576 if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && 10577 !(le32_to_cpu(sb->feature_map) & 10578 MD_FEATURE_RESHAPE_ACTIVE) && 10579 !mddev->cluster_ops->resync_status_get(mddev)) { 10580 /* 10581 * -1 to make raid1_add_disk() set conf->fullsync 10582 * to 1. This could avoid skipping sync when the 10583 * remote node is down during resyncing. 10584 */ 10585 if ((le32_to_cpu(sb->feature_map) 10586 & MD_FEATURE_RECOVERY_OFFSET)) 10587 rdev2->saved_raid_disk = -1; 10588 else 10589 rdev2->saved_raid_disk = role; 10590 ret = remove_and_add_spares(mddev, rdev2); 10591 pr_info("Activated spare: %pg\n", 10592 rdev2->bdev); 10593 /* wakeup mddev->thread here, so array could 10594 * perform resync with the new activated disk */ 10595 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10596 md_wakeup_thread(mddev->thread); 10597 } 10598 /* device faulty 10599 * We just want to do the minimum to mark the disk 10600 * as faulty. The recovery is performed by the 10601 * one who initiated the error. 10602 */ 10603 if (role == MD_DISK_ROLE_FAULTY || 10604 role == MD_DISK_ROLE_JOURNAL) { 10605 md_error(mddev, rdev2); 10606 clear_bit(Blocked, &rdev2->flags); 10607 } 10608 } 10609 } 10610 10611 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { 10612 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 10613 if (ret) 10614 pr_warn("md: updating array disks failed. %d\n", ret); 10615 } 10616 10617 /* 10618 * Since mddev->delta_disks has already updated in update_raid_disks, 10619 * so it is time to check reshape. 10620 */ 10621 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 10622 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 10623 /* 10624 * reshape is happening in the remote node, we need to 10625 * update reshape_position and call start_reshape. 10626 */ 10627 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 10628 if (mddev->pers->update_reshape_pos) 10629 mddev->pers->update_reshape_pos(mddev); 10630 if (mddev->pers->start_reshape) 10631 mddev->pers->start_reshape(mddev); 10632 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 10633 mddev->reshape_position != MaxSector && 10634 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 10635 /* reshape is just done in another node. */ 10636 mddev->reshape_position = MaxSector; 10637 if (mddev->pers->update_reshape_pos) 10638 mddev->pers->update_reshape_pos(mddev); 10639 } 10640 10641 /* Finally set the event to be up to date */ 10642 mddev->events = le64_to_cpu(sb->events); 10643} 10644 10645static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 10646{ 10647 int err; 10648 struct page *swapout = rdev->sb_page; 10649 struct mdp_superblock_1 *sb; 10650 10651 /* Store the sb page of the rdev in the swapout temporary 10652 * variable in case we err in the future 10653 */ 10654 rdev->sb_page = NULL; 10655 err = alloc_disk_sb(rdev); 10656 if (err == 0) { 10657 ClearPageUptodate(rdev->sb_page); 10658 rdev->sb_loaded = 0; 10659 err = super_types[mddev->major_version]. 10660 load_super(rdev, NULL, mddev->minor_version); 10661 } 10662 if (err < 0) { 10663 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 10664 __func__, __LINE__, rdev->desc_nr, err); 10665 if (rdev->sb_page) 10666 put_page(rdev->sb_page); 10667 rdev->sb_page = swapout; 10668 rdev->sb_loaded = 1; 10669 return err; 10670 } 10671 10672 sb = page_address(rdev->sb_page); 10673 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 10674 * is not set 10675 */ 10676 10677 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 10678 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 10679 10680 /* The other node finished recovery, call spare_active to set 10681 * device In_sync and mddev->degraded 10682 */ 10683 if (rdev->recovery_offset == MaxSector && 10684 !test_bit(In_sync, &rdev->flags) && 10685 mddev->pers->spare_active(mddev)) 10686 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 10687 10688 put_page(swapout); 10689 return 0; 10690} 10691 10692void md_reload_sb(struct mddev *mddev, int nr) 10693{ 10694 struct md_rdev *rdev = NULL, *iter; 10695 int err; 10696 10697 /* Find the rdev */ 10698 rdev_for_each_rcu(iter, mddev) { 10699 if (iter->desc_nr == nr) { 10700 rdev = iter; 10701 break; 10702 } 10703 } 10704 10705 if (!rdev) { 10706 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 10707 return; 10708 } 10709 10710 err = read_rdev(mddev, rdev); 10711 if (err < 0) 10712 return; 10713 10714 check_sb_changes(mddev, rdev); 10715 10716 /* Read all rdev's to update recovery_offset */ 10717 rdev_for_each_rcu(rdev, mddev) { 10718 if (!test_bit(Faulty, &rdev->flags)) 10719 read_rdev(mddev, rdev); 10720 } 10721} 10722EXPORT_SYMBOL(md_reload_sb); 10723 10724#ifndef MODULE 10725 10726/* 10727 * Searches all registered partitions for autorun RAID arrays 10728 * at boot time. 10729 */ 10730 10731static DEFINE_MUTEX(detected_devices_mutex); 10732static LIST_HEAD(all_detected_devices); 10733struct detected_devices_node { 10734 struct list_head list; 10735 dev_t dev; 10736}; 10737 10738void md_autodetect_dev(dev_t dev) 10739{ 10740 struct detected_devices_node *node_detected_dev; 10741 10742 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 10743 if (node_detected_dev) { 10744 node_detected_dev->dev = dev; 10745 mutex_lock(&detected_devices_mutex); 10746 list_add_tail(&node_detected_dev->list, &all_detected_devices); 10747 mutex_unlock(&detected_devices_mutex); 10748 } 10749} 10750 10751void md_autostart_arrays(int part) 10752{ 10753 struct md_rdev *rdev; 10754 struct detected_devices_node *node_detected_dev; 10755 dev_t dev; 10756 int i_scanned, i_passed; 10757 10758 i_scanned = 0; 10759 i_passed = 0; 10760 10761 pr_info("md: Autodetecting RAID arrays.\n"); 10762 10763 mutex_lock(&detected_devices_mutex); 10764 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 10765 i_scanned++; 10766 node_detected_dev = list_entry(all_detected_devices.next, 10767 struct detected_devices_node, list); 10768 list_del(&node_detected_dev->list); 10769 dev = node_detected_dev->dev; 10770 kfree(node_detected_dev); 10771 mutex_unlock(&detected_devices_mutex); 10772 rdev = md_import_device(dev,0, 90); 10773 mutex_lock(&detected_devices_mutex); 10774 if (IS_ERR(rdev)) 10775 continue; 10776 10777 if (test_bit(Faulty, &rdev->flags)) 10778 continue; 10779 10780 set_bit(AutoDetected, &rdev->flags); 10781 list_add(&rdev->same_set, &pending_raid_disks); 10782 i_passed++; 10783 } 10784 mutex_unlock(&detected_devices_mutex); 10785 10786 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); 10787 10788 autorun_devices(part); 10789} 10790 10791#endif /* !MODULE */ 10792 10793static __exit void md_exit(void) 10794{ 10795 struct mddev *mddev; 10796 int delay = 1; 10797 10798 unregister_blkdev(MD_MAJOR,"md"); 10799 unregister_blkdev(mdp_major, "mdp"); 10800 unregister_reboot_notifier(&md_notifier); 10801 unregister_sysctl_table(raid_table_header); 10802 10803 /* We cannot unload the modules while some process is 10804 * waiting for us in select() or poll() - wake them up 10805 */ 10806 md_unloading = 1; 10807 while (waitqueue_active(&md_event_waiters)) { 10808 /* not safe to leave yet */ 10809 wake_up(&md_event_waiters); 10810 msleep(delay); 10811 delay += delay; 10812 } 10813 remove_proc_entry("mdstat", NULL); 10814 10815 spin_lock(&all_mddevs_lock); 10816 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 10817 if (!mddev_get(mddev)) 10818 continue; 10819 spin_unlock(&all_mddevs_lock); 10820 export_array(mddev); 10821 mddev->ctime = 0; 10822 mddev->hold_active = 0; 10823 /* 10824 * As the mddev is now fully clear, mddev_put will schedule 10825 * the mddev for destruction by a workqueue, and the 10826 * destroy_workqueue() below will wait for that to complete. 10827 */ 10828 spin_lock(&all_mddevs_lock); 10829 mddev_put_locked(mddev); 10830 } 10831 spin_unlock(&all_mddevs_lock); 10832 10833 destroy_workqueue(md_misc_wq); 10834 destroy_workqueue(md_wq); 10835 md_bitmap_exit(); 10836} 10837 10838subsys_initcall(md_init); 10839module_exit(md_exit) 10840 10841static int get_ro(char *buffer, const struct kernel_param *kp) 10842{ 10843 return sprintf(buffer, "%d\n", start_readonly); 10844} 10845static int set_ro(const char *val, const struct kernel_param *kp) 10846{ 10847 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 10848} 10849 10850module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 10851module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 10852module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 10853module_param(create_on_open, bool, S_IRUSR|S_IWUSR); 10854module_param(legacy_async_del_gendisk, bool, 0600); 10855module_param(check_new_feature, bool, 0600); 10856 10857MODULE_LICENSE("GPL"); 10858MODULE_DESCRIPTION("MD RAID framework"); 10859MODULE_ALIAS("md"); 10860MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);