Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'md/4.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

Pull MD updates from Shaohua Li:
"Several patches from Guoqing fixing md-cluster bugs and several
patches from Heinz fixing dm-raid bugs"

* tag 'md/4.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md:
md-cluster: check the return value of process_recvd_msg
md-cluster: gather resync infos and enable recv_thread after bitmap is ready
md: set MD_CHANGE_PENDING in a atomic region
md: raid5: add prerequisite to run underneath dm-raid
md: raid10: add prerequisite to run underneath dm-raid
md: md.c: fix oops in mddev_suspend for raid0
md-cluster: fix ifnullfree.cocci warnings
md-cluster/bitmap: unplug bitmap to sync dirty pages to disk
md-cluster/bitmap: fix wrong page num in bitmap_file_clear_bit and bitmap_file_set_bit
md-cluster/bitmap: fix wrong calcuation of offset
md-cluster: sync bitmap when node received RESYNCING msg
md-cluster: always setup in-memory bitmap
md-cluster: wakeup thread if activated a spare disk
md-cluster: change array_sectors and update size are not supported
md-cluster: fix locking when node joins cluster during message broadcast
md-cluster: unregister thread if err happened
md-cluster: wake up thread to continue recovery
md-cluser: make resync_finish only called after pers->sync_request
md-cluster: change resync lock from asynchronous to synchronous

+257 -77
+6
Documentation/md-cluster.txt
··· 316 316 nodes are using the raid which is achieved by lock all bitmap 317 317 locks within the cluster, and also those locks are unlocked 318 318 accordingly. 319 + 320 + 7. Unsupported features 321 + 322 + There are somethings which are not supported by cluster MD yet. 323 + 324 + - update size and change array_sectors.
+77 -11
drivers/md/bitmap.c
··· 46 46 * allocated while we're using it 47 47 */ 48 48 static int bitmap_checkpage(struct bitmap_counts *bitmap, 49 - unsigned long page, int create) 49 + unsigned long page, int create, int no_hijack) 50 50 __releases(bitmap->lock) 51 51 __acquires(bitmap->lock) 52 52 { ··· 90 90 91 91 if (mappage == NULL) { 92 92 pr_debug("md/bitmap: map page allocation failed, hijacking\n"); 93 + /* We don't support hijack for cluster raid */ 94 + if (no_hijack) 95 + return -ENOMEM; 93 96 /* failed - set the hijacked flag so that we can use the 94 97 * pointer as a counter */ 95 98 if (!bitmap->bp[page].map) ··· 759 756 bytes += sizeof(bitmap_super_t); 760 757 761 758 num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); 762 - offset = slot_number * (num_pages - 1); 759 + offset = slot_number * num_pages; 763 760 764 761 store->filemap = kmalloc(sizeof(struct page *) 765 762 * num_pages, GFP_KERNEL); ··· 903 900 struct page *page; 904 901 void *kaddr; 905 902 unsigned long chunk = block >> bitmap->counts.chunkshift; 903 + struct bitmap_storage *store = &bitmap->storage; 904 + unsigned long node_offset = 0; 905 + 906 + if (mddev_is_clustered(bitmap->mddev)) 907 + node_offset = bitmap->cluster_slot * store->file_pages; 906 908 907 909 page = filemap_get_page(&bitmap->storage, chunk); 908 910 if (!page) ··· 923 915 kunmap_atomic(kaddr); 924 916 pr_debug("set file bit %lu page %lu\n", bit, page->index); 925 917 /* record page number so it gets flushed to disk when unplug occurs */ 926 - set_page_attr(bitmap, page->index, BITMAP_PAGE_DIRTY); 918 + set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY); 927 919 } 928 920 929 921 static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) ··· 932 924 struct page *page; 933 925 void *paddr; 934 926 unsigned long chunk = block >> bitmap->counts.chunkshift; 927 + struct bitmap_storage *store = &bitmap->storage; 928 + unsigned long node_offset = 0; 929 + 930 + if (mddev_is_clustered(bitmap->mddev)) 931 + node_offset = bitmap->cluster_slot * store->file_pages; 935 932 936 933 page = filemap_get_page(&bitmap->storage, chunk); 937 934 if (!page) ··· 948 935 else 949 936 clear_bit_le(bit, paddr); 950 937 kunmap_atomic(paddr); 951 - if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) { 952 - set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING); 938 + if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) { 939 + set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING); 953 940 bitmap->allclean = 0; 954 941 } 955 942 } ··· 1334 1321 sector_t csize; 1335 1322 int err; 1336 1323 1337 - err = bitmap_checkpage(bitmap, page, create); 1324 + err = bitmap_checkpage(bitmap, page, create, 0); 1338 1325 1339 1326 if (bitmap->bp[page].hijacked || 1340 1327 bitmap->bp[page].map == NULL) ··· 1607 1594 } 1608 1595 EXPORT_SYMBOL(bitmap_cond_end_sync); 1609 1596 1597 + void bitmap_sync_with_cluster(struct mddev *mddev, 1598 + sector_t old_lo, sector_t old_hi, 1599 + sector_t new_lo, sector_t new_hi) 1600 + { 1601 + struct bitmap *bitmap = mddev->bitmap; 1602 + sector_t sector, blocks = 0; 1603 + 1604 + for (sector = old_lo; sector < new_lo; ) { 1605 + bitmap_end_sync(bitmap, sector, &blocks, 0); 1606 + sector += blocks; 1607 + } 1608 + WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n"); 1609 + 1610 + for (sector = old_hi; sector < new_hi; ) { 1611 + bitmap_start_sync(bitmap, sector, &blocks, 0); 1612 + sector += blocks; 1613 + } 1614 + WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n"); 1615 + } 1616 + EXPORT_SYMBOL(bitmap_sync_with_cluster); 1617 + 1610 1618 static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) 1611 1619 { 1612 1620 /* For each chunk covered by any of these sectors, set the ··· 1848 1814 if (!bitmap) 1849 1815 goto out; 1850 1816 1817 + if (mddev_is_clustered(mddev)) 1818 + md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); 1819 + 1851 1820 /* Clear out old bitmap info first: Either there is none, or we 1852 1821 * are resuming after someone else has possibly changed things, 1853 1822 * so we should forget old cached info. ··· 1927 1890 1928 1891 if (clear_bits) { 1929 1892 bitmap_update_sb(bitmap); 1930 - /* Setting this for the ev_page should be enough. 1931 - * And we do not require both write_all and PAGE_DIRT either 1932 - */ 1893 + /* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs 1894 + * BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */ 1933 1895 for (i = 0; i < bitmap->storage.file_pages; i++) 1934 - set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); 1935 - bitmap_write_all(bitmap); 1896 + if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING)) 1897 + set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); 1936 1898 bitmap_unplug(bitmap); 1937 1899 } 1900 + bitmap_unplug(mddev->bitmap); 1938 1901 *low = lo; 1939 1902 *high = hi; 1940 1903 err: ··· 2069 2032 chunks << chunkshift); 2070 2033 2071 2034 spin_lock_irq(&bitmap->counts.lock); 2035 + /* For cluster raid, need to pre-allocate bitmap */ 2036 + if (mddev_is_clustered(bitmap->mddev)) { 2037 + unsigned long page; 2038 + for (page = 0; page < pages; page++) { 2039 + ret = bitmap_checkpage(&bitmap->counts, page, 1, 1); 2040 + if (ret) { 2041 + unsigned long k; 2042 + 2043 + /* deallocate the page memory */ 2044 + for (k = 0; k < page; k++) { 2045 + kfree(new_bp[k].map); 2046 + } 2047 + 2048 + /* restore some fields from old_counts */ 2049 + bitmap->counts.bp = old_counts.bp; 2050 + bitmap->counts.pages = old_counts.pages; 2051 + bitmap->counts.missing_pages = old_counts.pages; 2052 + bitmap->counts.chunkshift = old_counts.chunkshift; 2053 + bitmap->counts.chunks = old_counts.chunks; 2054 + bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift + 2055 + BITMAP_BLOCK_SHIFT); 2056 + blocks = old_counts.chunks << old_counts.chunkshift; 2057 + pr_err("Could not pre-allocate in-memory bitmap for cluster raid\n"); 2058 + break; 2059 + } else 2060 + bitmap->counts.bp[page].count += 1; 2061 + } 2062 + } 2063 + 2072 2064 for (block = 0; block < blocks; ) { 2073 2065 bitmap_counter_t *bmc_old, *bmc_new; 2074 2066 int set;
+3
drivers/md/bitmap.h
··· 258 258 void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); 259 259 void bitmap_close_sync(struct bitmap *bitmap); 260 260 void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); 261 + void bitmap_sync_with_cluster(struct mddev *mddev, 262 + sector_t old_lo, sector_t old_hi, 263 + sector_t new_lo, sector_t new_hi); 261 264 262 265 void bitmap_unplug(struct bitmap *bitmap); 263 266 void bitmap_daemon_work(struct mddev *mddev);
+79 -17
drivers/md/md-cluster.c
··· 61 61 * the lock. 62 62 */ 63 63 #define MD_CLUSTER_SEND_LOCKED_ALREADY 5 64 + /* We should receive message after node joined cluster and 65 + * set up all the related infos such as bitmap and personality */ 66 + #define MD_CLUSTER_ALREADY_IN_CLUSTER 6 67 + #define MD_CLUSTER_PENDING_RECV_EVENT 7 64 68 65 69 66 70 struct md_cluster_info { ··· 89 85 struct completion newdisk_completion; 90 86 wait_queue_head_t wait; 91 87 unsigned long state; 88 + /* record the region in RESYNCING message */ 89 + sector_t sync_low; 90 + sector_t sync_hi; 92 91 }; 93 92 94 93 enum msg_type { ··· 291 284 goto dlm_unlock; 292 285 } 293 286 if (hi > 0) { 294 - /* TODO:Wait for current resync to get over */ 295 - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 296 287 if (lo < mddev->recovery_cp) 297 288 mddev->recovery_cp = lo; 298 - md_check_recovery(mddev); 289 + /* wake up thread to continue resync in case resync 290 + * is not finished */ 291 + if (mddev->recovery_cp != MaxSector) { 292 + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 293 + md_wakeup_thread(mddev->thread); 294 + } 299 295 } 300 296 dlm_unlock: 301 297 dlm_unlock_sync(bm_lockres); ··· 380 370 struct dlm_lock_resource *res = arg; 381 371 struct md_cluster_info *cinfo = res->mddev->cluster_info; 382 372 383 - if (mode == DLM_LOCK_EX) 384 - md_wakeup_thread(cinfo->recv_thread); 373 + if (mode == DLM_LOCK_EX) { 374 + if (test_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state)) 375 + md_wakeup_thread(cinfo->recv_thread); 376 + else 377 + set_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state); 378 + } 385 379 } 386 380 387 381 static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) ··· 422 408 md_wakeup_thread(mddev->thread); 423 409 return; 424 410 } 411 + 412 + /* 413 + * The bitmaps are not same for different nodes 414 + * if RESYNCING is happening in one node, then 415 + * the node which received the RESYNCING message 416 + * probably will perform resync with the region 417 + * [lo, hi] again, so we could reduce resync time 418 + * a lot if we can ensure that the bitmaps among 419 + * different nodes are match up well. 420 + * 421 + * sync_low/hi is used to record the region which 422 + * arrived in the previous RESYNCING message, 423 + * 424 + * Call bitmap_sync_with_cluster to clear 425 + * NEEDED_MASK and set RESYNC_MASK since 426 + * resync thread is running in another node, 427 + * so we don't need to do the resync again 428 + * with the same section */ 429 + bitmap_sync_with_cluster(mddev, cinfo->sync_low, 430 + cinfo->sync_hi, 431 + lo, hi); 432 + cinfo->sync_low = lo; 433 + cinfo->sync_hi = hi; 434 + 425 435 s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); 426 436 if (!s) 427 437 return; ··· 520 482 __func__, __LINE__, le32_to_cpu(msg->raid_slot)); 521 483 } 522 484 523 - static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) 485 + static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) 524 486 { 487 + int ret = 0; 488 + 525 489 if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot), 526 490 "node %d received it's own msg\n", le32_to_cpu(msg->slot))) 527 - return; 491 + return -1; 528 492 switch (le32_to_cpu(msg->type)) { 529 493 case METADATA_UPDATED: 530 494 process_metadata_update(mddev, msg); ··· 549 509 __recover_slot(mddev, le32_to_cpu(msg->slot)); 550 510 break; 551 511 default: 512 + ret = -1; 552 513 pr_warn("%s:%d Received unknown message from %d\n", 553 514 __func__, __LINE__, msg->slot); 554 515 } 516 + return ret; 555 517 } 556 518 557 519 /* ··· 577 535 578 536 /* read lvb and wake up thread to process this message_lockres */ 579 537 memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg)); 580 - process_recvd_msg(thread->mddev, &msg); 538 + ret = process_recvd_msg(thread->mddev, &msg); 539 + if (ret) 540 + goto out; 581 541 582 542 /*release CR on ack_lockres*/ 583 543 ret = dlm_unlock_sync(ack_lockres); ··· 593 549 ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR); 594 550 if (unlikely(ret != 0)) 595 551 pr_info("lock CR on ack failed return %d\n", ret); 552 + out: 596 553 /*release CR on message_lockres*/ 597 554 ret = dlm_unlock_sync(message_lockres); 598 555 if (unlikely(ret != 0)) ··· 823 778 cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0); 824 779 if (!cinfo->token_lockres) 825 780 goto err; 826 - cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); 827 - if (!cinfo->ack_lockres) 828 - goto err; 829 781 cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0); 830 782 if (!cinfo->no_new_dev_lockres) 831 783 goto err; 832 784 785 + ret = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); 786 + if (ret) { 787 + ret = -EAGAIN; 788 + pr_err("md-cluster: can't join cluster to avoid lock issue\n"); 789 + goto err; 790 + } 791 + cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); 792 + if (!cinfo->ack_lockres) 793 + goto err; 833 794 /* get sync CR lock on ACK. */ 834 795 if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR)) 835 796 pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n", 836 797 ret); 798 + dlm_unlock_sync(cinfo->token_lockres); 837 799 /* get sync CR lock on no-new-dev. */ 838 800 if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR)) 839 801 pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret); ··· 861 809 if (!cinfo->resync_lockres) 862 810 goto err; 863 811 864 - ret = gather_all_resync_info(mddev, nodes); 865 - if (ret) 866 - goto err; 867 - 868 812 return 0; 869 813 err: 814 + md_unregister_thread(&cinfo->recovery_thread); 815 + md_unregister_thread(&cinfo->recv_thread); 870 816 lockres_free(cinfo->message_lockres); 871 817 lockres_free(cinfo->token_lockres); 872 818 lockres_free(cinfo->ack_lockres); ··· 876 826 mddev->cluster_info = NULL; 877 827 kfree(cinfo); 878 828 return ret; 829 + } 830 + 831 + static void load_bitmaps(struct mddev *mddev, int total_slots) 832 + { 833 + struct md_cluster_info *cinfo = mddev->cluster_info; 834 + 835 + /* load all the node's bitmap info for resync */ 836 + if (gather_all_resync_info(mddev, total_slots)) 837 + pr_err("md-cluster: failed to gather all resyn infos\n"); 838 + set_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state); 839 + /* wake up recv thread in case something need to be handled */ 840 + if (test_and_clear_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state)) 841 + md_wakeup_thread(cinfo->recv_thread); 879 842 } 880 843 881 844 static void resync_bitmap(struct mddev *mddev) ··· 1000 937 static int resync_start(struct mddev *mddev) 1001 938 { 1002 939 struct md_cluster_info *cinfo = mddev->cluster_info; 1003 - cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE; 1004 940 return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX); 1005 941 } 1006 942 ··· 1029 967 static int resync_finish(struct mddev *mddev) 1030 968 { 1031 969 struct md_cluster_info *cinfo = mddev->cluster_info; 1032 - cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE; 1033 970 dlm_unlock_sync(cinfo->resync_lockres); 1034 971 return resync_info_update(mddev, 0, 0); 1035 972 } ··· 1232 1171 .add_new_disk_cancel = add_new_disk_cancel, 1233 1172 .new_disk_ack = new_disk_ack, 1234 1173 .remove_disk = remove_disk, 1174 + .load_bitmaps = load_bitmaps, 1235 1175 .gather_bitmaps = gather_bitmaps, 1236 1176 .lock_all_bitmaps = lock_all_bitmaps, 1237 1177 .unlock_all_bitmaps = unlock_all_bitmaps,
+1
drivers/md/md-cluster.h
··· 23 23 void (*add_new_disk_cancel)(struct mddev *mddev); 24 24 int (*new_disk_ack)(struct mddev *mddev, bool ack); 25 25 int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); 26 + void (*load_bitmaps)(struct mddev *mddev, int total_slots); 26 27 int (*gather_bitmaps)(struct md_rdev *rdev); 27 28 int (*lock_all_bitmaps)(struct mddev *mddev); 28 29 void (*unlock_all_bitmaps)(struct mddev *mddev);
+53 -33
drivers/md/md.c
··· 307 307 */ 308 308 void mddev_suspend(struct mddev *mddev) 309 309 { 310 - WARN_ON_ONCE(current == mddev->thread->tsk); 310 + WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); 311 311 if (mddev->suspended++) 312 312 return; 313 313 synchronize_rcu(); ··· 2291 2291 return; 2292 2292 } 2293 2293 2294 + repeat: 2294 2295 if (mddev_is_clustered(mddev)) { 2295 2296 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 2296 2297 force_change = 1; 2298 + if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 2299 + nospares = 1; 2297 2300 ret = md_cluster_ops->metadata_update_start(mddev); 2298 2301 /* Has someone else has updated the sb */ 2299 2302 if (!does_sb_need_changing(mddev)) { 2300 2303 if (ret == 0) 2301 2304 md_cluster_ops->metadata_update_cancel(mddev); 2302 - clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2305 + bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING), 2306 + BIT(MD_CHANGE_DEVS) | 2307 + BIT(MD_CHANGE_CLEAN)); 2303 2308 return; 2304 2309 } 2305 2310 } 2306 - repeat: 2311 + 2307 2312 /* First make sure individual recovery_offsets are correct */ 2308 2313 rdev_for_each(rdev, mddev) { 2309 2314 if (rdev->raid_disk >= 0 && ··· 2435 2430 md_super_wait(mddev); 2436 2431 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 2437 2432 2438 - spin_lock(&mddev->lock); 2433 + if (mddev_is_clustered(mddev) && ret == 0) 2434 + md_cluster_ops->metadata_update_finish(mddev); 2435 + 2439 2436 if (mddev->in_sync != sync_req || 2440 - test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 2437 + !bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING), 2438 + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_CLEAN))) 2441 2439 /* have to write it out again */ 2442 - spin_unlock(&mddev->lock); 2443 2440 goto repeat; 2444 - } 2445 - clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2446 - spin_unlock(&mddev->lock); 2447 2441 wake_up(&mddev->sb_wait); 2448 2442 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2449 2443 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); ··· 2456 2452 clear_bit(BlockedBadBlocks, &rdev->flags); 2457 2453 wake_up(&rdev->blocked_wait); 2458 2454 } 2459 - 2460 - if (mddev_is_clustered(mddev) && ret == 0) 2461 - md_cluster_ops->metadata_update_finish(mddev); 2462 2455 } 2463 2456 EXPORT_SYMBOL(md_update_sb); 2464 2457 ··· 4817 4816 if (err) 4818 4817 return err; 4819 4818 4819 + /* cluster raid doesn't support change array_sectors */ 4820 + if (mddev_is_clustered(mddev)) 4821 + return -EINVAL; 4822 + 4820 4823 if (strncmp(buf, "default", 7) == 0) { 4821 4824 if (mddev->pers) 4822 4825 sectors = mddev->pers->size(mddev, 0, 0); ··· 6442 6437 int rv; 6443 6438 int fit = (num_sectors == 0); 6444 6439 6440 + /* cluster raid doesn't support update size */ 6441 + if (mddev_is_clustered(mddev)) 6442 + return -EINVAL; 6443 + 6445 6444 if (mddev->pers->resize == NULL) 6446 6445 return -EINVAL; 6447 6446 /* The "num_sectors" is the number of sectors of each device that ··· 7794 7785 struct md_rdev *rdev; 7795 7786 char *desc, *action = NULL; 7796 7787 struct blk_plug plug; 7797 - bool cluster_resync_finished = false; 7788 + int ret; 7798 7789 7799 7790 /* just incase thread restarts... */ 7800 7791 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) ··· 7802 7793 if (mddev->ro) {/* never try to sync a read-only array */ 7803 7794 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7804 7795 return; 7796 + } 7797 + 7798 + if (mddev_is_clustered(mddev)) { 7799 + ret = md_cluster_ops->resync_start(mddev); 7800 + if (ret) 7801 + goto skip; 7802 + 7803 + if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 7804 + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 7805 + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 7806 + && ((unsigned long long)mddev->curr_resync_completed 7807 + < (unsigned long long)mddev->resync_max_sectors)) 7808 + goto skip; 7805 7809 } 7806 7810 7807 7811 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { ··· 8111 8089 mddev->curr_resync_completed = mddev->curr_resync; 8112 8090 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 8113 8091 } 8114 - /* tell personality and other nodes that we are finished */ 8115 - if (mddev_is_clustered(mddev)) { 8116 - md_cluster_ops->resync_finish(mddev); 8117 - cluster_resync_finished = true; 8118 - } 8119 8092 mddev->pers->sync_request(mddev, max_sectors, &skipped); 8120 8093 8121 8094 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && ··· 8147 8130 } 8148 8131 } 8149 8132 skip: 8150 - set_bit(MD_CHANGE_DEVS, &mddev->flags); 8151 - 8152 8133 if (mddev_is_clustered(mddev) && 8153 - test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8154 - !cluster_resync_finished) 8134 + ret == 0) { 8135 + /* set CHANGE_PENDING here since maybe another 8136 + * update is needed, so other nodes are informed */ 8137 + set_mask_bits(&mddev->flags, 0, 8138 + BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS)); 8139 + md_wakeup_thread(mddev->thread); 8140 + wait_event(mddev->sb_wait, 8141 + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 8155 8142 md_cluster_ops->resync_finish(mddev); 8143 + } else 8144 + set_bit(MD_CHANGE_DEVS, &mddev->flags); 8156 8145 8157 8146 spin_lock(&mddev->lock); 8158 8147 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { ··· 8249 8226 struct mddev *mddev = container_of(ws, struct mddev, del_work); 8250 8227 int ret = 0; 8251 8228 8252 - if (mddev_is_clustered(mddev)) { 8253 - ret = md_cluster_ops->resync_start(mddev); 8254 - if (ret) { 8255 - mddev->sync_thread = NULL; 8256 - goto out; 8257 - } 8258 - } 8259 - 8260 8229 mddev->sync_thread = md_register_thread(md_do_sync, 8261 8230 mddev, 8262 8231 "resync"); 8263 - out: 8264 8232 if (!mddev->sync_thread) { 8265 8233 if (!(mddev_is_clustered(mddev) && ret == -EAGAIN)) 8266 8234 printk(KERN_ERR "%s: could not start resync" ··· 8550 8536 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8551 8537 int is_new) 8552 8538 { 8539 + struct mddev *mddev = rdev->mddev; 8553 8540 int rv; 8554 8541 if (is_new) 8555 8542 s += rdev->new_data_offset; ··· 8560 8545 if (rv == 0) { 8561 8546 /* Make sure they get written out promptly */ 8562 8547 sysfs_notify_dirent_safe(rdev->sysfs_state); 8563 - set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); 8564 - set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags); 8548 + set_mask_bits(&mddev->flags, 0, 8549 + BIT(MD_CHANGE_CLEAN) | BIT(MD_CHANGE_PENDING)); 8565 8550 md_wakeup_thread(rdev->mddev->thread); 8566 8551 return 1; 8567 8552 } else ··· 8695 8680 ret = remove_and_add_spares(mddev, rdev2); 8696 8681 pr_info("Activated spare: %s\n", 8697 8682 bdevname(rdev2->bdev,b)); 8683 + /* wakeup mddev->thread here, so array could 8684 + * perform resync with the new activated disk */ 8685 + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8686 + md_wakeup_thread(mddev->thread); 8687 + 8698 8688 } 8699 8689 /* device faulty 8700 8690 * We just want to do the minimum to mark the disk
+2 -2
drivers/md/raid1.c
··· 1474 1474 * if recovery is running, make sure it aborts. 1475 1475 */ 1476 1476 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1477 - set_bit(MD_CHANGE_DEVS, &mddev->flags); 1478 - set_bit(MD_CHANGE_PENDING, &mddev->flags); 1477 + set_mask_bits(&mddev->flags, 0, 1478 + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 1479 1479 printk(KERN_ALERT 1480 1480 "md/raid1:%s: Disk failure on %s, disabling device.\n" 1481 1481 "md/raid1:%s: Operation continuing on %d devices.\n",
+12 -8
drivers/md/raid10.c
··· 1102 1102 bio->bi_iter.bi_sector < conf->reshape_progress))) { 1103 1103 /* Need to update reshape_position in metadata */ 1104 1104 mddev->reshape_position = conf->reshape_progress; 1105 - set_bit(MD_CHANGE_DEVS, &mddev->flags); 1106 - set_bit(MD_CHANGE_PENDING, &mddev->flags); 1105 + set_mask_bits(&mddev->flags, 0, 1106 + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 1107 1107 md_wakeup_thread(mddev->thread); 1108 1108 wait_event(mddev->sb_wait, 1109 1109 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); ··· 1591 1591 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1592 1592 set_bit(Blocked, &rdev->flags); 1593 1593 set_bit(Faulty, &rdev->flags); 1594 - set_bit(MD_CHANGE_DEVS, &mddev->flags); 1595 - set_bit(MD_CHANGE_PENDING, &mddev->flags); 1594 + set_mask_bits(&mddev->flags, 0, 1595 + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 1596 1596 spin_unlock_irqrestore(&conf->device_lock, flags); 1597 1597 printk(KERN_ALERT 1598 1598 "md/raid10:%s: Disk failure on %s, disabling device.\n" ··· 3782 3782 return ret; 3783 3783 } 3784 3784 md_set_array_sectors(mddev, size); 3785 - set_capacity(mddev->gendisk, mddev->array_sectors); 3786 - revalidate_disk(mddev->gendisk); 3785 + if (mddev->queue) { 3786 + set_capacity(mddev->gendisk, mddev->array_sectors); 3787 + revalidate_disk(mddev->gendisk); 3788 + } 3787 3789 if (sectors > mddev->dev_sectors && 3788 3790 mddev->recovery_cp > oldsize) { 3789 3791 mddev->recovery_cp = oldsize; ··· 4595 4593 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4596 4594 } 4597 4595 mddev->resync_max_sectors = size; 4598 - set_capacity(mddev->gendisk, mddev->array_sectors); 4599 - revalidate_disk(mddev->gendisk); 4596 + if (mddev->queue) { 4597 + set_capacity(mddev->gendisk, mddev->array_sectors); 4598 + revalidate_disk(mddev->gendisk); 4599 + } 4600 4600 } else { 4601 4601 int d; 4602 4602 for (d = conf->geo.raid_disks ;
+2 -2
drivers/md/raid5-cache.c
··· 712 712 * in_teardown check workaround this issue. 713 713 */ 714 714 if (!log->in_teardown) { 715 - set_bit(MD_CHANGE_DEVS, &mddev->flags); 716 - set_bit(MD_CHANGE_PENDING, &mddev->flags); 715 + set_mask_bits(&mddev->flags, 0, 716 + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 717 717 md_wakeup_thread(mddev->thread); 718 718 wait_event(mddev->sb_wait, 719 719 !test_bit(MD_CHANGE_PENDING, &mddev->flags) ||
+6 -4
drivers/md/raid5.c
··· 2514 2514 2515 2515 set_bit(Blocked, &rdev->flags); 2516 2516 set_bit(Faulty, &rdev->flags); 2517 - set_bit(MD_CHANGE_DEVS, &mddev->flags); 2518 - set_bit(MD_CHANGE_PENDING, &mddev->flags); 2517 + set_mask_bits(&mddev->flags, 0, 2518 + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 2519 2519 printk(KERN_ALERT 2520 2520 "md/raid:%s: Disk failure on %s, disabling device.\n" 2521 2521 "md/raid:%s: Operation continuing on %d devices.\n", ··· 7572 7572 7573 7573 if (mddev->delta_disks > 0) { 7574 7574 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7575 - set_capacity(mddev->gendisk, mddev->array_sectors); 7576 - revalidate_disk(mddev->gendisk); 7575 + if (mddev->queue) { 7576 + set_capacity(mddev->gendisk, mddev->array_sectors); 7577 + revalidate_disk(mddev->gendisk); 7578 + } 7577 7579 } else { 7578 7580 int d; 7579 7581 spin_lock_irq(&conf->device_lock);
+16
include/linux/bitops.h
··· 227 227 }) 228 228 #endif 229 229 230 + #ifndef bit_clear_unless 231 + #define bit_clear_unless(ptr, _clear, _test) \ 232 + ({ \ 233 + const typeof(*ptr) clear = (_clear), test = (_test); \ 234 + typeof(*ptr) old, new; \ 235 + \ 236 + do { \ 237 + old = ACCESS_ONCE(*ptr); \ 238 + new = old & ~clear; \ 239 + } while (!(old & test) && \ 240 + cmpxchg(ptr, old, new) != old); \ 241 + \ 242 + !(old & test); \ 243 + }) 244 + #endif 245 + 230 246 #ifndef find_last_bit 231 247 /** 232 248 * find_last_bit - find the last set bit in a memory region