Merge tag 'md/4.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

tjh.dev / kernel

fork atom

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork atom

Merge tag 'md/4.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

Pull MD updates from Shaohua Li:
"Several patches from Guoqing fixing md-cluster bugs and several
patches from Heinz fixing dm-raid bugs"

* tag 'md/4.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md:
md-cluster: check the return value of process_recvd_msg
md-cluster: gather resync infos and enable recv_thread after bitmap is ready
md: set MD_CHANGE_PENDING in a atomic region
md: raid5: add prerequisite to run underneath dm-raid
md: raid10: add prerequisite to run underneath dm-raid
md: md.c: fix oops in mddev_suspend for raid0
md-cluster: fix ifnullfree.cocci warnings
md-cluster/bitmap: unplug bitmap to sync dirty pages to disk
md-cluster/bitmap: fix wrong page num in bitmap_file_clear_bit and bitmap_file_set_bit
md-cluster/bitmap: fix wrong calcuation of offset
md-cluster: sync bitmap when node received RESYNCING msg
md-cluster: always setup in-memory bitmap
md-cluster: wakeup thread if activated a spare disk
md-cluster: change array_sectors and update size are not supported
md-cluster: fix locking when node joins cluster during message broadcast
md-cluster: unregister thread if err happened
md-cluster: wake up thread to continue recovery
md-cluser: make resync_finish only called after pers->sync_request
md-cluster: change resync lock from asynchronous to synchronous

Linus Torvalds 10 years ago feaa7cb5 e0fb1b36

+257 -77

11 changed files

expand all collapse all

Documentation

md-cluster.txt

drivers

bitmap.c

bitmap.h

md-cluster.c

md-cluster.h

md.c

raid1.c

raid10.c

raid5-cache.c

raid5.c

include

linux

bitops.h

Documentation/md-cluster.txt

reviewed

··· 316 316 nodes are using the raid which is achieved by lock all bitmap 317 317 locks within the cluster, and also those locks are unlocked 318 318 accordingly. 319 319 + 320 320 + 7. Unsupported features 321 321 + 322 322 + There are somethings which are not supported by cluster MD yet. 323 323 + 324 324 + - update size and change array_sectors.

+77 -11

drivers/md/bitmap.c

reviewed

··· 46 46 * allocated while we're using it 47 47 */ 48 48 static int bitmap_checkpage(struct bitmap_counts *bitmap, 49 49 - unsigned long page, int create) 49 49 + unsigned long page, int create, int no_hijack) 50 50 __releases(bitmap->lock) 51 51 __acquires(bitmap->lock) 52 52 { ··· 90 90 91 91 if (mappage == NULL) { 92 92 pr_debug("md/bitmap: map page allocation failed, hijacking\n"); 93 93 + /* We don't support hijack for cluster raid */ 94 94 + if (no_hijack) 95 95 + return -ENOMEM; 93 96 /* failed - set the hijacked flag so that we can use the 94 97 * pointer as a counter */ 95 98 if (!bitmap->bp[page].map) ··· 759 756 bytes += sizeof(bitmap_super_t); 760 757 761 758 num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); 762 762 - offset = slot_number * (num_pages - 1); 759 759 + offset = slot_number * num_pages; 763 760 764 761 store->filemap = kmalloc(sizeof(struct page *) 765 762 * num_pages, GFP_KERNEL); ··· 903 900 struct page *page; 904 901 void *kaddr; 905 902 unsigned long chunk = block >> bitmap->counts.chunkshift; 903 903 + struct bitmap_storage *store = &bitmap->storage; 904 904 + unsigned long node_offset = 0; 905 905 + 906 906 + if (mddev_is_clustered(bitmap->mddev)) 907 907 + node_offset = bitmap->cluster_slot * store->file_pages; 906 908 907 909 page = filemap_get_page(&bitmap->storage, chunk); 908 910 if (!page) ··· 923 915 kunmap_atomic(kaddr); 924 916 pr_debug("set file bit %lu page %lu\n", bit, page->index); 925 917 /* record page number so it gets flushed to disk when unplug occurs */ 926 926 - set_page_attr(bitmap, page->index, BITMAP_PAGE_DIRTY); 918 918 + set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY); 927 919 } 928 920 929 921 static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) ··· 932 924 struct page *page; 933 925 void *paddr; 934 926 unsigned long chunk = block >> bitmap->counts.chunkshift; 927 927 + struct bitmap_storage *store = &bitmap->storage; 928 928 + unsigned long node_offset = 0; 929 929 + 930 930 + if (mddev_is_clustered(bitmap->mddev)) 931 931 + node_offset = bitmap->cluster_slot * store->file_pages; 935 932 936 933 page = filemap_get_page(&bitmap->storage, chunk); 937 934 if (!page) ··· 948 935 else 949 936 clear_bit_le(bit, paddr); 950 937 kunmap_atomic(paddr); 951 951 - if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) { 952 952 - set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING); 938 938 + if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) { 939 939 + set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING); 953 940 bitmap->allclean = 0; 954 941 } 955 942 } ··· 1334 1321 sector_t csize; 1335 1322 int err; 1336 1323 1337 1337 - err = bitmap_checkpage(bitmap, page, create); 1324 1324 + err = bitmap_checkpage(bitmap, page, create, 0); 1338 1325 1339 1326 if (bitmap->bp[page].hijacked || 1340 1327 bitmap->bp[page].map == NULL) ··· 1607 1594 } 1608 1595 EXPORT_SYMBOL(bitmap_cond_end_sync); 1609 1596 1597 1597 + void bitmap_sync_with_cluster(struct mddev *mddev, 1598 1598 + sector_t old_lo, sector_t old_hi, 1599 1599 + sector_t new_lo, sector_t new_hi) 1600 1600 + { 1601 1601 + struct bitmap *bitmap = mddev->bitmap; 1602 1602 + sector_t sector, blocks = 0; 1603 1603 + 1604 1604 + for (sector = old_lo; sector < new_lo; ) { 1605 1605 + bitmap_end_sync(bitmap, sector, &blocks, 0); 1606 1606 + sector += blocks; 1607 1607 + } 1608 1608 + WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n"); 1609 1609 + 1610 1610 + for (sector = old_hi; sector < new_hi; ) { 1611 1611 + bitmap_start_sync(bitmap, sector, &blocks, 0); 1612 1612 + sector += blocks; 1613 1613 + } 1614 1614 + WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n"); 1615 1615 + } 1616 1616 + EXPORT_SYMBOL(bitmap_sync_with_cluster); 1617 1617 + 1610 1618 static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) 1611 1619 { 1612 1620 /* For each chunk covered by any of these sectors, set the ··· 1848 1814 if (!bitmap) 1849 1815 goto out; 1850 1816 1817 1817 + if (mddev_is_clustered(mddev)) 1818 1818 + md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); 1819 1819 + 1851 1820 /* Clear out old bitmap info first: Either there is none, or we 1852 1821 * are resuming after someone else has possibly changed things, 1853 1822 * so we should forget old cached info. ··· 1927 1890 1928 1891 if (clear_bits) { 1929 1892 bitmap_update_sb(bitmap); 1930 1930 - /* Setting this for the ev_page should be enough. 1931 1931 - * And we do not require both write_all and PAGE_DIRT either 1932 1932 - */ 1893 1893 + /* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs 1894 1894 + * BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */ 1933 1895 for (i = 0; i < bitmap->storage.file_pages; i++) 1934 1934 - set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); 1935 1935 - bitmap_write_all(bitmap); 1896 1896 + if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING)) 1897 1897 + set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); 1936 1898 bitmap_unplug(bitmap); 1937 1899 } 1900 1900 + bitmap_unplug(mddev->bitmap); 1938 1901 *low = lo; 1939 1902 *high = hi; 1940 1903 err: ··· 2069 2032 chunks << chunkshift); 2070 2033 2071 2034 spin_lock_irq(&bitmap->counts.lock); 2035 2035 + /* For cluster raid, need to pre-allocate bitmap */ 2036 2036 + if (mddev_is_clustered(bitmap->mddev)) { 2037 2037 + unsigned long page; 2038 2038 + for (page = 0; page < pages; page++) { 2039 2039 + ret = bitmap_checkpage(&bitmap->counts, page, 1, 1); 2040 2040 + if (ret) { 2041 2041 + unsigned long k; 2042 2042 + 2043 2043 + /* deallocate the page memory */ 2044 2044 + for (k = 0; k < page; k++) { 2045 2045 + kfree(new_bp[k].map); 2046 2046 + } 2047 2047 + 2048 2048 + /* restore some fields from old_counts */ 2049 2049 + bitmap->counts.bp = old_counts.bp; 2050 2050 + bitmap->counts.pages = old_counts.pages; 2051 2051 + bitmap->counts.missing_pages = old_counts.pages; 2052 2052 + bitmap->counts.chunkshift = old_counts.chunkshift; 2053 2053 + bitmap->counts.chunks = old_counts.chunks; 2054 2054 + bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift + 2055 2055 + BITMAP_BLOCK_SHIFT); 2056 2056 + blocks = old_counts.chunks << old_counts.chunkshift; 2057 2057 + pr_err("Could not pre-allocate in-memory bitmap for cluster raid\n"); 2058 2058 + break; 2059 2059 + } else 2060 2060 + bitmap->counts.bp[page].count += 1; 2061 2061 + } 2062 2062 + } 2063 2063 + 2072 2064 for (block = 0; block < blocks; ) { 2073 2065 bitmap_counter_t *bmc_old, *bmc_new; 2074 2066 int set;

drivers/md/bitmap.h

reviewed

··· 258 258 void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); 259 259 void bitmap_close_sync(struct bitmap *bitmap); 260 260 void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); 261 261 + void bitmap_sync_with_cluster(struct mddev *mddev, 262 262 + sector_t old_lo, sector_t old_hi, 263 263 + sector_t new_lo, sector_t new_hi); 261 264 262 265 void bitmap_unplug(struct bitmap *bitmap); 263 266 void bitmap_daemon_work(struct mddev *mddev);

+79 -17

drivers/md/md-cluster.c

reviewed

··· 61 61 * the lock. 62 62 */ 63 63 #define MD_CLUSTER_SEND_LOCKED_ALREADY 5 64 64 + /* We should receive message after node joined cluster and 65 65 + * set up all the related infos such as bitmap and personality */ 66 66 + #define MD_CLUSTER_ALREADY_IN_CLUSTER 6 67 67 + #define MD_CLUSTER_PENDING_RECV_EVENT 7 64 68 65 69 66 70 struct md_cluster_info { ··· 89 85 struct completion newdisk_completion; 90 86 wait_queue_head_t wait; 91 87 unsigned long state; 88 88 + /* record the region in RESYNCING message */ 89 89 + sector_t sync_low; 90 90 + sector_t sync_hi; 92 91 }; 93 92 94 93 enum msg_type { ··· 291 284 goto dlm_unlock; 292 285 } 293 286 if (hi > 0) { 294 294 - /* TODO:Wait for current resync to get over */ 295 295 - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 296 287 if (lo < mddev->recovery_cp) 297 288 mddev->recovery_cp = lo; 298 298 - md_check_recovery(mddev); 289 289 + /* wake up thread to continue resync in case resync 290 290 + * is not finished */ 291 291 + if (mddev->recovery_cp != MaxSector) { 292 292 + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 293 293 + md_wakeup_thread(mddev->thread); 294 294 + } 299 295 } 300 296 dlm_unlock: 301 297 dlm_unlock_sync(bm_lockres); ··· 380 370 struct dlm_lock_resource *res = arg; 381 371 struct md_cluster_info *cinfo = res->mddev->cluster_info; 382 372 383 383 - if (mode == DLM_LOCK_EX) 384 384 - md_wakeup_thread(cinfo->recv_thread); 373 373 + if (mode == DLM_LOCK_EX) { 374 374 + if (test_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state)) 375 375 + md_wakeup_thread(cinfo->recv_thread); 376 376 + else 377 377 + set_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state); 378 378 + } 385 379 } 386 380 387 381 static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) ··· 422 408 md_wakeup_thread(mddev->thread); 423 409 return; 424 410 } 411 411 + 412 412 + /* 413 413 + * The bitmaps are not same for different nodes 414 414 + * if RESYNCING is happening in one node, then 415 415 + * the node which received the RESYNCING message 416 416 + * probably will perform resync with the region 417 417 + * [lo, hi] again, so we could reduce resync time 418 418 + * a lot if we can ensure that the bitmaps among 419 419 + * different nodes are match up well. 420 420 + * 421 421 + * sync_low/hi is used to record the region which 422 422 + * arrived in the previous RESYNCING message, 423 423 + * 424 424 + * Call bitmap_sync_with_cluster to clear 425 425 + * NEEDED_MASK and set RESYNC_MASK since 426 426 + * resync thread is running in another node, 427 427 + * so we don't need to do the resync again 428 428 + * with the same section */ 429 429 + bitmap_sync_with_cluster(mddev, cinfo->sync_low, 430 430 + cinfo->sync_hi, 431 431 + lo, hi); 432 432 + cinfo->sync_low = lo; 433 433 + cinfo->sync_hi = hi; 434 434 + 425 435 s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); 426 436 if (!s) 427 437 return; ··· 520 482 __func__, __LINE__, le32_to_cpu(msg->raid_slot)); 521 483 } 522 484 523 523 - static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) 485 485 + static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) 524 486 { 487 487 + int ret = 0; 488 488 + 525 489 if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot), 526 490 "node %d received it's own msg\n", le32_to_cpu(msg->slot))) 527 527 - return; 491 491 + return -1; 528 492 switch (le32_to_cpu(msg->type)) { 529 493 case METADATA_UPDATED: 530 494 process_metadata_update(mddev, msg); ··· 549 509 __recover_slot(mddev, le32_to_cpu(msg->slot)); 550 510 break; 551 511 default: 512 512 + ret = -1; 552 513 pr_warn("%s:%d Received unknown message from %d\n", 553 514 __func__, __LINE__, msg->slot); 554 515 } 516 516 + return ret; 555 517 } 556 518 557 519 /* ··· 577 535 578 536 /* read lvb and wake up thread to process this message_lockres */ 579 537 memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg)); 580 580 - process_recvd_msg(thread->mddev, &msg); 538 538 + ret = process_recvd_msg(thread->mddev, &msg); 539 539 + if (ret) 540 540 + goto out; 581 541 582 542 /*release CR on ack_lockres*/ 583 543 ret = dlm_unlock_sync(ack_lockres); ··· 593 549 ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR); 594 550 if (unlikely(ret != 0)) 595 551 pr_info("lock CR on ack failed return %d\n", ret); 552 552 + out: 596 553 /*release CR on message_lockres*/ 597 554 ret = dlm_unlock_sync(message_lockres); 598 555 if (unlikely(ret != 0)) ··· 823 778 cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0); 824 779 if (!cinfo->token_lockres) 825 780 goto err; 826 826 - cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); 827 827 - if (!cinfo->ack_lockres) 828 828 - goto err; 829 781 cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0); 830 782 if (!cinfo->no_new_dev_lockres) 831 783 goto err; 832 784 785 785 + ret = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); 786 786 + if (ret) { 787 787 + ret = -EAGAIN; 788 788 + pr_err("md-cluster: can't join cluster to avoid lock issue\n"); 789 789 + goto err; 790 790 + } 791 791 + cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); 792 792 + if (!cinfo->ack_lockres) 793 793 + goto err; 833 794 /* get sync CR lock on ACK. */ 834 795 if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR)) 835 796 pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n", 836 797 ret); 798 798 + dlm_unlock_sync(cinfo->token_lockres); 837 799 /* get sync CR lock on no-new-dev. */ 838 800 if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR)) 839 801 pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret); ··· 861 809 if (!cinfo->resync_lockres) 862 810 goto err; 863 811 864 864 - ret = gather_all_resync_info(mddev, nodes); 865 865 - if (ret) 866 866 - goto err; 867 867 - 868 812 return 0; 869 813 err: 814 814 + md_unregister_thread(&cinfo->recovery_thread); 815 815 + md_unregister_thread(&cinfo->recv_thread); 870 816 lockres_free(cinfo->message_lockres); 871 817 lockres_free(cinfo->token_lockres); 872 818 lockres_free(cinfo->ack_lockres); ··· 876 826 mddev->cluster_info = NULL; 877 827 kfree(cinfo); 878 828 return ret; 829 829 + } 830 830 + 831 831 + static void load_bitmaps(struct mddev *mddev, int total_slots) 832 832 + { 833 833 + struct md_cluster_info *cinfo = mddev->cluster_info; 834 834 + 835 835 + /* load all the node's bitmap info for resync */ 836 836 + if (gather_all_resync_info(mddev, total_slots)) 837 837 + pr_err("md-cluster: failed to gather all resyn infos\n"); 838 838 + set_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state); 839 839 + /* wake up recv thread in case something need to be handled */ 840 840 + if (test_and_clear_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state)) 841 841 + md_wakeup_thread(cinfo->recv_thread); 879 842 } 880 843 881 844 static void resync_bitmap(struct mddev *mddev) ··· 1000 937 static int resync_start(struct mddev *mddev) 1001 938 { 1002 939 struct md_cluster_info *cinfo = mddev->cluster_info; 1003 1003 - cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE; 1004 940 return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX); 1005 941 } 1006 942 ··· 1029 967 static int resync_finish(struct mddev *mddev) 1030 968 { 1031 969 struct md_cluster_info *cinfo = mddev->cluster_info; 1032 1032 - cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE; 1033 970 dlm_unlock_sync(cinfo->resync_lockres); 1034 971 return resync_info_update(mddev, 0, 0); 1035 972 } ··· 1232 1171 .add_new_disk_cancel = add_new_disk_cancel, 1233 1172 .new_disk_ack = new_disk_ack, 1234 1173 .remove_disk = remove_disk, 1174 1174 + .load_bitmaps = load_bitmaps, 1235 1175 .gather_bitmaps = gather_bitmaps, 1236 1176 .lock_all_bitmaps = lock_all_bitmaps, 1237 1177 .unlock_all_bitmaps = unlock_all_bitmaps,

drivers/md/md-cluster.h

reviewed

··· 23 23 void (*add_new_disk_cancel)(struct mddev *mddev); 24 24 int (*new_disk_ack)(struct mddev *mddev, bool ack); 25 25 int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); 26 26 + void (*load_bitmaps)(struct mddev *mddev, int total_slots); 26 27 int (*gather_bitmaps)(struct md_rdev *rdev); 27 28 int (*lock_all_bitmaps)(struct mddev *mddev); 28 29 void (*unlock_all_bitmaps)(struct mddev *mddev);

+53 -33

drivers/md/md.c

reviewed

··· 307 307 */ 308 308 void mddev_suspend(struct mddev *mddev) 309 309 { 310 310 - WARN_ON_ONCE(current == mddev->thread->tsk); 310 310 + WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); 311 311 if (mddev->suspended++) 312 312 return; 313 313 synchronize_rcu(); ··· 2291 2291 return; 2292 2292 } 2293 2293 2294 2294 + repeat: 2294 2295 if (mddev_is_clustered(mddev)) { 2295 2296 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 2296 2297 force_change = 1; 2298 2298 + if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 2299 2299 + nospares = 1; 2297 2300 ret = md_cluster_ops->metadata_update_start(mddev); 2298 2301 /* Has someone else has updated the sb */ 2299 2302 if (!does_sb_need_changing(mddev)) { 2300 2303 if (ret == 0) 2301 2304 md_cluster_ops->metadata_update_cancel(mddev); 2302 2302 - clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2305 2305 + bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING), 2306 2306 + BIT(MD_CHANGE_DEVS) | 2307 2307 + BIT(MD_CHANGE_CLEAN)); 2303 2308 return; 2304 2309 } 2305 2310 } 2306 2306 - repeat: 2311 2311 + 2307 2312 /* First make sure individual recovery_offsets are correct */ 2308 2313 rdev_for_each(rdev, mddev) { 2309 2314 if (rdev->raid_disk >= 0 && ··· 2435 2430 md_super_wait(mddev); 2436 2431 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 2437 2432 2438 2438 - spin_lock(&mddev->lock); 2433 2433 + if (mddev_is_clustered(mddev) && ret == 0) 2434 2434 + md_cluster_ops->metadata_update_finish(mddev); 2435 2435 + 2439 2436 if (mddev->in_sync != sync_req || 2440 2440 - test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 2437 2437 + !bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING), 2438 2438 + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_CLEAN))) 2441 2439 /* have to write it out again */ 2442 2442 - spin_unlock(&mddev->lock); 2443 2440 goto repeat; 2444 2444 - } 2445 2445 - clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2446 2446 - spin_unlock(&mddev->lock); 2447 2441 wake_up(&mddev->sb_wait); 2448 2442 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2449 2443 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); ··· 2456 2452 clear_bit(BlockedBadBlocks, &rdev->flags); 2457 2453 wake_up(&rdev->blocked_wait); 2458 2454 } 2459 2459 - 2460 2460 - if (mddev_is_clustered(mddev) && ret == 0) 2461 2461 - md_cluster_ops->metadata_update_finish(mddev); 2462 2455 } 2463 2456 EXPORT_SYMBOL(md_update_sb); 2464 2457 ··· 4817 4816 if (err) 4818 4817 return err; 4819 4818 4819 4819 + /* cluster raid doesn't support change array_sectors */ 4820 4820 + if (mddev_is_clustered(mddev)) 4821 4821 + return -EINVAL; 4822 4822 + 4820 4823 if (strncmp(buf, "default", 7) == 0) { 4821 4824 if (mddev->pers) 4822 4825 sectors = mddev->pers->size(mddev, 0, 0); ··· 6442 6437 int rv; 6443 6438 int fit = (num_sectors == 0); 6444 6439 6440 6440 + /* cluster raid doesn't support update size */ 6441 6441 + if (mddev_is_clustered(mddev)) 6442 6442 + return -EINVAL; 6443 6443 + 6445 6444 if (mddev->pers->resize == NULL) 6446 6445 return -EINVAL; 6447 6446 /* The "num_sectors" is the number of sectors of each device that ··· 7794 7785 struct md_rdev *rdev; 7795 7786 char *desc, *action = NULL; 7796 7787 struct blk_plug plug; 7797 7797 - bool cluster_resync_finished = false; 7788 7788 + int ret; 7798 7789 7799 7790 /* just incase thread restarts... */ 7800 7791 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) ··· 7802 7793 if (mddev->ro) {/* never try to sync a read-only array */ 7803 7794 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7804 7795 return; 7796 7796 + } 7797 7797 + 7798 7798 + if (mddev_is_clustered(mddev)) { 7799 7799 + ret = md_cluster_ops->resync_start(mddev); 7800 7800 + if (ret) 7801 7801 + goto skip; 7802 7802 + 7803 7803 + if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 7804 7804 + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 7805 7805 + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 7806 7806 + && ((unsigned long long)mddev->curr_resync_completed 7807 7807 + < (unsigned long long)mddev->resync_max_sectors)) 7808 7808 + goto skip; 7805 7809 } 7806 7810 7807 7811 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { ··· 8111 8089 mddev->curr_resync_completed = mddev->curr_resync; 8112 8090 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 8113 8091 } 8114 8114 - /* tell personality and other nodes that we are finished */ 8115 8115 - if (mddev_is_clustered(mddev)) { 8116 8116 - md_cluster_ops->resync_finish(mddev); 8117 8117 - cluster_resync_finished = true; 8118 8118 - } 8119 8092 mddev->pers->sync_request(mddev, max_sectors, &skipped); 8120 8093 8121 8094 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && ··· 8147 8130 } 8148 8131 } 8149 8132 skip: 8150 8150 - set_bit(MD_CHANGE_DEVS, &mddev->flags); 8151 8151 - 8152 8133 if (mddev_is_clustered(mddev) && 8153 8153 - test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8154 8154 - !cluster_resync_finished) 8134 8134 + ret == 0) { 8135 8135 + /* set CHANGE_PENDING here since maybe another 8136 8136 + * update is needed, so other nodes are informed */ 8137 8137 + set_mask_bits(&mddev->flags, 0, 8138 8138 + BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS)); 8139 8139 + md_wakeup_thread(mddev->thread); 8140 8140 + wait_event(mddev->sb_wait, 8141 8141 + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 8155 8142 md_cluster_ops->resync_finish(mddev); 8143 8143 + } else 8144 8144 + set_bit(MD_CHANGE_DEVS, &mddev->flags); 8156 8145 8157 8146 spin_lock(&mddev->lock); 8158 8147 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { ··· 8249 8226 struct mddev *mddev = container_of(ws, struct mddev, del_work); 8250 8227 int ret = 0; 8251 8228 8252 8252 - if (mddev_is_clustered(mddev)) { 8253 8253 - ret = md_cluster_ops->resync_start(mddev); 8254 8254 - if (ret) { 8255 8255 - mddev->sync_thread = NULL; 8256 8256 - goto out; 8257 8257 - } 8258 8258 - } 8259 8259 - 8260 8229 mddev->sync_thread = md_register_thread(md_do_sync, 8261 8230 mddev, 8262 8231 "resync"); 8263 8263 - out: 8264 8232 if (!mddev->sync_thread) { 8265 8233 if (!(mddev_is_clustered(mddev) && ret == -EAGAIN)) 8266 8234 printk(KERN_ERR "%s: could not start resync" ··· 8550 8536 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8551 8537 int is_new) 8552 8538 { 8539 8539 + struct mddev *mddev = rdev->mddev; 8553 8540 int rv; 8554 8541 if (is_new) 8555 8542 s += rdev->new_data_offset; ··· 8560 8545 if (rv == 0) { 8561 8546 /* Make sure they get written out promptly */ 8562 8547 sysfs_notify_dirent_safe(rdev->sysfs_state); 8563 8563 - set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); 8564 8564 - set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags); 8548 8548 + set_mask_bits(&mddev->flags, 0, 8549 8549 + BIT(MD_CHANGE_CLEAN) | BIT(MD_CHANGE_PENDING)); 8565 8550 md_wakeup_thread(rdev->mddev->thread); 8566 8551 return 1; 8567 8552 } else ··· 8695 8680 ret = remove_and_add_spares(mddev, rdev2); 8696 8681 pr_info("Activated spare: %s\n", 8697 8682 bdevname(rdev2->bdev,b)); 8683 8683 + /* wakeup mddev->thread here, so array could 8684 8684 + * perform resync with the new activated disk */ 8685 8685 + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8686 8686 + md_wakeup_thread(mddev->thread); 8687 8687 + 8698 8688 } 8699 8689 /* device faulty 8700 8690 * We just want to do the minimum to mark the disk

+2 -2

drivers/md/raid1.c

reviewed

··· 1474 1474 * if recovery is running, make sure it aborts. 1475 1475 */ 1476 1476 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1477 1477 - set_bit(MD_CHANGE_DEVS, &mddev->flags); 1478 1478 - set_bit(MD_CHANGE_PENDING, &mddev->flags); 1477 1477 + set_mask_bits(&mddev->flags, 0, 1478 1478 + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 1479 1479 printk(KERN_ALERT 1480 1480 "md/raid1:%s: Disk failure on %s, disabling device.\n" 1481 1481 "md/raid1:%s: Operation continuing on %d devices.\n",

+12 -8

drivers/md/raid10.c

reviewed

··· 1102 1102 bio->bi_iter.bi_sector < conf->reshape_progress))) { 1103 1103 /* Need to update reshape_position in metadata */ 1104 1104 mddev->reshape_position = conf->reshape_progress; 1105 1105 - set_bit(MD_CHANGE_DEVS, &mddev->flags); 1106 1106 - set_bit(MD_CHANGE_PENDING, &mddev->flags); 1105 1105 + set_mask_bits(&mddev->flags, 0, 1106 1106 + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 1107 1107 md_wakeup_thread(mddev->thread); 1108 1108 wait_event(mddev->sb_wait, 1109 1109 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); ··· 1591 1591 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1592 1592 set_bit(Blocked, &rdev->flags); 1593 1593 set_bit(Faulty, &rdev->flags); 1594 1594 - set_bit(MD_CHANGE_DEVS, &mddev->flags); 1595 1595 - set_bit(MD_CHANGE_PENDING, &mddev->flags); 1594 1594 + set_mask_bits(&mddev->flags, 0, 1595 1595 + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 1596 1596 spin_unlock_irqrestore(&conf->device_lock, flags); 1597 1597 printk(KERN_ALERT 1598 1598 "md/raid10:%s: Disk failure on %s, disabling device.\n" ··· 3782 3782 return ret; 3783 3783 } 3784 3784 md_set_array_sectors(mddev, size); 3785 3785 - set_capacity(mddev->gendisk, mddev->array_sectors); 3786 3786 - revalidate_disk(mddev->gendisk); 3785 3785 + if (mddev->queue) { 3786 3786 + set_capacity(mddev->gendisk, mddev->array_sectors); 3787 3787 + revalidate_disk(mddev->gendisk); 3788 3788 + } 3787 3789 if (sectors > mddev->dev_sectors && 3788 3790 mddev->recovery_cp > oldsize) { 3789 3791 mddev->recovery_cp = oldsize; ··· 4595 4593 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4596 4594 } 4597 4595 mddev->resync_max_sectors = size; 4598 4598 - set_capacity(mddev->gendisk, mddev->array_sectors); 4599 4599 - revalidate_disk(mddev->gendisk); 4596 4596 + if (mddev->queue) { 4597 4597 + set_capacity(mddev->gendisk, mddev->array_sectors); 4598 4598 + revalidate_disk(mddev->gendisk); 4599 4599 + } 4600 4600 } else { 4601 4601 int d; 4602 4602 for (d = conf->geo.raid_disks ;

+2 -2

drivers/md/raid5-cache.c

reviewed

··· 712 712 * in_teardown check workaround this issue. 713 713 */ 714 714 if (!log->in_teardown) { 715 715 - set_bit(MD_CHANGE_DEVS, &mddev->flags); 716 716 - set_bit(MD_CHANGE_PENDING, &mddev->flags); 715 715 + set_mask_bits(&mddev->flags, 0, 716 716 + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 717 717 md_wakeup_thread(mddev->thread); 718 718 wait_event(mddev->sb_wait, 719 719 !test_bit(MD_CHANGE_PENDING, &mddev->flags) ||

+6 -4

drivers/md/raid5.c

reviewed

··· 2514 2514 2515 2515 set_bit(Blocked, &rdev->flags); 2516 2516 set_bit(Faulty, &rdev->flags); 2517 2517 - set_bit(MD_CHANGE_DEVS, &mddev->flags); 2518 2518 - set_bit(MD_CHANGE_PENDING, &mddev->flags); 2517 2517 + set_mask_bits(&mddev->flags, 0, 2518 2518 + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 2519 2519 printk(KERN_ALERT 2520 2520 "md/raid:%s: Disk failure on %s, disabling device.\n" 2521 2521 "md/raid:%s: Operation continuing on %d devices.\n", ··· 7572 7572 7573 7573 if (mddev->delta_disks > 0) { 7574 7574 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7575 7575 - set_capacity(mddev->gendisk, mddev->array_sectors); 7576 7576 - revalidate_disk(mddev->gendisk); 7575 7575 + if (mddev->queue) { 7576 7576 + set_capacity(mddev->gendisk, mddev->array_sectors); 7577 7577 + revalidate_disk(mddev->gendisk); 7578 7578 + } 7577 7579 } else { 7578 7580 int d; 7579 7581 spin_lock_irq(&conf->device_lock);

+16

include/linux/bitops.h

reviewed

··· 227 227 }) 228 228 #endif 229 229 230 230 + #ifndef bit_clear_unless 231 231 + #define bit_clear_unless(ptr, _clear, _test) \ 232 232 + ({ \ 233 233 + const typeof(*ptr) clear = (_clear), test = (_test); \ 234 234 + typeof(*ptr) old, new; \ 235 235 + \ 236 236 + do { \ 237 237 + old = ACCESS_ONCE(*ptr); \ 238 238 + new = old & ~clear; \ 239 239 + } while (!(old & test) && \ 240 240 + cmpxchg(ptr, old, new) != old); \ 241 241 + \ 242 242 + !(old & test); \ 243 243 + }) 244 244 + #endif 245 245 + 230 246 #ifndef find_last_bit 231 247 /** 232 248 * find_last_bit - find the last set bit in a memory region