Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

md-cluster: Use a small window for resync

Suspending the entire device for resync could take too long. Resync
in small chunks.

cluster's resync window (32M) is maintained in r1conf as
cluster_sync_low and cluster_sync_high and processed in
raid1's sync_request(). If the current resync is outside the cluster
resync window:

1. Set the cluster_sync_low to curr_resync_completed.
2. Check if the sync will fit in the new window, if not issue a
wait_barrier() and set cluster_sync_low to sector_nr.
3. Set cluster_sync_high to cluster_sync_low + resync_window.
4. Send a message to all nodes so they may add it in their suspension
list.

bitmap_cond_end_sync is modified to allow to force a sync inorder
to get the curr_resync_completed uptodate with the sector passed.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: NeilBrown <neilb@suse.de>

+43 -53
+2 -2
drivers/md/bitmap.c
··· 1570 1570 } 1571 1571 EXPORT_SYMBOL(bitmap_close_sync); 1572 1572 1573 - void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) 1573 + void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) 1574 1574 { 1575 1575 sector_t s = 0; 1576 1576 sector_t blocks; ··· 1581 1581 bitmap->last_end_sync = jiffies; 1582 1582 return; 1583 1583 } 1584 - if (time_before(jiffies, (bitmap->last_end_sync 1584 + if (!force && time_before(jiffies, (bitmap->last_end_sync 1585 1585 + bitmap->mddev->bitmap_info.daemon_sleep))) 1586 1586 return; 1587 1587 wait_event(bitmap->mddev->recovery_wait,
+1 -1
drivers/md/bitmap.h
··· 257 257 int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); 258 258 void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); 259 259 void bitmap_close_sync(struct bitmap *bitmap); 260 - void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); 260 + void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); 261 261 262 262 void bitmap_unplug(struct bitmap *bitmap); 263 263 void bitmap_daemon_work(struct mddev *mddev);
+5 -36
drivers/md/md-cluster.c
··· 802 802 return cinfo->slot_number - 1; 803 803 } 804 804 805 - static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) 806 - { 807 - struct md_cluster_info *cinfo = mddev->cluster_info; 808 - 809 - add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi); 810 - /* Re-acquire the lock to refresh LVB */ 811 - dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); 812 - } 813 - 814 805 static int metadata_update_start(struct mddev *mddev) 815 806 { 816 807 return lock_comm(mddev->cluster_info); ··· 827 836 return dlm_unlock_sync(cinfo->token_lockres); 828 837 } 829 838 830 - static int resync_send(struct mddev *mddev, enum msg_type type, 831 - sector_t lo, sector_t hi) 839 + static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) 832 840 { 833 841 struct md_cluster_info *cinfo = mddev->cluster_info; 834 842 struct cluster_msg cmsg; 835 843 int slot = cinfo->slot_number - 1; 836 844 845 + add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi); 846 + /* Re-acquire the lock to refresh LVB */ 847 + dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); 837 848 pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__, 838 849 (unsigned long long)lo, 839 850 (unsigned long long)hi); 840 - resync_info_update(mddev, lo, hi); 841 - cmsg.type = cpu_to_le32(type); 851 + cmsg.type = cpu_to_le32(RESYNCING); 842 852 cmsg.slot = cpu_to_le32(slot); 843 853 cmsg.low = cpu_to_le64(lo); 844 854 cmsg.high = cpu_to_le64(hi); 845 855 return sendmsg(cinfo, &cmsg); 846 - } 847 - 848 - static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi) 849 - { 850 - pr_info("%s:%d\n", __func__, __LINE__); 851 - return resync_send(mddev, RESYNCING, lo, hi); 852 - } 853 - 854 - static void resync_finish(struct mddev *mddev) 855 - { 856 - struct md_cluster_info *cinfo = mddev->cluster_info; 857 - struct cluster_msg cmsg; 858 - int slot = cinfo->slot_number - 1; 859 - 860 - pr_info("%s:%d\n", __func__, __LINE__); 861 - resync_send(mddev, RESYNCING, 0, 0); 862 - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 863 - cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC); 864 - cmsg.slot = cpu_to_le32(slot); 865 - sendmsg(cinfo, &cmsg); 866 - } 867 856 } 868 857 869 858 static int area_resyncing(struct mddev *mddev, int direction, ··· 968 997 .leave = leave, 969 998 .slot_number = slot_number, 970 999 .resync_info_update = resync_info_update, 971 - .resync_start = resync_start, 972 - .resync_finish = resync_finish, 973 1000 .metadata_update_start = metadata_update_start, 974 1001 .metadata_update_finish = metadata_update_finish, 975 1002 .metadata_update_cancel = metadata_update_cancel,
+1 -3
drivers/md/md-cluster.h
··· 12 12 int (*join)(struct mddev *mddev, int nodes); 13 13 int (*leave)(struct mddev *mddev); 14 14 int (*slot_number)(struct mddev *mddev); 15 - void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); 16 - int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi); 17 - void (*resync_finish)(struct mddev *mddev); 15 + int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); 18 16 int (*metadata_update_start)(struct mddev *mddev); 19 17 int (*metadata_update_finish)(struct mddev *mddev); 20 18 int (*metadata_update_cancel)(struct mddev *mddev);
-8
drivers/md/md.c
··· 7805 7805 md_new_event(mddev); 7806 7806 update_time = jiffies; 7807 7807 7808 - if (mddev_is_clustered(mddev)) 7809 - md_cluster_ops->resync_start(mddev, j, max_sectors); 7810 - 7811 7808 blk_start_plug(&plug); 7812 7809 while (j < max_sectors) { 7813 7810 sector_t sectors; ··· 7868 7871 j = max_sectors; 7869 7872 if (j > 2) 7870 7873 mddev->curr_resync = j; 7871 - if (mddev_is_clustered(mddev)) 7872 - md_cluster_ops->resync_info_update(mddev, j, max_sectors); 7873 7874 mddev->curr_mark_cnt = io_sectors; 7874 7875 if (last_check == 0) 7875 7876 /* this is the earliest that rebuild will be ··· 7974 7979 } 7975 7980 } 7976 7981 skip: 7977 - if (mddev_is_clustered(mddev)) 7978 - md_cluster_ops->resync_finish(mddev); 7979 - 7980 7982 set_bit(MD_CHANGE_DEVS, &mddev->flags); 7981 7983 7982 7984 spin_lock(&mddev->lock);
+25 -1
drivers/md/raid1.c
··· 90 90 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) 91 91 #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) 92 92 #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) 93 + #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) 94 + #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) 93 95 #define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) 94 96 95 97 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) ··· 2490 2488 2491 2489 bitmap_close_sync(mddev->bitmap); 2492 2490 close_sync(conf); 2491 + 2492 + if (mddev_is_clustered(mddev)) { 2493 + conf->cluster_sync_low = 0; 2494 + conf->cluster_sync_high = 0; 2495 + /* Send zeros to mark end of resync */ 2496 + md_cluster_ops->resync_info_update(mddev, 0, 0); 2497 + } 2493 2498 return 0; 2494 2499 } 2495 2500 ··· 2517 2508 return sync_blocks; 2518 2509 } 2519 2510 2520 - bitmap_cond_end_sync(mddev->bitmap, sector_nr); 2511 + /* we are incrementing sector_nr below. To be safe, we check against 2512 + * sector_nr + two times RESYNC_SECTORS 2513 + */ 2514 + 2515 + bitmap_cond_end_sync(mddev->bitmap, sector_nr, 2516 + mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); 2521 2517 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); 2522 2518 2523 2519 raise_barrier(conf, sector_nr); ··· 2712 2698 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); 2713 2699 bio_full: 2714 2700 r1_bio->sectors = nr_sectors; 2701 + 2702 + if (mddev_is_clustered(mddev) && 2703 + conf->cluster_sync_high < sector_nr + nr_sectors) { 2704 + conf->cluster_sync_low = mddev->curr_resync_completed; 2705 + conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS; 2706 + /* Send resync message */ 2707 + md_cluster_ops->resync_info_update(mddev, 2708 + conf->cluster_sync_low, 2709 + conf->cluster_sync_high); 2710 + } 2715 2711 2716 2712 /* For a user-requested sync, we read all readable devices and do a 2717 2713 * compare
+7
drivers/md/raid1.h
··· 111 111 * the new thread here until we fully activate the array. 112 112 */ 113 113 struct md_thread *thread; 114 + 115 + /* Keep track of cluster resync window to send to other 116 + * nodes. 117 + */ 118 + sector_t cluster_sync_low; 119 + sector_t cluster_sync_high; 120 + 114 121 }; 115 122 116 123 /*
+1 -1
drivers/md/raid10.c
··· 3137 3137 /* resync. Schedule a read for every block at this virt offset */ 3138 3138 int count = 0; 3139 3139 3140 - bitmap_cond_end_sync(mddev->bitmap, sector_nr); 3140 + bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0); 3141 3141 3142 3142 if (!bitmap_start_sync(mddev->bitmap, sector_nr, 3143 3143 &sync_blocks, mddev->degraded) &&
+1 -1
drivers/md/raid5.c
··· 5613 5613 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 5614 5614 } 5615 5615 5616 - bitmap_cond_end_sync(mddev->bitmap, sector_nr); 5616 + bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); 5617 5617 5618 5618 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 5619 5619 if (sh == NULL) {