Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drbd: Allow online change of al-stripes and al-stripe-size

Allow to change the AL layout with an resize operation. For that
the reisze command gets two new fields: al_stripes and al_stripe_size.

In order to make the operation crash save:
1) Lock out all IO and MD-IO
2) Write the super block with MDF_PRIMARY_IND clear
3) write the bitmap to the new location (all zeros, since
we allow only while connected)
4) Initialize the new AL-area
5) Write the super block with the restored MDF_PRIMARY_IND.
6) Unfreeze all IO

Since the AL-layout has no influence on the protocol, this operation
needs to be beforemed on both sides of a resource (if intended).

Signed-off-by: Andreas Gruenbacher <agruen@linbit.com>
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Philipp Reisner and committed by
Jens Axboe
d752b269 e96c9633

+188 -53
+21
drivers/block/drbd/drbd_actlog.c
··· 659 659 wake_up(&mdev->al_wait); 660 660 } 661 661 662 + int drbd_initialize_al(struct drbd_conf *mdev, void *buffer) 663 + { 664 + struct al_transaction_on_disk *al = buffer; 665 + struct drbd_md *md = &mdev->ldev->md; 666 + sector_t al_base = md->md_offset + md->al_offset; 667 + int al_size_4k = md->al_stripes * md->al_stripe_size_4k; 668 + int i; 669 + 670 + memset(al, 0, 4096); 671 + al->magic = cpu_to_be32(DRBD_AL_MAGIC); 672 + al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED); 673 + al->crc32c = cpu_to_be32(crc32c(0, al, 4096)); 674 + 675 + for (i = 0; i < al_size_4k; i++) { 676 + int err = drbd_md_sync_page_io(mdev, mdev->ldev, al_base + i * 8, WRITE); 677 + if (err) 678 + return err; 679 + } 680 + return 0; 681 + } 682 + 662 683 static int w_update_odbm(struct drbd_work *w, int unused) 663 684 { 664 685 struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
+6 -1
drivers/block/drbd/drbd_int.h
··· 1133 1133 void drbd_print_uuids(struct drbd_conf *mdev, const char *text); 1134 1134 1135 1135 extern void conn_md_sync(struct drbd_tconn *tconn); 1136 + extern void drbd_md_write(struct drbd_conf *mdev, void *buffer); 1136 1137 extern void drbd_md_sync(struct drbd_conf *mdev); 1137 1138 extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); 1138 1139 extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); ··· 1469 1468 extern char *ppsize(char *buf, unsigned long long size); 1470 1469 extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int); 1471 1470 enum determine_dev_size { 1471 + DS_ERROR_SHRINK = -3, 1472 + DS_ERROR_SPACE_MD = -2, 1472 1473 DS_ERROR = -1, 1473 1474 DS_UNCHANGED = 0, 1474 1475 DS_SHRUNK = 1, 1475 1476 DS_GREW = 2 1476 1477 }; 1477 - extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); 1478 + extern enum determine_dev_size 1479 + drbd_determine_dev_size(struct drbd_conf *, enum dds_flags, struct resize_parms *) __must_hold(local); 1478 1480 extern void resync_after_online_grow(struct drbd_conf *); 1479 1481 extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev); 1480 1482 extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, ··· 1643 1639 #define drbd_set_out_of_sync(mdev, sector, size) \ 1644 1640 __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) 1645 1641 extern void drbd_al_shrink(struct drbd_conf *mdev); 1642 + extern int drbd_initialize_al(struct drbd_conf *, void *); 1646 1643 1647 1644 /* drbd_nl.c */ 1648 1645 /* state info broadcast */
+33 -24
drivers/block/drbd/drbd_main.c
··· 2879 2879 u8 reserved_u8[4096 - (7*8 + 10*4)]; 2880 2880 } __packed; 2881 2881 2882 - /** 2883 - * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set 2884 - * @mdev: DRBD device. 2885 - */ 2886 - void drbd_md_sync(struct drbd_conf *mdev) 2882 + 2883 + 2884 + void drbd_md_write(struct drbd_conf *mdev, void *b) 2887 2885 { 2888 - struct meta_data_on_disk *buffer; 2886 + struct meta_data_on_disk *buffer = b; 2889 2887 sector_t sector; 2890 2888 int i; 2891 - 2892 - /* Don't accidentally change the DRBD meta data layout. */ 2893 - BUILD_BUG_ON(UI_SIZE != 4); 2894 - BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096); 2895 - 2896 - del_timer(&mdev->md_sync_timer); 2897 - /* timer may be rearmed by drbd_md_mark_dirty() now. */ 2898 - if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) 2899 - return; 2900 - 2901 - /* We use here D_FAILED and not D_ATTACHING because we try to write 2902 - * metadata even if we detach due to a disk failure! */ 2903 - if (!get_ldev_if_state(mdev, D_FAILED)) 2904 - return; 2905 - 2906 - buffer = drbd_md_get_buffer(mdev); 2907 - if (!buffer) 2908 - goto out; 2909 2889 2910 2890 memset(buffer, 0, sizeof(*buffer)); 2911 2891 ··· 2915 2935 dev_err(DEV, "meta data update failed!\n"); 2916 2936 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); 2917 2937 } 2938 + } 2939 + 2940 + /** 2941 + * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set 2942 + * @mdev: DRBD device. 2943 + */ 2944 + void drbd_md_sync(struct drbd_conf *mdev) 2945 + { 2946 + struct meta_data_on_disk *buffer; 2947 + 2948 + /* Don't accidentally change the DRBD meta data layout. */ 2949 + BUILD_BUG_ON(UI_SIZE != 4); 2950 + BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096); 2951 + 2952 + del_timer(&mdev->md_sync_timer); 2953 + /* timer may be rearmed by drbd_md_mark_dirty() now. */ 2954 + if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) 2955 + return; 2956 + 2957 + /* We use here D_FAILED and not D_ATTACHING because we try to write 2958 + * metadata even if we detach due to a disk failure! */ 2959 + if (!get_ldev_if_state(mdev, D_FAILED)) 2960 + return; 2961 + 2962 + buffer = drbd_md_get_buffer(mdev); 2963 + if (!buffer) 2964 + goto out; 2965 + 2966 + drbd_md_write(mdev, buffer); 2918 2967 2919 2968 /* Update mdev->ldev->md.la_size_sect, 2920 2969 * since we updated it on metadata. */
+111 -26
drivers/block/drbd/drbd_nl.c
··· 827 827 * Returns 0 on success, negative return values indicate errors. 828 828 * You should call drbd_md_sync() after calling this function. 829 829 */ 830 - enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) 830 + enum determine_dev_size 831 + drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags, struct resize_parms *rs) __must_hold(local) 831 832 { 832 833 sector_t prev_first_sect, prev_size; /* previous meta location */ 833 834 sector_t la_size_sect, u_size; 835 + struct drbd_md *md = &mdev->ldev->md; 836 + u32 prev_al_stripe_size_4k; 837 + u32 prev_al_stripes; 834 838 sector_t size; 835 839 char ppb[10]; 840 + void *buffer; 836 841 837 842 int md_moved, la_size_changed; 838 843 enum determine_dev_size rv = DS_UNCHANGED; ··· 852 847 * still lock the act_log to not trigger ASSERTs there. 853 848 */ 854 849 drbd_suspend_io(mdev); 850 + buffer = drbd_md_get_buffer(mdev); /* Lock meta-data IO */ 851 + if (!buffer) { 852 + drbd_resume_io(mdev); 853 + return DS_ERROR; 854 + } 855 855 856 856 /* no wait necessary anymore, actually we could assert that */ 857 857 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); ··· 865 855 prev_size = mdev->ldev->md.md_size_sect; 866 856 la_size_sect = mdev->ldev->md.la_size_sect; 867 857 868 - /* TODO: should only be some assert here, not (re)init... */ 858 + if (rs) { 859 + /* rs is non NULL if we should change the AL layout only */ 860 + 861 + prev_al_stripes = md->al_stripes; 862 + prev_al_stripe_size_4k = md->al_stripe_size_4k; 863 + 864 + md->al_stripes = rs->al_stripes; 865 + md->al_stripe_size_4k = rs->al_stripe_size / 4; 866 + md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4; 867 + } 868 + 869 869 drbd_md_set_sector_offsets(mdev, mdev->ldev); 870 870 871 871 rcu_read_lock(); 872 872 u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; 873 873 rcu_read_unlock(); 874 874 size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED); 875 + 876 + if (size < la_size_sect) { 877 + if (rs && u_size == 0) { 878 + /* Remove "rs &&" later. This check should always be active, but 879 + right now the receiver expects the permissive behavior */ 880 + dev_warn(DEV, "Implicit shrink not allowed. " 881 + "Use --size=%llus for explicit shrink.\n", 882 + (unsigned long long)size); 883 + rv = DS_ERROR_SHRINK; 884 + } 885 + if (u_size > size) 886 + rv = DS_ERROR_SPACE_MD; 887 + if (rv != DS_UNCHANGED) 888 + goto err_out; 889 + } 875 890 876 891 if (drbd_get_capacity(mdev->this_bdev) != size || 877 892 drbd_bm_capacity(mdev) != size) { ··· 921 886 dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), 922 887 (unsigned long long)size>>1); 923 888 } 924 - if (rv == DS_ERROR) 925 - goto out; 889 + if (rv <= DS_ERROR) 890 + goto err_out; 926 891 927 892 la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect); 928 893 929 894 md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) 930 895 || prev_size != mdev->ldev->md.md_size_sect; 931 896 932 - if (la_size_changed || md_moved) { 933 - int err; 897 + if (la_size_changed || md_moved || rs) { 898 + u32 prev_flags; 934 899 935 900 drbd_al_shrink(mdev); /* All extents inactive. */ 901 + 902 + prev_flags = md->flags; 903 + md->flags &= ~MDF_PRIMARY_IND; 904 + drbd_md_write(mdev, buffer); 905 + 936 906 dev_info(DEV, "Writing the whole bitmap, %s\n", 937 907 la_size_changed && md_moved ? "size changed and md moved" : 938 908 la_size_changed ? "size changed" : "md moved"); 939 909 /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ 940 - err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write, 941 - "size changed", BM_LOCKED_MASK); 942 - if (err) { 943 - rv = DS_ERROR; 944 - goto out; 945 - } 946 - drbd_md_mark_dirty(mdev); 910 + drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write, 911 + "size changed", BM_LOCKED_MASK); 912 + drbd_initialize_al(mdev, buffer); 913 + 914 + md->flags = prev_flags; 915 + drbd_md_write(mdev, buffer); 916 + 917 + if (rs) 918 + dev_info(DEV, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n", 919 + md->al_stripes, md->al_stripe_size_4k * 4); 947 920 } 948 921 949 922 if (size > la_size_sect) 950 923 rv = DS_GREW; 951 924 if (size < la_size_sect) 952 925 rv = DS_SHRUNK; 953 - out: 926 + 927 + if (0) { 928 + err_out: 929 + if (rs) { 930 + md->al_stripes = prev_al_stripes; 931 + md->al_stripe_size_4k = prev_al_stripe_size_4k; 932 + md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k; 933 + 934 + drbd_md_set_sector_offsets(mdev, mdev->ldev); 935 + } 936 + } 954 937 lc_unlock(mdev->act_log); 955 938 wake_up(&mdev->al_wait); 939 + drbd_md_put_buffer(mdev); 956 940 drbd_resume_io(mdev); 957 941 958 942 return rv; ··· 1672 1618 !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) 1673 1619 set_bit(USE_DEGR_WFC_T, &mdev->flags); 1674 1620 1675 - dd = drbd_determine_dev_size(mdev, 0); 1676 - if (dd == DS_ERROR) { 1621 + dd = drbd_determine_dev_size(mdev, 0, NULL); 1622 + if (dd <= DS_ERROR) { 1677 1623 retcode = ERR_NOMEM_BITMAP; 1678 1624 goto force_diskless_dec; 1679 1625 } else if (dd == DS_GREW) ··· 2370 2316 struct drbd_conf *mdev; 2371 2317 enum drbd_ret_code retcode; 2372 2318 enum determine_dev_size dd; 2319 + bool change_al_layout = false; 2373 2320 enum dds_flags ddsf; 2374 2321 sector_t u_size; 2375 2322 int err; ··· 2381 2326 if (retcode != NO_ERROR) 2382 2327 goto fail; 2383 2328 2329 + mdev = adm_ctx.mdev; 2330 + if (!get_ldev(mdev)) { 2331 + retcode = ERR_NO_DISK; 2332 + goto fail; 2333 + } 2334 + 2384 2335 memset(&rs, 0, sizeof(struct resize_parms)); 2336 + rs.al_stripes = mdev->ldev->md.al_stripes; 2337 + rs.al_stripe_size = mdev->ldev->md.al_stripe_size_4k * 4; 2385 2338 if (info->attrs[DRBD_NLA_RESIZE_PARMS]) { 2386 2339 err = resize_parms_from_attrs(&rs, info); 2387 2340 if (err) { 2388 2341 retcode = ERR_MANDATORY_TAG; 2389 2342 drbd_msg_put_info(from_attrs_err_to_txt(err)); 2390 - goto fail; 2343 + goto fail_ldev; 2391 2344 } 2392 2345 } 2393 2346 2394 - mdev = adm_ctx.mdev; 2395 2347 if (mdev->state.conn > C_CONNECTED) { 2396 2348 retcode = ERR_RESIZE_RESYNC; 2397 - goto fail; 2349 + goto fail_ldev; 2398 2350 } 2399 2351 2400 2352 if (mdev->state.role == R_SECONDARY && 2401 2353 mdev->state.peer == R_SECONDARY) { 2402 2354 retcode = ERR_NO_PRIMARY; 2403 - goto fail; 2404 - } 2405 - 2406 - if (!get_ldev(mdev)) { 2407 - retcode = ERR_NO_DISK; 2408 - goto fail; 2355 + goto fail_ldev; 2409 2356 } 2410 2357 2411 2358 if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) { ··· 2426 2369 } 2427 2370 } 2428 2371 2372 + if (mdev->ldev->md.al_stripes != rs.al_stripes || 2373 + mdev->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) { 2374 + u32 al_size_k = rs.al_stripes * rs.al_stripe_size; 2375 + 2376 + if (al_size_k > (16 * 1024 * 1024)) { 2377 + retcode = ERR_MD_LAYOUT_TOO_BIG; 2378 + goto fail_ldev; 2379 + } 2380 + 2381 + if (al_size_k < MD_32kB_SECT/2) { 2382 + retcode = ERR_MD_LAYOUT_TOO_SMALL; 2383 + goto fail_ldev; 2384 + } 2385 + 2386 + if (mdev->state.conn != C_CONNECTED) { 2387 + retcode = ERR_MD_LAYOUT_CONNECTED; 2388 + goto fail_ldev; 2389 + } 2390 + 2391 + change_al_layout = true; 2392 + } 2393 + 2429 2394 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) 2430 2395 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); 2431 2396 ··· 2463 2384 } 2464 2385 2465 2386 ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); 2466 - dd = drbd_determine_dev_size(mdev, ddsf); 2387 + dd = drbd_determine_dev_size(mdev, ddsf, change_al_layout ? &rs : NULL); 2467 2388 drbd_md_sync(mdev); 2468 2389 put_ldev(mdev); 2469 2390 if (dd == DS_ERROR) { 2470 2391 retcode = ERR_NOMEM_BITMAP; 2392 + goto fail; 2393 + } else if (dd == DS_ERROR_SPACE_MD) { 2394 + retcode = ERR_MD_LAYOUT_NO_FIT; 2395 + goto fail; 2396 + } else if (dd == DS_ERROR_SHRINK) { 2397 + retcode = ERR_IMPLICIT_SHRINK; 2471 2398 goto fail; 2472 2399 } 2473 2400
+1 -1
drivers/block/drbd/drbd_receiver.c
··· 3617 3617 3618 3618 ddsf = be16_to_cpu(p->dds_flags); 3619 3619 if (get_ldev(mdev)) { 3620 - dd = drbd_determine_dev_size(mdev, ddsf); 3620 + dd = drbd_determine_dev_size(mdev, ddsf, NULL); 3621 3621 put_ldev(mdev); 3622 3622 if (dd == DS_ERROR) 3623 3623 return -EIO;
+5 -1
include/linux/drbd.h
··· 177 177 ERR_NEED_APV_100 = 163, 178 178 ERR_NEED_ALLOW_TWO_PRI = 164, 179 179 ERR_MD_UNCLEAN = 165, 180 - 180 + ERR_MD_LAYOUT_CONNECTED = 166, 181 + ERR_MD_LAYOUT_TOO_BIG = 167, 182 + ERR_MD_LAYOUT_TOO_SMALL = 168, 183 + ERR_MD_LAYOUT_NO_FIT = 169, 184 + ERR_IMPLICIT_SHRINK = 170, 181 185 /* insert new ones above this line */ 182 186 AFTER_LAST_ERR_CODE 183 187 };
+2
include/linux/drbd_genl.h
··· 181 181 __u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size) 182 182 __flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force) 183 183 __flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync) 184 + __u32_field_def(4, 0 /* OPTIONAL */, al_stripes, DRBD_AL_STRIPES_DEF) 185 + __u32_field_def(5, 0 /* OPTIONAL */, al_stripe_size, DRBD_AL_STRIPE_SIZE_DEF) 184 186 ) 185 187 186 188 GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info,
+9
include/linux/drbd_limits.h
··· 215 215 #define DRBD_ALWAYS_ASBP_DEF 0 216 216 #define DRBD_USE_RLE_DEF 1 217 217 218 + #define DRBD_AL_STRIPES_MIN 1 219 + #define DRBD_AL_STRIPES_MAX 1024 220 + #define DRBD_AL_STRIPES_DEF 1 221 + #define DRBD_AL_STRIPES_SCALE '1' 222 + 223 + #define DRBD_AL_STRIPE_SIZE_MIN 4 224 + #define DRBD_AL_STRIPE_SIZE_MAX 16777216 225 + #define DRBD_AL_STRIPE_SIZE_DEF 32 226 + #define DRBD_AL_STRIPE_SIZE_SCALE 'k' /* kilobytes */ 218 227 #endif