Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drbd: cleanup ondisk meta data layout calculations and defines

Add a comment about our meta data layout variants,
and rename a few defines (e.g. MD_RESERVED_SECT -> MD_128MB_SECT)
to make it clear that they are short hand for fixed constants,
and not arbitrarily to be redefined as one may see fit.

Properly pad struct meta_data_on_disk to 4kB,
and initialize to zero not only the first 512 Byte,
but all of it in drbd_md_sync().

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Lars Ellenberg and committed by
Jens Axboe
ae8bf312 9114d795

+123 -57
+22 -6
drivers/block/drbd/drbd_actlog.c
··· 209 209 current->comm, current->pid, __func__, 210 210 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); 211 211 212 - err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE); 212 + /* we do all our meta data IO in aligned 4k blocks. */ 213 + err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, 4096); 213 214 if (err) { 214 215 dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", 215 216 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); ··· 351 350 (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); 352 351 } 353 352 353 + static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev) 354 + { 355 + const unsigned int stripes = 1; 356 + const unsigned int stripe_size_4kB = MD_32kB_SECT/MD_4kB_SECT; 357 + 358 + /* transaction number, modulo on-disk ring buffer wrap around */ 359 + unsigned int t = mdev->al_tr_number % (stripe_size_4kB * stripes); 360 + 361 + /* ... to aligned 4k on disk block */ 362 + t = ((t % stripes) * stripe_size_4kB) + t/stripes; 363 + 364 + /* ... to 512 byte sector in activity log */ 365 + t *= 8; 366 + 367 + /* ... plus offset to the on disk position */ 368 + return mdev->ldev->md.md_offset + mdev->ldev->md.al_offset + t; 369 + } 370 + 354 371 static int 355 372 _al_write_transaction(struct drbd_conf *mdev) 356 373 { ··· 451 432 if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) 452 433 mdev->al_tr_cycle = 0; 453 434 454 - sector = mdev->ldev->md.md_offset 455 - + mdev->ldev->md.al_offset 456 - + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9); 435 + sector = al_tr_number_to_on_disk_sector(mdev); 457 436 458 437 crc = crc32c(0, buffer, 4096); 459 438 buffer->crc32c = cpu_to_be32(crc); 460 439 440 + /* normal execution path goes through all three branches */ 461 441 if (drbd_bm_write_hinted(mdev)) 462 442 err = -EIO; 463 443 /* drbd_chk_io_error done already */ ··· 464 446 err = -EIO; 465 447 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); 466 448 } else { 467 - /* advance ringbuffer position and transaction counter */ 468 - mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE); 469 449 mdev->al_tr_number++; 470 450 } 471 451
+12 -1
drivers/block/drbd/drbd_bitmap.c
··· 612 612 } 613 613 } 614 614 615 + /* For the layout, see comment above drbd_md_set_sector_offsets(). */ 616 + static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev) 617 + { 618 + u64 bitmap_sectors; 619 + if (ldev->md.al_offset == 8) 620 + bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset; 621 + else 622 + bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset; 623 + return bitmap_sectors << (9 + 3); 624 + } 625 + 615 626 /* 616 627 * make sure the bitmap has enough room for the attached storage, 617 628 * if necessary, resize. ··· 679 668 words = ALIGN(bits, 64) >> LN2_BPL; 680 669 681 670 if (get_ldev(mdev)) { 682 - u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12; 671 + u64 bits_on_disk = drbd_md_on_disk_bits(mdev->ldev); 683 672 put_ldev(mdev); 684 673 if (bits > bits_on_disk) { 685 674 dev_info(DEV, "bits = %lu\n", bits);
+50 -36
drivers/block/drbd/drbd_int.h
··· 753 753 u32 flags; 754 754 u32 md_size_sect; 755 755 756 - s32 al_offset; /* signed relative sector offset to al area */ 756 + s32 al_offset; /* signed relative sector offset to activity log */ 757 757 s32 bm_offset; /* signed relative sector offset to bitmap */ 758 - 759 - /* u32 al_nr_extents; important for restoring the AL 760 - * is stored into ldev->dc.al_extents, which in turn 761 - * gets applied to act_log->nr_elements 762 - */ 763 758 }; 764 759 765 760 struct drbd_backing_dev { ··· 1004 1009 struct lru_cache *act_log; /* activity log */ 1005 1010 unsigned int al_tr_number; 1006 1011 int al_tr_cycle; 1007 - int al_tr_pos; /* position of the next transaction in the journal */ 1008 1012 wait_queue_head_t seq_wait; 1009 1013 atomic_t packet_seq; 1010 1014 unsigned int peer_seq; ··· 1145 1151 extern void drbd_ldev_destroy(struct drbd_conf *mdev); 1146 1152 1147 1153 /* Meta data layout 1148 - We reserve a 128MB Block (4k aligned) 1149 - * either at the end of the backing device 1150 - * or on a separate meta data device. */ 1154 + * 1155 + * We currently have two possible layouts. 1156 + * Offsets in (512 byte) sectors. 1157 + * external: 1158 + * |----------- md_size_sect ------------------| 1159 + * [ 4k superblock ][ activity log ][ Bitmap ] 1160 + * | al_offset == 8 | 1161 + * | bm_offset = al_offset + X | 1162 + * ==> bitmap sectors = md_size_sect - bm_offset 1163 + * 1164 + * Variants: 1165 + * old, indexed fixed size meta data: 1166 + * 1167 + * internal: 1168 + * |----------- md_size_sect ------------------| 1169 + * [data.....][ Bitmap ][ activity log ][ 4k superblock ][padding*] 1170 + * | al_offset < 0 | 1171 + * | bm_offset = al_offset - Y | 1172 + * ==> bitmap sectors = Y = al_offset - bm_offset 1173 + * 1174 + * [padding*] are zero or up to 7 unused 512 Byte sectors to the 1175 + * end of the device, so that the [4k superblock] will be 4k aligned. 1176 + * 1177 + * The activity log consists of 4k transaction blocks, 1178 + * which are written in a ring-buffer, or striped ring-buffer like fashion, 1179 + * which are writtensize used to be fixed 32kB, 1180 + * but is about to become configurable. 1181 + */ 1151 1182 1152 - /* The following numbers are sectors */ 1153 - /* Allows up to about 3.8TB, so if you want more, 1183 + /* Our old fixed size meta data layout 1184 + * allows up to about 3.8TB, so if you want more, 1154 1185 * you need to use the "flexible" meta data format. */ 1155 - #define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ 1156 - #define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ 1157 - #define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */ 1158 - #define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS) 1159 - 1160 - /* we do all meta data IO in 4k blocks */ 1161 - #define MD_BLOCK_SHIFT 12 1162 - #define MD_BLOCK_SIZE (1<<MD_BLOCK_SHIFT) 1186 + #define MD_128MB_SECT (128LLU << 11) /* 128 MB, unit sectors */ 1187 + #define MD_4kB_SECT 8 1188 + #define MD_32kB_SECT 64 1163 1189 1164 1190 /* One activity log extent represents 4M of storage */ 1165 1191 #define AL_EXTENT_SHIFT 22 ··· 1269 1255 1270 1256 /* in one sector of the bitmap, we have this many activity_log extents. */ 1271 1257 #define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) 1272 - #define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) 1273 1258 1274 1259 #define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT) 1275 1260 #define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1) ··· 1288 1275 */ 1289 1276 1290 1277 #define DRBD_MAX_SECTORS_32 (0xffffffffLU) 1291 - #define DRBD_MAX_SECTORS_BM \ 1292 - ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9))) 1293 - #if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32 1294 - #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM 1295 - #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM 1296 - #elif !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32 1278 + /* we have a certain meta data variant that has a fixed on-disk size of 128 1279 + * MiB, of which 4k are our "superblock", and 32k are the fixed size activity 1280 + * log, leaving this many sectors for the bitmap. 1281 + */ 1282 + 1283 + #define DRBD_MAX_SECTORS_FIXED_BM \ 1284 + ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9))) 1285 + #if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32 1297 1286 #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32 1298 1287 #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32 1299 1288 #else 1300 - #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM 1289 + #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM 1301 1290 /* 16 TB in units of sectors */ 1302 1291 #if BITS_PER_LONG == 32 1303 1292 /* adjust by one page worth of bitmap, ··· 1807 1792 switch (meta_dev_idx) { 1808 1793 case DRBD_MD_INDEX_INTERNAL: 1809 1794 case DRBD_MD_INDEX_FLEX_INT: 1810 - return bdev->md.md_offset + MD_AL_OFFSET - 1; 1795 + return bdev->md.md_offset + MD_4kB_SECT -1; 1811 1796 case DRBD_MD_INDEX_FLEX_EXT: 1812 1797 default: 1813 - return bdev->md.md_offset + bdev->md.md_size_sect; 1798 + return bdev->md.md_offset + bdev->md.md_size_sect -1; 1814 1799 } 1815 1800 } 1816 1801 ··· 1876 1861 rcu_read_unlock(); 1877 1862 1878 1863 switch (meta_dev_idx) { 1879 - default: /* external, some index */ 1880 - return MD_RESERVED_SECT * meta_dev_idx; 1864 + default: /* external, some index; this is the old fixed size layout */ 1865 + return MD_128MB_SECT * meta_dev_idx; 1881 1866 case DRBD_MD_INDEX_INTERNAL: 1882 1867 /* with drbd08, internal meta data is always "flexible" */ 1883 1868 case DRBD_MD_INDEX_FLEX_INT: 1884 - /* sizeof(struct md_on_disk_07) == 4k 1885 - * position: last 4k aligned block of 4k size */ 1886 1869 if (!bdev->backing_bdev) { 1887 1870 if (__ratelimit(&drbd_ratelimit_state)) { 1888 1871 dev_err(DEV, "bdev->backing_bdev==NULL\n"); ··· 1888 1875 } 1889 1876 return 0; 1890 1877 } 1891 - return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) 1892 - - MD_AL_OFFSET; 1878 + /* sizeof(struct md_on_disk_07) == 4k 1879 + * position: last 4k aligned block of 4k size */ 1880 + return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8; 1893 1881 case DRBD_MD_INDEX_FLEX_EXT: 1894 1882 return 0; 1895 1883 }
+8 -3
drivers/block/drbd/drbd_main.c
··· 2834 2834 rcu_read_unlock(); 2835 2835 } 2836 2836 2837 + /* aligned 4kByte */ 2837 2838 struct meta_data_on_disk { 2838 2839 u64 la_size; /* last agreed size. */ 2839 2840 u64 uuid[UI_SIZE]; /* UUIDs. */ ··· 2844 2843 u32 magic; 2845 2844 u32 md_size_sect; 2846 2845 u32 al_offset; /* offset to this block */ 2847 - u32 al_nr_extents; /* important for restoring the AL */ 2846 + u32 al_nr_extents; /* important for restoring the AL (userspace) */ 2848 2847 /* `-- act_log->nr_elements <-- ldev->dc.al_extents */ 2849 2848 u32 bm_offset; /* offset to the bitmap, from here */ 2850 2849 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ 2851 2850 u32 la_peer_max_bio_size; /* last peer max_bio_size */ 2852 - u32 reserved_u32[3]; 2853 2851 2852 + u8 reserved_u8[4096 - (7*8 + 8*4)]; 2854 2853 } __packed; 2855 2854 2856 2855 /** ··· 2862 2861 struct meta_data_on_disk *buffer; 2863 2862 sector_t sector; 2864 2863 int i; 2864 + 2865 + /* Don't accidentally change the DRBD meta data layout. */ 2866 + BUILD_BUG_ON(UI_SIZE != 4); 2867 + BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096); 2865 2868 2866 2869 del_timer(&mdev->md_sync_timer); 2867 2870 /* timer may be rearmed by drbd_md_mark_dirty() now. */ ··· 2881 2876 if (!buffer) 2882 2877 goto out; 2883 2878 2884 - memset(buffer, 0, 512); 2879 + memset(buffer, 0, sizeof(*buffer)); 2885 2880 2886 2881 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); 2887 2882 for (i = UI_CURRENT; i < UI_SIZE; i++)
+31 -11
drivers/block/drbd/drbd_nl.c
··· 696 696 return 0; 697 697 } 698 698 699 - /* initializes the md.*_offset members, so we are able to find 700 - * the on disk meta data */ 699 + /* Initializes the md.*_offset members, so we are able to find 700 + * the on disk meta data. 701 + * 702 + * We currently have two possible layouts: 703 + * external: 704 + * |----------- md_size_sect ------------------| 705 + * [ 4k superblock ][ activity log ][ Bitmap ] 706 + * | al_offset == 8 | 707 + * | bm_offset = al_offset + X | 708 + * ==> bitmap sectors = md_size_sect - bm_offset 709 + * 710 + * internal: 711 + * |----------- md_size_sect ------------------| 712 + * [data.....][ Bitmap ][ activity log ][ 4k superblock ] 713 + * | al_offset < 0 | 714 + * | bm_offset = al_offset - Y | 715 + * ==> bitmap sectors = Y = al_offset - bm_offset 716 + * 717 + * Activity log size used to be fixed 32kB, 718 + * but is about to become configurable. 719 + */ 701 720 static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, 702 721 struct drbd_backing_dev *bdev) 703 722 { 704 723 sector_t md_size_sect = 0; 724 + unsigned int al_size_sect = MD_32kB_SECT; 705 725 int meta_dev_idx; 706 726 707 727 rcu_read_lock(); ··· 730 710 switch (meta_dev_idx) { 731 711 default: 732 712 /* v07 style fixed size indexed meta data */ 733 - bdev->md.md_size_sect = MD_RESERVED_SECT; 713 + bdev->md.md_size_sect = MD_128MB_SECT; 734 714 bdev->md.md_offset = drbd_md_ss__(mdev, bdev); 735 - bdev->md.al_offset = MD_AL_OFFSET; 736 - bdev->md.bm_offset = MD_BM_OFFSET; 715 + bdev->md.al_offset = MD_4kB_SECT; 716 + bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; 737 717 break; 738 718 case DRBD_MD_INDEX_FLEX_EXT: 739 719 /* just occupy the full device; unit: sectors */ 740 720 bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); 741 721 bdev->md.md_offset = 0; 742 - bdev->md.al_offset = MD_AL_OFFSET; 743 - bdev->md.bm_offset = MD_BM_OFFSET; 722 + bdev->md.al_offset = MD_4kB_SECT; 723 + bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; 744 724 break; 745 725 case DRBD_MD_INDEX_INTERNAL: 746 726 case DRBD_MD_INDEX_FLEX_INT: 747 727 bdev->md.md_offset = drbd_md_ss__(mdev, bdev); 748 728 /* al size is still fixed */ 749 - bdev->md.al_offset = -MD_AL_SECTORS; 729 + bdev->md.al_offset = -al_size_sect; 750 730 /* we need (slightly less than) ~ this much bitmap sectors: */ 751 731 md_size_sect = drbd_get_capacity(bdev->backing_bdev); 752 732 md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); ··· 755 735 756 736 /* plus the "drbd meta data super block", 757 737 * and the activity log; */ 758 - md_size_sect += MD_BM_OFFSET; 738 + md_size_sect += MD_4kB_SECT + al_size_sect; 759 739 760 740 bdev->md.md_size_sect = md_size_sect; 761 741 /* bitmap offset is adjusted by 'super' block size */ 762 - bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; 742 + bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT; 763 743 break; 764 744 } 765 745 rcu_read_unlock(); ··· 1436 1416 min_md_device_sectors = (2<<10); 1437 1417 } else { 1438 1418 max_possible_sectors = DRBD_MAX_SECTORS; 1439 - min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1); 1419 + min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1); 1440 1420 } 1441 1421 1442 1422 if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {