dm raid: add support for the MD RAID0 personality

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Add dm-raid access to the MD RAID0 personality to enable single zone
striping.

The following changes enable that access:
- add type definition to raid_types array
- make bitmap creation conditonal in super_validate(), because
bitmaps are not allowed in raid0
- set rdev->sectors to the data image size in super_validate()
to allow the raid0 personality to calculate the MD array
size properly
- use mdddev(un)lock() functions instead of direct mutex_(un)lock()
(wrapped in here because it's a trivial change)
- enhance raid_status() to always report full sync for raid0
so that userspace checks for 100% sync will succeed and allow
for resize (and takeover/reshape once added in future paches)
- enhance raid_resume() to not load bitmap in case of raid0
- add merge function to avoid data corruption (seen with readahead)
that resulted from bio payloads that grew too large. This problem
did not occur with the other raid levels because it either did not
apply without striping (raid1) or was avoided via stripe caching.
- raise version to 1.7.0 because of the raid0 API change

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Reviewed-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

authored by

Heinz Mauelshagen and committed by

Mike Snitzer 11 years ago 0cf45031 c76d53f4

+84 -47

2 changed files

expand all

Documentation

device-mapper

dm-raid.txt

drivers

dm-raid.c

Documentation/device-mapper/dm-raid.txt

··· 225 225 1.5.1 Add ability to restore transiently failed devices on resume. 226 226 1.5.2 'mismatch_cnt' is zero unless [last_]sync_action is "check". 227 227 1.6.0 Add discard support (and devices_handle_discard_safely module param). 228 + 1.7.0 Add support for MD RAID0 mappings.

+83 -47

drivers/md/dm-raid.c

··· 1 1 /* 2 2 * Copyright (C) 2010-2011 Neil Brown 3 - * Copyright (C) 2010-2014 Red Hat, Inc. All rights reserved. 3 + * Copyright (C) 2010-2015 Red Hat, Inc. All rights reserved. 4 4 * 5 5 * This file is released under the GPL. 6 6 */ ··· 82 82 const unsigned level; /* RAID level. */ 83 83 const unsigned algorithm; /* RAID algorithm. */ 84 84 } raid_types[] = { 85 + {"raid0", "RAID0 (striping)", 0, 2, 0, 0 /* NONE */}, 85 86 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, 86 87 {"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */}, 87 88 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, ··· 720 719 rs->md.layout = raid10_format_to_md_layout(raid10_format, 721 720 raid10_copies); 722 721 rs->md.new_layout = rs->md.layout; 723 - } else if ((rs->raid_type->level > 1) && 722 + } else if ((!rs->raid_type->level || rs->raid_type->level > 1) && 724 723 sector_div(sectors_per_dev, 725 724 (rs->md.raid_disks - rs->raid_type->parity_devs))) { 726 725 rs->ti->error = "Target length not divisible by number of data devices"; ··· 1026 1025 return 0; 1027 1026 } 1028 1027 1029 - static int super_validate(struct mddev *mddev, struct md_rdev *rdev) 1028 + static int super_validate(struct raid_set *rs, struct md_rdev *rdev) 1030 1029 { 1030 + struct mddev *mddev = &rs->md; 1031 1031 struct dm_raid_superblock *sb = page_address(rdev->sb_page); 1032 1032 1033 1033 /* ··· 1038 1036 if (!mddev->events && super_init_validation(mddev, rdev)) 1039 1037 return -EINVAL; 1040 1038 1041 - mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */ 1042 - rdev->mddev->bitmap_info.default_offset = 4096 >> 9; 1039 + /* Enable bitmap creation for RAID levels != 0 */ 1040 + mddev->bitmap_info.offset = (rs->raid_type->level) ? to_sector(4096) : 0; 1041 + rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; 1042 + 1043 1043 if (!test_bit(FirstUse, &rdev->flags)) { 1044 1044 rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset); 1045 1045 if (rdev->recovery_offset != MaxSector) ··· 1085 1081 * that the "sync" directive is disallowed during the 1086 1082 * reshape. 1087 1083 */ 1084 + rdev->sectors = to_sector(i_size_read(rdev->bdev->bd_inode)); 1085 + 1088 1086 if (rs->ctr_flags & CTR_FLAG_SYNC) 1089 1087 continue; 1090 1088 ··· 1145 1139 * validation for the remaining devices. 1146 1140 */ 1147 1141 ti->error = "Unable to assemble array: Invalid superblocks"; 1148 - if (super_validate(mddev, freshest)) 1142 + if (super_validate(rs, freshest)) 1149 1143 return -EINVAL; 1150 1144 1151 1145 rdev_for_each(rdev, mddev) 1152 - if ((rdev != freshest) && super_validate(mddev, rdev)) 1146 + if ((rdev != freshest) && super_validate(rs, rdev)) 1153 1147 return -EINVAL; 1154 1148 1155 1149 return 0; ··· 1287 1281 */ 1288 1282 configure_discard_support(ti, rs); 1289 1283 1290 - mutex_lock(&rs->md.reconfig_mutex); 1284 + /* Has to be held on running the array */ 1285 + mddev_lock_nointr(&rs->md); 1291 1286 ret = md_run(&rs->md); 1292 1287 rs->md.in_sync = 0; /* Assume already marked dirty */ 1293 - mutex_unlock(&rs->md.reconfig_mutex); 1288 + mddev_unlock(&rs->md); 1294 1289 1295 1290 if (ret) { 1296 1291 ti->error = "Fail to run raid array"; ··· 1374 1367 case STATUSTYPE_INFO: 1375 1368 DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks); 1376 1369 1377 - if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery)) 1378 - sync = rs->md.curr_resync_completed; 1379 - else 1380 - sync = rs->md.recovery_cp; 1370 + if (rs->raid_type->level) { 1371 + if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery)) 1372 + sync = rs->md.curr_resync_completed; 1373 + else 1374 + sync = rs->md.recovery_cp; 1381 1375 1382 - if (sync >= rs->md.resync_max_sectors) { 1383 - /* 1384 - * Sync complete. 1385 - */ 1376 + if (sync >= rs->md.resync_max_sectors) { 1377 + /* 1378 + * Sync complete. 1379 + */ 1380 + array_in_sync = 1; 1381 + sync = rs->md.resync_max_sectors; 1382 + } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) { 1383 + /* 1384 + * If "check" or "repair" is occurring, the array has 1385 + * undergone and initial sync and the health characters 1386 + * should not be 'a' anymore. 1387 + */ 1388 + array_in_sync = 1; 1389 + } else { 1390 + /* 1391 + * The array may be doing an initial sync, or it may 1392 + * be rebuilding individual components. If all the 1393 + * devices are In_sync, then it is the array that is 1394 + * being initialized. 1395 + */ 1396 + for (i = 0; i < rs->md.raid_disks; i++) 1397 + if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) 1398 + array_in_sync = 1; 1399 + } 1400 + } else { 1401 + /* RAID0 */ 1386 1402 array_in_sync = 1; 1387 1403 sync = rs->md.resync_max_sectors; 1388 - } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) { 1389 - /* 1390 - * If "check" or "repair" is occurring, the array has 1391 - * undergone and initial sync and the health characters 1392 - * should not be 'a' anymore. 1393 - */ 1394 - array_in_sync = 1; 1395 - } else { 1396 - /* 1397 - * The array may be doing an initial sync, or it may 1398 - * be rebuilding individual components. If all the 1399 - * devices are In_sync, then it is the array that is 1400 - * being initialized. 1401 - */ 1402 - for (i = 0; i < rs->md.raid_disks; i++) 1403 - if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) 1404 - array_in_sync = 1; 1405 1404 } 1406 1405 1407 1406 /* ··· 1696 1683 { 1697 1684 struct raid_set *rs = ti->private; 1698 1685 1699 - set_bit(MD_CHANGE_DEVS, &rs->md.flags); 1700 - if (!rs->bitmap_loaded) { 1701 - bitmap_load(&rs->md); 1702 - rs->bitmap_loaded = 1; 1703 - } else { 1704 - /* 1705 - * A secondary resume while the device is active. 1706 - * Take this opportunity to check whether any failed 1707 - * devices are reachable again. 1708 - */ 1709 - attempt_restore_of_faulty_devices(rs); 1686 + if (rs->raid_type->level) { 1687 + set_bit(MD_CHANGE_DEVS, &rs->md.flags); 1688 + 1689 + if (!rs->bitmap_loaded) { 1690 + bitmap_load(&rs->md); 1691 + rs->bitmap_loaded = 1; 1692 + } else { 1693 + /* 1694 + * A secondary resume while the device is active. 1695 + * Take this opportunity to check whether any failed 1696 + * devices are reachable again. 1697 + */ 1698 + attempt_restore_of_faulty_devices(rs); 1699 + } 1700 + 1701 + clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); 1710 1702 } 1711 1703 1712 - clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); 1713 1704 mddev_resume(&rs->md); 1705 + } 1706 + 1707 + static int raid_merge(struct dm_target *ti, struct bvec_merge_data *bvm, 1708 + struct bio_vec *biovec, int max_size) 1709 + { 1710 + struct raid_set *rs = ti->private; 1711 + struct md_personality *pers = rs->md.pers; 1712 + 1713 + if (pers && pers->mergeable_bvec) 1714 + return min(max_size, pers->mergeable_bvec(&rs->md, bvm, biovec)); 1715 + 1716 + /* 1717 + * In case we can't request the personality because 1718 + * the raid set is not running yet 1719 + * 1720 + * -> return safe minimum 1721 + */ 1722 + return rs->md.chunk_sectors; 1714 1723 } 1715 1724 1716 1725 static struct target_type raid_target = { 1717 1726 .name = "raid", 1718 - .version = {1, 6, 0}, 1727 + .version = {1, 7, 0}, 1719 1728 .module = THIS_MODULE, 1720 1729 .ctr = raid_ctr, 1721 1730 .dtr = raid_dtr, ··· 1749 1714 .presuspend = raid_presuspend, 1750 1715 .postsuspend = raid_postsuspend, 1751 1716 .resume = raid_resume, 1717 + .merge = raid_merge, 1752 1718 }; 1753 1719 1754 1720 static int __init dm_raid_init(void)