Merge tag 'dm-4.8-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

+55 -3

Documentation/device-mapper/dm-raid.txt

··· 14 14 <#raid_devs> <metadata_dev0> <dev0> [.. <metadata_devN> <devN>] 15 15 16 16 <raid_type>: 17 + raid0 RAID0 striping (no resilience) 17 18 raid1 RAID1 mirroring 18 - raid4 RAID4 dedicated parity disk 19 + raid4 RAID4 with dedicated last parity disk 20 + raid5_n RAID5 with dedicated last parity disk suporting takeover 21 + Same as raid4 22 + -Transitory layout 19 23 raid5_la RAID5 left asymmetric 20 24 - rotating parity 0 with data continuation 21 25 raid5_ra RAID5 right asymmetric ··· 34 30 - rotating parity N (right-to-left) with data restart 35 31 raid6_nc RAID6 N continue 36 32 - rotating parity N (right-to-left) with data continuation 33 + raid6_n_6 RAID6 with dedicate parity disks 34 + - parity and Q-syndrome on the last 2 disks; 35 + laylout for takeover from/to raid4/raid5_n 36 + raid6_la_6 Same as "raid_la" plus dedicated last Q-syndrome disk 37 + - layout for takeover from raid5_la from/to raid6 38 + raid6_ra_6 Same as "raid5_ra" dedicated last Q-syndrome disk 39 + - layout for takeover from raid5_ra from/to raid6 40 + raid6_ls_6 Same as "raid5_ls" dedicated last Q-syndrome disk 41 + - layout for takeover from raid5_ls from/to raid6 42 + raid6_rs_6 Same as "raid5_rs" dedicated last Q-syndrome disk 43 + - layout for takeover from raid5_rs from/to raid6 37 44 raid10 Various RAID10 inspired algorithms chosen by additional params 45 + (see raid10_format and raid10_copies below) 38 46 - RAID10: Striped Mirrors (aka 'Striping on top of mirrors') 39 47 - RAID1E: Integrated Adjacent Stripe Mirroring 40 48 - RAID1E: Integrated Offset Stripe Mirroring ··· 132 116 Here we see layouts closely akin to 'RAID1E - Integrated 133 117 Offset Stripe Mirroring'. 134 118 119 + [delta_disks <N>] 120 + The delta_disks option value (-251 < N < +251) triggers 121 + device removal (negative value) or device addition (positive 122 + value) to any reshape supporting raid levels 4/5/6 and 10. 123 + RAID levels 4/5/6 allow for addition of devices (metadata 124 + and data device tupel), raid10_near and raid10_offset only 125 + allow for device addtion. raid10_far does not support any 126 + reshaping at all. 127 + A minimum of devices have to be kept to enforce resilience, 128 + which is 3 devices for raid4/5 and 4 devices for raid6. 129 + 130 + [data_offset <sectors>] 131 + This option value defines the offset into each data device 132 + where the data starts. This is used to provide out-of-place 133 + reshaping space to avoid writing over data whilst 134 + changing the layout of stripes, hence an interruption/crash 135 + may happen at any time without the risk of losing data. 136 + E.g. when adding devices to an existing raid set during 137 + forward reshaping, the out-of-place space will be allocated 138 + at the beginning of each raid device. The kernel raid4/5/6/10 139 + MD personalities supporting such device addition will read the data from 140 + the existing first stripes (those with smaller number of stripes) 141 + starting at data_offset to fill up a new stripe with the larger 142 + number of stripes, calculate the redundancy blocks (CRC/Q-syndrome) 143 + and write that new stripe to offset 0. Same will be applied to all 144 + N-1 other new stripes. This out-of-place scheme is used to change 145 + the RAID type (i.e. the allocation algorithm) as well, e.g. 146 + changing from raid5_ls to raid5_n. 147 + 135 148 <#raid_devs>: The number of devices composing the array. 136 149 Each device consists of two entries. The first is the device 137 150 containing the metadata (if any); the second is the one containing the 138 - data. 151 + data. A Maximum of 64 metadata/data device entries are supported 152 + up to target version 1.8.0. 153 + 1.9.0 supports up to 253 which is enforced by the used MD kernel runtime. 139 154 140 155 If a drive has failed or is missing at creation time, a '-' can be 141 156 given for both the metadata and data drives for a given position. ··· 254 207 "recover"- Initiate/continue a recover process. 255 208 "check" - Initiate a check (i.e. a "scrub") of the array. 256 209 "repair" - Initiate a repair of the array. 257 - "reshape"- Currently unsupported (-EINVAL). 258 210 259 211 260 212 Discard Support ··· 303 257 1.5.2 'mismatch_cnt' is zero unless [last_]sync_action is "check". 304 258 1.6.0 Add discard support (and devices_handle_discard_safely module param). 305 259 1.7.0 Add support for MD RAID0 mappings. 260 + 1.8.0 Explictely check for compatible flags in the superblock metadata 261 + and reject to start the raid set if any are set by a newer 262 + target version, thus avoiding data corruption on a raid set 263 + with a reshape in progress. 264 + 1.9.0 Add support for RAID level takeover/reshape/region size 265 + and set size reduction.

+2 -1

drivers/md/Makefile

··· 3 3 # 4 4 5 5 dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ 6 - dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o 6 + dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o \ 7 + dm-rq.o 7 8 dm-multipath-y += dm-path-selector.o dm-mpath.o 8 9 dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ 9 10 dm-snap-persistent.o

+1 -1

drivers/md/dm-builtin.c

··· 1 - #include "dm.h" 1 + #include "dm-core.h" 2 2 3 3 /* 4 4 * The kobject release method must not be placed in the module itself,

+149

drivers/md/dm-core.h

··· 1 + /* 2 + * Internal header file _only_ for device mapper core 3 + * 4 + * Copyright (C) 2016 Red Hat, Inc. All rights reserved. 5 + * 6 + * This file is released under the LGPL. 7 + */ 8 + 9 + #ifndef DM_CORE_INTERNAL_H 10 + #define DM_CORE_INTERNAL_H 11 + 12 + #include <linux/kthread.h> 13 + #include <linux/ktime.h> 14 + #include <linux/blk-mq.h> 15 + 16 + #include <trace/events/block.h> 17 + 18 + #include "dm.h" 19 + 20 + #define DM_RESERVED_MAX_IOS 1024 21 + 22 + struct dm_kobject_holder { 23 + struct kobject kobj; 24 + struct completion completion; 25 + }; 26 + 27 + /* 28 + * DM core internal structure that used directly by dm.c and dm-rq.c 29 + * DM targets must _not_ deference a mapped_device to directly access its members! 30 + */ 31 + struct mapped_device { 32 + struct srcu_struct io_barrier; 33 + struct mutex suspend_lock; 34 + 35 + /* 36 + * The current mapping (struct dm_table *). 37 + * Use dm_get_live_table{_fast} or take suspend_lock for 38 + * dereference. 39 + */ 40 + void __rcu *map; 41 + 42 + struct list_head table_devices; 43 + struct mutex table_devices_lock; 44 + 45 + unsigned long flags; 46 + 47 + struct request_queue *queue; 48 + int numa_node_id; 49 + 50 + unsigned type; 51 + /* Protect queue and type against concurrent access. */ 52 + struct mutex type_lock; 53 + 54 + atomic_t holders; 55 + atomic_t open_count; 56 + 57 + struct dm_target *immutable_target; 58 + struct target_type *immutable_target_type; 59 + 60 + struct gendisk *disk; 61 + char name[16]; 62 + 63 + void *interface_ptr; 64 + 65 + /* 66 + * A list of ios that arrived while we were suspended. 67 + */ 68 + atomic_t pending[2]; 69 + wait_queue_head_t wait; 70 + struct work_struct work; 71 + spinlock_t deferred_lock; 72 + struct bio_list deferred; 73 + 74 + /* 75 + * Event handling. 76 + */ 77 + wait_queue_head_t eventq; 78 + atomic_t event_nr; 79 + atomic_t uevent_seq; 80 + struct list_head uevent_list; 81 + spinlock_t uevent_lock; /* Protect access to uevent_list */ 82 + 83 + /* the number of internal suspends */ 84 + unsigned internal_suspend_count; 85 + 86 + /* 87 + * Processing queue (flush) 88 + */ 89 + struct workqueue_struct *wq; 90 + 91 + /* 92 + * io objects are allocated from here. 93 + */ 94 + mempool_t *io_pool; 95 + mempool_t *rq_pool; 96 + 97 + struct bio_set *bs; 98 + 99 + /* 100 + * freeze/thaw support require holding onto a super block 101 + */ 102 + struct super_block *frozen_sb; 103 + 104 + /* forced geometry settings */ 105 + struct hd_geometry geometry; 106 + 107 + struct block_device *bdev; 108 + 109 + /* kobject and completion */ 110 + struct dm_kobject_holder kobj_holder; 111 + 112 + /* zero-length flush that will be cloned and submitted to targets */ 113 + struct bio flush_bio; 114 + 115 + struct dm_stats stats; 116 + 117 + struct kthread_worker kworker; 118 + struct task_struct *kworker_task; 119 + 120 + /* for request-based merge heuristic in dm_request_fn() */ 121 + unsigned seq_rq_merge_deadline_usecs; 122 + int last_rq_rw; 123 + sector_t last_rq_pos; 124 + ktime_t last_rq_start_time; 125 + 126 + /* for blk-mq request-based DM support */ 127 + struct blk_mq_tag_set *tag_set; 128 + bool use_blk_mq:1; 129 + bool init_tio_pdu:1; 130 + }; 131 + 132 + void dm_init_md_queue(struct mapped_device *md); 133 + void dm_init_normal_md_queue(struct mapped_device *md); 134 + int md_in_flight(struct mapped_device *md); 135 + void disable_write_same(struct mapped_device *md); 136 + 137 + static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) 138 + { 139 + return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; 140 + } 141 + 142 + unsigned __dm_get_module_param(unsigned *module_param, unsigned def, unsigned max); 143 + 144 + static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen) 145 + { 146 + return !maxlen || strlen(result) + 1 >= maxlen; 147 + } 148 + 149 + #endif

+2 -2

drivers/md/dm-crypt.c

··· 683 683 u8 *data) 684 684 { 685 685 struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; 686 - u64 sector = cpu_to_le64((u64)dmreq->iv_sector); 686 + __le64 sector = cpu_to_le64(dmreq->iv_sector); 687 687 u8 buf[TCW_WHITENING_SIZE]; 688 688 SHASH_DESC_ON_STACK(desc, tcw->crc32_tfm); 689 689 int i, r; ··· 722 722 struct dm_crypt_request *dmreq) 723 723 { 724 724 struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; 725 - u64 sector = cpu_to_le64((u64)dmreq->iv_sector); 725 + __le64 sector = cpu_to_le64(dmreq->iv_sector); 726 726 u8 *src; 727 727 int r = 0; 728 728

+1 -1

drivers/md/dm-io.c

··· 5 5 * This file is released under the GPL. 6 6 */ 7 7 8 - #include "dm.h" 8 + #include "dm-core.h" 9 9 10 10 #include <linux/device-mapper.h> 11 11

+17 -14

drivers/md/dm-ioctl.c

··· 5 5 * This file is released under the GPL. 6 6 */ 7 7 8 - #include "dm.h" 8 + #include "dm-core.h" 9 9 10 10 #include <linux/module.h> 11 11 #include <linux/vmalloc.h> ··· 1267 1267 return dm_table_complete(table); 1268 1268 } 1269 1269 1270 + static bool is_valid_type(unsigned cur, unsigned new) 1271 + { 1272 + if (cur == new || 1273 + (cur == DM_TYPE_BIO_BASED && new == DM_TYPE_DAX_BIO_BASED)) 1274 + return true; 1275 + 1276 + return false; 1277 + } 1278 + 1270 1279 static int table_load(struct dm_ioctl *param, size_t param_size) 1271 1280 { 1272 1281 int r; ··· 1318 1309 DMWARN("unable to set up device queue for new table."); 1319 1310 goto err_unlock_md_type; 1320 1311 } 1321 - } else if (dm_get_md_type(md) != dm_table_get_type(t)) { 1312 + } else if (!is_valid_type(dm_get_md_type(md), dm_table_get_type(t))) { 1322 1313 DMWARN("can't change device type after initial table load."); 1323 1314 r = -EINVAL; 1324 1315 goto err_unlock_md_type; ··· 1679 1670 return r; 1680 1671 } 1681 1672 1682 - #define DM_PARAMS_KMALLOC 0x0001 /* Params alloced with kmalloc */ 1683 - #define DM_PARAMS_VMALLOC 0x0002 /* Params alloced with vmalloc */ 1673 + #define DM_PARAMS_MALLOC 0x0001 /* Params allocated with kvmalloc() */ 1684 1674 #define DM_WIPE_BUFFER 0x0010 /* Wipe input buffer before returning from ioctl */ 1685 1675 1686 1676 static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags) ··· 1687 1679 if (param_flags & DM_WIPE_BUFFER) 1688 1680 memset(param, 0, param_size); 1689 1681 1690 - if (param_flags & DM_PARAMS_KMALLOC) 1691 - kfree(param); 1692 - if (param_flags & DM_PARAMS_VMALLOC) 1693 - vfree(param); 1682 + if (param_flags & DM_PARAMS_MALLOC) 1683 + kvfree(param); 1694 1684 } 1695 1685 1696 1686 static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel, ··· 1720 1714 * Use kmalloc() rather than vmalloc() when we can. 1721 1715 */ 1722 1716 dmi = NULL; 1723 - if (param_kernel->data_size <= KMALLOC_MAX_SIZE) { 1717 + if (param_kernel->data_size <= KMALLOC_MAX_SIZE) 1724 1718 dmi = kmalloc(param_kernel->data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); 1725 - if (dmi) 1726 - *param_flags |= DM_PARAMS_KMALLOC; 1727 - } 1728 1719 1729 1720 if (!dmi) { 1730 1721 unsigned noio_flag; 1731 1722 noio_flag = memalloc_noio_save(); 1732 1723 dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL); 1733 1724 memalloc_noio_restore(noio_flag); 1734 - if (dmi) 1735 - *param_flags |= DM_PARAMS_VMALLOC; 1736 1725 } 1737 1726 1738 1727 if (!dmi) { ··· 1735 1734 return -EFAULT; 1736 1735 return -ENOMEM; 1737 1736 } 1737 + 1738 + *param_flags |= DM_PARAMS_MALLOC; 1738 1739 1739 1740 if (copy_from_user(dmi, user, param_kernel->data_size)) 1740 1741 goto bad;

+1 -1

drivers/md/dm-kcopyd.c

··· 26 26 #include <linux/device-mapper.h> 27 27 #include <linux/dm-kcopyd.h> 28 28 29 - #include "dm.h" 29 + #include "dm-core.h" 30 30 31 31 #define SUB_JOB_SIZE 128 32 32 #define SPLIT_COUNT 8

+20 -1

drivers/md/dm-linear.c

··· 141 141 return fn(ti, lc->dev, lc->start, ti->len, data); 142 142 } 143 143 144 + static long linear_direct_access(struct dm_target *ti, sector_t sector, 145 + void __pmem **kaddr, pfn_t *pfn, long size) 146 + { 147 + struct linear_c *lc = ti->private; 148 + struct block_device *bdev = lc->dev->bdev; 149 + struct blk_dax_ctl dax = { 150 + .sector = linear_map_sector(ti, sector), 151 + .size = size, 152 + }; 153 + long ret; 154 + 155 + ret = bdev_direct_access(bdev, &dax); 156 + *kaddr = dax.addr; 157 + *pfn = dax.pfn; 158 + 159 + return ret; 160 + } 161 + 144 162 static struct target_type linear_target = { 145 163 .name = "linear", 146 - .version = {1, 2, 1}, 164 + .version = {1, 3, 0}, 147 165 .module = THIS_MODULE, 148 166 .ctr = linear_ctr, 149 167 .dtr = linear_dtr, ··· 169 151 .status = linear_status, 170 152 .prepare_ioctl = linear_prepare_ioctl, 171 153 .iterate_devices = linear_iterate_devices, 154 + .direct_access = linear_direct_access, 172 155 }; 173 156 174 157 int __init dm_linear_init(void)

+318 -36

drivers/md/dm-mpath.c

··· 7 7 8 8 #include <linux/device-mapper.h> 9 9 10 - #include "dm.h" 10 + #include "dm-rq.h" 11 + #include "dm-bio-record.h" 11 12 #include "dm-path-selector.h" 12 13 #include "dm-uevent.h" 13 14 ··· 90 89 atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ 91 90 atomic_t pg_init_count; /* Number of times pg_init called */ 92 91 92 + unsigned queue_mode; 93 + 93 94 /* 94 95 * We must use a mempool of dm_mpath_io structs so that we 95 96 * can resubmit bios on error. ··· 100 97 101 98 struct mutex work_mutex; 102 99 struct work_struct trigger_event; 100 + 101 + struct work_struct process_queued_bios; 102 + struct bio_list queued_bios; 103 103 }; 104 104 105 105 /* 106 - * Context information attached to each bio we process. 106 + * Context information attached to each io we process. 107 107 */ 108 108 struct dm_mpath_io { 109 109 struct pgpath *pgpath; ··· 120 114 static struct workqueue_struct *kmultipathd, *kmpath_handlerd; 121 115 static void trigger_event(struct work_struct *work); 122 116 static void activate_path(struct work_struct *work); 117 + static void process_queued_bios(struct work_struct *work); 123 118 124 119 /*----------------------------------------------- 125 120 * Multipath state flags. ··· 192 185 kfree(pg); 193 186 } 194 187 195 - static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq) 188 + static struct multipath *alloc_multipath(struct dm_target *ti) 196 189 { 197 190 struct multipath *m; 198 191 ··· 210 203 mutex_init(&m->work_mutex); 211 204 212 205 m->mpio_pool = NULL; 213 - if (!use_blk_mq) { 214 - unsigned min_ios = dm_get_reserved_rq_based_ios(); 215 - 216 - m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache); 217 - if (!m->mpio_pool) { 218 - kfree(m); 219 - return NULL; 220 - } 221 - } 206 + m->queue_mode = DM_TYPE_NONE; 222 207 223 208 m->ti = ti; 224 209 ti->private = m; 225 210 } 226 211 227 212 return m; 213 + } 214 + 215 + static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) 216 + { 217 + if (m->queue_mode == DM_TYPE_NONE) { 218 + /* 219 + * Default to request-based. 220 + */ 221 + if (dm_use_blk_mq(dm_table_get_md(ti->table))) 222 + m->queue_mode = DM_TYPE_MQ_REQUEST_BASED; 223 + else 224 + m->queue_mode = DM_TYPE_REQUEST_BASED; 225 + } 226 + 227 + if (m->queue_mode == DM_TYPE_REQUEST_BASED) { 228 + unsigned min_ios = dm_get_reserved_rq_based_ios(); 229 + 230 + m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache); 231 + if (!m->mpio_pool) 232 + return -ENOMEM; 233 + } 234 + else if (m->queue_mode == DM_TYPE_BIO_BASED) { 235 + INIT_WORK(&m->process_queued_bios, process_queued_bios); 236 + /* 237 + * bio-based doesn't support any direct scsi_dh management; 238 + * it just discovers if a scsi_dh is attached. 239 + */ 240 + set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); 241 + } 242 + 243 + dm_table_set_type(ti->table, m->queue_mode); 244 + 245 + return 0; 228 246 } 229 247 230 248 static void free_multipath(struct multipath *m) ··· 302 270 info->ptr = NULL; 303 271 mempool_free(mpio, m->mpio_pool); 304 272 } 273 + } 274 + 275 + static size_t multipath_per_bio_data_size(void) 276 + { 277 + return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details); 278 + } 279 + 280 + static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio) 281 + { 282 + return dm_per_bio_data(bio, multipath_per_bio_data_size()); 283 + } 284 + 285 + static struct dm_bio_details *get_bio_details_from_bio(struct bio *bio) 286 + { 287 + /* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */ 288 + struct dm_mpath_io *mpio = get_mpio_from_bio(bio); 289 + void *bio_details = mpio + 1; 290 + 291 + return bio_details; 292 + } 293 + 294 + static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p, 295 + struct dm_bio_details **bio_details_p) 296 + { 297 + struct dm_mpath_io *mpio = get_mpio_from_bio(bio); 298 + struct dm_bio_details *bio_details = get_bio_details_from_bio(bio); 299 + 300 + memset(mpio, 0, sizeof(*mpio)); 301 + memset(bio_details, 0, sizeof(*bio_details)); 302 + dm_bio_record(bio_details, bio); 303 + 304 + if (mpio_p) 305 + *mpio_p = mpio; 306 + if (bio_details_p) 307 + *bio_details_p = bio_details; 305 308 } 306 309 307 310 /*----------------------------------------------- ··· 498 431 * and multipath_resume() calls and we have no need to check 499 432 * for the DMF_NOFLUSH_SUSPENDING flag. 500 433 */ 501 - static int must_push_back(struct multipath *m) 434 + static bool __must_push_back(struct multipath *m) 435 + { 436 + return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) != 437 + test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) && 438 + dm_noflush_suspending(m->ti)); 439 + } 440 + 441 + static bool must_push_back_rq(struct multipath *m) 502 442 { 503 443 return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || 504 - ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) != 505 - test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) && 506 - dm_noflush_suspending(m->ti))); 444 + __must_push_back(m)); 445 + } 446 + 447 + static bool must_push_back_bio(struct multipath *m) 448 + { 449 + return __must_push_back(m); 507 450 } 508 451 509 452 /* 510 - * Map cloned requests 453 + * Map cloned requests (request-based multipath) 511 454 */ 512 455 static int __multipath_map(struct dm_target *ti, struct request *clone, 513 456 union map_info *map_context, ··· 536 459 pgpath = choose_pgpath(m, nr_bytes); 537 460 538 461 if (!pgpath) { 539 - if (!must_push_back(m)) 462 + if (!must_push_back_rq(m)) 540 463 r = -EIO; /* Failed */ 541 464 return r; 542 465 } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || ··· 607 530 } 608 531 609 532 /* 533 + * Map cloned bios (bio-based multipath) 534 + */ 535 + static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_mpath_io *mpio) 536 + { 537 + size_t nr_bytes = bio->bi_iter.bi_size; 538 + struct pgpath *pgpath; 539 + unsigned long flags; 540 + bool queue_io; 541 + 542 + /* Do we need to select a new pgpath? */ 543 + pgpath = lockless_dereference(m->current_pgpath); 544 + queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); 545 + if (!pgpath || !queue_io) 546 + pgpath = choose_pgpath(m, nr_bytes); 547 + 548 + if ((pgpath && queue_io) || 549 + (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) { 550 + /* Queue for the daemon to resubmit */ 551 + spin_lock_irqsave(&m->lock, flags); 552 + bio_list_add(&m->queued_bios, bio); 553 + spin_unlock_irqrestore(&m->lock, flags); 554 + /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */ 555 + if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) 556 + pg_init_all_paths(m); 557 + else if (!queue_io) 558 + queue_work(kmultipathd, &m->process_queued_bios); 559 + return DM_MAPIO_SUBMITTED; 560 + } 561 + 562 + if (!pgpath) { 563 + if (!must_push_back_bio(m)) 564 + return -EIO; 565 + return DM_MAPIO_REQUEUE; 566 + } 567 + 568 + mpio->pgpath = pgpath; 569 + mpio->nr_bytes = nr_bytes; 570 + 571 + bio->bi_error = 0; 572 + bio->bi_bdev = pgpath->path.dev->bdev; 573 + bio->bi_rw |= REQ_FAILFAST_TRANSPORT; 574 + 575 + if (pgpath->pg->ps.type->start_io) 576 + pgpath->pg->ps.type->start_io(&pgpath->pg->ps, 577 + &pgpath->path, 578 + nr_bytes); 579 + return DM_MAPIO_REMAPPED; 580 + } 581 + 582 + static int multipath_map_bio(struct dm_target *ti, struct bio *bio) 583 + { 584 + struct multipath *m = ti->private; 585 + struct dm_mpath_io *mpio = NULL; 586 + 587 + multipath_init_per_bio_data(bio, &mpio, NULL); 588 + 589 + return __multipath_map_bio(m, bio, mpio); 590 + } 591 + 592 + static void process_queued_bios_list(struct multipath *m) 593 + { 594 + if (m->queue_mode == DM_TYPE_BIO_BASED) 595 + queue_work(kmultipathd, &m->process_queued_bios); 596 + } 597 + 598 + static void process_queued_bios(struct work_struct *work) 599 + { 600 + int r; 601 + unsigned long flags; 602 + struct bio *bio; 603 + struct bio_list bios; 604 + struct blk_plug plug; 605 + struct multipath *m = 606 + container_of(work, struct multipath, process_queued_bios); 607 + 608 + bio_list_init(&bios); 609 + 610 + spin_lock_irqsave(&m->lock, flags); 611 + 612 + if (bio_list_empty(&m->queued_bios)) { 613 + spin_unlock_irqrestore(&m->lock, flags); 614 + return; 615 + } 616 + 617 + bio_list_merge(&bios, &m->queued_bios); 618 + bio_list_init(&m->queued_bios); 619 + 620 + spin_unlock_irqrestore(&m->lock, flags); 621 + 622 + blk_start_plug(&plug); 623 + while ((bio = bio_list_pop(&bios))) { 624 + r = __multipath_map_bio(m, bio, get_mpio_from_bio(bio)); 625 + if (r < 0 || r == DM_MAPIO_REQUEUE) { 626 + bio->bi_error = r; 627 + bio_endio(bio); 628 + } else if (r == DM_MAPIO_REMAPPED) 629 + generic_make_request(bio); 630 + } 631 + blk_finish_plug(&plug); 632 + } 633 + 634 + /* 610 635 * If we run out of usable paths, should we queue I/O or error it? 611 636 */ 612 637 static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, ··· 736 557 737 558 spin_unlock_irqrestore(&m->lock, flags); 738 559 739 - if (!queue_if_no_path) 560 + if (!queue_if_no_path) { 740 561 dm_table_run_md_queue_async(m->ti->table); 562 + process_queued_bios_list(m); 563 + } 741 564 742 565 return 0; 743 566 } ··· 979 798 if (!hw_argc) 980 799 return 0; 981 800 801 + if (m->queue_mode == DM_TYPE_BIO_BASED) { 802 + dm_consume_args(as, hw_argc); 803 + DMERR("bio-based multipath doesn't allow hardware handler args"); 804 + return 0; 805 + } 806 + 982 807 m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); 983 808 984 809 if (hw_argc > 1) { ··· 1020 833 const char *arg_name; 1021 834 1022 835 static struct dm_arg _args[] = { 1023 - {0, 6, "invalid number of feature args"}, 836 + {0, 8, "invalid number of feature args"}, 1024 837 {1, 50, "pg_init_retries must be between 1 and 50"}, 1025 838 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, 1026 839 }; ··· 1060 873 continue; 1061 874 } 1062 875 876 + if (!strcasecmp(arg_name, "queue_mode") && 877 + (argc >= 1)) { 878 + const char *queue_mode_name = dm_shift_arg(as); 879 + 880 + if (!strcasecmp(queue_mode_name, "bio")) 881 + m->queue_mode = DM_TYPE_BIO_BASED; 882 + else if (!strcasecmp(queue_mode_name, "rq")) 883 + m->queue_mode = DM_TYPE_REQUEST_BASED; 884 + else if (!strcasecmp(queue_mode_name, "mq")) 885 + m->queue_mode = DM_TYPE_MQ_REQUEST_BASED; 886 + else { 887 + ti->error = "Unknown 'queue_mode' requested"; 888 + r = -EINVAL; 889 + } 890 + argc--; 891 + continue; 892 + } 893 + 1063 894 ti->error = "Unrecognised multipath feature request"; 1064 895 r = -EINVAL; 1065 896 } while (argc && !r); ··· 1085 880 return r; 1086 881 } 1087 882 1088 - static int multipath_ctr(struct dm_target *ti, unsigned int argc, 1089 - char **argv) 883 + static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) 1090 884 { 1091 885 /* target arguments */ 1092 886 static struct dm_arg _args[] = { ··· 1098 894 struct dm_arg_set as; 1099 895 unsigned pg_count = 0; 1100 896 unsigned next_pg_num; 1101 - bool use_blk_mq = dm_use_blk_mq(dm_table_get_md(ti->table)); 1102 897 1103 898 as.argc = argc; 1104 899 as.argv = argv; 1105 900 1106 - m = alloc_multipath(ti, use_blk_mq); 901 + m = alloc_multipath(ti); 1107 902 if (!m) { 1108 903 ti->error = "can't allocate multipath"; 1109 904 return -EINVAL; 1110 905 } 1111 906 1112 907 r = parse_features(&as, m); 908 + if (r) 909 + goto bad; 910 + 911 + r = alloc_multipath_stage2(ti, m); 1113 912 if (r) 1114 913 goto bad; 1115 914 ··· 1165 958 ti->num_flush_bios = 1; 1166 959 ti->num_discard_bios = 1; 1167 960 ti->num_write_same_bios = 1; 1168 - if (use_blk_mq) 961 + if (m->queue_mode == DM_TYPE_BIO_BASED) 962 + ti->per_io_data_size = multipath_per_bio_data_size(); 963 + else if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED) 1169 964 ti->per_io_data_size = sizeof(struct dm_mpath_io); 1170 965 1171 966 return 0; ··· 1292 1083 1293 1084 out: 1294 1085 spin_unlock_irqrestore(&m->lock, flags); 1295 - if (run_queue) 1086 + if (run_queue) { 1296 1087 dm_table_run_md_queue_async(m->ti->table); 1088 + process_queued_bios_list(m); 1089 + } 1297 1090 1298 1091 return r; 1299 1092 } ··· 1492 1281 } 1493 1282 clear_bit(MPATHF_QUEUE_IO, &m->flags); 1494 1283 1284 + process_queued_bios_list(m); 1285 + 1495 1286 /* 1496 1287 * Wake up any thread waiting to suspend. 1497 1288 */ ··· 1541 1328 * during end I/O handling, since those clone requests don't have 1542 1329 * bio clones. If we queue them inside the multipath target, 1543 1330 * we need to make bio clones, that requires memory allocation. 1544 - * (See drivers/md/dm.c:end_clone_bio() about why the clone requests 1331 + * (See drivers/md/dm-rq.c:end_clone_bio() about why the clone requests 1545 1332 * don't have bio clones.) 1546 1333 * Instead of queueing the clone request here, we queue the original 1547 1334 * request into dm core, which will remake a clone request and ··· 1560 1347 1561 1348 if (!atomic_read(&m->nr_valid_paths)) { 1562 1349 if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { 1563 - if (!must_push_back(m)) 1350 + if (!must_push_back_rq(m)) 1564 1351 r = -EIO; 1565 1352 } else { 1566 1353 if (error == -EBADE) ··· 1590 1377 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); 1591 1378 } 1592 1379 clear_request_fn_mpio(m, map_context); 1380 + 1381 + return r; 1382 + } 1383 + 1384 + static int do_end_io_bio(struct multipath *m, struct bio *clone, 1385 + int error, struct dm_mpath_io *mpio) 1386 + { 1387 + unsigned long flags; 1388 + 1389 + if (!error) 1390 + return 0; /* I/O complete */ 1391 + 1392 + if (noretry_error(error)) 1393 + return error; 1394 + 1395 + if (mpio->pgpath) 1396 + fail_path(mpio->pgpath); 1397 + 1398 + if (!atomic_read(&m->nr_valid_paths)) { 1399 + if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { 1400 + if (!must_push_back_bio(m)) 1401 + return -EIO; 1402 + return DM_ENDIO_REQUEUE; 1403 + } else { 1404 + if (error == -EBADE) 1405 + return error; 1406 + } 1407 + } 1408 + 1409 + /* Queue for the daemon to resubmit */ 1410 + dm_bio_restore(get_bio_details_from_bio(clone), clone); 1411 + 1412 + spin_lock_irqsave(&m->lock, flags); 1413 + bio_list_add(&m->queued_bios, clone); 1414 + spin_unlock_irqrestore(&m->lock, flags); 1415 + if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) 1416 + queue_work(kmultipathd, &m->process_queued_bios); 1417 + 1418 + return DM_ENDIO_INCOMPLETE; 1419 + } 1420 + 1421 + static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, int error) 1422 + { 1423 + struct multipath *m = ti->private; 1424 + struct dm_mpath_io *mpio = get_mpio_from_bio(clone); 1425 + struct pgpath *pgpath; 1426 + struct path_selector *ps; 1427 + int r; 1428 + 1429 + BUG_ON(!mpio); 1430 + 1431 + r = do_end_io_bio(m, clone, error, mpio); 1432 + pgpath = mpio->pgpath; 1433 + if (pgpath) { 1434 + ps = &pgpath->pg->ps; 1435 + if (ps->type->end_io) 1436 + ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); 1437 + } 1593 1438 1594 1439 return r; 1595 1440 } ··· 1725 1454 DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) + 1726 1455 (m->pg_init_retries > 0) * 2 + 1727 1456 (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 + 1728 - test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)); 1457 + test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) + 1458 + (m->queue_mode != DM_TYPE_REQUEST_BASED) * 2); 1459 + 1729 1460 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) 1730 1461 DMEMIT("queue_if_no_path "); 1731 1462 if (m->pg_init_retries) ··· 1736 1463 DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); 1737 1464 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) 1738 1465 DMEMIT("retain_attached_hw_handler "); 1466 + if (m->queue_mode != DM_TYPE_REQUEST_BASED) { 1467 + switch(m->queue_mode) { 1468 + case DM_TYPE_BIO_BASED: 1469 + DMEMIT("queue_mode bio "); 1470 + break; 1471 + case DM_TYPE_MQ_REQUEST_BASED: 1472 + DMEMIT("queue_mode mq "); 1473 + break; 1474 + } 1475 + } 1739 1476 } 1740 1477 1741 1478 if (!m->hw_handler_name || type == STATUSTYPE_INFO) ··· 1925 1642 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) 1926 1643 pg_init_all_paths(m); 1927 1644 dm_table_run_md_queue_async(m->ti->table); 1645 + process_queued_bios_list(m); 1928 1646 } 1929 1647 1930 1648 /* ··· 2032 1748 *---------------------------------------------------------------*/ 2033 1749 static struct target_type multipath_target = { 2034 1750 .name = "multipath", 2035 - .version = {1, 11, 0}, 1751 + .version = {1, 12, 0}, 2036 1752 .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE, 2037 1753 .module = THIS_MODULE, 2038 1754 .ctr = multipath_ctr, ··· 2041 1757 .clone_and_map_rq = multipath_clone_and_map, 2042 1758 .release_clone_rq = multipath_release_clone, 2043 1759 .rq_end_io = multipath_end_io, 1760 + .map = multipath_map_bio, 1761 + .end_io = multipath_end_io_bio, 2044 1762 .presuspend = multipath_presuspend, 2045 1763 .postsuspend = multipath_postsuspend, 2046 1764 .resume = multipath_resume, ··· 2057 1771 { 2058 1772 int r; 2059 1773 2060 - /* allocate a slab for the dm_ios */ 1774 + /* allocate a slab for the dm_mpath_ios */ 2061 1775 _mpio_cache = KMEM_CACHE(dm_mpath_io, 0); 2062 1776 if (!_mpio_cache) 2063 1777 return -ENOMEM; 2064 1778 2065 1779 r = dm_register_target(&multipath_target); 2066 1780 if (r < 0) { 2067 - DMERR("register failed %d", r); 1781 + DMERR("request-based register failed %d", r); 2068 1782 r = -EINVAL; 2069 1783 goto bad_register_target; 2070 1784 } ··· 2089 1803 r = -ENOMEM; 2090 1804 goto bad_alloc_kmpath_handlerd; 2091 1805 } 2092 - 2093 - DMINFO("version %u.%u.%u loaded", 2094 - multipath_target.version[0], multipath_target.version[1], 2095 - multipath_target.version[2]); 2096 1806 2097 1807 return 0; 2098 1808

+2505 -592

drivers/md/dm-raid.c

··· 1 1 /* 2 2 * Copyright (C) 2010-2011 Neil Brown 3 - * Copyright (C) 2010-2015 Red Hat, Inc. All rights reserved. 3 + * Copyright (C) 2010-2016 Red Hat, Inc. All rights reserved. 4 4 * 5 5 * This file is released under the GPL. 6 6 */ ··· 17 17 #include <linux/device-mapper.h> 18 18 19 19 #define DM_MSG_PREFIX "raid" 20 - #define MAX_RAID_DEVICES 253 /* raid4/5/6 limit */ 20 + #define MAX_RAID_DEVICES 253 /* md-raid kernel limit */ 21 + 22 + /* 23 + * Minimum sectors of free reshape space per raid device 24 + */ 25 + #define MIN_FREE_RESHAPE_SPACE to_sector(4*4096) 21 26 22 27 static bool devices_handle_discard_safely = false; 23 28 ··· 30 25 * The following flags are used by dm-raid.c to set up the array state. 31 26 * They must be cleared before md_run is called. 32 27 */ 33 - #define FirstUse 10 /* rdev flag */ 28 + #define FirstUse 10 /* rdev flag */ 34 29 35 30 struct raid_dev { 36 31 /* 37 32 * Two DM devices, one to hold metadata and one to hold the 38 - * actual data/parity. The reason for this is to not confuse 33 + * actual data/parity. The reason for this is to not confuse 39 34 * ti->len and give more flexibility in altering size and 40 35 * characteristics. 41 36 * ··· 51 46 }; 52 47 53 48 /* 49 + * Bits for establishing rs->ctr_flags 50 + * 51 + * 1 = no flag value 52 + * 2 = flag with value 53 + */ 54 + #define __CTR_FLAG_SYNC 0 /* 1 */ /* Not with raid0! */ 55 + #define __CTR_FLAG_NOSYNC 1 /* 1 */ /* Not with raid0! */ 56 + #define __CTR_FLAG_REBUILD 2 /* 2 */ /* Not with raid0! */ 57 + #define __CTR_FLAG_DAEMON_SLEEP 3 /* 2 */ /* Not with raid0! */ 58 + #define __CTR_FLAG_MIN_RECOVERY_RATE 4 /* 2 */ /* Not with raid0! */ 59 + #define __CTR_FLAG_MAX_RECOVERY_RATE 5 /* 2 */ /* Not with raid0! */ 60 + #define __CTR_FLAG_MAX_WRITE_BEHIND 6 /* 2 */ /* Only with raid1! */ 61 + #define __CTR_FLAG_WRITE_MOSTLY 7 /* 2 */ /* Only with raid1! */ 62 + #define __CTR_FLAG_STRIPE_CACHE 8 /* 2 */ /* Only with raid4/5/6! */ 63 + #define __CTR_FLAG_REGION_SIZE 9 /* 2 */ /* Not with raid0! */ 64 + #define __CTR_FLAG_RAID10_COPIES 10 /* 2 */ /* Only with raid10 */ 65 + #define __CTR_FLAG_RAID10_FORMAT 11 /* 2 */ /* Only with raid10 */ 66 + /* New for v1.9.0 */ 67 + #define __CTR_FLAG_DELTA_DISKS 12 /* 2 */ /* Only with reshapable raid1/4/5/6/10! */ 68 + #define __CTR_FLAG_DATA_OFFSET 13 /* 2 */ /* Only with reshapable raid4/5/6/10! */ 69 + #define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */ 70 + 71 + /* 54 72 * Flags for rs->ctr_flags field. 55 73 */ 56 - #define CTR_FLAG_SYNC 0x1 57 - #define CTR_FLAG_NOSYNC 0x2 58 - #define CTR_FLAG_REBUILD 0x4 59 - #define CTR_FLAG_DAEMON_SLEEP 0x8 60 - #define CTR_FLAG_MIN_RECOVERY_RATE 0x10 61 - #define CTR_FLAG_MAX_RECOVERY_RATE 0x20 62 - #define CTR_FLAG_MAX_WRITE_BEHIND 0x40 63 - #define CTR_FLAG_STRIPE_CACHE 0x80 64 - #define CTR_FLAG_REGION_SIZE 0x100 65 - #define CTR_FLAG_RAID10_COPIES 0x200 66 - #define CTR_FLAG_RAID10_FORMAT 0x400 74 + #define CTR_FLAG_SYNC (1 << __CTR_FLAG_SYNC) 75 + #define CTR_FLAG_NOSYNC (1 << __CTR_FLAG_NOSYNC) 76 + #define CTR_FLAG_REBUILD (1 << __CTR_FLAG_REBUILD) 77 + #define CTR_FLAG_DAEMON_SLEEP (1 << __CTR_FLAG_DAEMON_SLEEP) 78 + #define CTR_FLAG_MIN_RECOVERY_RATE (1 << __CTR_FLAG_MIN_RECOVERY_RATE) 79 + #define CTR_FLAG_MAX_RECOVERY_RATE (1 << __CTR_FLAG_MAX_RECOVERY_RATE) 80 + #define CTR_FLAG_MAX_WRITE_BEHIND (1 << __CTR_FLAG_MAX_WRITE_BEHIND) 81 + #define CTR_FLAG_WRITE_MOSTLY (1 << __CTR_FLAG_WRITE_MOSTLY) 82 + #define CTR_FLAG_STRIPE_CACHE (1 << __CTR_FLAG_STRIPE_CACHE) 83 + #define CTR_FLAG_REGION_SIZE (1 << __CTR_FLAG_REGION_SIZE) 84 + #define CTR_FLAG_RAID10_COPIES (1 << __CTR_FLAG_RAID10_COPIES) 85 + #define CTR_FLAG_RAID10_FORMAT (1 << __CTR_FLAG_RAID10_FORMAT) 86 + #define CTR_FLAG_DELTA_DISKS (1 << __CTR_FLAG_DELTA_DISKS) 87 + #define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET) 88 + #define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS) 89 + 90 + /* 91 + * Definitions of various constructor flags to 92 + * be used in checks of valid / invalid flags 93 + * per raid level. 94 + */ 95 + /* Define all any sync flags */ 96 + #define CTR_FLAGS_ANY_SYNC (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC) 97 + 98 + /* Define flags for options without argument (e.g. 'nosync') */ 99 + #define CTR_FLAG_OPTIONS_NO_ARGS (CTR_FLAGS_ANY_SYNC | \ 100 + CTR_FLAG_RAID10_USE_NEAR_SETS) 101 + 102 + /* Define flags for options with one argument (e.g. 'delta_disks +2') */ 103 + #define CTR_FLAG_OPTIONS_ONE_ARG (CTR_FLAG_REBUILD | \ 104 + CTR_FLAG_WRITE_MOSTLY | \ 105 + CTR_FLAG_DAEMON_SLEEP | \ 106 + CTR_FLAG_MIN_RECOVERY_RATE | \ 107 + CTR_FLAG_MAX_RECOVERY_RATE | \ 108 + CTR_FLAG_MAX_WRITE_BEHIND | \ 109 + CTR_FLAG_STRIPE_CACHE | \ 110 + CTR_FLAG_REGION_SIZE | \ 111 + CTR_FLAG_RAID10_COPIES | \ 112 + CTR_FLAG_RAID10_FORMAT | \ 113 + CTR_FLAG_DELTA_DISKS | \ 114 + CTR_FLAG_DATA_OFFSET) 115 + 116 + /* Valid options definitions per raid level... */ 117 + 118 + /* "raid0" does only accept data offset */ 119 + #define RAID0_VALID_FLAGS (CTR_FLAG_DATA_OFFSET) 120 + 121 + /* "raid1" does not accept stripe cache, data offset, delta_disks or any raid10 options */ 122 + #define RAID1_VALID_FLAGS (CTR_FLAGS_ANY_SYNC | \ 123 + CTR_FLAG_REBUILD | \ 124 + CTR_FLAG_WRITE_MOSTLY | \ 125 + CTR_FLAG_DAEMON_SLEEP | \ 126 + CTR_FLAG_MIN_RECOVERY_RATE | \ 127 + CTR_FLAG_MAX_RECOVERY_RATE | \ 128 + CTR_FLAG_MAX_WRITE_BEHIND | \ 129 + CTR_FLAG_REGION_SIZE | \ 130 + CTR_FLAG_DELTA_DISKS | \ 131 + CTR_FLAG_DATA_OFFSET) 132 + 133 + /* "raid10" does not accept any raid1 or stripe cache options */ 134 + #define RAID10_VALID_FLAGS (CTR_FLAGS_ANY_SYNC | \ 135 + CTR_FLAG_REBUILD | \ 136 + CTR_FLAG_DAEMON_SLEEP | \ 137 + CTR_FLAG_MIN_RECOVERY_RATE | \ 138 + CTR_FLAG_MAX_RECOVERY_RATE | \ 139 + CTR_FLAG_REGION_SIZE | \ 140 + CTR_FLAG_RAID10_COPIES | \ 141 + CTR_FLAG_RAID10_FORMAT | \ 142 + CTR_FLAG_DELTA_DISKS | \ 143 + CTR_FLAG_DATA_OFFSET | \ 144 + CTR_FLAG_RAID10_USE_NEAR_SETS) 145 + 146 + /* 147 + * "raid4/5/6" do not accept any raid1 or raid10 specific options 148 + * 149 + * "raid6" does not accept "nosync", because it is not guaranteed 150 + * that both parity and q-syndrome are being written properly with 151 + * any writes 152 + */ 153 + #define RAID45_VALID_FLAGS (CTR_FLAGS_ANY_SYNC | \ 154 + CTR_FLAG_REBUILD | \ 155 + CTR_FLAG_DAEMON_SLEEP | \ 156 + CTR_FLAG_MIN_RECOVERY_RATE | \ 157 + CTR_FLAG_MAX_RECOVERY_RATE | \ 158 + CTR_FLAG_MAX_WRITE_BEHIND | \ 159 + CTR_FLAG_STRIPE_CACHE | \ 160 + CTR_FLAG_REGION_SIZE | \ 161 + CTR_FLAG_DELTA_DISKS | \ 162 + CTR_FLAG_DATA_OFFSET) 163 + 164 + #define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \ 165 + CTR_FLAG_REBUILD | \ 166 + CTR_FLAG_DAEMON_SLEEP | \ 167 + CTR_FLAG_MIN_RECOVERY_RATE | \ 168 + CTR_FLAG_MAX_RECOVERY_RATE | \ 169 + CTR_FLAG_MAX_WRITE_BEHIND | \ 170 + CTR_FLAG_STRIPE_CACHE | \ 171 + CTR_FLAG_REGION_SIZE | \ 172 + CTR_FLAG_DELTA_DISKS | \ 173 + CTR_FLAG_DATA_OFFSET) 174 + /* ...valid options definitions per raid level */ 175 + 176 + /* 177 + * Flags for rs->runtime_flags field 178 + * (RT_FLAG prefix meaning "runtime flag") 179 + * 180 + * These are all internal and used to define runtime state, 181 + * e.g. to prevent another resume from preresume processing 182 + * the raid set all over again. 183 + */ 184 + #define RT_FLAG_RS_PRERESUMED 0 185 + #define RT_FLAG_RS_RESUMED 1 186 + #define RT_FLAG_RS_BITMAP_LOADED 2 187 + #define RT_FLAG_UPDATE_SBS 3 188 + #define RT_FLAG_RESHAPE_RS 4 189 + #define RT_FLAG_KEEP_RS_FROZEN 5 190 + 191 + /* Array elements of 64 bit needed for rebuild/failed disk bits */ 192 + #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) 193 + 194 + /* 195 + * raid set level, layout and chunk sectors backup/restore 196 + */ 197 + struct rs_layout { 198 + int new_level; 199 + int new_layout; 200 + int new_chunk_sectors; 201 + }; 67 202 68 203 struct raid_set { 69 204 struct dm_target *ti; 70 205 71 206 uint32_t bitmap_loaded; 72 - uint32_t ctr_flags; 207 + uint32_t stripe_cache_entries; 208 + unsigned long ctr_flags; 209 + unsigned long runtime_flags; 210 + 211 + uint64_t rebuild_disks[DISKS_ARRAY_ELEMS]; 212 + 213 + int raid_disks; 214 + int delta_disks; 215 + int data_offset; 216 + int raid10_copies; 217 + int requested_bitmap_chunk_sectors; 73 218 74 219 struct mddev md; 75 220 struct raid_type *raid_type; ··· 228 73 struct raid_dev dev[0]; 229 74 }; 230 75 76 + static void rs_config_backup(struct raid_set *rs, struct rs_layout *l) 77 + { 78 + struct mddev *mddev = &rs->md; 79 + 80 + l->new_level = mddev->new_level; 81 + l->new_layout = mddev->new_layout; 82 + l->new_chunk_sectors = mddev->new_chunk_sectors; 83 + } 84 + 85 + static void rs_config_restore(struct raid_set *rs, struct rs_layout *l) 86 + { 87 + struct mddev *mddev = &rs->md; 88 + 89 + mddev->new_level = l->new_level; 90 + mddev->new_layout = l->new_layout; 91 + mddev->new_chunk_sectors = l->new_chunk_sectors; 92 + } 93 + 94 + /* raid10 algorithms (i.e. formats) */ 95 + #define ALGORITHM_RAID10_DEFAULT 0 96 + #define ALGORITHM_RAID10_NEAR 1 97 + #define ALGORITHM_RAID10_OFFSET 2 98 + #define ALGORITHM_RAID10_FAR 3 99 + 231 100 /* Supported raid types and properties. */ 232 101 static struct raid_type { 233 102 const char *name; /* RAID algorithm. */ 234 103 const char *descr; /* Descriptor text for logging. */ 235 - const unsigned parity_devs; /* # of parity devices. */ 236 - const unsigned minimal_devs; /* minimal # of devices in set. */ 237 - const unsigned level; /* RAID level. */ 238 - const unsigned algorithm; /* RAID algorithm. */ 104 + const unsigned int parity_devs; /* # of parity devices. */ 105 + const unsigned int minimal_devs;/* minimal # of devices in set. */ 106 + const unsigned int level; /* RAID level. */ 107 + const unsigned int algorithm; /* RAID algorithm. */ 239 108 } raid_types[] = { 240 - {"raid0", "RAID0 (striping)", 0, 2, 0, 0 /* NONE */}, 241 - {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, 242 - {"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */}, 243 - {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, 244 - {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, 245 - {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, 246 - {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC}, 247 - {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC}, 248 - {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART}, 249 - {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART}, 250 - {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} 109 + {"raid0", "raid0 (striping)", 0, 2, 0, 0 /* NONE */}, 110 + {"raid1", "raid1 (mirroring)", 0, 2, 1, 0 /* NONE */}, 111 + {"raid10_far", "raid10 far (striped mirrors)", 0, 2, 10, ALGORITHM_RAID10_FAR}, 112 + {"raid10_offset", "raid10 offset (striped mirrors)", 0, 2, 10, ALGORITHM_RAID10_OFFSET}, 113 + {"raid10_near", "raid10 near (striped mirrors)", 0, 2, 10, ALGORITHM_RAID10_NEAR}, 114 + {"raid10", "raid10 (striped mirrors)", 0, 2, 10, ALGORITHM_RAID10_DEFAULT}, 115 + {"raid4", "raid4 (dedicated last parity disk)", 1, 2, 4, ALGORITHM_PARITY_N}, /* raid4 layout = raid5_n */ 116 + {"raid5_n", "raid5 (dedicated last parity disk)", 1, 2, 5, ALGORITHM_PARITY_N}, 117 + {"raid5_ls", "raid5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC}, 118 + {"raid5_rs", "raid5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC}, 119 + {"raid5_la", "raid5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, 120 + {"raid5_ra", "raid5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, 121 + {"raid6_zr", "raid6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART}, 122 + {"raid6_nr", "raid6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART}, 123 + {"raid6_nc", "raid6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}, 124 + {"raid6_n_6", "raid6 (dedicated parity/Q n/6)", 2, 4, 6, ALGORITHM_PARITY_N_6}, 125 + {"raid6_ls_6", "raid6 (left symmetric dedicated Q 6)", 2, 4, 6, ALGORITHM_LEFT_SYMMETRIC_6}, 126 + {"raid6_rs_6", "raid6 (right symmetric dedicated Q 6)", 2, 4, 6, ALGORITHM_RIGHT_SYMMETRIC_6}, 127 + {"raid6_la_6", "raid6 (left asymmetric dedicated Q 6)", 2, 4, 6, ALGORITHM_LEFT_ASYMMETRIC_6}, 128 + {"raid6_ra_6", "raid6 (right asymmetric dedicated Q 6)", 2, 4, 6, ALGORITHM_RIGHT_ASYMMETRIC_6} 251 129 }; 252 130 253 - static char *raid10_md_layout_to_format(int layout) 131 + /* True, if @v is in inclusive range [@min, @max] */ 132 + static bool __within_range(long v, long min, long max) 254 133 { 255 - /* 256 - * Bit 16 and 17 stand for "offset" and "use_far_sets" 257 - * Refer to MD's raid10.c for details 258 - */ 259 - if ((layout & 0x10000) && (layout & 0x20000)) 260 - return "offset"; 261 - 262 - if ((layout & 0xFF) > 1) 263 - return "near"; 264 - 265 - return "far"; 134 + return v >= min && v <= max; 266 135 } 267 136 268 - static unsigned raid10_md_layout_to_copies(int layout) 137 + /* All table line arguments are defined here */ 138 + static struct arg_name_flag { 139 + const unsigned long flag; 140 + const char *name; 141 + } __arg_name_flags[] = { 142 + { CTR_FLAG_SYNC, "sync"}, 143 + { CTR_FLAG_NOSYNC, "nosync"}, 144 + { CTR_FLAG_REBUILD, "rebuild"}, 145 + { CTR_FLAG_DAEMON_SLEEP, "daemon_sleep"}, 146 + { CTR_FLAG_MIN_RECOVERY_RATE, "min_recovery_rate"}, 147 + { CTR_FLAG_MAX_RECOVERY_RATE, "max_recovery_rate"}, 148 + { CTR_FLAG_MAX_WRITE_BEHIND, "max_write_behind"}, 149 + { CTR_FLAG_WRITE_MOSTLY, "write_mostly"}, 150 + { CTR_FLAG_STRIPE_CACHE, "stripe_cache"}, 151 + { CTR_FLAG_REGION_SIZE, "region_size"}, 152 + { CTR_FLAG_RAID10_COPIES, "raid10_copies"}, 153 + { CTR_FLAG_RAID10_FORMAT, "raid10_format"}, 154 + { CTR_FLAG_DATA_OFFSET, "data_offset"}, 155 + { CTR_FLAG_DELTA_DISKS, "delta_disks"}, 156 + { CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"}, 157 + }; 158 + 159 + /* Return argument name string for given @flag */ 160 + static const char *dm_raid_arg_name_by_flag(const uint32_t flag) 269 161 { 270 - if ((layout & 0xFF) > 1) 271 - return layout & 0xFF; 272 - return (layout >> 8) & 0xFF; 273 - } 162 + if (hweight32(flag) == 1) { 163 + struct arg_name_flag *anf = __arg_name_flags + ARRAY_SIZE(__arg_name_flags); 274 164 275 - static int raid10_format_to_md_layout(char *format, unsigned copies) 276 - { 277 - unsigned n = 1, f = 1; 165 + while (anf-- > __arg_name_flags) 166 + if (flag & anf->flag) 167 + return anf->name; 278 168 279 - if (!strcasecmp("near", format)) 280 - n = copies; 281 - else 282 - f = copies; 283 - 284 - if (!strcasecmp("offset", format)) 285 - return 0x30000 | (f << 8) | n; 286 - 287 - if (!strcasecmp("far", format)) 288 - return 0x20000 | (f << 8) | n; 289 - 290 - return (f << 8) | n; 291 - } 292 - 293 - static struct raid_type *get_raid_type(char *name) 294 - { 295 - int i; 296 - 297 - for (i = 0; i < ARRAY_SIZE(raid_types); i++) 298 - if (!strcmp(raid_types[i].name, name)) 299 - return &raid_types[i]; 169 + } else 170 + DMERR("%s called with more than one flag!", __func__); 300 171 301 172 return NULL; 302 173 } 303 174 304 - static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs) 175 + /* 176 + * Bool helpers to test for various raid levels of a raid set. 177 + * It's level as reported by the superblock rather than 178 + * the requested raid_type passed to the constructor. 179 + */ 180 + /* Return true, if raid set in @rs is raid0 */ 181 + static bool rs_is_raid0(struct raid_set *rs) 305 182 { 306 - unsigned i; 183 + return !rs->md.level; 184 + } 185 + 186 + /* Return true, if raid set in @rs is raid1 */ 187 + static bool rs_is_raid1(struct raid_set *rs) 188 + { 189 + return rs->md.level == 1; 190 + } 191 + 192 + /* Return true, if raid set in @rs is raid10 */ 193 + static bool rs_is_raid10(struct raid_set *rs) 194 + { 195 + return rs->md.level == 10; 196 + } 197 + 198 + /* Return true, if raid set in @rs is level 6 */ 199 + static bool rs_is_raid6(struct raid_set *rs) 200 + { 201 + return rs->md.level == 6; 202 + } 203 + 204 + /* Return true, if raid set in @rs is level 4, 5 or 6 */ 205 + static bool rs_is_raid456(struct raid_set *rs) 206 + { 207 + return __within_range(rs->md.level, 4, 6); 208 + } 209 + 210 + /* Return true, if raid set in @rs is reshapable */ 211 + static bool __is_raid10_far(int layout); 212 + static bool rs_is_reshapable(struct raid_set *rs) 213 + { 214 + return rs_is_raid456(rs) || 215 + (rs_is_raid10(rs) && !__is_raid10_far(rs->md.new_layout)); 216 + } 217 + 218 + /* Return true, if raid set in @rs is recovering */ 219 + static bool rs_is_recovering(struct raid_set *rs) 220 + { 221 + return rs->md.recovery_cp < rs->dev[0].rdev.sectors; 222 + } 223 + 224 + /* Return true, if raid set in @rs is reshaping */ 225 + static bool rs_is_reshaping(struct raid_set *rs) 226 + { 227 + return rs->md.reshape_position != MaxSector; 228 + } 229 + 230 + /* 231 + * bool helpers to test for various raid levels of a raid type @rt 232 + */ 233 + 234 + /* Return true, if raid type in @rt is raid0 */ 235 + static bool rt_is_raid0(struct raid_type *rt) 236 + { 237 + return !rt->level; 238 + } 239 + 240 + /* Return true, if raid type in @rt is raid1 */ 241 + static bool rt_is_raid1(struct raid_type *rt) 242 + { 243 + return rt->level == 1; 244 + } 245 + 246 + /* Return true, if raid type in @rt is raid10 */ 247 + static bool rt_is_raid10(struct raid_type *rt) 248 + { 249 + return rt->level == 10; 250 + } 251 + 252 + /* Return true, if raid type in @rt is raid4/5 */ 253 + static bool rt_is_raid45(struct raid_type *rt) 254 + { 255 + return __within_range(rt->level, 4, 5); 256 + } 257 + 258 + /* Return true, if raid type in @rt is raid6 */ 259 + static bool rt_is_raid6(struct raid_type *rt) 260 + { 261 + return rt->level == 6; 262 + } 263 + 264 + /* Return true, if raid type in @rt is raid4/5/6 */ 265 + static bool rt_is_raid456(struct raid_type *rt) 266 + { 267 + return __within_range(rt->level, 4, 6); 268 + } 269 + /* END: raid level bools */ 270 + 271 + /* Return valid ctr flags for the raid level of @rs */ 272 + static unsigned long __valid_flags(struct raid_set *rs) 273 + { 274 + if (rt_is_raid0(rs->raid_type)) 275 + return RAID0_VALID_FLAGS; 276 + else if (rt_is_raid1(rs->raid_type)) 277 + return RAID1_VALID_FLAGS; 278 + else if (rt_is_raid10(rs->raid_type)) 279 + return RAID10_VALID_FLAGS; 280 + else if (rt_is_raid45(rs->raid_type)) 281 + return RAID45_VALID_FLAGS; 282 + else if (rt_is_raid6(rs->raid_type)) 283 + return RAID6_VALID_FLAGS; 284 + 285 + return 0; 286 + } 287 + 288 + /* 289 + * Check for valid flags set on @rs 290 + * 291 + * Has to be called after parsing of the ctr flags! 292 + */ 293 + static int rs_check_for_valid_flags(struct raid_set *rs) 294 + { 295 + if (rs->ctr_flags & ~__valid_flags(rs)) { 296 + rs->ti->error = "Invalid flags combination"; 297 + return -EINVAL; 298 + } 299 + 300 + return 0; 301 + } 302 + 303 + /* MD raid10 bit definitions and helpers */ 304 + #define RAID10_OFFSET (1 << 16) /* stripes with data copies area adjacent on devices */ 305 + #define RAID10_BROCKEN_USE_FAR_SETS (1 << 17) /* Broken in raid10.c: use sets instead of whole stripe rotation */ 306 + #define RAID10_USE_FAR_SETS (1 << 18) /* Use sets instead of whole stripe rotation */ 307 + #define RAID10_FAR_COPIES_SHIFT 8 /* raid10 # far copies shift (2nd byte of layout) */ 308 + 309 + /* Return md raid10 near copies for @layout */ 310 + static unsigned int __raid10_near_copies(int layout) 311 + { 312 + return layout & 0xFF; 313 + } 314 + 315 + /* Return md raid10 far copies for @layout */ 316 + static unsigned int __raid10_far_copies(int layout) 317 + { 318 + return __raid10_near_copies(layout >> RAID10_FAR_COPIES_SHIFT); 319 + } 320 + 321 + /* Return true if md raid10 offset for @layout */ 322 + static bool __is_raid10_offset(int layout) 323 + { 324 + return !!(layout & RAID10_OFFSET); 325 + } 326 + 327 + /* Return true if md raid10 near for @layout */ 328 + static bool __is_raid10_near(int layout) 329 + { 330 + return !__is_raid10_offset(layout) && __raid10_near_copies(layout) > 1; 331 + } 332 + 333 + /* Return true if md raid10 far for @layout */ 334 + static bool __is_raid10_far(int layout) 335 + { 336 + return !__is_raid10_offset(layout) && __raid10_far_copies(layout) > 1; 337 + } 338 + 339 + /* Return md raid10 layout string for @layout */ 340 + static const char *raid10_md_layout_to_format(int layout) 341 + { 342 + /* 343 + * Bit 16 stands for "offset" 344 + * (i.e. adjacent stripes hold copies) 345 + * 346 + * Refer to MD's raid10.c for details 347 + */ 348 + if (__is_raid10_offset(layout)) 349 + return "offset"; 350 + 351 + if (__raid10_near_copies(layout) > 1) 352 + return "near"; 353 + 354 + WARN_ON(__raid10_far_copies(layout) < 2); 355 + 356 + return "far"; 357 + } 358 + 359 + /* Return md raid10 algorithm for @name */ 360 + static int raid10_name_to_format(const char *name) 361 + { 362 + if (!strcasecmp(name, "near")) 363 + return ALGORITHM_RAID10_NEAR; 364 + else if (!strcasecmp(name, "offset")) 365 + return ALGORITHM_RAID10_OFFSET; 366 + else if (!strcasecmp(name, "far")) 367 + return ALGORITHM_RAID10_FAR; 368 + 369 + return -EINVAL; 370 + } 371 + 372 + /* Return md raid10 copies for @layout */ 373 + static unsigned int raid10_md_layout_to_copies(int layout) 374 + { 375 + return max(__raid10_near_copies(layout), __raid10_far_copies(layout)); 376 + } 377 + 378 + /* Return md raid10 format id for @format string */ 379 + static int raid10_format_to_md_layout(struct raid_set *rs, 380 + unsigned int algorithm, 381 + unsigned int copies) 382 + { 383 + unsigned int n = 1, f = 1, r = 0; 384 + 385 + /* 386 + * MD resilienece flaw: 387 + * 388 + * enabling use_far_sets for far/offset formats causes copies 389 + * to be colocated on the same devs together with their origins! 390 + * 391 + * -> disable it for now in the definition above 392 + */ 393 + if (algorithm == ALGORITHM_RAID10_DEFAULT || 394 + algorithm == ALGORITHM_RAID10_NEAR) 395 + n = copies; 396 + 397 + else if (algorithm == ALGORITHM_RAID10_OFFSET) { 398 + f = copies; 399 + r = RAID10_OFFSET; 400 + if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) 401 + r |= RAID10_USE_FAR_SETS; 402 + 403 + } else if (algorithm == ALGORITHM_RAID10_FAR) { 404 + f = copies; 405 + r = !RAID10_OFFSET; 406 + if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) 407 + r |= RAID10_USE_FAR_SETS; 408 + 409 + } else 410 + return -EINVAL; 411 + 412 + return r | (f << RAID10_FAR_COPIES_SHIFT) | n; 413 + } 414 + /* END: MD raid10 bit definitions and helpers */ 415 + 416 + /* Check for any of the raid10 algorithms */ 417 + static bool __got_raid10(struct raid_type *rtp, const int layout) 418 + { 419 + if (rtp->level == 10) { 420 + switch (rtp->algorithm) { 421 + case ALGORITHM_RAID10_DEFAULT: 422 + case ALGORITHM_RAID10_NEAR: 423 + return __is_raid10_near(layout); 424 + case ALGORITHM_RAID10_OFFSET: 425 + return __is_raid10_offset(layout); 426 + case ALGORITHM_RAID10_FAR: 427 + return __is_raid10_far(layout); 428 + default: 429 + break; 430 + } 431 + } 432 + 433 + return false; 434 + } 435 + 436 + /* Return raid_type for @name */ 437 + static struct raid_type *get_raid_type(const char *name) 438 + { 439 + struct raid_type *rtp = raid_types + ARRAY_SIZE(raid_types); 440 + 441 + while (rtp-- > raid_types) 442 + if (!strcasecmp(rtp->name, name)) 443 + return rtp; 444 + 445 + return NULL; 446 + } 447 + 448 + /* Return raid_type for @name based derived from @level and @layout */ 449 + static struct raid_type *get_raid_type_by_ll(const int level, const int layout) 450 + { 451 + struct raid_type *rtp = raid_types + ARRAY_SIZE(raid_types); 452 + 453 + while (rtp-- > raid_types) { 454 + /* RAID10 special checks based on @layout flags/properties */ 455 + if (rtp->level == level && 456 + (__got_raid10(rtp, layout) || rtp->algorithm == layout)) 457 + return rtp; 458 + } 459 + 460 + return NULL; 461 + } 462 + 463 + /* 464 + * Conditionally change bdev capacity of @rs 465 + * in case of a disk add/remove reshape 466 + */ 467 + static void rs_set_capacity(struct raid_set *rs) 468 + { 469 + struct mddev *mddev = &rs->md; 470 + struct md_rdev *rdev; 471 + struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table)); 472 + 473 + /* 474 + * raid10 sets rdev->sector to the device size, which 475 + * is unintended in case of out-of-place reshaping 476 + */ 477 + rdev_for_each(rdev, mddev) 478 + rdev->sectors = mddev->dev_sectors; 479 + 480 + set_capacity(gendisk, mddev->array_sectors); 481 + revalidate_disk(gendisk); 482 + } 483 + 484 + /* 485 + * Set the mddev properties in @rs to the current 486 + * ones retrieved from the freshest superblock 487 + */ 488 + static void rs_set_cur(struct raid_set *rs) 489 + { 490 + struct mddev *mddev = &rs->md; 491 + 492 + mddev->new_level = mddev->level; 493 + mddev->new_layout = mddev->layout; 494 + mddev->new_chunk_sectors = mddev->chunk_sectors; 495 + } 496 + 497 + /* 498 + * Set the mddev properties in @rs to the new 499 + * ones requested by the ctr 500 + */ 501 + static void rs_set_new(struct raid_set *rs) 502 + { 503 + struct mddev *mddev = &rs->md; 504 + 505 + mddev->level = mddev->new_level; 506 + mddev->layout = mddev->new_layout; 507 + mddev->chunk_sectors = mddev->new_chunk_sectors; 508 + mddev->raid_disks = rs->raid_disks; 509 + mddev->delta_disks = 0; 510 + } 511 + 512 + static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *raid_type, 513 + unsigned int raid_devs) 514 + { 515 + unsigned int i; 307 516 struct raid_set *rs; 308 517 309 518 if (raid_devs <= raid_type->parity_devs) { ··· 683 164 684 165 mddev_init(&rs->md); 685 166 167 + rs->raid_disks = raid_devs; 168 + rs->delta_disks = 0; 169 + 686 170 rs->ti = ti; 687 171 rs->raid_type = raid_type; 172 + rs->stripe_cache_entries = 256; 688 173 rs->md.raid_disks = raid_devs; 689 174 rs->md.level = raid_type->level; 690 175 rs->md.new_level = rs->md.level; 691 176 rs->md.layout = raid_type->algorithm; 692 177 rs->md.new_layout = rs->md.layout; 693 178 rs->md.delta_disks = 0; 694 - rs->md.recovery_cp = 0; 179 + rs->md.recovery_cp = MaxSector; 695 180 696 181 for (i = 0; i < raid_devs; i++) 697 182 md_rdev_init(&rs->dev[i].rdev); ··· 712 189 return rs; 713 190 } 714 191 715 - static void context_free(struct raid_set *rs) 192 + static void raid_set_free(struct raid_set *rs) 716 193 { 717 194 int i; 718 195 719 - for (i = 0; i < rs->md.raid_disks; i++) { 196 + for (i = 0; i < rs->raid_disks; i++) { 720 197 if (rs->dev[i].meta_dev) 721 198 dm_put_device(rs->ti, rs->dev[i].meta_dev); 722 199 md_rdev_clear(&rs->dev[i].rdev); ··· 741 218 * <meta_dev> - 742 219 * 743 220 * This code parses those words. If there is a failure, 744 - * the caller must use context_free to unwind the operations. 221 + * the caller must use raid_set_free() to unwind the operations. 745 222 */ 746 - static int dev_parms(struct raid_set *rs, char **argv) 223 + static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as) 747 224 { 748 225 int i; 749 226 int rebuild = 0; 750 227 int metadata_available = 0; 751 - int ret = 0; 228 + int r = 0; 229 + const char *arg; 752 230 753 - for (i = 0; i < rs->md.raid_disks; i++, argv += 2) { 231 + /* Put off the number of raid devices argument to get to dev pairs */ 232 + arg = dm_shift_arg(as); 233 + if (!arg) 234 + return -EINVAL; 235 + 236 + for (i = 0; i < rs->raid_disks; i++) { 754 237 rs->dev[i].rdev.raid_disk = i; 755 238 756 239 rs->dev[i].meta_dev = NULL; ··· 769 240 rs->dev[i].rdev.data_offset = 0; 770 241 rs->dev[i].rdev.mddev = &rs->md; 771 242 772 - if (strcmp(argv[0], "-")) { 773 - ret = dm_get_device(rs->ti, argv[0], 774 - dm_table_get_mode(rs->ti->table), 775 - &rs->dev[i].meta_dev); 776 - rs->ti->error = "RAID metadata device lookup failure"; 777 - if (ret) 778 - return ret; 243 + arg = dm_shift_arg(as); 244 + if (!arg) 245 + return -EINVAL; 246 + 247 + if (strcmp(arg, "-")) { 248 + r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table), 249 + &rs->dev[i].meta_dev); 250 + if (r) { 251 + rs->ti->error = "RAID metadata device lookup failure"; 252 + return r; 253 + } 779 254 780 255 rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL); 781 - if (!rs->dev[i].rdev.sb_page) 256 + if (!rs->dev[i].rdev.sb_page) { 257 + rs->ti->error = "Failed to allocate superblock page"; 782 258 return -ENOMEM; 259 + } 783 260 } 784 261 785 - if (!strcmp(argv[1], "-")) { 262 + arg = dm_shift_arg(as); 263 + if (!arg) 264 + return -EINVAL; 265 + 266 + if (!strcmp(arg, "-")) { 786 267 if (!test_bit(In_sync, &rs->dev[i].rdev.flags) && 787 268 (!rs->dev[i].rdev.recovery_offset)) { 788 269 rs->ti->error = "Drive designated for rebuild not specified"; 789 270 return -EINVAL; 790 271 } 791 272 792 - rs->ti->error = "No data device supplied with metadata device"; 793 - if (rs->dev[i].meta_dev) 273 + if (rs->dev[i].meta_dev) { 274 + rs->ti->error = "No data device supplied with metadata device"; 794 275 return -EINVAL; 276 + } 795 277 796 278 continue; 797 279 } 798 280 799 - ret = dm_get_device(rs->ti, argv[1], 800 - dm_table_get_mode(rs->ti->table), 801 - &rs->dev[i].data_dev); 802 - if (ret) { 281 + r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table), 282 + &rs->dev[i].data_dev); 283 + if (r) { 803 284 rs->ti->error = "RAID device lookup failure"; 804 - return ret; 285 + return r; 805 286 } 806 287 807 288 if (rs->dev[i].meta_dev) { ··· 819 280 rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev; 820 281 } 821 282 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; 822 - list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); 283 + list_add_tail(&rs->dev[i].rdev.same_set, &rs->md.disks); 823 284 if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) 824 285 rebuild++; 825 286 } ··· 840 301 * 841 302 * User could specify 'nosync' option if desperate. 842 303 */ 843 - DMERR("Unable to rebuild drive while array is not in-sync"); 844 - rs->ti->error = "RAID device lookup failure"; 304 + rs->ti->error = "Unable to rebuild drive while array is not in-sync"; 845 305 return -EINVAL; 846 306 } 847 307 ··· 863 325 864 326 if (!region_size) { 865 327 /* 866 - * Choose a reasonable default. All figures in sectors. 328 + * Choose a reasonable default. All figures in sectors. 867 329 */ 868 330 if (min_region_size > (1 << 13)) { 869 331 /* If not a power of 2, make it the next power of 2 */ ··· 904 366 /* 905 367 * Convert sectors to bytes. 906 368 */ 907 - rs->md.bitmap_info.chunksize = (region_size << 9); 369 + rs->md.bitmap_info.chunksize = to_bytes(region_size); 908 370 909 371 return 0; 910 372 } ··· 920 382 */ 921 383 static int validate_raid_redundancy(struct raid_set *rs) 922 384 { 923 - unsigned i, rebuild_cnt = 0; 924 - unsigned rebuilds_per_group = 0, copies, d; 925 - unsigned group_size, last_group_start; 385 + unsigned int i, rebuild_cnt = 0; 386 + unsigned int rebuilds_per_group = 0, copies; 387 + unsigned int group_size, last_group_start; 926 388 927 389 for (i = 0; i < rs->md.raid_disks; i++) 928 390 if (!test_bit(In_sync, &rs->dev[i].rdev.flags) || ··· 941 403 goto too_many; 942 404 break; 943 405 case 10: 944 - copies = raid10_md_layout_to_copies(rs->md.layout); 406 + copies = raid10_md_layout_to_copies(rs->md.new_layout); 945 407 if (rebuild_cnt < copies) 946 408 break; 947 409 ··· 955 417 * simple case where the number of devices is a multiple of the 956 418 * number of copies, we must also handle cases where the number 957 419 * of devices is not a multiple of the number of copies. 958 - * E.g. dev1 dev2 dev3 dev4 dev5 959 - * A A B B C 960 - * C D D E E 420 + * E.g. dev1 dev2 dev3 dev4 dev5 421 + * A A B B C 422 + * C D D E E 961 423 */ 962 - if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) { 963 - for (i = 0; i < rs->md.raid_disks * copies; i++) { 424 + if (__is_raid10_near(rs->md.new_layout)) { 425 + for (i = 0; i < rs->md.raid_disks; i++) { 964 426 if (!(i % copies)) 965 427 rebuilds_per_group = 0; 966 - d = i % rs->md.raid_disks; 967 - if ((!rs->dev[d].rdev.sb_page || 968 - !test_bit(In_sync, &rs->dev[d].rdev.flags)) && 428 + if ((!rs->dev[i].rdev.sb_page || 429 + !test_bit(In_sync, &rs->dev[i].rdev.flags)) && 969 430 (++rebuilds_per_group >= copies)) 970 431 goto too_many; 971 432 } ··· 979 442 * use the 'use_far_sets' variant.) 980 443 * 981 444 * This check is somewhat complicated by the need to account 982 - * for arrays that are not a multiple of (far) copies. This 445 + * for arrays that are not a multiple of (far) copies. This 983 446 * results in the need to treat the last (potentially larger) 984 447 * set differently. 985 448 */ ··· 1012 475 * 1013 476 * Argument definitions 1014 477 * <chunk_size> The number of sectors per disk that 1015 - * will form the "stripe" 478 + * will form the "stripe" 1016 479 * [[no]sync] Force or prevent recovery of the 1017 - * entire array 480 + * entire array 1018 481 * [rebuild <idx>] Rebuild the drive indicated by the index 1019 482 * [daemon_sleep <ms>] Time between bitmap daemon work to 1020 - * clear bits 483 + * clear bits 1021 484 * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization 1022 485 * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization 1023 486 * [write_mostly <idx>] Indicate a write mostly drive via index 1024 487 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) 1025 488 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs 1026 - * [region_size <sectors>] Defines granularity of bitmap 489 + * [region_size <sectors>] Defines granularity of bitmap 1027 490 * 1028 491 * RAID10-only options: 1029 - * [raid10_copies <# copies>] Number of copies. (Default: 2) 492 + * [raid10_copies <# copies>] Number of copies. (Default: 2) 1030 493 * [raid10_format <near|far|offset>] Layout algorithm. (Default: near) 1031 494 */ 1032 - static int parse_raid_params(struct raid_set *rs, char **argv, 1033 - unsigned num_raid_params) 495 + static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, 496 + unsigned int num_raid_params) 1034 497 { 1035 - char *raid10_format = "near"; 1036 - unsigned raid10_copies = 2; 1037 - unsigned i; 1038 - unsigned long value, region_size = 0; 1039 - sector_t sectors_per_dev = rs->ti->len; 498 + int value, raid10_format = ALGORITHM_RAID10_DEFAULT; 499 + unsigned int raid10_copies = 2; 500 + unsigned int i, write_mostly = 0; 501 + unsigned int region_size = 0; 1040 502 sector_t max_io_len; 1041 - char *key; 503 + const char *arg, *key; 504 + struct raid_dev *rd; 505 + struct raid_type *rt = rs->raid_type; 506 + 507 + arg = dm_shift_arg(as); 508 + num_raid_params--; /* Account for chunk_size argument */ 509 + 510 + if (kstrtoint(arg, 10, &value) < 0) { 511 + rs->ti->error = "Bad numerical argument given for chunk_size"; 512 + return -EINVAL; 513 + } 1042 514 1043 515 /* 1044 516 * First, parse the in-order required arguments 1045 517 * "chunk_size" is the only argument of this type. 1046 518 */ 1047 - if ((kstrtoul(argv[0], 10, &value) < 0)) { 1048 - rs->ti->error = "Bad chunk size"; 1049 - return -EINVAL; 1050 - } else if (rs->raid_type->level == 1) { 519 + if (rt_is_raid1(rt)) { 1051 520 if (value) 1052 521 DMERR("Ignoring chunk size parameter for RAID 1"); 1053 522 value = 0; ··· 1066 523 } 1067 524 1068 525 rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; 1069 - argv++; 1070 - num_raid_params--; 1071 526 1072 527 /* 1073 528 * We set each individual device as In_sync with a completed ··· 1073 532 * replacement then one of the following cases applies: 1074 533 * 1075 534 * 1) User specifies 'rebuild'. 1076 - * - Device is reset when param is read. 535 + * - Device is reset when param is read. 1077 536 * 2) A new device is supplied. 1078 - * - No matching superblock found, resets device. 537 + * - No matching superblock found, resets device. 1079 538 * 3) Device failure was transient and returns on reload. 1080 - * - Failure noticed, resets device for bitmap replay. 539 + * - Failure noticed, resets device for bitmap replay. 1081 540 * 4) Device hadn't completed recovery after previous failure. 1082 - * - Superblock is read and overrides recovery_offset. 541 + * - Superblock is read and overrides recovery_offset. 1083 542 * 1084 543 * What is found in the superblocks of the devices is always 1085 544 * authoritative, unless 'rebuild' or '[no]sync' was specified. 1086 545 */ 1087 - for (i = 0; i < rs->md.raid_disks; i++) { 546 + for (i = 0; i < rs->raid_disks; i++) { 1088 547 set_bit(In_sync, &rs->dev[i].rdev.flags); 1089 548 rs->dev[i].rdev.recovery_offset = MaxSector; 1090 549 } ··· 1093 552 * Second, parse the unordered optional arguments 1094 553 */ 1095 554 for (i = 0; i < num_raid_params; i++) { 1096 - if (!strcasecmp(argv[i], "nosync")) { 1097 - rs->md.recovery_cp = MaxSector; 1098 - rs->ctr_flags |= CTR_FLAG_NOSYNC; 555 + key = dm_shift_arg(as); 556 + if (!key) { 557 + rs->ti->error = "Not enough raid parameters given"; 558 + return -EINVAL; 559 + } 560 + 561 + if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC))) { 562 + if (test_and_set_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) { 563 + rs->ti->error = "Only one 'nosync' argument allowed"; 564 + return -EINVAL; 565 + } 1099 566 continue; 1100 567 } 1101 - if (!strcasecmp(argv[i], "sync")) { 1102 - rs->md.recovery_cp = 0; 1103 - rs->ctr_flags |= CTR_FLAG_SYNC; 568 + if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_SYNC))) { 569 + if (test_and_set_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) { 570 + rs->ti->error = "Only one 'sync' argument allowed"; 571 + return -EINVAL; 572 + } 573 + continue; 574 + } 575 + if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_USE_NEAR_SETS))) { 576 + if (test_and_set_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) { 577 + rs->ti->error = "Only one 'raid10_use_new_sets' argument allowed"; 578 + return -EINVAL; 579 + } 1104 580 continue; 1105 581 } 1106 582 1107 - /* The rest of the optional arguments come in key/value pairs */ 1108 - if ((i + 1) >= num_raid_params) { 583 + arg = dm_shift_arg(as); 584 + i++; /* Account for the argument pairs */ 585 + if (!arg) { 1109 586 rs->ti->error = "Wrong number of raid parameters given"; 1110 587 return -EINVAL; 1111 588 } 1112 589 1113 - key = argv[i++]; 590 + /* 591 + * Parameters that take a string value are checked here. 592 + */ 1114 593 1115 - /* Parameters that take a string value are checked here. */ 1116 - if (!strcasecmp(key, "raid10_format")) { 1117 - if (rs->raid_type->level != 10) { 594 + if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) { 595 + if (test_and_set_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) { 596 + rs->ti->error = "Only one 'raid10_format' argument pair allowed"; 597 + return -EINVAL; 598 + } 599 + if (!rt_is_raid10(rt)) { 1118 600 rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; 1119 601 return -EINVAL; 1120 602 } 1121 - if (strcmp("near", argv[i]) && 1122 - strcmp("far", argv[i]) && 1123 - strcmp("offset", argv[i])) { 603 + raid10_format = raid10_name_to_format(arg); 604 + if (raid10_format < 0) { 1124 605 rs->ti->error = "Invalid 'raid10_format' value given"; 1125 - return -EINVAL; 606 + return raid10_format; 1126 607 } 1127 - raid10_format = argv[i]; 1128 - rs->ctr_flags |= CTR_FLAG_RAID10_FORMAT; 1129 608 continue; 1130 609 } 1131 610 1132 - if (kstrtoul(argv[i], 10, &value) < 0) { 611 + if (kstrtoint(arg, 10, &value) < 0) { 1133 612 rs->ti->error = "Bad numerical argument given in raid params"; 1134 613 return -EINVAL; 1135 614 } 1136 615 1137 - /* Parameters that take a numeric value are checked here */ 1138 - if (!strcasecmp(key, "rebuild")) { 1139 - if (value >= rs->md.raid_disks) { 616 + if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD))) { 617 + /* 618 + * "rebuild" is being passed in by userspace to provide 619 + * indexes of replaced devices and to set up additional 620 + * devices on raid level takeover. 621 + */ 622 + if (!__within_range(value, 0, rs->raid_disks - 1)) { 1140 623 rs->ti->error = "Invalid rebuild index given"; 1141 624 return -EINVAL; 1142 625 } 1143 - clear_bit(In_sync, &rs->dev[value].rdev.flags); 1144 - rs->dev[value].rdev.recovery_offset = 0; 1145 - rs->ctr_flags |= CTR_FLAG_REBUILD; 1146 - } else if (!strcasecmp(key, "write_mostly")) { 1147 - if (rs->raid_type->level != 1) { 626 + 627 + if (test_and_set_bit(value, (void *) rs->rebuild_disks)) { 628 + rs->ti->error = "rebuild for this index already given"; 629 + return -EINVAL; 630 + } 631 + 632 + rd = rs->dev + value; 633 + clear_bit(In_sync, &rd->rdev.flags); 634 + clear_bit(Faulty, &rd->rdev.flags); 635 + rd->rdev.recovery_offset = 0; 636 + set_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags); 637 + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY))) { 638 + if (!rt_is_raid1(rt)) { 1148 639 rs->ti->error = "write_mostly option is only valid for RAID1"; 1149 640 return -EINVAL; 1150 641 } 1151 - if (value >= rs->md.raid_disks) { 1152 - rs->ti->error = "Invalid write_mostly drive index given"; 642 + 643 + if (!__within_range(value, 0, rs->md.raid_disks - 1)) { 644 + rs->ti->error = "Invalid write_mostly index given"; 1153 645 return -EINVAL; 1154 646 } 647 + 648 + write_mostly++; 1155 649 set_bit(WriteMostly, &rs->dev[value].rdev.flags); 1156 - } else if (!strcasecmp(key, "max_write_behind")) { 1157 - if (rs->raid_type->level != 1) { 650 + set_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags); 651 + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND))) { 652 + if (!rt_is_raid1(rt)) { 1158 653 rs->ti->error = "max_write_behind option is only valid for RAID1"; 1159 654 return -EINVAL; 1160 655 } 1161 - rs->ctr_flags |= CTR_FLAG_MAX_WRITE_BEHIND; 656 + 657 + if (test_and_set_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags)) { 658 + rs->ti->error = "Only one max_write_behind argument pair allowed"; 659 + return -EINVAL; 660 + } 1162 661 1163 662 /* 1164 663 * In device-mapper, we specify things in sectors, but ··· 1209 628 rs->ti->error = "Max write-behind limit out of range"; 1210 629 return -EINVAL; 1211 630 } 631 + 1212 632 rs->md.bitmap_info.max_write_behind = value; 1213 - } else if (!strcasecmp(key, "daemon_sleep")) { 1214 - rs->ctr_flags |= CTR_FLAG_DAEMON_SLEEP; 633 + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP))) { 634 + if (test_and_set_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) { 635 + rs->ti->error = "Only one daemon_sleep argument pair allowed"; 636 + return -EINVAL; 637 + } 1215 638 if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { 1216 639 rs->ti->error = "daemon sleep period out of range"; 1217 640 return -EINVAL; 1218 641 } 1219 642 rs->md.bitmap_info.daemon_sleep = value; 1220 - } else if (!strcasecmp(key, "stripe_cache")) { 1221 - rs->ctr_flags |= CTR_FLAG_STRIPE_CACHE; 643 + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET))) { 644 + /* Userspace passes new data_offset after having extended the the data image LV */ 645 + if (test_and_set_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) { 646 + rs->ti->error = "Only one data_offset argument pair allowed"; 647 + return -EINVAL; 648 + } 649 + /* Ensure sensible data offset */ 650 + if (value < 0 || 651 + (value && (value < MIN_FREE_RESHAPE_SPACE || value % to_sector(PAGE_SIZE)))) { 652 + rs->ti->error = "Bogus data_offset value"; 653 + return -EINVAL; 654 + } 655 + rs->data_offset = value; 656 + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS))) { 657 + /* Define the +/-# of disks to add to/remove from the given raid set */ 658 + if (test_and_set_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) { 659 + rs->ti->error = "Only one delta_disks argument pair allowed"; 660 + return -EINVAL; 661 + } 662 + /* Ensure MAX_RAID_DEVICES and raid type minimal_devs! */ 663 + if (!__within_range(abs(value), 1, MAX_RAID_DEVICES - rt->minimal_devs)) { 664 + rs->ti->error = "Too many delta_disk requested"; 665 + return -EINVAL; 666 + } 1222 667 1223 - /* 1224 - * In device-mapper, we specify things in sectors, but 1225 - * MD records this value in kB 1226 - */ 1227 - value /= 2; 668 + rs->delta_disks = value; 669 + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE))) { 670 + if (test_and_set_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags)) { 671 + rs->ti->error = "Only one stripe_cache argument pair allowed"; 672 + return -EINVAL; 673 + } 1228 674 1229 - if ((rs->raid_type->level != 5) && 1230 - (rs->raid_type->level != 6)) { 675 + if (!rt_is_raid456(rt)) { 1231 676 rs->ti->error = "Inappropriate argument: stripe_cache"; 1232 677 return -EINVAL; 1233 678 } 1234 - if (raid5_set_cache_size(&rs->md, (int)value)) { 1235 - rs->ti->error = "Bad stripe_cache size"; 679 + 680 + rs->stripe_cache_entries = value; 681 + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE))) { 682 + if (test_and_set_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) { 683 + rs->ti->error = "Only one min_recovery_rate argument pair allowed"; 1236 684 return -EINVAL; 1237 685 } 1238 - } else if (!strcasecmp(key, "min_recovery_rate")) { 1239 - rs->ctr_flags |= CTR_FLAG_MIN_RECOVERY_RATE; 1240 686 if (value > INT_MAX) { 1241 687 rs->ti->error = "min_recovery_rate out of range"; 1242 688 return -EINVAL; 1243 689 } 1244 690 rs->md.sync_speed_min = (int)value; 1245 - } else if (!strcasecmp(key, "max_recovery_rate")) { 1246 - rs->ctr_flags |= CTR_FLAG_MAX_RECOVERY_RATE; 691 + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE))) { 692 + if (test_and_set_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) { 693 + rs->ti->error = "Only one max_recovery_rate argument pair allowed"; 694 + return -EINVAL; 695 + } 1247 696 if (value > INT_MAX) { 1248 697 rs->ti->error = "max_recovery_rate out of range"; 1249 698 return -EINVAL; 1250 699 } 1251 700 rs->md.sync_speed_max = (int)value; 1252 - } else if (!strcasecmp(key, "region_size")) { 1253 - rs->ctr_flags |= CTR_FLAG_REGION_SIZE; 701 + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE))) { 702 + if (test_and_set_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) { 703 + rs->ti->error = "Only one region_size argument pair allowed"; 704 + return -EINVAL; 705 + } 706 + 1254 707 region_size = value; 1255 - } else if (!strcasecmp(key, "raid10_copies") && 1256 - (rs->raid_type->level == 10)) { 1257 - if ((value < 2) || (value > 0xFF)) { 708 + rs->requested_bitmap_chunk_sectors = value; 709 + } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES))) { 710 + if (test_and_set_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags)) { 711 + rs->ti->error = "Only one raid10_copies argument pair allowed"; 712 + return -EINVAL; 713 + } 714 + 715 + if (!__within_range(value, 2, rs->md.raid_disks)) { 1258 716 rs->ti->error = "Bad value for 'raid10_copies'"; 1259 717 return -EINVAL; 1260 718 } 1261 - rs->ctr_flags |= CTR_FLAG_RAID10_COPIES; 719 + 1262 720 raid10_copies = value; 1263 721 } else { 1264 722 DMERR("Unable to parse RAID parameter: %s", key); 1265 - rs->ti->error = "Unable to parse RAID parameters"; 723 + rs->ti->error = "Unable to parse RAID parameter"; 1266 724 return -EINVAL; 1267 725 } 726 + } 727 + 728 + if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) && 729 + test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) { 730 + rs->ti->error = "sync and nosync are mutually exclusive"; 731 + return -EINVAL; 732 + } 733 + 734 + if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && 735 + (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) || 736 + test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))) { 737 + rs->ti->error = "sync/nosync and rebuild are mutually exclusive"; 738 + return -EINVAL; 739 + } 740 + 741 + if (write_mostly >= rs->md.raid_disks) { 742 + rs->ti->error = "Can't set all raid1 devices to write_mostly"; 743 + return -EINVAL; 1268 744 } 1269 745 1270 746 if (validate_region_size(rs, region_size)) ··· 1335 697 if (dm_set_target_max_io_len(rs->ti, max_io_len)) 1336 698 return -EINVAL; 1337 699 1338 - if (rs->raid_type->level == 10) { 700 + if (rt_is_raid10(rt)) { 1339 701 if (raid10_copies > rs->md.raid_disks) { 1340 702 rs->ti->error = "Not enough devices to satisfy specification"; 1341 703 return -EINVAL; 1342 704 } 1343 705 1344 - /* 1345 - * If the format is not "near", we only support 1346 - * two copies at the moment. 1347 - */ 1348 - if (strcmp("near", raid10_format) && (raid10_copies > 2)) { 1349 - rs->ti->error = "Too many copies for given RAID10 format."; 706 + rs->md.new_layout = raid10_format_to_md_layout(rs, raid10_format, raid10_copies); 707 + if (rs->md.new_layout < 0) { 708 + rs->ti->error = "Error getting raid10 format"; 709 + return rs->md.new_layout; 710 + } 711 + 712 + rt = get_raid_type_by_ll(10, rs->md.new_layout); 713 + if (!rt) { 714 + rs->ti->error = "Failed to recognize new raid10 layout"; 1350 715 return -EINVAL; 1351 716 } 1352 717 1353 - /* (Len * #mirrors) / #devices */ 1354 - sectors_per_dev = rs->ti->len * raid10_copies; 1355 - sector_div(sectors_per_dev, rs->md.raid_disks); 1356 - 1357 - rs->md.layout = raid10_format_to_md_layout(raid10_format, 1358 - raid10_copies); 1359 - rs->md.new_layout = rs->md.layout; 1360 - } else if ((!rs->raid_type->level || rs->raid_type->level > 1) && 1361 - sector_div(sectors_per_dev, 1362 - (rs->md.raid_disks - rs->raid_type->parity_devs))) { 1363 - rs->ti->error = "Target length not divisible by number of data devices"; 1364 - return -EINVAL; 718 + if ((rt->algorithm == ALGORITHM_RAID10_DEFAULT || 719 + rt->algorithm == ALGORITHM_RAID10_NEAR) && 720 + test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) { 721 + rs->ti->error = "RAID10 format 'near' and 'raid10_use_near_sets' are incompatible"; 722 + return -EINVAL; 723 + } 1365 724 } 1366 - rs->md.dev_sectors = sectors_per_dev; 725 + 726 + rs->raid10_copies = raid10_copies; 1367 727 1368 728 /* Assume there are no metadata devices until the drives are parsed */ 1369 729 rs->md.persistent = 0; 1370 730 rs->md.external = 1; 1371 731 732 + /* Check, if any invalid ctr arguments have been passed in for the raid level */ 733 + return rs_check_for_valid_flags(rs); 734 + } 735 + 736 + /* Set raid4/5/6 cache size */ 737 + static int rs_set_raid456_stripe_cache(struct raid_set *rs) 738 + { 739 + int r; 740 + struct r5conf *conf; 741 + struct mddev *mddev = &rs->md; 742 + uint32_t min_stripes = max(mddev->chunk_sectors, mddev->new_chunk_sectors) / 2; 743 + uint32_t nr_stripes = rs->stripe_cache_entries; 744 + 745 + if (!rt_is_raid456(rs->raid_type)) { 746 + rs->ti->error = "Inappropriate raid level; cannot change stripe_cache size"; 747 + return -EINVAL; 748 + } 749 + 750 + if (nr_stripes < min_stripes) { 751 + DMINFO("Adjusting requested %u stripe cache entries to %u to suit stripe size", 752 + nr_stripes, min_stripes); 753 + nr_stripes = min_stripes; 754 + } 755 + 756 + conf = mddev->private; 757 + if (!conf) { 758 + rs->ti->error = "Cannot change stripe_cache size on inactive RAID set"; 759 + return -EINVAL; 760 + } 761 + 762 + /* Try setting number of stripes in raid456 stripe cache */ 763 + if (conf->min_nr_stripes != nr_stripes) { 764 + r = raid5_set_cache_size(mddev, nr_stripes); 765 + if (r) { 766 + rs->ti->error = "Failed to set raid4/5/6 stripe cache size"; 767 + return r; 768 + } 769 + 770 + DMINFO("%u stripe cache entries", nr_stripes); 771 + } 772 + 1372 773 return 0; 774 + } 775 + 776 + /* Return # of data stripes as kept in mddev as of @rs (i.e. as of superblock) */ 777 + static unsigned int mddev_data_stripes(struct raid_set *rs) 778 + { 779 + return rs->md.raid_disks - rs->raid_type->parity_devs; 780 + } 781 + 782 + /* Return # of data stripes of @rs (i.e. as of ctr) */ 783 + static unsigned int rs_data_stripes(struct raid_set *rs) 784 + { 785 + return rs->raid_disks - rs->raid_type->parity_devs; 786 + } 787 + 788 + /* Calculate the sectors per device and per array used for @rs */ 789 + static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) 790 + { 791 + int delta_disks; 792 + unsigned int data_stripes; 793 + struct mddev *mddev = &rs->md; 794 + struct md_rdev *rdev; 795 + sector_t array_sectors = rs->ti->len, dev_sectors = rs->ti->len; 796 + 797 + if (use_mddev) { 798 + delta_disks = mddev->delta_disks; 799 + data_stripes = mddev_data_stripes(rs); 800 + } else { 801 + delta_disks = rs->delta_disks; 802 + data_stripes = rs_data_stripes(rs); 803 + } 804 + 805 + /* Special raid1 case w/o delta_disks support (yet) */ 806 + if (rt_is_raid1(rs->raid_type)) 807 + ; 808 + else if (rt_is_raid10(rs->raid_type)) { 809 + if (rs->raid10_copies < 2 || 810 + delta_disks < 0) { 811 + rs->ti->error = "Bogus raid10 data copies or delta disks"; 812 + return -EINVAL; 813 + } 814 + 815 + dev_sectors *= rs->raid10_copies; 816 + if (sector_div(dev_sectors, data_stripes)) 817 + goto bad; 818 + 819 + array_sectors = (data_stripes + delta_disks) * dev_sectors; 820 + if (sector_div(array_sectors, rs->raid10_copies)) 821 + goto bad; 822 + 823 + } else if (sector_div(dev_sectors, data_stripes)) 824 + goto bad; 825 + 826 + else 827 + /* Striped layouts */ 828 + array_sectors = (data_stripes + delta_disks) * dev_sectors; 829 + 830 + rdev_for_each(rdev, mddev) 831 + rdev->sectors = dev_sectors; 832 + 833 + mddev->array_sectors = array_sectors; 834 + mddev->dev_sectors = dev_sectors; 835 + 836 + return 0; 837 + bad: 838 + rs->ti->error = "Target length not divisible by number of data devices"; 839 + return -EINVAL; 840 + } 841 + 842 + /* Setup recovery on @rs */ 843 + static void __rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) 844 + { 845 + /* raid0 does not recover */ 846 + if (rs_is_raid0(rs)) 847 + rs->md.recovery_cp = MaxSector; 848 + /* 849 + * A raid6 set has to be recovered either 850 + * completely or for the grown part to 851 + * ensure proper parity and Q-Syndrome 852 + */ 853 + else if (rs_is_raid6(rs)) 854 + rs->md.recovery_cp = dev_sectors; 855 + /* 856 + * Other raid set types may skip recovery 857 + * depending on the 'nosync' flag. 858 + */ 859 + else 860 + rs->md.recovery_cp = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags) 861 + ? MaxSector : dev_sectors; 862 + } 863 + 864 + /* Setup recovery on @rs based on raid type, device size and 'nosync' flag */ 865 + static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) 866 + { 867 + if (!dev_sectors) 868 + /* New raid set or 'sync' flag provided */ 869 + __rs_setup_recovery(rs, 0); 870 + else if (dev_sectors == MaxSector) 871 + /* Prevent recovery */ 872 + __rs_setup_recovery(rs, MaxSector); 873 + else if (rs->dev[0].rdev.sectors < dev_sectors) 874 + /* Grown raid set */ 875 + __rs_setup_recovery(rs, rs->dev[0].rdev.sectors); 876 + else 877 + __rs_setup_recovery(rs, MaxSector); 1373 878 } 1374 879 1375 880 static void do_table_event(struct work_struct *ws) 1376 881 { 1377 882 struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); 1378 883 884 + smp_rmb(); /* Make sure we access most actual mddev properties */ 885 + if (!rs_is_reshaping(rs)) 886 + rs_set_capacity(rs); 1379 887 dm_table_event(rs->ti->table); 1380 888 } 1381 889 ··· 1533 749 } 1534 750 1535 751 /* 752 + * Make sure a valid takover (level switch) is being requested on @rs 753 + * 754 + * Conversions of raid sets from one MD personality to another 755 + * have to conform to restrictions which are enforced here. 756 + */ 757 + static int rs_check_takeover(struct raid_set *rs) 758 + { 759 + struct mddev *mddev = &rs->md; 760 + unsigned int near_copies; 761 + 762 + if (rs->md.degraded) { 763 + rs->ti->error = "Can't takeover degraded raid set"; 764 + return -EPERM; 765 + } 766 + 767 + if (rs_is_reshaping(rs)) { 768 + rs->ti->error = "Can't takeover reshaping raid set"; 769 + return -EPERM; 770 + } 771 + 772 + switch (mddev->level) { 773 + case 0: 774 + /* raid0 -> raid1/5 with one disk */ 775 + if ((mddev->new_level == 1 || mddev->new_level == 5) && 776 + mddev->raid_disks == 1) 777 + return 0; 778 + 779 + /* raid0 -> raid10 */ 780 + if (mddev->new_level == 10 && 781 + !(rs->raid_disks % mddev->raid_disks)) 782 + return 0; 783 + 784 + /* raid0 with multiple disks -> raid4/5/6 */ 785 + if (__within_range(mddev->new_level, 4, 6) && 786 + mddev->new_layout == ALGORITHM_PARITY_N && 787 + mddev->raid_disks > 1) 788 + return 0; 789 + 790 + break; 791 + 792 + case 10: 793 + /* Can't takeover raid10_offset! */ 794 + if (__is_raid10_offset(mddev->layout)) 795 + break; 796 + 797 + near_copies = __raid10_near_copies(mddev->layout); 798 + 799 + /* raid10* -> raid0 */ 800 + if (mddev->new_level == 0) { 801 + /* Can takeover raid10_near with raid disks divisable by data copies! */ 802 + if (near_copies > 1 && 803 + !(mddev->raid_disks % near_copies)) { 804 + mddev->raid_disks /= near_copies; 805 + mddev->delta_disks = mddev->raid_disks; 806 + return 0; 807 + } 808 + 809 + /* Can takeover raid10_far */ 810 + if (near_copies == 1 && 811 + __raid10_far_copies(mddev->layout) > 1) 812 + return 0; 813 + 814 + break; 815 + } 816 + 817 + /* raid10_{near,far} -> raid1 */ 818 + if (mddev->new_level == 1 && 819 + max(near_copies, __raid10_far_copies(mddev->layout)) == mddev->raid_disks) 820 + return 0; 821 + 822 + /* raid10_{near,far} with 2 disks -> raid4/5 */ 823 + if (__within_range(mddev->new_level, 4, 5) && 824 + mddev->raid_disks == 2) 825 + return 0; 826 + break; 827 + 828 + case 1: 829 + /* raid1 with 2 disks -> raid4/5 */ 830 + if (__within_range(mddev->new_level, 4, 5) && 831 + mddev->raid_disks == 2) { 832 + mddev->degraded = 1; 833 + return 0; 834 + } 835 + 836 + /* raid1 -> raid0 */ 837 + if (mddev->new_level == 0 && 838 + mddev->raid_disks == 1) 839 + return 0; 840 + 841 + /* raid1 -> raid10 */ 842 + if (mddev->new_level == 10) 843 + return 0; 844 + break; 845 + 846 + case 4: 847 + /* raid4 -> raid0 */ 848 + if (mddev->new_level == 0) 849 + return 0; 850 + 851 + /* raid4 -> raid1/5 with 2 disks */ 852 + if ((mddev->new_level == 1 || mddev->new_level == 5) && 853 + mddev->raid_disks == 2) 854 + return 0; 855 + 856 + /* raid4 -> raid5/6 with parity N */ 857 + if (__within_range(mddev->new_level, 5, 6) && 858 + mddev->layout == ALGORITHM_PARITY_N) 859 + return 0; 860 + break; 861 + 862 + case 5: 863 + /* raid5 with parity N -> raid0 */ 864 + if (mddev->new_level == 0 && 865 + mddev->layout == ALGORITHM_PARITY_N) 866 + return 0; 867 + 868 + /* raid5 with parity N -> raid4 */ 869 + if (mddev->new_level == 4 && 870 + mddev->layout == ALGORITHM_PARITY_N) 871 + return 0; 872 + 873 + /* raid5 with 2 disks -> raid1/4/10 */ 874 + if ((mddev->new_level == 1 || mddev->new_level == 4 || mddev->new_level == 10) && 875 + mddev->raid_disks == 2) 876 + return 0; 877 + 878 + /* raid5_* -> raid6_*_6 with Q-Syndrome N (e.g. raid5_ra -> raid6_ra_6 */ 879 + if (mddev->new_level == 6 && 880 + ((mddev->layout == ALGORITHM_PARITY_N && mddev->new_layout == ALGORITHM_PARITY_N) || 881 + __within_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC_6, ALGORITHM_RIGHT_SYMMETRIC_6))) 882 + return 0; 883 + break; 884 + 885 + case 6: 886 + /* raid6 with parity N -> raid0 */ 887 + if (mddev->new_level == 0 && 888 + mddev->layout == ALGORITHM_PARITY_N) 889 + return 0; 890 + 891 + /* raid6 with parity N -> raid4 */ 892 + if (mddev->new_level == 4 && 893 + mddev->layout == ALGORITHM_PARITY_N) 894 + return 0; 895 + 896 + /* raid6_*_n with Q-Syndrome N -> raid5_* */ 897 + if (mddev->new_level == 5 && 898 + ((mddev->layout == ALGORITHM_PARITY_N && mddev->new_layout == ALGORITHM_PARITY_N) || 899 + __within_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC, ALGORITHM_RIGHT_SYMMETRIC))) 900 + return 0; 901 + 902 + default: 903 + break; 904 + } 905 + 906 + rs->ti->error = "takeover not possible"; 907 + return -EINVAL; 908 + } 909 + 910 + /* True if @rs requested to be taken over */ 911 + static bool rs_takeover_requested(struct raid_set *rs) 912 + { 913 + return rs->md.new_level != rs->md.level; 914 + } 915 + 916 + /* True if @rs is requested to reshape by ctr */ 917 + static bool rs_reshape_requested(struct raid_set *rs) 918 + { 919 + bool change; 920 + struct mddev *mddev = &rs->md; 921 + 922 + if (rs_takeover_requested(rs)) 923 + return false; 924 + 925 + if (!mddev->level) 926 + return false; 927 + 928 + change = mddev->new_layout != mddev->layout || 929 + mddev->new_chunk_sectors != mddev->chunk_sectors || 930 + rs->delta_disks; 931 + 932 + /* Historical case to support raid1 reshape without delta disks */ 933 + if (mddev->level == 1) { 934 + if (rs->delta_disks) 935 + return !!rs->delta_disks; 936 + 937 + return !change && 938 + mddev->raid_disks != rs->raid_disks; 939 + } 940 + 941 + if (mddev->level == 10) 942 + return change && 943 + !__is_raid10_far(mddev->new_layout) && 944 + rs->delta_disks >= 0; 945 + 946 + return change; 947 + } 948 + 949 + /* Features */ 950 + #define FEATURE_FLAG_SUPPORTS_V190 0x1 /* Supports extended superblock */ 951 + 952 + /* State flags for sb->flags */ 953 + #define SB_FLAG_RESHAPE_ACTIVE 0x1 954 + #define SB_FLAG_RESHAPE_BACKWARDS 0x2 955 + 956 + /* 1536 957 * This structure is never routinely used by userspace, unlike md superblocks. 1537 958 * Devices with this superblock should only ever be accessed via device-mapper. 1538 959 */ 1539 960 #define DM_RAID_MAGIC 0x64526D44 1540 961 struct dm_raid_superblock { 1541 962 __le32 magic; /* "DmRd" */ 1542 - __le32 features; /* Used to indicate possible future changes */ 963 + __le32 compat_features; /* Used to indicate compatible features (like 1.9.0 ondisk metadata extension) */ 1543 964 1544 - __le32 num_devices; /* Number of devices in this array. (Max 64) */ 1545 - __le32 array_position; /* The position of this drive in the array */ 965 + __le32 num_devices; /* Number of devices in this raid set. (Max 64) */ 966 + __le32 array_position; /* The position of this drive in the raid set */ 1546 967 1547 968 __le64 events; /* Incremented by md when superblock updated */ 1548 - __le64 failed_devices; /* Bit field of devices to indicate failures */ 969 + __le64 failed_devices; /* Pre 1.9.0 part of bit field of devices to */ 970 + /* indicate failures (see extension below) */ 1549 971 1550 972 /* 1551 973 * This offset tracks the progress of the repair or replacement of ··· 1760 770 __le64 disk_recovery_offset; 1761 771 1762 772 /* 1763 - * This offset tracks the progress of the initial array 773 + * This offset tracks the progress of the initial raid set 1764 774 * synchronisation/parity calculation. 1765 775 */ 1766 776 __le64 array_resync_offset; 1767 777 1768 778 /* 1769 - * RAID characteristics 779 + * raid characteristics 1770 780 */ 1771 781 __le32 level; 1772 782 __le32 layout; 1773 783 __le32 stripe_sectors; 1774 784 1775 - /* Remainder of a logical block is zero-filled when writing (see super_sync()). */ 785 + /******************************************************************** 786 + * BELOW FOLLOW V1.9.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!! 787 + * 788 + * FEATURE_FLAG_SUPPORTS_V190 in the features member indicates that those exist 789 + */ 790 + 791 + __le32 flags; /* Flags defining array states for reshaping */ 792 + 793 + /* 794 + * This offset tracks the progress of a raid 795 + * set reshape in order to be able to restart it 796 + */ 797 + __le64 reshape_position; 798 + 799 + /* 800 + * These define the properties of the array in case of an interrupted reshape 801 + */ 802 + __le32 new_level; 803 + __le32 new_layout; 804 + __le32 new_stripe_sectors; 805 + __le32 delta_disks; 806 + 807 + __le64 array_sectors; /* Array size in sectors */ 808 + 809 + /* 810 + * Sector offsets to data on devices (reshaping). 811 + * Needed to support out of place reshaping, thus 812 + * not writing over any stripes whilst converting 813 + * them from old to new layout 814 + */ 815 + __le64 data_offset; 816 + __le64 new_data_offset; 817 + 818 + __le64 sectors; /* Used device size in sectors */ 819 + 820 + /* 821 + * Additonal Bit field of devices indicating failures to support 822 + * up to 256 devices with the 1.9.0 on-disk metadata format 823 + */ 824 + __le64 extended_failed_devices[DISKS_ARRAY_ELEMS - 1]; 825 + 826 + __le32 incompat_features; /* Used to indicate any incompatible features */ 827 + 828 + /* Always set rest up to logical block size to 0 when writing (see get_metadata_device() below). */ 1776 829 } __packed; 830 + 831 + /* 832 + * Check for reshape constraints on raid set @rs: 833 + * 834 + * - reshape function non-existent 835 + * - degraded set 836 + * - ongoing recovery 837 + * - ongoing reshape 838 + * 839 + * Returns 0 if none or -EPERM if given constraint 840 + * and error message reference in @errmsg 841 + */ 842 + static int rs_check_reshape(struct raid_set *rs) 843 + { 844 + struct mddev *mddev = &rs->md; 845 + 846 + if (!mddev->pers || !mddev->pers->check_reshape) 847 + rs->ti->error = "Reshape not supported"; 848 + else if (mddev->degraded) 849 + rs->ti->error = "Can't reshape degraded raid set"; 850 + else if (rs_is_recovering(rs)) 851 + rs->ti->error = "Convert request on recovering raid set prohibited"; 852 + else if (rs_is_reshaping(rs)) 853 + rs->ti->error = "raid set already reshaping!"; 854 + else if (!(rs_is_raid1(rs) || rs_is_raid10(rs) || rs_is_raid456(rs))) 855 + rs->ti->error = "Reshaping only supported for raid1/4/5/6/10"; 856 + else 857 + return 0; 858 + 859 + return -EPERM; 860 + } 1777 861 1778 862 static int read_disk_sb(struct md_rdev *rdev, int size) 1779 863 { ··· 1856 792 if (rdev->sb_loaded) 1857 793 return 0; 1858 794 1859 - if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, 1)) { 795 + if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) { 1860 796 DMERR("Failed to read superblock of device at position %d", 1861 797 rdev->raid_disk); 1862 798 md_error(rdev->mddev, rdev); ··· 1868 804 return 0; 1869 805 } 1870 806 807 + static void sb_retrieve_failed_devices(struct dm_raid_superblock *sb, uint64_t *failed_devices) 808 + { 809 + failed_devices[0] = le64_to_cpu(sb->failed_devices); 810 + memset(failed_devices + 1, 0, sizeof(sb->extended_failed_devices)); 811 + 812 + if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190) { 813 + int i = ARRAY_SIZE(sb->extended_failed_devices); 814 + 815 + while (i--) 816 + failed_devices[i+1] = le64_to_cpu(sb->extended_failed_devices[i]); 817 + } 818 + } 819 + 820 + static void sb_update_failed_devices(struct dm_raid_superblock *sb, uint64_t *failed_devices) 821 + { 822 + int i = ARRAY_SIZE(sb->extended_failed_devices); 823 + 824 + sb->failed_devices = cpu_to_le64(failed_devices[0]); 825 + while (i--) 826 + sb->extended_failed_devices[i] = cpu_to_le64(failed_devices[i+1]); 827 + } 828 + 829 + /* 830 + * Synchronize the superblock members with the raid set properties 831 + * 832 + * All superblock data is little endian. 833 + */ 1871 834 static void super_sync(struct mddev *mddev, struct md_rdev *rdev) 1872 835 { 1873 - int i; 1874 - uint64_t failed_devices; 836 + bool update_failed_devices = false; 837 + unsigned int i; 838 + uint64_t failed_devices[DISKS_ARRAY_ELEMS]; 1875 839 struct dm_raid_superblock *sb; 1876 840 struct raid_set *rs = container_of(mddev, struct raid_set, md); 1877 841 842 + /* No metadata device, no superblock */ 843 + if (!rdev->meta_bdev) 844 + return; 845 + 846 + BUG_ON(!rdev->sb_page); 847 + 1878 848 sb = page_address(rdev->sb_page); 1879 - failed_devices = le64_to_cpu(sb->failed_devices); 1880 849 1881 - for (i = 0; i < mddev->raid_disks; i++) 1882 - if (!rs->dev[i].data_dev || 1883 - test_bit(Faulty, &(rs->dev[i].rdev.flags))) 1884 - failed_devices |= (1ULL << i); 850 + sb_retrieve_failed_devices(sb, failed_devices); 1885 851 1886 - memset(sb + 1, 0, rdev->sb_size - sizeof(*sb)); 852 + for (i = 0; i < rs->raid_disks; i++) 853 + if (!rs->dev[i].data_dev || test_bit(Faulty, &rs->dev[i].rdev.flags)) { 854 + update_failed_devices = true; 855 + set_bit(i, (void *) failed_devices); 856 + } 857 + 858 + if (update_failed_devices) 859 + sb_update_failed_devices(sb, failed_devices); 1887 860 1888 861 sb->magic = cpu_to_le32(DM_RAID_MAGIC); 1889 - sb->features = cpu_to_le32(0); /* No features yet */ 862 + sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190); 1890 863 1891 864 sb->num_devices = cpu_to_le32(mddev->raid_disks); 1892 865 sb->array_position = cpu_to_le32(rdev->raid_disk); 1893 866 1894 867 sb->events = cpu_to_le64(mddev->events); 1895 - sb->failed_devices = cpu_to_le64(failed_devices); 1896 868 1897 869 sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset); 1898 870 sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp); ··· 1936 836 sb->level = cpu_to_le32(mddev->level); 1937 837 sb->layout = cpu_to_le32(mddev->layout); 1938 838 sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors); 839 + 840 + sb->new_level = cpu_to_le32(mddev->new_level); 841 + sb->new_layout = cpu_to_le32(mddev->new_layout); 842 + sb->new_stripe_sectors = cpu_to_le32(mddev->new_chunk_sectors); 843 + 844 + sb->delta_disks = cpu_to_le32(mddev->delta_disks); 845 + 846 + smp_rmb(); /* Make sure we access most recent reshape position */ 847 + sb->reshape_position = cpu_to_le64(mddev->reshape_position); 848 + if (le64_to_cpu(sb->reshape_position) != MaxSector) { 849 + /* Flag ongoing reshape */ 850 + sb->flags |= cpu_to_le32(SB_FLAG_RESHAPE_ACTIVE); 851 + 852 + if (mddev->delta_disks < 0 || mddev->reshape_backwards) 853 + sb->flags |= cpu_to_le32(SB_FLAG_RESHAPE_BACKWARDS); 854 + } else { 855 + /* Clear reshape flags */ 856 + sb->flags &= ~(cpu_to_le32(SB_FLAG_RESHAPE_ACTIVE|SB_FLAG_RESHAPE_BACKWARDS)); 857 + } 858 + 859 + sb->array_sectors = cpu_to_le64(mddev->array_sectors); 860 + sb->data_offset = cpu_to_le64(rdev->data_offset); 861 + sb->new_data_offset = cpu_to_le64(rdev->new_data_offset); 862 + sb->sectors = cpu_to_le64(rdev->sectors); 863 + 864 + /* Zero out the rest of the payload after the size of the superblock */ 865 + memset(sb + 1, 0, rdev->sb_size - sizeof(*sb)); 1939 866 } 1940 867 1941 868 /* ··· 1975 848 */ 1976 849 static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) 1977 850 { 1978 - int ret; 851 + int r; 1979 852 struct dm_raid_superblock *sb; 1980 853 struct dm_raid_superblock *refsb; 1981 854 uint64_t events_sb, events_refsb; ··· 1987 860 return -EINVAL; 1988 861 } 1989 862 1990 - ret = read_disk_sb(rdev, rdev->sb_size); 1991 - if (ret) 1992 - return ret; 863 + r = read_disk_sb(rdev, rdev->sb_size); 864 + if (r) 865 + return r; 1993 866 1994 867 sb = page_address(rdev->sb_page); 1995 868 ··· 2003 876 super_sync(rdev->mddev, rdev); 2004 877 2005 878 set_bit(FirstUse, &rdev->flags); 879 + sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190); 2006 880 2007 881 /* Force writing of superblocks to disk */ 2008 882 set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags); ··· 2023 895 return (events_sb > events_refsb) ? 1 : 0; 2024 896 } 2025 897 2026 - static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) 898 + static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) 2027 899 { 2028 900 int role; 2029 - struct raid_set *rs = container_of(mddev, struct raid_set, md); 901 + unsigned int d; 902 + struct mddev *mddev = &rs->md; 2030 903 uint64_t events_sb; 2031 - uint64_t failed_devices; 904 + uint64_t failed_devices[DISKS_ARRAY_ELEMS]; 2032 905 struct dm_raid_superblock *sb; 2033 - uint32_t new_devs = 0; 2034 - uint32_t rebuilds = 0; 906 + uint32_t new_devs = 0, rebuild_and_new = 0, rebuilds = 0; 2035 907 struct md_rdev *r; 2036 908 struct dm_raid_superblock *sb2; 2037 909 2038 910 sb = page_address(rdev->sb_page); 2039 911 events_sb = le64_to_cpu(sb->events); 2040 - failed_devices = le64_to_cpu(sb->failed_devices); 2041 912 2042 913 /* 2043 914 * Initialise to 1 if this is a new superblock. 2044 915 */ 2045 916 mddev->events = events_sb ? : 1; 2046 917 918 + mddev->reshape_position = MaxSector; 919 + 2047 920 /* 2048 - * Reshaping is not currently allowed 921 + * Reshaping is supported, e.g. reshape_position is valid 922 + * in superblock and superblock content is authoritative. 2049 923 */ 2050 - if (le32_to_cpu(sb->level) != mddev->level) { 2051 - DMERR("Reshaping arrays not yet supported. (RAID level change)"); 2052 - return -EINVAL; 2053 - } 2054 - if (le32_to_cpu(sb->layout) != mddev->layout) { 2055 - DMERR("Reshaping arrays not yet supported. (RAID layout change)"); 2056 - DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout); 2057 - DMERR(" Old layout: %s w/ %d copies", 2058 - raid10_md_layout_to_format(le32_to_cpu(sb->layout)), 2059 - raid10_md_layout_to_copies(le32_to_cpu(sb->layout))); 2060 - DMERR(" New layout: %s w/ %d copies", 2061 - raid10_md_layout_to_format(mddev->layout), 2062 - raid10_md_layout_to_copies(mddev->layout)); 2063 - return -EINVAL; 2064 - } 2065 - if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) { 2066 - DMERR("Reshaping arrays not yet supported. (stripe sectors change)"); 2067 - return -EINVAL; 924 + if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190) { 925 + /* Superblock is authoritative wrt given raid set layout! */ 926 + mddev->raid_disks = le32_to_cpu(sb->num_devices); 927 + mddev->level = le32_to_cpu(sb->level); 928 + mddev->layout = le32_to_cpu(sb->layout); 929 + mddev->chunk_sectors = le32_to_cpu(sb->stripe_sectors); 930 + mddev->new_level = le32_to_cpu(sb->new_level); 931 + mddev->new_layout = le32_to_cpu(sb->new_layout); 932 + mddev->new_chunk_sectors = le32_to_cpu(sb->new_stripe_sectors); 933 + mddev->delta_disks = le32_to_cpu(sb->delta_disks); 934 + mddev->array_sectors = le64_to_cpu(sb->array_sectors); 935 + 936 + /* raid was reshaping and got interrupted */ 937 + if (le32_to_cpu(sb->flags) & SB_FLAG_RESHAPE_ACTIVE) { 938 + if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) { 939 + DMERR("Reshape requested but raid set is still reshaping"); 940 + return -EINVAL; 941 + } 942 + 943 + if (mddev->delta_disks < 0 || 944 + (!mddev->delta_disks && (le32_to_cpu(sb->flags) & SB_FLAG_RESHAPE_BACKWARDS))) 945 + mddev->reshape_backwards = 1; 946 + else 947 + mddev->reshape_backwards = 0; 948 + 949 + mddev->reshape_position = le64_to_cpu(sb->reshape_position); 950 + rs->raid_type = get_raid_type_by_ll(mddev->level, mddev->layout); 951 + } 952 + 953 + } else { 954 + /* 955 + * No takeover/reshaping, because we don't have the extended v1.9.0 metadata 956 + */ 957 + if (le32_to_cpu(sb->level) != mddev->level) { 958 + DMERR("Reshaping/takeover raid sets not yet supported. (raid level/stripes/size change)"); 959 + return -EINVAL; 960 + } 961 + if (le32_to_cpu(sb->layout) != mddev->layout) { 962 + DMERR("Reshaping raid sets not yet supported. (raid layout change)"); 963 + DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout); 964 + DMERR(" Old layout: %s w/ %d copies", 965 + raid10_md_layout_to_format(le32_to_cpu(sb->layout)), 966 + raid10_md_layout_to_copies(le32_to_cpu(sb->layout))); 967 + DMERR(" New layout: %s w/ %d copies", 968 + raid10_md_layout_to_format(mddev->layout), 969 + raid10_md_layout_to_copies(mddev->layout)); 970 + return -EINVAL; 971 + } 972 + if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) { 973 + DMERR("Reshaping raid sets not yet supported. (stripe sectors change)"); 974 + return -EINVAL; 975 + } 976 + 977 + /* We can only change the number of devices in raid1 with old (i.e. pre 1.0.7) metadata */ 978 + if (!rt_is_raid1(rs->raid_type) && 979 + (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { 980 + DMERR("Reshaping raid sets not yet supported. (device count change from %u to %u)", 981 + sb->num_devices, mddev->raid_disks); 982 + return -EINVAL; 983 + } 984 + 985 + /* Table line is checked vs. authoritative superblock */ 986 + rs_set_new(rs); 2068 987 } 2069 988 2070 - /* We can only change the number of devices in RAID1 right now */ 2071 - if ((rs->raid_type->level != 1) && 2072 - (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { 2073 - DMERR("Reshaping arrays not yet supported. (device count change)"); 2074 - return -EINVAL; 2075 - } 2076 - 2077 - if (!(rs->ctr_flags & (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC))) 989 + if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) 2078 990 mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); 2079 991 2080 992 /* 2081 993 * During load, we set FirstUse if a new superblock was written. 2082 994 * There are two reasons we might not have a superblock: 2083 - * 1) The array is brand new - in which case, all of the 2084 - * devices must have their In_sync bit set. Also, 995 + * 1) The raid set is brand new - in which case, all of the 996 + * devices must have their In_sync bit set. Also, 2085 997 * recovery_cp must be 0, unless forced. 2086 - * 2) This is a new device being added to an old array 998 + * 2) This is a new device being added to an old raid set 2087 999 * and the new device needs to be rebuilt - in which 2088 1000 * case the In_sync bit will /not/ be set and 2089 1001 * recovery_cp must be MaxSector. 1002 + * 3) This is/are a new device(s) being added to an old 1003 + * raid set during takeover to a higher raid level 1004 + * to provide capacity for redundancy or during reshape 1005 + * to add capacity to grow the raid set. 2090 1006 */ 1007 + d = 0; 2091 1008 rdev_for_each(r, mddev) { 2092 - if (!test_bit(In_sync, &r->flags)) { 2093 - DMINFO("Device %d specified for rebuild: " 2094 - "Clearing superblock", r->raid_disk); 2095 - rebuilds++; 2096 - } else if (test_bit(FirstUse, &r->flags)) 1009 + if (test_bit(FirstUse, &r->flags)) 2097 1010 new_devs++; 1011 + 1012 + if (!test_bit(In_sync, &r->flags)) { 1013 + DMINFO("Device %d specified for rebuild; clearing superblock", 1014 + r->raid_disk); 1015 + rebuilds++; 1016 + 1017 + if (test_bit(FirstUse, &r->flags)) 1018 + rebuild_and_new++; 1019 + } 1020 + 1021 + d++; 2098 1022 } 2099 1023 2100 - if (!rebuilds) { 2101 - if (new_devs == mddev->raid_disks) { 2102 - DMINFO("Superblocks created for new array"); 1024 + if (new_devs == rs->raid_disks || !rebuilds) { 1025 + /* Replace a broken device */ 1026 + if (new_devs == 1 && !rs->delta_disks) 1027 + ; 1028 + if (new_devs == rs->raid_disks) { 1029 + DMINFO("Superblocks created for new raid set"); 2103 1030 set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); 2104 - } else if (new_devs) { 2105 - DMERR("New device injected " 2106 - "into existing array without 'rebuild' " 2107 - "parameter specified"); 1031 + } else if (new_devs != rebuilds && 1032 + new_devs != rs->delta_disks) { 1033 + DMERR("New device injected into existing raid set without " 1034 + "'delta_disks' or 'rebuild' parameter specified"); 2108 1035 return -EINVAL; 2109 1036 } 2110 - } else if (new_devs) { 2111 - DMERR("'rebuild' devices cannot be " 2112 - "injected into an array with other first-time devices"); 1037 + } else if (new_devs && new_devs != rebuilds) { 1038 + DMERR("%u 'rebuild' devices cannot be injected into" 1039 + " a raid set with %u other first-time devices", 1040 + rebuilds, new_devs); 2113 1041 return -EINVAL; 2114 - } else if (mddev->recovery_cp != MaxSector) { 2115 - DMERR("'rebuild' specified while array is not in-sync"); 2116 - return -EINVAL; 1042 + } else if (rebuilds) { 1043 + if (rebuild_and_new && rebuilds != rebuild_and_new) { 1044 + DMERR("new device%s provided without 'rebuild'", 1045 + new_devs > 1 ? "s" : ""); 1046 + return -EINVAL; 1047 + } else if (rs_is_recovering(rs)) { 1048 + DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)", 1049 + (unsigned long long) mddev->recovery_cp); 1050 + return -EINVAL; 1051 + } else if (rs_is_reshaping(rs)) { 1052 + DMERR("'rebuild' specified while raid set is being reshaped (reshape_position=%llu)", 1053 + (unsigned long long) mddev->reshape_position); 1054 + return -EINVAL; 1055 + } 2117 1056 } 2118 1057 2119 1058 /* 2120 1059 * Now we set the Faulty bit for those devices that are 2121 1060 * recorded in the superblock as failed. 2122 1061 */ 1062 + sb_retrieve_failed_devices(sb, failed_devices); 2123 1063 rdev_for_each(r, mddev) { 2124 1064 if (!r->sb_page) 2125 1065 continue; 2126 1066 sb2 = page_address(r->sb_page); 2127 1067 sb2->failed_devices = 0; 1068 + memset(sb2->extended_failed_devices, 0, sizeof(sb2->extended_failed_devices)); 2128 1069 2129 1070 /* 2130 1071 * Check for any device re-ordering. 2131 1072 */ 2132 1073 if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) { 2133 1074 role = le32_to_cpu(sb2->array_position); 1075 + if (role < 0) 1076 + continue; 1077 + 2134 1078 if (role != r->raid_disk) { 2135 - if (rs->raid_type->level != 1) { 2136 - rs->ti->error = "Cannot change device " 2137 - "positions in RAID array"; 1079 + if (__is_raid10_near(mddev->layout)) { 1080 + if (mddev->raid_disks % __raid10_near_copies(mddev->layout) || 1081 + rs->raid_disks % rs->raid10_copies) { 1082 + rs->ti->error = 1083 + "Cannot change raid10 near set to odd # of devices!"; 1084 + return -EINVAL; 1085 + } 1086 + 1087 + sb2->array_position = cpu_to_le32(r->raid_disk); 1088 + 1089 + } else if (!(rs_is_raid10(rs) && rt_is_raid0(rs->raid_type)) && 1090 + !(rs_is_raid0(rs) && rt_is_raid10(rs->raid_type)) && 1091 + !rt_is_raid1(rs->raid_type)) { 1092 + rs->ti->error = "Cannot change device positions in raid set"; 2138 1093 return -EINVAL; 2139 1094 } 2140 - DMINFO("RAID1 device #%d now at position #%d", 2141 - role, r->raid_disk); 1095 + 1096 + DMINFO("raid device #%d now at position #%d", role, r->raid_disk); 2142 1097 } 2143 1098 2144 1099 /* 2145 1100 * Partial recovery is performed on 2146 1101 * returning failed devices. 2147 1102 */ 2148 - if (failed_devices & (1 << role)) 1103 + if (test_bit(role, (void *) failed_devices)) 2149 1104 set_bit(Faulty, &r->flags); 2150 1105 } 2151 1106 } ··· 2239 1028 static int super_validate(struct raid_set *rs, struct md_rdev *rdev) 2240 1029 { 2241 1030 struct mddev *mddev = &rs->md; 2242 - struct dm_raid_superblock *sb = page_address(rdev->sb_page); 1031 + struct dm_raid_superblock *sb; 1032 + 1033 + if (rs_is_raid0(rs) || !rdev->sb_page) 1034 + return 0; 1035 + 1036 + sb = page_address(rdev->sb_page); 2243 1037 2244 1038 /* 2245 1039 * If mddev->events is not set, we know we have not yet initialized 2246 1040 * the array. 2247 1041 */ 2248 - if (!mddev->events && super_init_validation(mddev, rdev)) 1042 + if (!mddev->events && super_init_validation(rs, rdev)) 2249 1043 return -EINVAL; 2250 1044 2251 - if (le32_to_cpu(sb->features)) { 2252 - rs->ti->error = "Unable to assemble array: No feature flags supported yet"; 1045 + if (le32_to_cpu(sb->compat_features) != FEATURE_FLAG_SUPPORTS_V190) { 1046 + rs->ti->error = "Unable to assemble array: Unknown flag(s) in compatible feature flags"; 1047 + return -EINVAL; 1048 + } 1049 + 1050 + if (sb->incompat_features) { 1051 + rs->ti->error = "Unable to assemble array: No incompatible feature flags supported yet"; 2253 1052 return -EINVAL; 2254 1053 } 2255 1054 2256 1055 /* Enable bitmap creation for RAID levels != 0 */ 2257 - mddev->bitmap_info.offset = (rs->raid_type->level) ? to_sector(4096) : 0; 1056 + mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096); 2258 1057 rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; 2259 1058 2260 - if (!test_bit(FirstUse, &rdev->flags)) { 1059 + if (!test_and_clear_bit(FirstUse, &rdev->flags)) { 1060 + /* Retrieve device size stored in superblock to be prepared for shrink */ 1061 + rdev->sectors = le64_to_cpu(sb->sectors); 2261 1062 rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset); 2262 - if (rdev->recovery_offset != MaxSector) 2263 - clear_bit(In_sync, &rdev->flags); 1063 + if (rdev->recovery_offset == MaxSector) 1064 + set_bit(In_sync, &rdev->flags); 1065 + /* 1066 + * If no reshape in progress -> we're recovering single 1067 + * disk(s) and have to set the device(s) to out-of-sync 1068 + */ 1069 + else if (!rs_is_reshaping(rs)) 1070 + clear_bit(In_sync, &rdev->flags); /* Mandatory for recovery */ 2264 1071 } 2265 1072 2266 1073 /* 2267 1074 * If a device comes back, set it as not In_sync and no longer faulty. 2268 1075 */ 2269 - if (test_bit(Faulty, &rdev->flags)) { 2270 - clear_bit(Faulty, &rdev->flags); 1076 + if (test_and_clear_bit(Faulty, &rdev->flags)) { 1077 + rdev->recovery_offset = 0; 2271 1078 clear_bit(In_sync, &rdev->flags); 2272 1079 rdev->saved_raid_disk = rdev->raid_disk; 2273 - rdev->recovery_offset = 0; 2274 1080 } 2275 1081 2276 - clear_bit(FirstUse, &rdev->flags); 1082 + /* Reshape support -> restore repective data offsets */ 1083 + rdev->data_offset = le64_to_cpu(sb->data_offset); 1084 + rdev->new_data_offset = le64_to_cpu(sb->new_data_offset); 2277 1085 2278 1086 return 0; 2279 1087 } ··· 2302 1072 */ 2303 1073 static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) 2304 1074 { 2305 - int ret; 1075 + int r; 2306 1076 struct raid_dev *dev; 2307 1077 struct md_rdev *rdev, *tmp, *freshest; 2308 1078 struct mddev *mddev = &rs->md; ··· 2312 1082 /* 2313 1083 * Skipping super_load due to CTR_FLAG_SYNC will cause 2314 1084 * the array to undergo initialization again as 2315 - * though it were new. This is the intended effect 1085 + * though it were new. This is the intended effect 2316 1086 * of the "sync" directive. 2317 1087 * 2318 1088 * When reshaping capability is added, we must ensure 2319 1089 * that the "sync" directive is disallowed during the 2320 1090 * reshape. 2321 1091 */ 2322 - rdev->sectors = to_sector(i_size_read(rdev->bdev->bd_inode)); 2323 - 2324 - if (rs->ctr_flags & CTR_FLAG_SYNC) 1092 + if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) 2325 1093 continue; 2326 1094 2327 1095 if (!rdev->meta_bdev) 2328 1096 continue; 2329 1097 2330 - ret = super_load(rdev, freshest); 1098 + r = super_load(rdev, freshest); 2331 1099 2332 - switch (ret) { 1100 + switch (r) { 2333 1101 case 1: 2334 1102 freshest = rdev; 2335 1103 break; ··· 2376 1148 * Validation of the freshest device provides the source of 2377 1149 * validation for the remaining devices. 2378 1150 */ 2379 - ti->error = "Unable to assemble array: Invalid superblocks"; 1151 + rs->ti->error = "Unable to assemble array: Invalid superblocks"; 2380 1152 if (super_validate(rs, freshest)) 2381 1153 return -EINVAL; 2382 1154 2383 1155 rdev_for_each(rdev, mddev) 2384 1156 if ((rdev != freshest) && super_validate(rs, rdev)) 2385 1157 return -EINVAL; 1158 + return 0; 1159 + } 1160 + 1161 + /* 1162 + * Adjust data_offset and new_data_offset on all disk members of @rs 1163 + * for out of place reshaping if requested by contructor 1164 + * 1165 + * We need free space at the beginning of each raid disk for forward 1166 + * and at the end for backward reshapes which userspace has to provide 1167 + * via remapping/reordering of space. 1168 + */ 1169 + static int rs_adjust_data_offsets(struct raid_set *rs) 1170 + { 1171 + sector_t data_offset = 0, new_data_offset = 0; 1172 + struct md_rdev *rdev; 1173 + 1174 + /* Constructor did not request data offset change */ 1175 + if (!test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) { 1176 + if (!rs_is_reshapable(rs)) 1177 + goto out; 1178 + 1179 + return 0; 1180 + } 1181 + 1182 + /* HM FIXME: get InSync raid_dev? */ 1183 + rdev = &rs->dev[0].rdev; 1184 + 1185 + if (rs->delta_disks < 0) { 1186 + /* 1187 + * Removing disks (reshaping backwards): 1188 + * 1189 + * - before reshape: data is at offset 0 and free space 1190 + * is at end of each component LV 1191 + * 1192 + * - after reshape: data is at offset rs->data_offset != 0 on each component LV 1193 + */ 1194 + data_offset = 0; 1195 + new_data_offset = rs->data_offset; 1196 + 1197 + } else if (rs->delta_disks > 0) { 1198 + /* 1199 + * Adding disks (reshaping forwards): 1200 + * 1201 + * - before reshape: data is at offset rs->data_offset != 0 and 1202 + * free space is at begin of each component LV 1203 + * 1204 + * - after reshape: data is at offset 0 on each component LV 1205 + */ 1206 + data_offset = rs->data_offset; 1207 + new_data_offset = 0; 1208 + 1209 + } else { 1210 + /* 1211 + * User space passes in 0 for data offset after having removed reshape space 1212 + * 1213 + * - or - (data offset != 0) 1214 + * 1215 + * Changing RAID layout or chunk size -> toggle offsets 1216 + * 1217 + * - before reshape: data is at offset rs->data_offset 0 and 1218 + * free space is at end of each component LV 1219 + * -or- 1220 + * data is at offset rs->data_offset != 0 and 1221 + * free space is at begin of each component LV 1222 + * 1223 + * - after reshape: data is at offset 0 if it was at offset != 0 1224 + * or at offset != 0 if it was at offset 0 1225 + * on each component LV 1226 + * 1227 + */ 1228 + data_offset = rs->data_offset ? rdev->data_offset : 0; 1229 + new_data_offset = data_offset ? 0 : rs->data_offset; 1230 + set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); 1231 + } 1232 + 1233 + /* 1234 + * Make sure we got a minimum amount of free sectors per device 1235 + */ 1236 + if (rs->data_offset && 1237 + to_sector(i_size_read(rdev->bdev->bd_inode)) - rdev->sectors < MIN_FREE_RESHAPE_SPACE) { 1238 + rs->ti->error = data_offset ? "No space for forward reshape" : 1239 + "No space for backward reshape"; 1240 + return -ENOSPC; 1241 + } 1242 + out: 1243 + /* Adjust data offsets on all rdevs */ 1244 + rdev_for_each(rdev, &rs->md) { 1245 + rdev->data_offset = data_offset; 1246 + rdev->new_data_offset = new_data_offset; 1247 + } 2386 1248 2387 1249 return 0; 1250 + } 1251 + 1252 + /* Userpace reordered disks -> adjust raid_disk indexes in @rs */ 1253 + static void __reorder_raid_disk_indexes(struct raid_set *rs) 1254 + { 1255 + int i = 0; 1256 + struct md_rdev *rdev; 1257 + 1258 + rdev_for_each(rdev, &rs->md) { 1259 + rdev->raid_disk = i++; 1260 + rdev->saved_raid_disk = rdev->new_raid_disk = -1; 1261 + } 1262 + } 1263 + 1264 + /* 1265 + * Setup @rs for takeover by a different raid level 1266 + */ 1267 + static int rs_setup_takeover(struct raid_set *rs) 1268 + { 1269 + struct mddev *mddev = &rs->md; 1270 + struct md_rdev *rdev; 1271 + unsigned int d = mddev->raid_disks = rs->raid_disks; 1272 + sector_t new_data_offset = rs->dev[0].rdev.data_offset ? 0 : rs->data_offset; 1273 + 1274 + if (rt_is_raid10(rs->raid_type)) { 1275 + if (mddev->level == 0) { 1276 + /* Userpace reordered disks -> adjust raid_disk indexes */ 1277 + __reorder_raid_disk_indexes(rs); 1278 + 1279 + /* raid0 -> raid10_far layout */ 1280 + mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_FAR, 1281 + rs->raid10_copies); 1282 + } else if (mddev->level == 1) 1283 + /* raid1 -> raid10_near layout */ 1284 + mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR, 1285 + rs->raid_disks); 1286 + else 1287 + return -EINVAL; 1288 + 1289 + } 1290 + 1291 + clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags); 1292 + mddev->recovery_cp = MaxSector; 1293 + 1294 + while (d--) { 1295 + rdev = &rs->dev[d].rdev; 1296 + 1297 + if (test_bit(d, (void *) rs->rebuild_disks)) { 1298 + clear_bit(In_sync, &rdev->flags); 1299 + clear_bit(Faulty, &rdev->flags); 1300 + mddev->recovery_cp = rdev->recovery_offset = 0; 1301 + /* Bitmap has to be created when we do an "up" takeover */ 1302 + set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); 1303 + } 1304 + 1305 + rdev->new_data_offset = new_data_offset; 1306 + } 1307 + 1308 + return 0; 1309 + } 1310 + 1311 + /* Prepare @rs for reshape */ 1312 + static int rs_prepare_reshape(struct raid_set *rs) 1313 + { 1314 + bool reshape; 1315 + struct mddev *mddev = &rs->md; 1316 + 1317 + if (rs_is_raid10(rs)) { 1318 + if (rs->raid_disks != mddev->raid_disks && 1319 + __is_raid10_near(mddev->layout) && 1320 + rs->raid10_copies && 1321 + rs->raid10_copies != __raid10_near_copies(mddev->layout)) { 1322 + /* 1323 + * raid disk have to be multiple of data copies to allow this conversion, 1324 + * 1325 + * This is actually not a reshape it is a 1326 + * rebuild of any additional mirrors per group 1327 + */ 1328 + if (rs->raid_disks % rs->raid10_copies) { 1329 + rs->ti->error = "Can't reshape raid10 mirror groups"; 1330 + return -EINVAL; 1331 + } 1332 + 1333 + /* Userpace reordered disks to add/remove mirrors -> adjust raid_disk indexes */ 1334 + __reorder_raid_disk_indexes(rs); 1335 + mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR, 1336 + rs->raid10_copies); 1337 + mddev->new_layout = mddev->layout; 1338 + reshape = false; 1339 + } else 1340 + reshape = true; 1341 + 1342 + } else if (rs_is_raid456(rs)) 1343 + reshape = true; 1344 + 1345 + else if (rs_is_raid1(rs)) { 1346 + if (rs->delta_disks) { 1347 + /* Process raid1 via delta_disks */ 1348 + mddev->degraded = rs->delta_disks < 0 ? -rs->delta_disks : rs->delta_disks; 1349 + reshape = true; 1350 + } else { 1351 + /* Process raid1 without delta_disks */ 1352 + mddev->raid_disks = rs->raid_disks; 1353 + set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags); 1354 + reshape = false; 1355 + } 1356 + } else { 1357 + rs->ti->error = "Called with bogus raid type"; 1358 + return -EINVAL; 1359 + } 1360 + 1361 + if (reshape) { 1362 + set_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags); 1363 + set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); 1364 + set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags); 1365 + } else if (mddev->raid_disks < rs->raid_disks) 1366 + /* Create new superblocks and bitmaps, if any new disks */ 1367 + set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); 1368 + 1369 + return 0; 1370 + } 1371 + 1372 + /* 1373 + * 1374 + * - change raid layout 1375 + * - change chunk size 1376 + * - add disks 1377 + * - remove disks 1378 + */ 1379 + static int rs_setup_reshape(struct raid_set *rs) 1380 + { 1381 + int r = 0; 1382 + unsigned int cur_raid_devs, d; 1383 + struct mddev *mddev = &rs->md; 1384 + struct md_rdev *rdev; 1385 + 1386 + mddev->delta_disks = rs->delta_disks; 1387 + cur_raid_devs = mddev->raid_disks; 1388 + 1389 + /* Ignore impossible layout change whilst adding/removing disks */ 1390 + if (mddev->delta_disks && 1391 + mddev->layout != mddev->new_layout) { 1392 + DMINFO("Ignoring invalid layout change with delta_disks=%d", rs->delta_disks); 1393 + mddev->new_layout = mddev->layout; 1394 + } 1395 + 1396 + /* 1397 + * Adjust array size: 1398 + * 1399 + * - in case of adding disks, array size has 1400 + * to grow after the disk adding reshape, 1401 + * which'll hapen in the event handler; 1402 + * reshape will happen forward, so space has to 1403 + * be available at the beginning of each disk 1404 + * 1405 + * - in case of removing disks, array size 1406 + * has to shrink before starting the reshape, 1407 + * which'll happen here; 1408 + * reshape will happen backward, so space has to 1409 + * be available at the end of each disk 1410 + * 1411 + * - data_offset and new_data_offset are 1412 + * adjusted for aforementioned out of place 1413 + * reshaping based on userspace passing in 1414 + * the "data_offset <sectors>" key/value 1415 + * pair via the constructor 1416 + */ 1417 + 1418 + /* Add disk(s) */ 1419 + if (rs->delta_disks > 0) { 1420 + /* Prepare disks for check in raid4/5/6/10 {check|start}_reshape */ 1421 + for (d = cur_raid_devs; d < rs->raid_disks; d++) { 1422 + rdev = &rs->dev[d].rdev; 1423 + clear_bit(In_sync, &rdev->flags); 1424 + 1425 + /* 1426 + * save_raid_disk needs to be -1, or recovery_offset will be set to 0 1427 + * by md, which'll store that erroneously in the superblock on reshape 1428 + */ 1429 + rdev->saved_raid_disk = -1; 1430 + rdev->raid_disk = d; 1431 + 1432 + rdev->sectors = mddev->dev_sectors; 1433 + rdev->recovery_offset = rs_is_raid1(rs) ? 0 : MaxSector; 1434 + } 1435 + 1436 + mddev->reshape_backwards = 0; /* adding disks -> forward reshape */ 1437 + 1438 + /* Remove disk(s) */ 1439 + } else if (rs->delta_disks < 0) { 1440 + r = rs_set_dev_and_array_sectors(rs, true); 1441 + mddev->reshape_backwards = 1; /* removing disk(s) -> backward reshape */ 1442 + 1443 + /* Change layout and/or chunk size */ 1444 + } else { 1445 + /* 1446 + * Reshape layout (e.g. raid5_ls -> raid5_n) and/or chunk size: 1447 + * 1448 + * keeping number of disks and do layout change -> 1449 + * 1450 + * toggle reshape_backward depending on data_offset: 1451 + * 1452 + * - free space upfront -> reshape forward 1453 + * 1454 + * - free space at the end -> reshape backward 1455 + * 1456 + * 1457 + * This utilizes free reshape space avoiding the need 1458 + * for userspace to move (parts of) LV segments in 1459 + * case of layout/chunksize change (for disk 1460 + * adding/removing reshape space has to be at 1461 + * the proper address (see above with delta_disks): 1462 + * 1463 + * add disk(s) -> begin 1464 + * remove disk(s)-> end 1465 + */ 1466 + mddev->reshape_backwards = rs->dev[0].rdev.data_offset ? 0 : 1; 1467 + } 1468 + 1469 + return r; 2388 1470 } 2389 1471 2390 1472 /* 2391 1473 * Enable/disable discard support on RAID set depending on 2392 1474 * RAID level and discard properties of underlying RAID members. 2393 1475 */ 2394 - static void configure_discard_support(struct dm_target *ti, struct raid_set *rs) 1476 + static void configure_discard_support(struct raid_set *rs) 2395 1477 { 2396 1478 int i; 2397 1479 bool raid456; 1480 + struct dm_target *ti = rs->ti; 2398 1481 2399 1482 /* Assume discards not supported until after checks below. */ 2400 1483 ti->discards_supported = false; ··· 2713 1174 /* RAID level 4,5,6 require discard_zeroes_data for data integrity! */ 2714 1175 raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6); 2715 1176 2716 - for (i = 0; i < rs->md.raid_disks; i++) { 1177 + for (i = 0; i < rs->raid_disks; i++) { 2717 1178 struct request_queue *q; 2718 1179 2719 1180 if (!rs->dev[i].rdev.bdev) ··· 2746 1207 } 2747 1208 2748 1209 /* 2749 - * Construct a RAID4/5/6 mapping: 1210 + * Construct a RAID0/1/10/4/5/6 mapping: 2750 1211 * Args: 2751 - * <raid_type> <#raid_params> <raid_params> \ 2752 - * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } 1212 + * <raid_type> <#raid_params> <raid_params>{0,} \ 1213 + * <#raid_devs> [<meta_dev1> <dev1>]{1,} 2753 1214 * 2754 - * <raid_params> varies by <raid_type>. See 'parse_raid_params' for 1215 + * <raid_params> varies by <raid_type>. See 'parse_raid_params' for 2755 1216 * details on possible <raid_params>. 1217 + * 1218 + * Userspace is free to initialize the metadata devices, hence the superblocks to 1219 + * enforce recreation based on the passed in table parameters. 1220 + * 2756 1221 */ 2757 - static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) 1222 + static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) 2758 1223 { 2759 - int ret; 1224 + int r; 1225 + bool resize; 2760 1226 struct raid_type *rt; 2761 - unsigned long num_raid_params, num_raid_devs; 1227 + unsigned int num_raid_params, num_raid_devs; 1228 + sector_t calculated_dev_sectors; 2762 1229 struct raid_set *rs = NULL; 1230 + const char *arg; 1231 + struct rs_layout rs_layout; 1232 + struct dm_arg_set as = { argc, argv }, as_nrd; 1233 + struct dm_arg _args[] = { 1234 + { 0, as.argc, "Cannot understand number of raid parameters" }, 1235 + { 1, 254, "Cannot understand number of raid devices parameters" } 1236 + }; 2763 1237 2764 - /* Must have at least <raid_type> <#raid_params> */ 2765 - if (argc < 2) { 2766 - ti->error = "Too few arguments"; 1238 + /* Must have <raid_type> */ 1239 + arg = dm_shift_arg(&as); 1240 + if (!arg) { 1241 + ti->error = "No arguments"; 2767 1242 return -EINVAL; 2768 1243 } 2769 1244 2770 - /* raid type */ 2771 - rt = get_raid_type(argv[0]); 1245 + rt = get_raid_type(arg); 2772 1246 if (!rt) { 2773 1247 ti->error = "Unrecognised raid_type"; 2774 1248 return -EINVAL; 2775 1249 } 2776 - argc--; 2777 - argv++; 2778 1250 2779 - /* number of RAID parameters */ 2780 - if (kstrtoul(argv[0], 10, &num_raid_params) < 0) { 2781 - ti->error = "Cannot understand number of RAID parameters"; 1251 + /* Must have <#raid_params> */ 1252 + if (dm_read_arg_group(_args, &as, &num_raid_params, &ti->error)) 2782 1253 return -EINVAL; 2783 - } 2784 - argc--; 2785 - argv++; 2786 1254 2787 - /* Skip over RAID params for now and find out # of devices */ 2788 - if (num_raid_params >= argc) { 2789 - ti->error = "Arguments do not agree with counts given"; 1255 + /* number of raid device tupples <meta_dev data_dev> */ 1256 + as_nrd = as; 1257 + dm_consume_args(&as_nrd, num_raid_params); 1258 + _args[1].max = (as_nrd.argc - 1) / 2; 1259 + if (dm_read_arg(_args + 1, &as_nrd, &num_raid_devs, &ti->error)) 2790 1260 return -EINVAL; 2791 - } 2792 1261 2793 - if ((kstrtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) || 2794 - (num_raid_devs > MAX_RAID_DEVICES)) { 2795 - ti->error = "Cannot understand number of raid devices"; 1262 + if (!__within_range(num_raid_devs, 1, MAX_RAID_DEVICES)) { 1263 + ti->error = "Invalid number of supplied raid devices"; 2796 1264 return -EINVAL; 2797 1265 } 2798 1266 2799 - argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */ 2800 - if (argc != (num_raid_devs * 2)) { 2801 - ti->error = "Supplied RAID devices does not match the count given"; 2802 - return -EINVAL; 2803 - } 2804 - 2805 - rs = context_alloc(ti, rt, (unsigned)num_raid_devs); 1267 + rs = raid_set_alloc(ti, rt, num_raid_devs); 2806 1268 if (IS_ERR(rs)) 2807 1269 return PTR_ERR(rs); 2808 1270 2809 - ret = parse_raid_params(rs, argv, (unsigned)num_raid_params); 2810 - if (ret) 1271 + r = parse_raid_params(rs, &as, num_raid_params); 1272 + if (r) 2811 1273 goto bad; 2812 1274 2813 - argv += num_raid_params + 1; 2814 - 2815 - ret = dev_parms(rs, argv); 2816 - if (ret) 1275 + r = parse_dev_params(rs, &as); 1276 + if (r) 2817 1277 goto bad; 2818 1278 2819 1279 rs->md.sync_super = super_sync; 2820 - ret = analyse_superblocks(ti, rs); 2821 - if (ret) 1280 + 1281 + /* 1282 + * Calculate ctr requested array and device sizes to allow 1283 + * for superblock analysis needing device sizes defined. 1284 + * 1285 + * Any existing superblock will overwrite the array and device sizes 1286 + */ 1287 + r = rs_set_dev_and_array_sectors(rs, false); 1288 + if (r) 2822 1289 goto bad; 1290 + 1291 + calculated_dev_sectors = rs->dev[0].rdev.sectors; 1292 + 1293 + /* 1294 + * Backup any new raid set level, layout, ... 1295 + * requested to be able to compare to superblock 1296 + * members for conversion decisions. 1297 + */ 1298 + rs_config_backup(rs, &rs_layout); 1299 + 1300 + r = analyse_superblocks(ti, rs); 1301 + if (r) 1302 + goto bad; 1303 + 1304 + resize = calculated_dev_sectors != rs->dev[0].rdev.sectors; 2823 1305 2824 1306 INIT_WORK(&rs->md.event_work, do_table_event); 2825 1307 ti->private = rs; 2826 1308 ti->num_flush_bios = 1; 2827 1309 1310 + /* Restore any requested new layout for conversion decision */ 1311 + rs_config_restore(rs, &rs_layout); 1312 + 2828 1313 /* 2829 - * Disable/enable discard support on RAID set. 1314 + * Now that we have any superblock metadata available, 1315 + * check for new, recovering, reshaping, to be taken over, 1316 + * to be reshaped or an existing, unchanged raid set to 1317 + * run in sequence. 2830 1318 */ 2831 - configure_discard_support(ti, rs); 1319 + if (test_bit(MD_ARRAY_FIRST_USE, &rs->md.flags)) { 1320 + /* A new raid6 set has to be recovered to ensure proper parity and Q-Syndrome */ 1321 + if (rs_is_raid6(rs) && 1322 + test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) { 1323 + ti->error = "'nosync' not allowed for new raid6 set"; 1324 + r = -EINVAL; 1325 + goto bad; 1326 + } 1327 + rs_setup_recovery(rs, 0); 1328 + set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); 1329 + rs_set_new(rs); 1330 + } else if (rs_is_recovering(rs)) { 1331 + /* A recovering raid set may be resized */ 1332 + ; /* skip setup rs */ 1333 + } else if (rs_is_reshaping(rs)) { 1334 + /* Have to reject size change request during reshape */ 1335 + if (resize) { 1336 + ti->error = "Can't resize a reshaping raid set"; 1337 + r = -EPERM; 1338 + goto bad; 1339 + } 1340 + /* skip setup rs */ 1341 + } else if (rs_takeover_requested(rs)) { 1342 + if (rs_is_reshaping(rs)) { 1343 + ti->error = "Can't takeover a reshaping raid set"; 1344 + r = -EPERM; 1345 + goto bad; 1346 + } 1347 + 1348 + /* 1349 + * If a takeover is needed, userspace sets any additional 1350 + * devices to rebuild and we can check for a valid request here. 1351 + * 1352 + * If acceptible, set the level to the new requested 1353 + * one, prohibit requesting recovery, allow the raid 1354 + * set to run and store superblocks during resume. 1355 + */ 1356 + r = rs_check_takeover(rs); 1357 + if (r) 1358 + goto bad; 1359 + 1360 + r = rs_setup_takeover(rs); 1361 + if (r) 1362 + goto bad; 1363 + 1364 + set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); 1365 + set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags); 1366 + /* Takeover ain't recovery, so disable recovery */ 1367 + rs_setup_recovery(rs, MaxSector); 1368 + rs_set_new(rs); 1369 + } else if (rs_reshape_requested(rs)) { 1370 + /* 1371 + * We can only prepare for a reshape here, because the 1372 + * raid set needs to run to provide the repective reshape 1373 + * check functions via its MD personality instance. 1374 + * 1375 + * So do the reshape check after md_run() succeeded. 1376 + */ 1377 + r = rs_prepare_reshape(rs); 1378 + if (r) 1379 + return r; 1380 + 1381 + /* Reshaping ain't recovery, so disable recovery */ 1382 + rs_setup_recovery(rs, MaxSector); 1383 + rs_set_cur(rs); 1384 + } else { 1385 + /* May not set recovery when a device rebuild is requested */ 1386 + if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) { 1387 + rs_setup_recovery(rs, MaxSector); 1388 + set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); 1389 + } else 1390 + rs_setup_recovery(rs, test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) ? 1391 + 0 : (resize ? calculated_dev_sectors : MaxSector)); 1392 + rs_set_cur(rs); 1393 + } 1394 + 1395 + /* If constructor requested it, change data and new_data offsets */ 1396 + r = rs_adjust_data_offsets(rs); 1397 + if (r) 1398 + goto bad; 1399 + 1400 + /* Start raid set read-only and assumed clean to change in raid_resume() */ 1401 + rs->md.ro = 1; 1402 + rs->md.in_sync = 1; 1403 + set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); 2832 1404 2833 1405 /* Has to be held on running the array */ 2834 1406 mddev_lock_nointr(&rs->md); 2835 - ret = md_run(&rs->md); 1407 + r = md_run(&rs->md); 2836 1408 rs->md.in_sync = 0; /* Assume already marked dirty */ 2837 - mddev_unlock(&rs->md); 2838 1409 2839 - if (ret) { 2840 - ti->error = "Fail to run raid array"; 1410 + if (r) { 1411 + ti->error = "Failed to run raid array"; 1412 + mddev_unlock(&rs->md); 2841 1413 goto bad; 2842 1414 } 2843 1415 2844 - if (ti->len != rs->md.array_sectors) { 2845 - ti->error = "Array size does not match requested target length"; 2846 - ret = -EINVAL; 2847 - goto size_mismatch; 2848 - } 2849 1416 rs->callbacks.congested_fn = raid_is_congested; 2850 1417 dm_table_add_target_callbacks(ti->table, &rs->callbacks); 2851 1418 2852 1419 mddev_suspend(&rs->md); 1420 + 1421 + /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */ 1422 + if (rs_is_raid456(rs)) { 1423 + r = rs_set_raid456_stripe_cache(rs); 1424 + if (r) 1425 + goto bad_stripe_cache; 1426 + } 1427 + 1428 + /* Now do an early reshape check */ 1429 + if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) { 1430 + r = rs_check_reshape(rs); 1431 + if (r) 1432 + goto bad_check_reshape; 1433 + 1434 + /* Restore new, ctr requested layout to perform check */ 1435 + rs_config_restore(rs, &rs_layout); 1436 + 1437 + if (rs->md.pers->start_reshape) { 1438 + r = rs->md.pers->check_reshape(&rs->md); 1439 + if (r) { 1440 + ti->error = "Reshape check failed"; 1441 + goto bad_check_reshape; 1442 + } 1443 + } 1444 + } 1445 + 1446 + mddev_unlock(&rs->md); 2853 1447 return 0; 2854 1448 2855 - size_mismatch: 1449 + bad_stripe_cache: 1450 + bad_check_reshape: 2856 1451 md_stop(&rs->md); 2857 1452 bad: 2858 - context_free(rs); 1453 + raid_set_free(rs); 2859 1454 2860 - return ret; 1455 + return r; 2861 1456 } 2862 1457 2863 1458 static void raid_dtr(struct dm_target *ti) ··· 3000 1327 3001 1328 list_del_init(&rs->callbacks.list); 3002 1329 md_stop(&rs->md); 3003 - context_free(rs); 1330 + raid_set_free(rs); 3004 1331 } 3005 1332 3006 1333 static int raid_map(struct dm_target *ti, struct bio *bio) ··· 3008 1335 struct raid_set *rs = ti->private; 3009 1336 struct mddev *mddev = &rs->md; 3010 1337 1338 + /* 1339 + * If we're reshaping to add disk(s)), ti->len and 1340 + * mddev->array_sectors will differ during the process 1341 + * (ti->len > mddev->array_sectors), so we have to requeue 1342 + * bios with addresses > mddev->array_sectors here or 1343 + * there will occur accesses past EOD of the component 1344 + * data images thus erroring the raid set. 1345 + */ 1346 + if (unlikely(bio_end_sector(bio) > mddev->array_sectors)) 1347 + return DM_MAPIO_REQUEUE; 1348 + 3011 1349 mddev->pers->make_request(mddev, bio); 3012 1350 3013 1351 return DM_MAPIO_SUBMITTED; 3014 1352 } 3015 1353 1354 + /* Return string describing the current sync action of @mddev */ 3016 1355 static const char *decipher_sync_action(struct mddev *mddev) 3017 1356 { 3018 1357 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) ··· 3050 1365 return "idle"; 3051 1366 } 3052 1367 3053 - static void raid_status(struct dm_target *ti, status_type_t type, 3054 - unsigned status_flags, char *result, unsigned maxlen) 1368 + /* 1369 + * Return status string @rdev 1370 + * 1371 + * Status characters: 1372 + * 1373 + * 'D' = Dead/Failed device 1374 + * 'a' = Alive but not in-sync 1375 + * 'A' = Alive and in-sync 1376 + */ 1377 + static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync) 3055 1378 { 3056 - struct raid_set *rs = ti->private; 3057 - unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ 3058 - unsigned sz = 0; 3059 - int i, array_in_sync = 0; 3060 - sector_t sync; 3061 - 3062 - switch (type) { 3063 - case STATUSTYPE_INFO: 3064 - DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks); 3065 - 3066 - if (rs->raid_type->level) { 3067 - if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery)) 3068 - sync = rs->md.curr_resync_completed; 3069 - else 3070 - sync = rs->md.recovery_cp; 3071 - 3072 - if (sync >= rs->md.resync_max_sectors) { 3073 - /* 3074 - * Sync complete. 3075 - */ 3076 - array_in_sync = 1; 3077 - sync = rs->md.resync_max_sectors; 3078 - } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) { 3079 - /* 3080 - * If "check" or "repair" is occurring, the array has 3081 - * undergone and initial sync and the health characters 3082 - * should not be 'a' anymore. 3083 - */ 3084 - array_in_sync = 1; 3085 - } else { 3086 - /* 3087 - * The array may be doing an initial sync, or it may 3088 - * be rebuilding individual components. If all the 3089 - * devices are In_sync, then it is the array that is 3090 - * being initialized. 3091 - */ 3092 - for (i = 0; i < rs->md.raid_disks; i++) 3093 - if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) 3094 - array_in_sync = 1; 3095 - } 3096 - } else { 3097 - /* RAID0 */ 3098 - array_in_sync = 1; 3099 - sync = rs->md.resync_max_sectors; 3100 - } 3101 - 3102 - /* 3103 - * Status characters: 3104 - * 'D' = Dead/Failed device 3105 - * 'a' = Alive but not in-sync 3106 - * 'A' = Alive and in-sync 3107 - */ 3108 - for (i = 0; i < rs->md.raid_disks; i++) { 3109 - if (test_bit(Faulty, &rs->dev[i].rdev.flags)) 3110 - DMEMIT("D"); 3111 - else if (!array_in_sync || 3112 - !test_bit(In_sync, &rs->dev[i].rdev.flags)) 3113 - DMEMIT("a"); 3114 - else 3115 - DMEMIT("A"); 3116 - } 3117 - 3118 - /* 3119 - * In-sync ratio: 3120 - * The in-sync ratio shows the progress of: 3121 - * - Initializing the array 3122 - * - Rebuilding a subset of devices of the array 3123 - * The user can distinguish between the two by referring 3124 - * to the status characters. 3125 - */ 3126 - DMEMIT(" %llu/%llu", 3127 - (unsigned long long) sync, 3128 - (unsigned long long) rs->md.resync_max_sectors); 3129 - 3130 - /* 3131 - * Sync action: 3132 - * See Documentation/device-mapper/dm-raid.c for 3133 - * information on each of these states. 3134 - */ 3135 - DMEMIT(" %s", decipher_sync_action(&rs->md)); 3136 - 3137 - /* 3138 - * resync_mismatches/mismatch_cnt 3139 - * This field shows the number of discrepancies found when 3140 - * performing a "check" of the array. 3141 - */ 3142 - DMEMIT(" %llu", 3143 - (strcmp(rs->md.last_sync_action, "check")) ? 0 : 3144 - (unsigned long long) 3145 - atomic64_read(&rs->md.resync_mismatches)); 3146 - break; 3147 - case STATUSTYPE_TABLE: 3148 - /* The string you would use to construct this array */ 3149 - for (i = 0; i < rs->md.raid_disks; i++) { 3150 - if ((rs->ctr_flags & CTR_FLAG_REBUILD) && 3151 - rs->dev[i].data_dev && 3152 - !test_bit(In_sync, &rs->dev[i].rdev.flags)) 3153 - raid_param_cnt += 2; /* for rebuilds */ 3154 - if (rs->dev[i].data_dev && 3155 - test_bit(WriteMostly, &rs->dev[i].rdev.flags)) 3156 - raid_param_cnt += 2; 3157 - } 3158 - 3159 - raid_param_cnt += (hweight32(rs->ctr_flags & ~CTR_FLAG_REBUILD) * 2); 3160 - if (rs->ctr_flags & (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC)) 3161 - raid_param_cnt--; 3162 - 3163 - DMEMIT("%s %u %u", rs->raid_type->name, 3164 - raid_param_cnt, rs->md.chunk_sectors); 3165 - 3166 - if ((rs->ctr_flags & CTR_FLAG_SYNC) && 3167 - (rs->md.recovery_cp == MaxSector)) 3168 - DMEMIT(" sync"); 3169 - if (rs->ctr_flags & CTR_FLAG_NOSYNC) 3170 - DMEMIT(" nosync"); 3171 - 3172 - for (i = 0; i < rs->md.raid_disks; i++) 3173 - if ((rs->ctr_flags & CTR_FLAG_REBUILD) && 3174 - rs->dev[i].data_dev && 3175 - !test_bit(In_sync, &rs->dev[i].rdev.flags)) 3176 - DMEMIT(" rebuild %u", i); 3177 - 3178 - if (rs->ctr_flags & CTR_FLAG_DAEMON_SLEEP) 3179 - DMEMIT(" daemon_sleep %lu", 3180 - rs->md.bitmap_info.daemon_sleep); 3181 - 3182 - if (rs->ctr_flags & CTR_FLAG_MIN_RECOVERY_RATE) 3183 - DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min); 3184 - 3185 - if (rs->ctr_flags & CTR_FLAG_MAX_RECOVERY_RATE) 3186 - DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); 3187 - 3188 - for (i = 0; i < rs->md.raid_disks; i++) 3189 - if (rs->dev[i].data_dev && 3190 - test_bit(WriteMostly, &rs->dev[i].rdev.flags)) 3191 - DMEMIT(" write_mostly %u", i); 3192 - 3193 - if (rs->ctr_flags & CTR_FLAG_MAX_WRITE_BEHIND) 3194 - DMEMIT(" max_write_behind %lu", 3195 - rs->md.bitmap_info.max_write_behind); 3196 - 3197 - if (rs->ctr_flags & CTR_FLAG_STRIPE_CACHE) { 3198 - struct r5conf *conf = rs->md.private; 3199 - 3200 - /* convert from kiB to sectors */ 3201 - DMEMIT(" stripe_cache %d", 3202 - conf ? conf->max_nr_stripes * 2 : 0); 3203 - } 3204 - 3205 - if (rs->ctr_flags & CTR_FLAG_REGION_SIZE) 3206 - DMEMIT(" region_size %lu", 3207 - rs->md.bitmap_info.chunksize >> 9); 3208 - 3209 - if (rs->ctr_flags & CTR_FLAG_RAID10_COPIES) 3210 - DMEMIT(" raid10_copies %u", 3211 - raid10_md_layout_to_copies(rs->md.layout)); 3212 - 3213 - if (rs->ctr_flags & CTR_FLAG_RAID10_FORMAT) 3214 - DMEMIT(" raid10_format %s", 3215 - raid10_md_layout_to_format(rs->md.layout)); 3216 - 3217 - DMEMIT(" %d", rs->md.raid_disks); 3218 - for (i = 0; i < rs->md.raid_disks; i++) { 3219 - if (rs->dev[i].meta_dev) 3220 - DMEMIT(" %s", rs->dev[i].meta_dev->name); 3221 - else 3222 - DMEMIT(" -"); 3223 - 3224 - if (rs->dev[i].data_dev) 3225 - DMEMIT(" %s", rs->dev[i].data_dev->name); 3226 - else 3227 - DMEMIT(" -"); 3228 - } 3229 - } 1379 + if (test_bit(Faulty, &rdev->flags)) 1380 + return "D"; 1381 + else if (!array_in_sync || !test_bit(In_sync, &rdev->flags)) 1382 + return "a"; 1383 + else 1384 + return "A"; 3230 1385 } 3231 1386 3232 - static int raid_message(struct dm_target *ti, unsigned argc, char **argv) 1387 + /* Helper to return resync/reshape progress for @rs and @array_in_sync */ 1388 + static sector_t rs_get_progress(struct raid_set *rs, 1389 + sector_t resync_max_sectors, bool *array_in_sync) 1390 + { 1391 + sector_t r, recovery_cp, curr_resync_completed; 1392 + struct mddev *mddev = &rs->md; 1393 + 1394 + curr_resync_completed = mddev->curr_resync_completed ?: mddev->recovery_cp; 1395 + recovery_cp = mddev->recovery_cp; 1396 + *array_in_sync = false; 1397 + 1398 + if (rs_is_raid0(rs)) { 1399 + r = resync_max_sectors; 1400 + *array_in_sync = true; 1401 + 1402 + } else { 1403 + r = mddev->reshape_position; 1404 + 1405 + /* Reshape is relative to the array size */ 1406 + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 1407 + r != MaxSector) { 1408 + if (r == MaxSector) { 1409 + *array_in_sync = true; 1410 + r = resync_max_sectors; 1411 + } else { 1412 + /* Got to reverse on backward reshape */ 1413 + if (mddev->reshape_backwards) 1414 + r = mddev->array_sectors - r; 1415 + 1416 + /* Devide by # of data stripes */ 1417 + sector_div(r, mddev_data_stripes(rs)); 1418 + } 1419 + 1420 + /* Sync is relative to the component device size */ 1421 + } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 1422 + r = curr_resync_completed; 1423 + else 1424 + r = recovery_cp; 1425 + 1426 + if (r == MaxSector) { 1427 + /* 1428 + * Sync complete. 1429 + */ 1430 + *array_in_sync = true; 1431 + r = resync_max_sectors; 1432 + } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 1433 + /* 1434 + * If "check" or "repair" is occurring, the raid set has 1435 + * undergone an initial sync and the health characters 1436 + * should not be 'a' anymore. 1437 + */ 1438 + *array_in_sync = true; 1439 + } else { 1440 + struct md_rdev *rdev; 1441 + 1442 + /* 1443 + * The raid set may be doing an initial sync, or it may 1444 + * be rebuilding individual components. If all the 1445 + * devices are In_sync, then it is the raid set that is 1446 + * being initialized. 1447 + */ 1448 + rdev_for_each(rdev, mddev) 1449 + if (!test_bit(In_sync, &rdev->flags)) 1450 + *array_in_sync = true; 1451 + #if 0 1452 + r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */ 1453 + #endif 1454 + } 1455 + } 1456 + 1457 + return r; 1458 + } 1459 + 1460 + /* Helper to return @dev name or "-" if !@dev */ 1461 + static const char *__get_dev_name(struct dm_dev *dev) 1462 + { 1463 + return dev ? dev->name : "-"; 1464 + } 1465 + 1466 + static void raid_status(struct dm_target *ti, status_type_t type, 1467 + unsigned int status_flags, char *result, unsigned int maxlen) 3233 1468 { 3234 1469 struct raid_set *rs = ti->private; 3235 1470 struct mddev *mddev = &rs->md; 1471 + struct r5conf *conf = mddev->private; 1472 + int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0; 1473 + bool array_in_sync; 1474 + unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */ 1475 + unsigned int sz = 0; 1476 + unsigned int rebuild_disks; 1477 + unsigned int write_mostly_params = 0; 1478 + sector_t progress, resync_max_sectors, resync_mismatches; 1479 + const char *sync_action; 1480 + struct raid_type *rt; 1481 + struct md_rdev *rdev; 3236 1482 3237 - if (!strcasecmp(argv[0], "reshape")) { 3238 - DMERR("Reshape not supported."); 3239 - return -EINVAL; 1483 + switch (type) { 1484 + case STATUSTYPE_INFO: 1485 + /* *Should* always succeed */ 1486 + rt = get_raid_type_by_ll(mddev->new_level, mddev->new_layout); 1487 + if (!rt) 1488 + return; 1489 + 1490 + DMEMIT("%s %d ", rt->name, mddev->raid_disks); 1491 + 1492 + /* Access most recent mddev properties for status output */ 1493 + smp_rmb(); 1494 + /* Get sensible max sectors even if raid set not yet started */ 1495 + resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ? 1496 + mddev->resync_max_sectors : mddev->dev_sectors; 1497 + progress = rs_get_progress(rs, resync_max_sectors, &array_in_sync); 1498 + resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ? 1499 + atomic64_read(&mddev->resync_mismatches) : 0; 1500 + sync_action = decipher_sync_action(&rs->md); 1501 + 1502 + /* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */ 1503 + rdev_for_each(rdev, mddev) 1504 + DMEMIT(__raid_dev_status(rdev, array_in_sync)); 1505 + 1506 + /* 1507 + * In-sync/Reshape ratio: 1508 + * The in-sync ratio shows the progress of: 1509 + * - Initializing the raid set 1510 + * - Rebuilding a subset of devices of the raid set 1511 + * The user can distinguish between the two by referring 1512 + * to the status characters. 1513 + * 1514 + * The reshape ratio shows the progress of 1515 + * changing the raid layout or the number of 1516 + * disks of a raid set 1517 + */ 1518 + DMEMIT(" %llu/%llu", (unsigned long long) progress, 1519 + (unsigned long long) resync_max_sectors); 1520 + 1521 + /* 1522 + * v1.5.0+: 1523 + * 1524 + * Sync action: 1525 + * See Documentation/device-mapper/dm-raid.txt for 1526 + * information on each of these states. 1527 + */ 1528 + DMEMIT(" %s", sync_action); 1529 + 1530 + /* 1531 + * v1.5.0+: 1532 + * 1533 + * resync_mismatches/mismatch_cnt 1534 + * This field shows the number of discrepancies found when 1535 + * performing a "check" of the raid set. 1536 + */ 1537 + DMEMIT(" %llu", (unsigned long long) resync_mismatches); 1538 + 1539 + /* 1540 + * v1.9.0+: 1541 + * 1542 + * data_offset (needed for out of space reshaping) 1543 + * This field shows the data offset into the data 1544 + * image LV where the first stripes data starts. 1545 + * 1546 + * We keep data_offset equal on all raid disks of the set, 1547 + * so retrieving it from the first raid disk is sufficient. 1548 + */ 1549 + DMEMIT(" %llu", (unsigned long long) rs->dev[0].rdev.data_offset); 1550 + break; 1551 + 1552 + case STATUSTYPE_TABLE: 1553 + /* Report the table line string you would use to construct this raid set */ 1554 + 1555 + /* Calculate raid parameter count */ 1556 + for (i = 0; i < rs->raid_disks; i++) 1557 + if (test_bit(WriteMostly, &rs->dev[i].rdev.flags)) 1558 + write_mostly_params += 2; 1559 + rebuild_disks = memweight(rs->rebuild_disks, DISKS_ARRAY_ELEMS * sizeof(*rs->rebuild_disks)); 1560 + raid_param_cnt += rebuild_disks * 2 + 1561 + write_mostly_params + 1562 + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) + 1563 + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2; 1564 + /* Emit table line */ 1565 + DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors); 1566 + if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) 1567 + DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT), 1568 + raid10_md_layout_to_format(mddev->layout)); 1569 + if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags)) 1570 + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES), 1571 + raid10_md_layout_to_copies(mddev->layout)); 1572 + if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) 1573 + DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC)); 1574 + if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) 1575 + DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC)); 1576 + if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) 1577 + DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE), 1578 + (unsigned long long) to_sector(mddev->bitmap_info.chunksize)); 1579 + if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) 1580 + DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET), 1581 + (unsigned long long) rs->data_offset); 1582 + if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) 1583 + DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP), 1584 + mddev->bitmap_info.daemon_sleep); 1585 + if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) 1586 + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS), 1587 + max(rs->delta_disks, mddev->delta_disks)); 1588 + if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags)) 1589 + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE), 1590 + max_nr_stripes); 1591 + if (rebuild_disks) 1592 + for (i = 0; i < rs->raid_disks; i++) 1593 + if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks)) 1594 + DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), 1595 + rs->dev[i].rdev.raid_disk); 1596 + if (write_mostly_params) 1597 + for (i = 0; i < rs->raid_disks; i++) 1598 + if (test_bit(WriteMostly, &rs->dev[i].rdev.flags)) 1599 + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY), 1600 + rs->dev[i].rdev.raid_disk); 1601 + if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags)) 1602 + DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND), 1603 + mddev->bitmap_info.max_write_behind); 1604 + if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags)) 1605 + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE), 1606 + mddev->sync_speed_max); 1607 + if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) 1608 + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE), 1609 + mddev->sync_speed_min); 1610 + DMEMIT(" %d", rs->raid_disks); 1611 + for (i = 0; i < rs->raid_disks; i++) 1612 + DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev), 1613 + __get_dev_name(rs->dev[i].data_dev)); 3240 1614 } 1615 + } 1616 + 1617 + static int raid_message(struct dm_target *ti, unsigned int argc, char **argv) 1618 + { 1619 + struct raid_set *rs = ti->private; 1620 + struct mddev *mddev = &rs->md; 3241 1621 3242 1622 if (!mddev->pers || !mddev->pers->sync_request) 3243 1623 return -EINVAL; ··· 3321 1571 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 3322 1572 return -EBUSY; 3323 1573 else if (!strcasecmp(argv[0], "resync")) 3324 - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3325 - else if (!strcasecmp(argv[0], "recover")) { 1574 + ; /* MD_RECOVERY_NEEDED set below */ 1575 + else if (!strcasecmp(argv[0], "recover")) 3326 1576 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 3327 - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3328 - } else { 1577 + else { 3329 1578 if (!strcasecmp(argv[0], "check")) 3330 1579 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 3331 1580 else if (!!strcasecmp(argv[0], "repair")) ··· 3337 1588 * canceling read-auto mode 3338 1589 */ 3339 1590 mddev->ro = 0; 3340 - if (!mddev->suspended) 1591 + if (!mddev->suspended && mddev->sync_thread) 3341 1592 md_wakeup_thread(mddev->sync_thread); 3342 1593 } 3343 1594 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3344 - if (!mddev->suspended) 1595 + if (!mddev->suspended && mddev->thread) 3345 1596 md_wakeup_thread(mddev->thread); 3346 1597 3347 1598 return 0; ··· 3351 1602 iterate_devices_callout_fn fn, void *data) 3352 1603 { 3353 1604 struct raid_set *rs = ti->private; 3354 - unsigned i; 3355 - int ret = 0; 1605 + unsigned int i; 1606 + int r = 0; 3356 1607 3357 - for (i = 0; !ret && i < rs->md.raid_disks; i++) 1608 + for (i = 0; !r && i < rs->md.raid_disks; i++) 3358 1609 if (rs->dev[i].data_dev) 3359 - ret = fn(ti, 1610 + r = fn(ti, 3360 1611 rs->dev[i].data_dev, 3361 1612 0, /* No offset on data devs */ 3362 1613 rs->md.dev_sectors, 3363 1614 data); 3364 1615 3365 - return ret; 1616 + return r; 3366 1617 } 3367 1618 3368 1619 static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) 3369 1620 { 3370 1621 struct raid_set *rs = ti->private; 3371 - unsigned chunk_size = rs->md.chunk_sectors << 9; 3372 - struct r5conf *conf = rs->md.private; 1622 + unsigned int chunk_size = to_bytes(rs->md.chunk_sectors); 3373 1623 3374 1624 blk_limits_io_min(limits, chunk_size); 3375 - blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded)); 1625 + blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs)); 3376 1626 } 3377 1627 3378 1628 static void raid_presuspend(struct dm_target *ti) ··· 3385 1637 { 3386 1638 struct raid_set *rs = ti->private; 3387 1639 3388 - mddev_suspend(&rs->md); 1640 + if (test_and_clear_bit(RT_FLAG_RS_RESUMED, &rs->runtime_flags)) { 1641 + if (!rs->md.suspended) 1642 + mddev_suspend(&rs->md); 1643 + rs->md.ro = 1; 1644 + } 3389 1645 } 3390 1646 3391 1647 static void attempt_restore_of_faulty_devices(struct raid_set *rs) ··· 3403 1651 for (i = 0; i < rs->md.raid_disks; i++) { 3404 1652 r = &rs->dev[i].rdev; 3405 1653 if (test_bit(Faulty, &r->flags) && r->sb_page && 3406 - sync_page_io(r, 0, r->sb_size, r->sb_page, REQ_OP_READ, 0, 3407 - 1)) { 1654 + sync_page_io(r, 0, r->sb_size, r->sb_page, 1655 + REQ_OP_READ, 0, true)) { 3408 1656 DMINFO("Faulty %s device #%d has readable super block." 3409 1657 " Attempting to revive it.", 3410 1658 rs->raid_type->name, i); ··· 3413 1661 * Faulty bit may be set, but sometimes the array can 3414 1662 * be suspended before the personalities can respond 3415 1663 * by removing the device from the array (i.e. calling 3416 - * 'hot_remove_disk'). If they haven't yet removed 1664 + * 'hot_remove_disk'). If they haven't yet removed 3417 1665 * the failed device, its 'raid_disk' number will be 3418 1666 * '>= 0' - meaning we must call this function 3419 1667 * ourselves. ··· 3449 1697 } 3450 1698 } 3451 1699 1700 + static int __load_dirty_region_bitmap(struct raid_set *rs) 1701 + { 1702 + int r = 0; 1703 + 1704 + /* Try loading the bitmap unless "raid0", which does not have one */ 1705 + if (!rs_is_raid0(rs) && 1706 + !test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) { 1707 + r = bitmap_load(&rs->md); 1708 + if (r) 1709 + DMERR("Failed to load bitmap"); 1710 + } 1711 + 1712 + return r; 1713 + } 1714 + 1715 + /* Enforce updating all superblocks */ 1716 + static void rs_update_sbs(struct raid_set *rs) 1717 + { 1718 + struct mddev *mddev = &rs->md; 1719 + int ro = mddev->ro; 1720 + 1721 + set_bit(MD_CHANGE_DEVS, &mddev->flags); 1722 + mddev->ro = 0; 1723 + md_update_sb(mddev, 1); 1724 + mddev->ro = ro; 1725 + } 1726 + 1727 + /* 1728 + * Reshape changes raid algorithm of @rs to new one within personality 1729 + * (e.g. raid6_zr -> raid6_nc), changes stripe size, adds/removes 1730 + * disks from a raid set thus growing/shrinking it or resizes the set 1731 + * 1732 + * Call mddev_lock_nointr() before! 1733 + */ 1734 + static int rs_start_reshape(struct raid_set *rs) 1735 + { 1736 + int r; 1737 + struct mddev *mddev = &rs->md; 1738 + struct md_personality *pers = mddev->pers; 1739 + 1740 + r = rs_setup_reshape(rs); 1741 + if (r) 1742 + return r; 1743 + 1744 + /* Need to be resumed to be able to start reshape, recovery is frozen until raid_resume() though */ 1745 + if (mddev->suspended) 1746 + mddev_resume(mddev); 1747 + 1748 + /* 1749 + * Check any reshape constraints enforced by the personalility 1750 + * 1751 + * May as well already kick the reshape off so that * pers->start_reshape() becomes optional. 1752 + */ 1753 + r = pers->check_reshape(mddev); 1754 + if (r) { 1755 + rs->ti->error = "pers->check_reshape() failed"; 1756 + return r; 1757 + } 1758 + 1759 + /* 1760 + * Personality may not provide start reshape method in which 1761 + * case check_reshape above has already covered everything 1762 + */ 1763 + if (pers->start_reshape) { 1764 + r = pers->start_reshape(mddev); 1765 + if (r) { 1766 + rs->ti->error = "pers->start_reshape() failed"; 1767 + return r; 1768 + } 1769 + } 1770 + 1771 + /* Suspend because a resume will happen in raid_resume() */ 1772 + if (!mddev->suspended) 1773 + mddev_suspend(mddev); 1774 + 1775 + /* 1776 + * Now reshape got set up, update superblocks to 1777 + * reflect the fact so that a table reload will 1778 + * access proper superblock content in the ctr. 1779 + */ 1780 + rs_update_sbs(rs); 1781 + 1782 + return 0; 1783 + } 1784 + 1785 + static int raid_preresume(struct dm_target *ti) 1786 + { 1787 + int r; 1788 + struct raid_set *rs = ti->private; 1789 + struct mddev *mddev = &rs->md; 1790 + 1791 + /* This is a resume after a suspend of the set -> it's already started */ 1792 + if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags)) 1793 + return 0; 1794 + 1795 + /* 1796 + * The superblocks need to be updated on disk if the 1797 + * array is new or new devices got added (thus zeroed 1798 + * out by userspace) or __load_dirty_region_bitmap 1799 + * will overwrite them in core with old data or fail. 1800 + */ 1801 + if (test_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags)) 1802 + rs_update_sbs(rs); 1803 + 1804 + /* 1805 + * Disable/enable discard support on raid set after any 1806 + * conversion, because devices can have been added 1807 + */ 1808 + configure_discard_support(rs); 1809 + 1810 + /* Load the bitmap from disk unless raid0 */ 1811 + r = __load_dirty_region_bitmap(rs); 1812 + if (r) 1813 + return r; 1814 + 1815 + /* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) */ 1816 + if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && 1817 + mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)) { 1818 + r = bitmap_resize(mddev->bitmap, mddev->dev_sectors, 1819 + to_bytes(rs->requested_bitmap_chunk_sectors), 0); 1820 + if (r) 1821 + DMERR("Failed to resize bitmap"); 1822 + } 1823 + 1824 + /* Check for any resize/reshape on @rs and adjust/initiate */ 1825 + /* Be prepared for mddev_resume() in raid_resume() */ 1826 + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 1827 + if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { 1828 + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 1829 + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 1830 + mddev->resync_min = mddev->recovery_cp; 1831 + } 1832 + 1833 + rs_set_capacity(rs); 1834 + 1835 + /* Check for any reshape request unless new raid set */ 1836 + if (test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) { 1837 + /* Initiate a reshape. */ 1838 + mddev_lock_nointr(mddev); 1839 + r = rs_start_reshape(rs); 1840 + mddev_unlock(mddev); 1841 + if (r) 1842 + DMWARN("Failed to check/start reshape, continuing without change"); 1843 + r = 0; 1844 + } 1845 + 1846 + return r; 1847 + } 1848 + 3452 1849 static void raid_resume(struct dm_target *ti) 3453 1850 { 3454 1851 struct raid_set *rs = ti->private; 1852 + struct mddev *mddev = &rs->md; 3455 1853 3456 - if (rs->raid_type->level) { 3457 - set_bit(MD_CHANGE_DEVS, &rs->md.flags); 1854 + if (test_and_set_bit(RT_FLAG_RS_RESUMED, &rs->runtime_flags)) { 1855 + /* 1856 + * A secondary resume while the device is active. 1857 + * Take this opportunity to check whether any failed 1858 + * devices are reachable again. 1859 + */ 1860 + attempt_restore_of_faulty_devices(rs); 1861 + } else { 1862 + mddev->ro = 0; 1863 + mddev->in_sync = 0; 3458 1864 3459 - if (!rs->bitmap_loaded) { 3460 - bitmap_load(&rs->md); 3461 - rs->bitmap_loaded = 1; 3462 - } else { 3463 - /* 3464 - * A secondary resume while the device is active. 3465 - * Take this opportunity to check whether any failed 3466 - * devices are reachable again. 3467 - */ 3468 - attempt_restore_of_faulty_devices(rs); 3469 - } 1865 + /* 1866 + * When passing in flags to the ctr, we expect userspace 1867 + * to reset them because they made it to the superblocks 1868 + * and reload the mapping anyway. 1869 + * 1870 + * -> only unfreeze recovery in case of a table reload or 1871 + * we'll have a bogus recovery/reshape position 1872 + * retrieved from the superblock by the ctr because 1873 + * the ongoing recovery/reshape will change it after read. 1874 + */ 1875 + if (!test_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags)) 1876 + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3470 1877 3471 - clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); 1878 + if (mddev->suspended) 1879 + mddev_resume(mddev); 3472 1880 } 3473 - 3474 - mddev_resume(&rs->md); 3475 1881 } 3476 1882 3477 1883 static struct target_type raid_target = { 3478 1884 .name = "raid", 3479 - .version = {1, 8, 0}, 1885 + .version = {1, 9, 0}, 3480 1886 .module = THIS_MODULE, 3481 1887 .ctr = raid_ctr, 3482 1888 .dtr = raid_dtr, ··· 3645 1735 .io_hints = raid_io_hints, 3646 1736 .presuspend = raid_presuspend, 3647 1737 .postsuspend = raid_postsuspend, 1738 + .preresume = raid_preresume, 3648 1739 .resume = raid_resume, 3649 1740 }; 3650 1741 ··· 3670 1759 MODULE_PARM_DESC(devices_handle_discard_safely, 3671 1760 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); 3672 1761 3673 - MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); 1762 + MODULE_DESCRIPTION(DM_NAME " raid0/1/10/4/5/6 target"); 1763 + MODULE_ALIAS("dm-raid0"); 3674 1764 MODULE_ALIAS("dm-raid1"); 3675 1765 MODULE_ALIAS("dm-raid10"); 3676 1766 MODULE_ALIAS("dm-raid4"); 3677 1767 MODULE_ALIAS("dm-raid5"); 3678 1768 MODULE_ALIAS("dm-raid6"); 3679 1769 MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>"); 1770 + MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>"); 3680 1771 MODULE_LICENSE("GPL");

+970

drivers/md/dm-rq.c

··· 1 + /* 2 + * Copyright (C) 2016 Red Hat, Inc. All rights reserved. 3 + * 4 + * This file is released under the GPL. 5 + */ 6 + 7 + #include "dm-core.h" 8 + #include "dm-rq.h" 9 + 10 + #include <linux/elevator.h> /* for rq_end_sector() */ 11 + #include <linux/blk-mq.h> 12 + 13 + #define DM_MSG_PREFIX "core-rq" 14 + 15 + #define DM_MQ_NR_HW_QUEUES 1 16 + #define DM_MQ_QUEUE_DEPTH 2048 17 + static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES; 18 + static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH; 19 + 20 + /* 21 + * Request-based DM's mempools' reserved IOs set by the user. 22 + */ 23 + #define RESERVED_REQUEST_BASED_IOS 256 24 + static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; 25 + 26 + #ifdef CONFIG_DM_MQ_DEFAULT 27 + static bool use_blk_mq = true; 28 + #else 29 + static bool use_blk_mq = false; 30 + #endif 31 + 32 + bool dm_use_blk_mq_default(void) 33 + { 34 + return use_blk_mq; 35 + } 36 + 37 + bool dm_use_blk_mq(struct mapped_device *md) 38 + { 39 + return md->use_blk_mq; 40 + } 41 + EXPORT_SYMBOL_GPL(dm_use_blk_mq); 42 + 43 + unsigned dm_get_reserved_rq_based_ios(void) 44 + { 45 + return __dm_get_module_param(&reserved_rq_based_ios, 46 + RESERVED_REQUEST_BASED_IOS, DM_RESERVED_MAX_IOS); 47 + } 48 + EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); 49 + 50 + static unsigned dm_get_blk_mq_nr_hw_queues(void) 51 + { 52 + return __dm_get_module_param(&dm_mq_nr_hw_queues, 1, 32); 53 + } 54 + 55 + static unsigned dm_get_blk_mq_queue_depth(void) 56 + { 57 + return __dm_get_module_param(&dm_mq_queue_depth, 58 + DM_MQ_QUEUE_DEPTH, BLK_MQ_MAX_DEPTH); 59 + } 60 + 61 + int dm_request_based(struct mapped_device *md) 62 + { 63 + return blk_queue_stackable(md->queue); 64 + } 65 + 66 + static void dm_old_start_queue(struct request_queue *q) 67 + { 68 + unsigned long flags; 69 + 70 + spin_lock_irqsave(q->queue_lock, flags); 71 + if (blk_queue_stopped(q)) 72 + blk_start_queue(q); 73 + spin_unlock_irqrestore(q->queue_lock, flags); 74 + } 75 + 76 + void dm_start_queue(struct request_queue *q) 77 + { 78 + if (!q->mq_ops) 79 + dm_old_start_queue(q); 80 + else { 81 + blk_mq_start_stopped_hw_queues(q, true); 82 + blk_mq_kick_requeue_list(q); 83 + } 84 + } 85 + 86 + static void dm_old_stop_queue(struct request_queue *q) 87 + { 88 + unsigned long flags; 89 + 90 + spin_lock_irqsave(q->queue_lock, flags); 91 + if (blk_queue_stopped(q)) { 92 + spin_unlock_irqrestore(q->queue_lock, flags); 93 + return; 94 + } 95 + 96 + blk_stop_queue(q); 97 + spin_unlock_irqrestore(q->queue_lock, flags); 98 + } 99 + 100 + void dm_stop_queue(struct request_queue *q) 101 + { 102 + if (!q->mq_ops) 103 + dm_old_stop_queue(q); 104 + else 105 + blk_mq_stop_hw_queues(q); 106 + } 107 + 108 + static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md, 109 + gfp_t gfp_mask) 110 + { 111 + return mempool_alloc(md->io_pool, gfp_mask); 112 + } 113 + 114 + static void free_old_rq_tio(struct dm_rq_target_io *tio) 115 + { 116 + mempool_free(tio, tio->md->io_pool); 117 + } 118 + 119 + static struct request *alloc_old_clone_request(struct mapped_device *md, 120 + gfp_t gfp_mask) 121 + { 122 + return mempool_alloc(md->rq_pool, gfp_mask); 123 + } 124 + 125 + static void free_old_clone_request(struct mapped_device *md, struct request *rq) 126 + { 127 + mempool_free(rq, md->rq_pool); 128 + } 129 + 130 + /* 131 + * Partial completion handling for request-based dm 132 + */ 133 + static void end_clone_bio(struct bio *clone) 134 + { 135 + struct dm_rq_clone_bio_info *info = 136 + container_of(clone, struct dm_rq_clone_bio_info, clone); 137 + struct dm_rq_target_io *tio = info->tio; 138 + struct bio *bio = info->orig; 139 + unsigned int nr_bytes = info->orig->bi_iter.bi_size; 140 + int error = clone->bi_error; 141 + 142 + bio_put(clone); 143 + 144 + if (tio->error) 145 + /* 146 + * An error has already been detected on the request. 147 + * Once error occurred, just let clone->end_io() handle 148 + * the remainder. 149 + */ 150 + return; 151 + else if (error) { 152 + /* 153 + * Don't notice the error to the upper layer yet. 154 + * The error handling decision is made by the target driver, 155 + * when the request is completed. 156 + */ 157 + tio->error = error; 158 + return; 159 + } 160 + 161 + /* 162 + * I/O for the bio successfully completed. 163 + * Notice the data completion to the upper layer. 164 + */ 165 + 166 + /* 167 + * bios are processed from the head of the list. 168 + * So the completing bio should always be rq->bio. 169 + * If it's not, something wrong is happening. 170 + */ 171 + if (tio->orig->bio != bio) 172 + DMERR("bio completion is going in the middle of the request"); 173 + 174 + /* 175 + * Update the original request. 176 + * Do not use blk_end_request() here, because it may complete 177 + * the original request before the clone, and break the ordering. 178 + */ 179 + blk_update_request(tio->orig, 0, nr_bytes); 180 + } 181 + 182 + static struct dm_rq_target_io *tio_from_request(struct request *rq) 183 + { 184 + return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special); 185 + } 186 + 187 + static void rq_end_stats(struct mapped_device *md, struct request *orig) 188 + { 189 + if (unlikely(dm_stats_used(&md->stats))) { 190 + struct dm_rq_target_io *tio = tio_from_request(orig); 191 + tio->duration_jiffies = jiffies - tio->duration_jiffies; 192 + dm_stats_account_io(&md->stats, rq_data_dir(orig), 193 + blk_rq_pos(orig), tio->n_sectors, true, 194 + tio->duration_jiffies, &tio->stats_aux); 195 + } 196 + } 197 + 198 + /* 199 + * Don't touch any member of the md after calling this function because 200 + * the md may be freed in dm_put() at the end of this function. 201 + * Or do dm_get() before calling this function and dm_put() later. 202 + */ 203 + static void rq_completed(struct mapped_device *md, int rw, bool run_queue) 204 + { 205 + atomic_dec(&md->pending[rw]); 206 + 207 + /* nudge anyone waiting on suspend queue */ 208 + if (!md_in_flight(md)) 209 + wake_up(&md->wait); 210 + 211 + /* 212 + * Run this off this callpath, as drivers could invoke end_io while 213 + * inside their request_fn (and holding the queue lock). Calling 214 + * back into ->request_fn() could deadlock attempting to grab the 215 + * queue lock again. 216 + */ 217 + if (!md->queue->mq_ops && run_queue) 218 + blk_run_queue_async(md->queue); 219 + 220 + /* 221 + * dm_put() must be at the end of this function. See the comment above 222 + */ 223 + dm_put(md); 224 + } 225 + 226 + static void free_rq_clone(struct request *clone) 227 + { 228 + struct dm_rq_target_io *tio = clone->end_io_data; 229 + struct mapped_device *md = tio->md; 230 + 231 + blk_rq_unprep_clone(clone); 232 + 233 + /* 234 + * It is possible for a clone_old_rq() allocated clone to 235 + * get passed in -- it may not yet have a request_queue. 236 + * This is known to occur if the error target replaces 237 + * a multipath target that has a request_fn queue stacked 238 + * on blk-mq queue(s). 239 + */ 240 + if (clone->q && clone->q->mq_ops) 241 + /* stacked on blk-mq queue(s) */ 242 + tio->ti->type->release_clone_rq(clone); 243 + else if (!md->queue->mq_ops) 244 + /* request_fn queue stacked on request_fn queue(s) */ 245 + free_old_clone_request(md, clone); 246 + 247 + if (!md->queue->mq_ops) 248 + free_old_rq_tio(tio); 249 + } 250 + 251 + /* 252 + * Complete the clone and the original request. 253 + * Must be called without clone's queue lock held, 254 + * see end_clone_request() for more details. 255 + */ 256 + static void dm_end_request(struct request *clone, int error) 257 + { 258 + int rw = rq_data_dir(clone); 259 + struct dm_rq_target_io *tio = clone->end_io_data; 260 + struct mapped_device *md = tio->md; 261 + struct request *rq = tio->orig; 262 + 263 + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 264 + rq->errors = clone->errors; 265 + rq->resid_len = clone->resid_len; 266 + 267 + if (rq->sense) 268 + /* 269 + * We are using the sense buffer of the original 270 + * request. 271 + * So setting the length of the sense data is enough. 272 + */ 273 + rq->sense_len = clone->sense_len; 274 + } 275 + 276 + free_rq_clone(clone); 277 + rq_end_stats(md, rq); 278 + if (!rq->q->mq_ops) 279 + blk_end_request_all(rq, error); 280 + else 281 + blk_mq_end_request(rq, error); 282 + rq_completed(md, rw, true); 283 + } 284 + 285 + static void dm_unprep_request(struct request *rq) 286 + { 287 + struct dm_rq_target_io *tio = tio_from_request(rq); 288 + struct request *clone = tio->clone; 289 + 290 + if (!rq->q->mq_ops) { 291 + rq->special = NULL; 292 + rq->cmd_flags &= ~REQ_DONTPREP; 293 + } 294 + 295 + if (clone) 296 + free_rq_clone(clone); 297 + else if (!tio->md->queue->mq_ops) 298 + free_old_rq_tio(tio); 299 + } 300 + 301 + /* 302 + * Requeue the original request of a clone. 303 + */ 304 + static void dm_old_requeue_request(struct request *rq) 305 + { 306 + struct request_queue *q = rq->q; 307 + unsigned long flags; 308 + 309 + spin_lock_irqsave(q->queue_lock, flags); 310 + blk_requeue_request(q, rq); 311 + blk_run_queue_async(q); 312 + spin_unlock_irqrestore(q->queue_lock, flags); 313 + } 314 + 315 + static void dm_mq_requeue_request(struct request *rq) 316 + { 317 + struct request_queue *q = rq->q; 318 + unsigned long flags; 319 + 320 + blk_mq_requeue_request(rq); 321 + spin_lock_irqsave(q->queue_lock, flags); 322 + if (!blk_queue_stopped(q)) 323 + blk_mq_kick_requeue_list(q); 324 + spin_unlock_irqrestore(q->queue_lock, flags); 325 + } 326 + 327 + static void dm_requeue_original_request(struct mapped_device *md, 328 + struct request *rq) 329 + { 330 + int rw = rq_data_dir(rq); 331 + 332 + rq_end_stats(md, rq); 333 + dm_unprep_request(rq); 334 + 335 + if (!rq->q->mq_ops) 336 + dm_old_requeue_request(rq); 337 + else 338 + dm_mq_requeue_request(rq); 339 + 340 + rq_completed(md, rw, false); 341 + } 342 + 343 + static void dm_done(struct request *clone, int error, bool mapped) 344 + { 345 + int r = error; 346 + struct dm_rq_target_io *tio = clone->end_io_data; 347 + dm_request_endio_fn rq_end_io = NULL; 348 + 349 + if (tio->ti) { 350 + rq_end_io = tio->ti->type->rq_end_io; 351 + 352 + if (mapped && rq_end_io) 353 + r = rq_end_io(tio->ti, clone, error, &tio->info); 354 + } 355 + 356 + if (unlikely(r == -EREMOTEIO && (req_op(clone) == REQ_OP_WRITE_SAME) && 357 + !clone->q->limits.max_write_same_sectors)) 358 + disable_write_same(tio->md); 359 + 360 + if (r <= 0) 361 + /* The target wants to complete the I/O */ 362 + dm_end_request(clone, r); 363 + else if (r == DM_ENDIO_INCOMPLETE) 364 + /* The target will handle the I/O */ 365 + return; 366 + else if (r == DM_ENDIO_REQUEUE) 367 + /* The target wants to requeue the I/O */ 368 + dm_requeue_original_request(tio->md, tio->orig); 369 + else { 370 + DMWARN("unimplemented target endio return value: %d", r); 371 + BUG(); 372 + } 373 + } 374 + 375 + /* 376 + * Request completion handler for request-based dm 377 + */ 378 + static void dm_softirq_done(struct request *rq) 379 + { 380 + bool mapped = true; 381 + struct dm_rq_target_io *tio = tio_from_request(rq); 382 + struct request *clone = tio->clone; 383 + int rw; 384 + 385 + if (!clone) { 386 + rq_end_stats(tio->md, rq); 387 + rw = rq_data_dir(rq); 388 + if (!rq->q->mq_ops) { 389 + blk_end_request_all(rq, tio->error); 390 + rq_completed(tio->md, rw, false); 391 + free_old_rq_tio(tio); 392 + } else { 393 + blk_mq_end_request(rq, tio->error); 394 + rq_completed(tio->md, rw, false); 395 + } 396 + return; 397 + } 398 + 399 + if (rq->cmd_flags & REQ_FAILED) 400 + mapped = false; 401 + 402 + dm_done(clone, tio->error, mapped); 403 + } 404 + 405 + /* 406 + * Complete the clone and the original request with the error status 407 + * through softirq context. 408 + */ 409 + static void dm_complete_request(struct request *rq, int error) 410 + { 411 + struct dm_rq_target_io *tio = tio_from_request(rq); 412 + 413 + tio->error = error; 414 + if (!rq->q->mq_ops) 415 + blk_complete_request(rq); 416 + else 417 + blk_mq_complete_request(rq, error); 418 + } 419 + 420 + /* 421 + * Complete the not-mapped clone and the original request with the error status 422 + * through softirq context. 423 + * Target's rq_end_io() function isn't called. 424 + * This may be used when the target's map_rq() or clone_and_map_rq() functions fail. 425 + */ 426 + static void dm_kill_unmapped_request(struct request *rq, int error) 427 + { 428 + rq->cmd_flags |= REQ_FAILED; 429 + dm_complete_request(rq, error); 430 + } 431 + 432 + /* 433 + * Called with the clone's queue lock held (in the case of .request_fn) 434 + */ 435 + static void end_clone_request(struct request *clone, int error) 436 + { 437 + struct dm_rq_target_io *tio = clone->end_io_data; 438 + 439 + if (!clone->q->mq_ops) { 440 + /* 441 + * For just cleaning up the information of the queue in which 442 + * the clone was dispatched. 443 + * The clone is *NOT* freed actually here because it is alloced 444 + * from dm own mempool (REQ_ALLOCED isn't set). 445 + */ 446 + __blk_put_request(clone->q, clone); 447 + } 448 + 449 + /* 450 + * Actual request completion is done in a softirq context which doesn't 451 + * hold the clone's queue lock. Otherwise, deadlock could occur because: 452 + * - another request may be submitted by the upper level driver 453 + * of the stacking during the completion 454 + * - the submission which requires queue lock may be done 455 + * against this clone's queue 456 + */ 457 + dm_complete_request(tio->orig, error); 458 + } 459 + 460 + static void dm_dispatch_clone_request(struct request *clone, struct request *rq) 461 + { 462 + int r; 463 + 464 + if (blk_queue_io_stat(clone->q)) 465 + clone->cmd_flags |= REQ_IO_STAT; 466 + 467 + clone->start_time = jiffies; 468 + r = blk_insert_cloned_request(clone->q, clone); 469 + if (r) 470 + /* must complete clone in terms of original request */ 471 + dm_complete_request(rq, r); 472 + } 473 + 474 + static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 475 + void *data) 476 + { 477 + struct dm_rq_target_io *tio = data; 478 + struct dm_rq_clone_bio_info *info = 479 + container_of(bio, struct dm_rq_clone_bio_info, clone); 480 + 481 + info->orig = bio_orig; 482 + info->tio = tio; 483 + bio->bi_end_io = end_clone_bio; 484 + 485 + return 0; 486 + } 487 + 488 + static int setup_clone(struct request *clone, struct request *rq, 489 + struct dm_rq_target_io *tio, gfp_t gfp_mask) 490 + { 491 + int r; 492 + 493 + r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask, 494 + dm_rq_bio_constructor, tio); 495 + if (r) 496 + return r; 497 + 498 + clone->cmd = rq->cmd; 499 + clone->cmd_len = rq->cmd_len; 500 + clone->sense = rq->sense; 501 + clone->end_io = end_clone_request; 502 + clone->end_io_data = tio; 503 + 504 + tio->clone = clone; 505 + 506 + return 0; 507 + } 508 + 509 + static struct request *clone_old_rq(struct request *rq, struct mapped_device *md, 510 + struct dm_rq_target_io *tio, gfp_t gfp_mask) 511 + { 512 + /* 513 + * Create clone for use with .request_fn request_queue 514 + */ 515 + struct request *clone; 516 + 517 + clone = alloc_old_clone_request(md, gfp_mask); 518 + if (!clone) 519 + return NULL; 520 + 521 + blk_rq_init(NULL, clone); 522 + if (setup_clone(clone, rq, tio, gfp_mask)) { 523 + /* -ENOMEM */ 524 + free_old_clone_request(md, clone); 525 + return NULL; 526 + } 527 + 528 + return clone; 529 + } 530 + 531 + static void map_tio_request(struct kthread_work *work); 532 + 533 + static void init_tio(struct dm_rq_target_io *tio, struct request *rq, 534 + struct mapped_device *md) 535 + { 536 + tio->md = md; 537 + tio->ti = NULL; 538 + tio->clone = NULL; 539 + tio->orig = rq; 540 + tio->error = 0; 541 + /* 542 + * Avoid initializing info for blk-mq; it passes 543 + * target-specific data through info.ptr 544 + * (see: dm_mq_init_request) 545 + */ 546 + if (!md->init_tio_pdu) 547 + memset(&tio->info, 0, sizeof(tio->info)); 548 + if (md->kworker_task) 549 + init_kthread_work(&tio->work, map_tio_request); 550 + } 551 + 552 + static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq, 553 + struct mapped_device *md, 554 + gfp_t gfp_mask) 555 + { 556 + struct dm_rq_target_io *tio; 557 + int srcu_idx; 558 + struct dm_table *table; 559 + 560 + tio = alloc_old_rq_tio(md, gfp_mask); 561 + if (!tio) 562 + return NULL; 563 + 564 + init_tio(tio, rq, md); 565 + 566 + table = dm_get_live_table(md, &srcu_idx); 567 + /* 568 + * Must clone a request if this .request_fn DM device 569 + * is stacked on .request_fn device(s). 570 + */ 571 + if (!dm_table_all_blk_mq_devices(table)) { 572 + if (!clone_old_rq(rq, md, tio, gfp_mask)) { 573 + dm_put_live_table(md, srcu_idx); 574 + free_old_rq_tio(tio); 575 + return NULL; 576 + } 577 + } 578 + dm_put_live_table(md, srcu_idx); 579 + 580 + return tio; 581 + } 582 + 583 + /* 584 + * Called with the queue lock held. 585 + */ 586 + static int dm_old_prep_fn(struct request_queue *q, struct request *rq) 587 + { 588 + struct mapped_device *md = q->queuedata; 589 + struct dm_rq_target_io *tio; 590 + 591 + if (unlikely(rq->special)) { 592 + DMWARN("Already has something in rq->special."); 593 + return BLKPREP_KILL; 594 + } 595 + 596 + tio = dm_old_prep_tio(rq, md, GFP_ATOMIC); 597 + if (!tio) 598 + return BLKPREP_DEFER; 599 + 600 + rq->special = tio; 601 + rq->cmd_flags |= REQ_DONTPREP; 602 + 603 + return BLKPREP_OK; 604 + } 605 + 606 + /* 607 + * Returns: 608 + * 0 : the request has been processed 609 + * DM_MAPIO_REQUEUE : the original request needs to be requeued 610 + * < 0 : the request was completed due to failure 611 + */ 612 + static int map_request(struct dm_rq_target_io *tio, struct request *rq, 613 + struct mapped_device *md) 614 + { 615 + int r; 616 + struct dm_target *ti = tio->ti; 617 + struct request *clone = NULL; 618 + 619 + if (tio->clone) { 620 + clone = tio->clone; 621 + r = ti->type->map_rq(ti, clone, &tio->info); 622 + } else { 623 + r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); 624 + if (r < 0) { 625 + /* The target wants to complete the I/O */ 626 + dm_kill_unmapped_request(rq, r); 627 + return r; 628 + } 629 + if (r != DM_MAPIO_REMAPPED) 630 + return r; 631 + if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { 632 + /* -ENOMEM */ 633 + ti->type->release_clone_rq(clone); 634 + return DM_MAPIO_REQUEUE; 635 + } 636 + } 637 + 638 + switch (r) { 639 + case DM_MAPIO_SUBMITTED: 640 + /* The target has taken the I/O to submit by itself later */ 641 + break; 642 + case DM_MAPIO_REMAPPED: 643 + /* The target has remapped the I/O so dispatch it */ 644 + trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 645 + blk_rq_pos(rq)); 646 + dm_dispatch_clone_request(clone, rq); 647 + break; 648 + case DM_MAPIO_REQUEUE: 649 + /* The target wants to requeue the I/O */ 650 + dm_requeue_original_request(md, tio->orig); 651 + break; 652 + default: 653 + if (r > 0) { 654 + DMWARN("unimplemented target map return value: %d", r); 655 + BUG(); 656 + } 657 + 658 + /* The target wants to complete the I/O */ 659 + dm_kill_unmapped_request(rq, r); 660 + return r; 661 + } 662 + 663 + return 0; 664 + } 665 + 666 + static void dm_start_request(struct mapped_device *md, struct request *orig) 667 + { 668 + if (!orig->q->mq_ops) 669 + blk_start_request(orig); 670 + else 671 + blk_mq_start_request(orig); 672 + atomic_inc(&md->pending[rq_data_dir(orig)]); 673 + 674 + if (md->seq_rq_merge_deadline_usecs) { 675 + md->last_rq_pos = rq_end_sector(orig); 676 + md->last_rq_rw = rq_data_dir(orig); 677 + md->last_rq_start_time = ktime_get(); 678 + } 679 + 680 + if (unlikely(dm_stats_used(&md->stats))) { 681 + struct dm_rq_target_io *tio = tio_from_request(orig); 682 + tio->duration_jiffies = jiffies; 683 + tio->n_sectors = blk_rq_sectors(orig); 684 + dm_stats_account_io(&md->stats, rq_data_dir(orig), 685 + blk_rq_pos(orig), tio->n_sectors, false, 0, 686 + &tio->stats_aux); 687 + } 688 + 689 + /* 690 + * Hold the md reference here for the in-flight I/O. 691 + * We can't rely on the reference count by device opener, 692 + * because the device may be closed during the request completion 693 + * when all bios are completed. 694 + * See the comment in rq_completed() too. 695 + */ 696 + dm_get(md); 697 + } 698 + 699 + static void map_tio_request(struct kthread_work *work) 700 + { 701 + struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work); 702 + struct request *rq = tio->orig; 703 + struct mapped_device *md = tio->md; 704 + 705 + if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) 706 + dm_requeue_original_request(md, rq); 707 + } 708 + 709 + ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf) 710 + { 711 + return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs); 712 + } 713 + 714 + #define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000 715 + 716 + ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, 717 + const char *buf, size_t count) 718 + { 719 + unsigned deadline; 720 + 721 + if (dm_get_md_type(md) != DM_TYPE_REQUEST_BASED) 722 + return count; 723 + 724 + if (kstrtouint(buf, 10, &deadline)) 725 + return -EINVAL; 726 + 727 + if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS) 728 + deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS; 729 + 730 + md->seq_rq_merge_deadline_usecs = deadline; 731 + 732 + return count; 733 + } 734 + 735 + static bool dm_old_request_peeked_before_merge_deadline(struct mapped_device *md) 736 + { 737 + ktime_t kt_deadline; 738 + 739 + if (!md->seq_rq_merge_deadline_usecs) 740 + return false; 741 + 742 + kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC); 743 + kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline); 744 + 745 + return !ktime_after(ktime_get(), kt_deadline); 746 + } 747 + 748 + /* 749 + * q->request_fn for old request-based dm. 750 + * Called with the queue lock held. 751 + */ 752 + static void dm_old_request_fn(struct request_queue *q) 753 + { 754 + struct mapped_device *md = q->queuedata; 755 + struct dm_target *ti = md->immutable_target; 756 + struct request *rq; 757 + struct dm_rq_target_io *tio; 758 + sector_t pos = 0; 759 + 760 + if (unlikely(!ti)) { 761 + int srcu_idx; 762 + struct dm_table *map = dm_get_live_table(md, &srcu_idx); 763 + 764 + ti = dm_table_find_target(map, pos); 765 + dm_put_live_table(md, srcu_idx); 766 + } 767 + 768 + /* 769 + * For suspend, check blk_queue_stopped() and increment 770 + * ->pending within a single queue_lock not to increment the 771 + * number of in-flight I/Os after the queue is stopped in 772 + * dm_suspend(). 773 + */ 774 + while (!blk_queue_stopped(q)) { 775 + rq = blk_peek_request(q); 776 + if (!rq) 777 + return; 778 + 779 + /* always use block 0 to find the target for flushes for now */ 780 + pos = 0; 781 + if (req_op(rq) != REQ_OP_FLUSH) 782 + pos = blk_rq_pos(rq); 783 + 784 + if ((dm_old_request_peeked_before_merge_deadline(md) && 785 + md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 && 786 + md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) || 787 + (ti->type->busy && ti->type->busy(ti))) { 788 + blk_delay_queue(q, 10); 789 + return; 790 + } 791 + 792 + dm_start_request(md, rq); 793 + 794 + tio = tio_from_request(rq); 795 + /* Establish tio->ti before queuing work (map_tio_request) */ 796 + tio->ti = ti; 797 + queue_kthread_work(&md->kworker, &tio->work); 798 + BUG_ON(!irqs_disabled()); 799 + } 800 + } 801 + 802 + /* 803 + * Fully initialize a .request_fn request-based queue. 804 + */ 805 + int dm_old_init_request_queue(struct mapped_device *md) 806 + { 807 + /* Fully initialize the queue */ 808 + if (!blk_init_allocated_queue(md->queue, dm_old_request_fn, NULL)) 809 + return -EINVAL; 810 + 811 + /* disable dm_old_request_fn's merge heuristic by default */ 812 + md->seq_rq_merge_deadline_usecs = 0; 813 + 814 + dm_init_normal_md_queue(md); 815 + blk_queue_softirq_done(md->queue, dm_softirq_done); 816 + blk_queue_prep_rq(md->queue, dm_old_prep_fn); 817 + 818 + /* Initialize the request-based DM worker thread */ 819 + init_kthread_worker(&md->kworker); 820 + md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker, 821 + "kdmwork-%s", dm_device_name(md)); 822 + if (IS_ERR(md->kworker_task)) 823 + return PTR_ERR(md->kworker_task); 824 + 825 + elv_register_queue(md->queue); 826 + 827 + return 0; 828 + } 829 + 830 + static int dm_mq_init_request(void *data, struct request *rq, 831 + unsigned int hctx_idx, unsigned int request_idx, 832 + unsigned int numa_node) 833 + { 834 + struct mapped_device *md = data; 835 + struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); 836 + 837 + /* 838 + * Must initialize md member of tio, otherwise it won't 839 + * be available in dm_mq_queue_rq. 840 + */ 841 + tio->md = md; 842 + 843 + if (md->init_tio_pdu) { 844 + /* target-specific per-io data is immediately after the tio */ 845 + tio->info.ptr = tio + 1; 846 + } 847 + 848 + return 0; 849 + } 850 + 851 + static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 852 + const struct blk_mq_queue_data *bd) 853 + { 854 + struct request *rq = bd->rq; 855 + struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); 856 + struct mapped_device *md = tio->md; 857 + struct dm_target *ti = md->immutable_target; 858 + 859 + if (unlikely(!ti)) { 860 + int srcu_idx; 861 + struct dm_table *map = dm_get_live_table(md, &srcu_idx); 862 + 863 + ti = dm_table_find_target(map, 0); 864 + dm_put_live_table(md, srcu_idx); 865 + } 866 + 867 + if (ti->type->busy && ti->type->busy(ti)) 868 + return BLK_MQ_RQ_QUEUE_BUSY; 869 + 870 + dm_start_request(md, rq); 871 + 872 + /* Init tio using md established in .init_request */ 873 + init_tio(tio, rq, md); 874 + 875 + /* 876 + * Establish tio->ti before calling map_request(). 877 + */ 878 + tio->ti = ti; 879 + 880 + /* Direct call is fine since .queue_rq allows allocations */ 881 + if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) { 882 + /* Undo dm_start_request() before requeuing */ 883 + rq_end_stats(md, rq); 884 + rq_completed(md, rq_data_dir(rq), false); 885 + return BLK_MQ_RQ_QUEUE_BUSY; 886 + } 887 + 888 + return BLK_MQ_RQ_QUEUE_OK; 889 + } 890 + 891 + static struct blk_mq_ops dm_mq_ops = { 892 + .queue_rq = dm_mq_queue_rq, 893 + .map_queue = blk_mq_map_queue, 894 + .complete = dm_softirq_done, 895 + .init_request = dm_mq_init_request, 896 + }; 897 + 898 + int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) 899 + { 900 + struct request_queue *q; 901 + struct dm_target *immutable_tgt; 902 + int err; 903 + 904 + if (!dm_table_all_blk_mq_devices(t)) { 905 + DMERR("request-based dm-mq may only be stacked on blk-mq device(s)"); 906 + return -EINVAL; 907 + } 908 + 909 + md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id); 910 + if (!md->tag_set) 911 + return -ENOMEM; 912 + 913 + md->tag_set->ops = &dm_mq_ops; 914 + md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); 915 + md->tag_set->numa_node = md->numa_node_id; 916 + md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 917 + md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); 918 + md->tag_set->driver_data = md; 919 + 920 + md->tag_set->cmd_size = sizeof(struct dm_rq_target_io); 921 + immutable_tgt = dm_table_get_immutable_target(t); 922 + if (immutable_tgt && immutable_tgt->per_io_data_size) { 923 + /* any target-specific per-io data is immediately after the tio */ 924 + md->tag_set->cmd_size += immutable_tgt->per_io_data_size; 925 + md->init_tio_pdu = true; 926 + } 927 + 928 + err = blk_mq_alloc_tag_set(md->tag_set); 929 + if (err) 930 + goto out_kfree_tag_set; 931 + 932 + q = blk_mq_init_allocated_queue(md->tag_set, md->queue); 933 + if (IS_ERR(q)) { 934 + err = PTR_ERR(q); 935 + goto out_tag_set; 936 + } 937 + dm_init_md_queue(md); 938 + 939 + /* backfill 'mq' sysfs registration normally done in blk_register_queue */ 940 + blk_mq_register_disk(md->disk); 941 + 942 + return 0; 943 + 944 + out_tag_set: 945 + blk_mq_free_tag_set(md->tag_set); 946 + out_kfree_tag_set: 947 + kfree(md->tag_set); 948 + 949 + return err; 950 + } 951 + 952 + void dm_mq_cleanup_mapped_device(struct mapped_device *md) 953 + { 954 + if (md->tag_set) { 955 + blk_mq_free_tag_set(md->tag_set); 956 + kfree(md->tag_set); 957 + } 958 + } 959 + 960 + module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); 961 + MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); 962 + 963 + module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR); 964 + MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices"); 965 + 966 + module_param(dm_mq_nr_hw_queues, uint, S_IRUGO | S_IWUSR); 967 + MODULE_PARM_DESC(dm_mq_nr_hw_queues, "Number of hardware queues for request-based dm-mq devices"); 968 + 969 + module_param(dm_mq_queue_depth, uint, S_IRUGO | S_IWUSR); 970 + MODULE_PARM_DESC(dm_mq_queue_depth, "Queue depth for request-based dm-mq devices");

+64

drivers/md/dm-rq.h

··· 1 + /* 2 + * Internal header file for device mapper 3 + * 4 + * Copyright (C) 2016 Red Hat, Inc. All rights reserved. 5 + * 6 + * This file is released under the LGPL. 7 + */ 8 + 9 + #ifndef DM_RQ_INTERNAL_H 10 + #define DM_RQ_INTERNAL_H 11 + 12 + #include <linux/bio.h> 13 + #include <linux/kthread.h> 14 + 15 + #include "dm-stats.h" 16 + 17 + struct mapped_device; 18 + 19 + /* 20 + * One of these is allocated per request. 21 + */ 22 + struct dm_rq_target_io { 23 + struct mapped_device *md; 24 + struct dm_target *ti; 25 + struct request *orig, *clone; 26 + struct kthread_work work; 27 + int error; 28 + union map_info info; 29 + struct dm_stats_aux stats_aux; 30 + unsigned long duration_jiffies; 31 + unsigned n_sectors; 32 + }; 33 + 34 + /* 35 + * For request-based dm - the bio clones we allocate are embedded in these 36 + * structs. 37 + * 38 + * We allocate these with bio_alloc_bioset, using the front_pad parameter when 39 + * the bioset is created - this means the bio has to come at the end of the 40 + * struct. 41 + */ 42 + struct dm_rq_clone_bio_info { 43 + struct bio *orig; 44 + struct dm_rq_target_io *tio; 45 + struct bio clone; 46 + }; 47 + 48 + bool dm_use_blk_mq_default(void); 49 + bool dm_use_blk_mq(struct mapped_device *md); 50 + 51 + int dm_old_init_request_queue(struct mapped_device *md); 52 + int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t); 53 + void dm_mq_cleanup_mapped_device(struct mapped_device *md); 54 + 55 + void dm_start_queue(struct request_queue *q); 56 + void dm_stop_queue(struct request_queue *q); 57 + 58 + unsigned dm_get_reserved_rq_based_ios(void); 59 + 60 + ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf); 61 + ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, 62 + const char *buf, size_t count); 63 + 64 + #endif

+8

drivers/md/dm-snap.c

··· 2302 2302 return do_origin(o->dev, bio); 2303 2303 } 2304 2304 2305 + static long origin_direct_access(struct dm_target *ti, sector_t sector, 2306 + void __pmem **kaddr, pfn_t *pfn, long size) 2307 + { 2308 + DMWARN("device does not support dax."); 2309 + return -EIO; 2310 + } 2311 + 2305 2312 /* 2306 2313 * Set the target "max_io_len" field to the minimum of all the snapshots' 2307 2314 * chunk sizes. ··· 2368 2361 .postsuspend = origin_postsuspend, 2369 2362 .status = origin_status, 2370 2363 .iterate_devices = origin_iterate_devices, 2364 + .direct_access = origin_direct_access, 2371 2365 }; 2372 2366 2373 2367 static struct target_type snapshot_target = {

+1 -1

drivers/md/dm-stats.c

··· 10 10 #include <linux/module.h> 11 11 #include <linux/device-mapper.h> 12 12 13 - #include "dm.h" 13 + #include "dm-core.h" 14 14 #include "dm-stats.h" 15 15 16 16 #define DM_MSG_PREFIX "stats"

+25 -1

drivers/md/dm-stripe.c

··· 308 308 return DM_MAPIO_REMAPPED; 309 309 } 310 310 311 + static long stripe_direct_access(struct dm_target *ti, sector_t sector, 312 + void __pmem **kaddr, pfn_t *pfn, long size) 313 + { 314 + struct stripe_c *sc = ti->private; 315 + uint32_t stripe; 316 + struct block_device *bdev; 317 + struct blk_dax_ctl dax = { 318 + .size = size, 319 + }; 320 + long ret; 321 + 322 + stripe_map_sector(sc, sector, &stripe, &dax.sector); 323 + 324 + dax.sector += sc->stripe[stripe].physical_start; 325 + bdev = sc->stripe[stripe].dev->bdev; 326 + 327 + ret = bdev_direct_access(bdev, &dax); 328 + *kaddr = dax.addr; 329 + *pfn = dax.pfn; 330 + 331 + return ret; 332 + } 333 + 311 334 /* 312 335 * Stripe status: 313 336 * ··· 439 416 440 417 static struct target_type stripe_target = { 441 418 .name = "striped", 442 - .version = {1, 5, 1}, 419 + .version = {1, 6, 0}, 443 420 .module = THIS_MODULE, 444 421 .ctr = stripe_ctr, 445 422 .dtr = stripe_dtr, ··· 448 425 .status = stripe_status, 449 426 .iterate_devices = stripe_iterate_devices, 450 427 .io_hints = stripe_io_hints, 428 + .direct_access = stripe_direct_access, 451 429 }; 452 430 453 431 int __init dm_stripe_init(void)

+2 -1

drivers/md/dm-sysfs.c

··· 6 6 7 7 #include <linux/sysfs.h> 8 8 #include <linux/dm-ioctl.h> 9 - #include "dm.h" 9 + #include "dm-core.h" 10 + #include "dm-rq.h" 10 11 11 12 struct dm_sysfs_attr { 12 13 struct attribute attr;

+90 -24

drivers/md/dm-table.c

··· 5 5 * This file is released under the GPL. 6 6 */ 7 7 8 - #include "dm.h" 8 + #include "dm-core.h" 9 9 10 10 #include <linux/module.h> 11 11 #include <linux/vmalloc.h> ··· 43 43 struct dm_target *targets; 44 44 45 45 struct target_type *immutable_target_type; 46 - unsigned integrity_supported:1; 47 - unsigned singleton:1; 46 + 47 + bool integrity_supported:1; 48 + bool singleton:1; 49 + bool all_blk_mq:1; 48 50 49 51 /* 50 52 * Indicates the rw permissions for the new logical ··· 208 206 return -ENOMEM; 209 207 } 210 208 209 + t->type = DM_TYPE_NONE; 211 210 t->mode = mode; 212 211 t->md = md; 213 212 *result = t; ··· 706 703 dm_device_name(t->md), type); 707 704 return -EINVAL; 708 705 } 709 - t->singleton = 1; 706 + t->singleton = true; 710 707 } 711 708 712 709 if (dm_target_always_writeable(tgt->type) && !(t->mode & FMODE_WRITE)) { ··· 827 824 } 828 825 EXPORT_SYMBOL(dm_consume_args); 829 826 827 + static bool __table_type_bio_based(unsigned table_type) 828 + { 829 + return (table_type == DM_TYPE_BIO_BASED || 830 + table_type == DM_TYPE_DAX_BIO_BASED); 831 + } 832 + 830 833 static bool __table_type_request_based(unsigned table_type) 831 834 { 832 835 return (table_type == DM_TYPE_REQUEST_BASED || 833 836 table_type == DM_TYPE_MQ_REQUEST_BASED); 834 837 } 835 838 836 - static int dm_table_set_type(struct dm_table *t) 839 + void dm_table_set_type(struct dm_table *t, unsigned type) 840 + { 841 + t->type = type; 842 + } 843 + EXPORT_SYMBOL_GPL(dm_table_set_type); 844 + 845 + static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, 846 + sector_t start, sector_t len, void *data) 847 + { 848 + struct request_queue *q = bdev_get_queue(dev->bdev); 849 + 850 + return q && blk_queue_dax(q); 851 + } 852 + 853 + static bool dm_table_supports_dax(struct dm_table *t) 854 + { 855 + struct dm_target *ti; 856 + unsigned i = 0; 857 + 858 + /* Ensure that all targets support DAX. */ 859 + while (i < dm_table_get_num_targets(t)) { 860 + ti = dm_table_get_target(t, i++); 861 + 862 + if (!ti->type->direct_access) 863 + return false; 864 + 865 + if (!ti->type->iterate_devices || 866 + !ti->type->iterate_devices(ti, device_supports_dax, NULL)) 867 + return false; 868 + } 869 + 870 + return true; 871 + } 872 + 873 + static int dm_table_determine_type(struct dm_table *t) 837 874 { 838 875 unsigned i; 839 876 unsigned bio_based = 0, request_based = 0, hybrid = 0; 840 - bool use_blk_mq = false; 877 + bool verify_blk_mq = false; 841 878 struct dm_target *tgt; 842 879 struct dm_dev_internal *dd; 843 - struct list_head *devices; 880 + struct list_head *devices = dm_table_get_devices(t); 844 881 unsigned live_md_type = dm_get_md_type(t->md); 882 + 883 + if (t->type != DM_TYPE_NONE) { 884 + /* target already set the table's type */ 885 + if (t->type == DM_TYPE_BIO_BASED) 886 + return 0; 887 + BUG_ON(t->type == DM_TYPE_DAX_BIO_BASED); 888 + goto verify_rq_based; 889 + } 845 890 846 891 for (i = 0; i < t->num_targets; i++) { 847 892 tgt = t->targets + i; ··· 922 871 if (bio_based) { 923 872 /* We must use this table as bio-based */ 924 873 t->type = DM_TYPE_BIO_BASED; 874 + if (dm_table_supports_dax(t) || 875 + (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) 876 + t->type = DM_TYPE_DAX_BIO_BASED; 925 877 return 0; 926 878 } 927 879 928 880 BUG_ON(!request_based); /* No targets in this table */ 929 881 882 + if (list_empty(devices) && __table_type_request_based(live_md_type)) { 883 + /* inherit live MD type */ 884 + t->type = live_md_type; 885 + return 0; 886 + } 887 + 888 + /* 889 + * The only way to establish DM_TYPE_MQ_REQUEST_BASED is by 890 + * having a compatible target use dm_table_set_type. 891 + */ 892 + t->type = DM_TYPE_REQUEST_BASED; 893 + 894 + verify_rq_based: 930 895 /* 931 896 * Request-based dm supports only tables that have a single target now. 932 897 * To support multiple targets, request splitting support is needed, ··· 955 888 } 956 889 957 890 /* Non-request-stackable devices can't be used for request-based dm */ 958 - devices = dm_table_get_devices(t); 959 891 list_for_each_entry(dd, devices, list) { 960 892 struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); 961 893 ··· 965 899 } 966 900 967 901 if (q->mq_ops) 968 - use_blk_mq = true; 902 + verify_blk_mq = true; 969 903 } 970 904 971 - if (use_blk_mq) { 905 + if (verify_blk_mq) { 972 906 /* verify _all_ devices in the table are blk-mq devices */ 973 907 list_for_each_entry(dd, devices, list) 974 908 if (!bdev_get_queue(dd->dm_dev->bdev)->mq_ops) { ··· 976 910 " are blk-mq request-stackable"); 977 911 return -EINVAL; 978 912 } 979 - t->type = DM_TYPE_MQ_REQUEST_BASED; 980 913 981 - } else if (list_empty(devices) && __table_type_request_based(live_md_type)) { 982 - /* inherit live MD type */ 983 - t->type = live_md_type; 984 - 985 - } else 986 - t->type = DM_TYPE_REQUEST_BASED; 914 + t->all_blk_mq = true; 915 + } 987 916 988 917 return 0; 989 918 } ··· 1017 956 return NULL; 1018 957 } 1019 958 959 + bool dm_table_bio_based(struct dm_table *t) 960 + { 961 + return __table_type_bio_based(dm_table_get_type(t)); 962 + } 963 + 1020 964 bool dm_table_request_based(struct dm_table *t) 1021 965 { 1022 966 return __table_type_request_based(dm_table_get_type(t)); 1023 967 } 1024 968 1025 - bool dm_table_mq_request_based(struct dm_table *t) 969 + bool dm_table_all_blk_mq_devices(struct dm_table *t) 1026 970 { 1027 - return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED; 971 + return t->all_blk_mq; 1028 972 } 1029 973 1030 974 static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) ··· 1044 978 return -EINVAL; 1045 979 } 1046 980 1047 - if (type == DM_TYPE_BIO_BASED) 981 + if (__table_type_bio_based(type)) 1048 982 for (i = 0; i < t->num_targets; i++) { 1049 983 tgt = t->targets + i; 1050 984 per_io_data_size = max(per_io_data_size, tgt->per_io_data_size); ··· 1172 1106 return 0; 1173 1107 1174 1108 if (!integrity_profile_exists(dm_disk(md))) { 1175 - t->integrity_supported = 1; 1109 + t->integrity_supported = true; 1176 1110 /* 1177 1111 * Register integrity profile during table load; we can do 1178 1112 * this because the final profile must match during resume. ··· 1195 1129 } 1196 1130 1197 1131 /* Preserve existing integrity profile */ 1198 - t->integrity_supported = 1; 1132 + t->integrity_supported = true; 1199 1133 return 0; 1200 1134 } 1201 1135 ··· 1207 1141 { 1208 1142 int r; 1209 1143 1210 - r = dm_table_set_type(t); 1144 + r = dm_table_determine_type(t); 1211 1145 if (r) { 1212 - DMERR("unable to set table type"); 1146 + DMERR("unable to determine table type"); 1213 1147 return r; 1214 1148 } 1215 1149

+9 -2

drivers/md/dm-target.c

··· 4 4 * This file is released under the GPL. 5 5 */ 6 6 7 - #include "dm.h" 7 + #include "dm-core.h" 8 8 9 9 #include <linux/module.h> 10 10 #include <linux/init.h> ··· 148 148 { 149 149 } 150 150 151 + static long io_err_direct_access(struct dm_target *ti, sector_t sector, 152 + void __pmem **kaddr, pfn_t *pfn, long size) 153 + { 154 + return -EIO; 155 + } 156 + 151 157 static struct target_type error_target = { 152 158 .name = "error", 153 - .version = {1, 4, 0}, 159 + .version = {1, 5, 0}, 154 160 .features = DM_TARGET_WILDCARD, 155 161 .ctr = io_err_ctr, 156 162 .dtr = io_err_dtr, ··· 164 158 .map_rq = io_err_map_rq, 165 159 .clone_and_map_rq = io_err_clone_and_map_rq, 166 160 .release_clone_rq = io_err_release_clone_rq, 161 + .direct_access = io_err_direct_access, 167 162 }; 168 163 169 164 int __init dm_target_init(void)

+30

drivers/md/dm-thin-metadata.c

··· 1677 1677 return r; 1678 1678 } 1679 1679 1680 + int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e) 1681 + { 1682 + int r = 0; 1683 + 1684 + down_write(&pmd->root_lock); 1685 + for (; b != e; b++) { 1686 + r = dm_sm_inc_block(pmd->data_sm, b); 1687 + if (r) 1688 + break; 1689 + } 1690 + up_write(&pmd->root_lock); 1691 + 1692 + return r; 1693 + } 1694 + 1695 + int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e) 1696 + { 1697 + int r = 0; 1698 + 1699 + down_write(&pmd->root_lock); 1700 + for (; b != e; b++) { 1701 + r = dm_sm_dec_block(pmd->data_sm, b); 1702 + if (r) 1703 + break; 1704 + } 1705 + up_write(&pmd->root_lock); 1706 + 1707 + return r; 1708 + } 1709 + 1680 1710 bool dm_thin_changed_this_transaction(struct dm_thin_device *td) 1681 1711 { 1682 1712 int r;

+3

drivers/md/dm-thin-metadata.h

··· 197 197 198 198 int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result); 199 199 200 + int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e); 201 + int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e); 202 + 200 203 /* 201 204 * Returns -ENOSPC if the new size is too small and already allocated 202 205 * blocks would be lost.

+97 -17

drivers/md/dm-thin.c

··· 253 253 struct bio_list deferred_flush_bios; 254 254 struct list_head prepared_mappings; 255 255 struct list_head prepared_discards; 256 + struct list_head prepared_discards_pt2; 256 257 struct list_head active_thins; 257 258 258 259 struct dm_deferred_set *shared_read_ds; ··· 270 269 271 270 process_mapping_fn process_prepared_mapping; 272 271 process_mapping_fn process_prepared_discard; 272 + process_mapping_fn process_prepared_discard_pt2; 273 273 274 274 struct dm_bio_prison_cell **cell_sort_array; 275 275 }; ··· 1003 1001 1004 1002 /*----------------------------------------------------------------*/ 1005 1003 1006 - static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m) 1004 + static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m, 1005 + struct bio *discard_parent) 1007 1006 { 1008 1007 /* 1009 1008 * We've already unmapped this range of blocks, but before we ··· 1017 1014 dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin; 1018 1015 struct discard_op op; 1019 1016 1020 - begin_discard(&op, tc, m->bio); 1017 + begin_discard(&op, tc, discard_parent); 1021 1018 while (b != end) { 1022 1019 /* find start of unmapped run */ 1023 1020 for (; b < end; b++) { ··· 1052 1049 end_discard(&op, r); 1053 1050 } 1054 1051 1055 - static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m) 1052 + static void queue_passdown_pt2(struct dm_thin_new_mapping *m) 1053 + { 1054 + unsigned long flags; 1055 + struct pool *pool = m->tc->pool; 1056 + 1057 + spin_lock_irqsave(&pool->lock, flags); 1058 + list_add_tail(&m->list, &pool->prepared_discards_pt2); 1059 + spin_unlock_irqrestore(&pool->lock, flags); 1060 + wake_worker(pool); 1061 + } 1062 + 1063 + static void passdown_endio(struct bio *bio) 1064 + { 1065 + /* 1066 + * It doesn't matter if the passdown discard failed, we still want 1067 + * to unmap (we ignore err). 1068 + */ 1069 + queue_passdown_pt2(bio->bi_private); 1070 + } 1071 + 1072 + static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m) 1073 + { 1074 + int r; 1075 + struct thin_c *tc = m->tc; 1076 + struct pool *pool = tc->pool; 1077 + struct bio *discard_parent; 1078 + dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin); 1079 + 1080 + /* 1081 + * Only this thread allocates blocks, so we can be sure that the 1082 + * newly unmapped blocks will not be allocated before the end of 1083 + * the function. 1084 + */ 1085 + r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end); 1086 + if (r) { 1087 + metadata_operation_failed(pool, "dm_thin_remove_range", r); 1088 + bio_io_error(m->bio); 1089 + cell_defer_no_holder(tc, m->cell); 1090 + mempool_free(m, pool->mapping_pool); 1091 + return; 1092 + } 1093 + 1094 + discard_parent = bio_alloc(GFP_NOIO, 1); 1095 + if (!discard_parent) { 1096 + DMWARN("%s: unable to allocate top level discard bio for passdown. Skipping passdown.", 1097 + dm_device_name(tc->pool->pool_md)); 1098 + queue_passdown_pt2(m); 1099 + 1100 + } else { 1101 + discard_parent->bi_end_io = passdown_endio; 1102 + discard_parent->bi_private = m; 1103 + 1104 + if (m->maybe_shared) 1105 + passdown_double_checking_shared_status(m, discard_parent); 1106 + else { 1107 + struct discard_op op; 1108 + 1109 + begin_discard(&op, tc, discard_parent); 1110 + r = issue_discard(&op, m->data_block, data_end); 1111 + end_discard(&op, r); 1112 + } 1113 + } 1114 + 1115 + /* 1116 + * Increment the unmapped blocks. This prevents a race between the 1117 + * passdown io and reallocation of freed blocks. 1118 + */ 1119 + r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end); 1120 + if (r) { 1121 + metadata_operation_failed(pool, "dm_pool_inc_data_range", r); 1122 + bio_io_error(m->bio); 1123 + cell_defer_no_holder(tc, m->cell); 1124 + mempool_free(m, pool->mapping_pool); 1125 + return; 1126 + } 1127 + } 1128 + 1129 + static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m) 1056 1130 { 1057 1131 int r; 1058 1132 struct thin_c *tc = m->tc; 1059 1133 struct pool *pool = tc->pool; 1060 1134 1061 - r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end); 1135 + /* 1136 + * The passdown has completed, so now we can decrement all those 1137 + * unmapped blocks. 1138 + */ 1139 + r = dm_pool_dec_data_range(pool->pmd, m->data_block, 1140 + m->data_block + (m->virt_end - m->virt_begin)); 1062 1141 if (r) { 1063 - metadata_operation_failed(pool, "dm_thin_remove_range", r); 1142 + metadata_operation_failed(pool, "dm_pool_dec_data_range", r); 1064 1143 bio_io_error(m->bio); 1065 - 1066 - } else if (m->maybe_shared) { 1067 - passdown_double_checking_shared_status(m); 1068 - 1069 - } else { 1070 - struct discard_op op; 1071 - begin_discard(&op, tc, m->bio); 1072 - r = issue_discard(&op, m->data_block, 1073 - m->data_block + (m->virt_end - m->virt_begin)); 1074 - end_discard(&op, r); 1075 - } 1144 + } else 1145 + bio_endio(m->bio); 1076 1146 1077 1147 cell_defer_no_holder(tc, m->cell); 1078 1148 mempool_free(m, pool->mapping_pool); ··· 2291 2215 throttle_work_update(&pool->throttle); 2292 2216 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); 2293 2217 throttle_work_update(&pool->throttle); 2218 + process_prepared(pool, &pool->prepared_discards_pt2, &pool->process_prepared_discard_pt2); 2219 + throttle_work_update(&pool->throttle); 2294 2220 process_deferred_bios(pool); 2295 2221 throttle_work_complete(&pool->throttle); 2296 2222 } ··· 2421 2343 2422 2344 if (passdown_enabled(pt)) { 2423 2345 pool->process_discard_cell = process_discard_cell_passdown; 2424 - pool->process_prepared_discard = process_prepared_discard_passdown; 2346 + pool->process_prepared_discard = process_prepared_discard_passdown_pt1; 2347 + pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2; 2425 2348 } else { 2426 2349 pool->process_discard_cell = process_discard_cell_no_passdown; 2427 2350 pool->process_prepared_discard = process_prepared_discard_no_passdown; ··· 2909 2830 bio_list_init(&pool->deferred_flush_bios); 2910 2831 INIT_LIST_HEAD(&pool->prepared_mappings); 2911 2832 INIT_LIST_HEAD(&pool->prepared_discards); 2833 + INIT_LIST_HEAD(&pool->prepared_discards_pt2); 2912 2834 INIT_LIST_HEAD(&pool->active_thins); 2913 2835 pool->low_water_triggered = false; 2914 2836 pool->suspended = true;

+1 -3

drivers/md/dm-verity-fec.c

··· 453 453 */ 454 454 455 455 offset = block << v->data_dev_block_bits; 456 - 457 - res = offset; 458 - div64_u64(res, v->fec->rounds << v->data_dev_block_bits); 456 + res = div64_u64(offset, v->fec->rounds << v->data_dev_block_bits); 459 457 460 458 /* 461 459 * The base RS block we can feed to the interleaver to find out all

+115 -1126

drivers/md/dm.c

··· 5 5 * This file is released under the GPL. 6 6 */ 7 7 8 - #include "dm.h" 8 + #include "dm-core.h" 9 + #include "dm-rq.h" 9 10 #include "dm-uevent.h" 10 11 11 12 #include <linux/init.h> 12 13 #include <linux/module.h> 13 14 #include <linux/mutex.h> 14 - #include <linux/moduleparam.h> 15 15 #include <linux/blkpg.h> 16 16 #include <linux/bio.h> 17 17 #include <linux/mempool.h> ··· 20 20 #include <linux/hdreg.h> 21 21 #include <linux/delay.h> 22 22 #include <linux/wait.h> 23 - #include <linux/kthread.h> 24 - #include <linux/ktime.h> 25 - #include <linux/elevator.h> /* for rq_end_sector() */ 26 - #include <linux/blk-mq.h> 27 23 #include <linux/pr.h> 28 - 29 - #include <trace/events/block.h> 30 24 31 25 #define DM_MSG_PREFIX "core" 32 26 ··· 57 63 static struct workqueue_struct *deferred_remove_workqueue; 58 64 59 65 /* 60 - * For bio-based dm. 61 66 * One of these is allocated per bio. 62 67 */ 63 68 struct dm_io { ··· 67 74 unsigned long start_time; 68 75 spinlock_t endio_lock; 69 76 struct dm_stats_aux stats_aux; 70 - }; 71 - 72 - /* 73 - * For request-based dm. 74 - * One of these is allocated per request. 75 - */ 76 - struct dm_rq_target_io { 77 - struct mapped_device *md; 78 - struct dm_target *ti; 79 - struct request *orig, *clone; 80 - struct kthread_work work; 81 - int error; 82 - union map_info info; 83 - struct dm_stats_aux stats_aux; 84 - unsigned long duration_jiffies; 85 - unsigned n_sectors; 86 - }; 87 - 88 - /* 89 - * For request-based dm - the bio clones we allocate are embedded in these 90 - * structs. 91 - * 92 - * We allocate these with bio_alloc_bioset, using the front_pad parameter when 93 - * the bioset is created - this means the bio has to come at the end of the 94 - * struct. 95 - */ 96 - struct dm_rq_clone_bio_info { 97 - struct bio *orig; 98 - struct dm_rq_target_io *tio; 99 - struct bio clone; 100 77 }; 101 78 102 79 #define MINOR_ALLOCED ((void *)-1) ··· 83 120 #define DMF_DEFERRED_REMOVE 6 84 121 #define DMF_SUSPENDED_INTERNALLY 7 85 122 86 - /* 87 - * Work processed by per-device workqueue. 88 - */ 89 - struct mapped_device { 90 - struct srcu_struct io_barrier; 91 - struct mutex suspend_lock; 92 - 93 - /* 94 - * The current mapping (struct dm_table *). 95 - * Use dm_get_live_table{_fast} or take suspend_lock for 96 - * dereference. 97 - */ 98 - void __rcu *map; 99 - 100 - struct list_head table_devices; 101 - struct mutex table_devices_lock; 102 - 103 - unsigned long flags; 104 - 105 - struct request_queue *queue; 106 - int numa_node_id; 107 - 108 - unsigned type; 109 - /* Protect queue and type against concurrent access. */ 110 - struct mutex type_lock; 111 - 112 - atomic_t holders; 113 - atomic_t open_count; 114 - 115 - struct dm_target *immutable_target; 116 - struct target_type *immutable_target_type; 117 - 118 - struct gendisk *disk; 119 - char name[16]; 120 - 121 - void *interface_ptr; 122 - 123 - /* 124 - * A list of ios that arrived while we were suspended. 125 - */ 126 - atomic_t pending[2]; 127 - wait_queue_head_t wait; 128 - struct work_struct work; 129 - spinlock_t deferred_lock; 130 - struct bio_list deferred; 131 - 132 - /* 133 - * Event handling. 134 - */ 135 - wait_queue_head_t eventq; 136 - atomic_t event_nr; 137 - atomic_t uevent_seq; 138 - struct list_head uevent_list; 139 - spinlock_t uevent_lock; /* Protect access to uevent_list */ 140 - 141 - /* the number of internal suspends */ 142 - unsigned internal_suspend_count; 143 - 144 - /* 145 - * Processing queue (flush) 146 - */ 147 - struct workqueue_struct *wq; 148 - 149 - /* 150 - * io objects are allocated from here. 151 - */ 152 - mempool_t *io_pool; 153 - mempool_t *rq_pool; 154 - 155 - struct bio_set *bs; 156 - 157 - /* 158 - * freeze/thaw support require holding onto a super block 159 - */ 160 - struct super_block *frozen_sb; 161 - 162 - /* forced geometry settings */ 163 - struct hd_geometry geometry; 164 - 165 - struct block_device *bdev; 166 - 167 - /* kobject and completion */ 168 - struct dm_kobject_holder kobj_holder; 169 - 170 - /* zero-length flush that will be cloned and submitted to targets */ 171 - struct bio flush_bio; 172 - 173 - struct dm_stats stats; 174 - 175 - struct kthread_worker kworker; 176 - struct task_struct *kworker_task; 177 - 178 - /* for request-based merge heuristic in dm_request_fn() */ 179 - unsigned seq_rq_merge_deadline_usecs; 180 - int last_rq_rw; 181 - sector_t last_rq_pos; 182 - ktime_t last_rq_start_time; 183 - 184 - /* for blk-mq request-based DM support */ 185 - struct blk_mq_tag_set *tag_set; 186 - bool use_blk_mq:1; 187 - bool init_tio_pdu:1; 188 - }; 189 - 190 - #ifdef CONFIG_DM_MQ_DEFAULT 191 - static bool use_blk_mq = true; 192 - #else 193 - static bool use_blk_mq = false; 194 - #endif 195 - 196 - #define DM_MQ_NR_HW_QUEUES 1 197 - #define DM_MQ_QUEUE_DEPTH 2048 198 123 #define DM_NUMA_NODE NUMA_NO_NODE 199 - 200 - static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES; 201 - static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH; 202 124 static int dm_numa_node = DM_NUMA_NODE; 203 - 204 - bool dm_use_blk_mq(struct mapped_device *md) 205 - { 206 - return md->use_blk_mq; 207 - } 208 - EXPORT_SYMBOL_GPL(dm_use_blk_mq); 209 125 210 126 /* 211 127 * For mempools pre-allocation at the table loading time. ··· 101 259 struct dm_dev dm_dev; 102 260 }; 103 261 104 - #define RESERVED_BIO_BASED_IOS 16 105 - #define RESERVED_REQUEST_BASED_IOS 256 106 - #define RESERVED_MAX_IOS 1024 107 262 static struct kmem_cache *_io_cache; 108 263 static struct kmem_cache *_rq_tio_cache; 109 264 static struct kmem_cache *_rq_cache; ··· 108 269 /* 109 270 * Bio-based DM's mempools' reserved IOs set by the user. 110 271 */ 272 + #define RESERVED_BIO_BASED_IOS 16 111 273 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; 112 - 113 - /* 114 - * Request-based DM's mempools' reserved IOs set by the user. 115 - */ 116 - static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; 117 274 118 275 static int __dm_get_module_param_int(int *module_param, int min, int max) 119 276 { ··· 132 297 return param; 133 298 } 134 299 135 - static unsigned __dm_get_module_param(unsigned *module_param, 136 - unsigned def, unsigned max) 300 + unsigned __dm_get_module_param(unsigned *module_param, 301 + unsigned def, unsigned max) 137 302 { 138 303 unsigned param = ACCESS_ONCE(*module_param); 139 304 unsigned modified_param = 0; ··· 154 319 unsigned dm_get_reserved_bio_based_ios(void) 155 320 { 156 321 return __dm_get_module_param(&reserved_bio_based_ios, 157 - RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS); 322 + RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS); 158 323 } 159 324 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 160 - 161 - unsigned dm_get_reserved_rq_based_ios(void) 162 - { 163 - return __dm_get_module_param(&reserved_rq_based_ios, 164 - RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS); 165 - } 166 - EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); 167 - 168 - static unsigned dm_get_blk_mq_nr_hw_queues(void) 169 - { 170 - return __dm_get_module_param(&dm_mq_nr_hw_queues, 1, 32); 171 - } 172 - 173 - static unsigned dm_get_blk_mq_queue_depth(void) 174 - { 175 - return __dm_get_module_param(&dm_mq_queue_depth, 176 - DM_MQ_QUEUE_DEPTH, BLK_MQ_MAX_DEPTH); 177 - } 178 325 179 326 static unsigned dm_get_numa_node(void) 180 327 { ··· 496 679 bio_put(&tio->clone); 497 680 } 498 681 499 - static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md, 500 - gfp_t gfp_mask) 501 - { 502 - return mempool_alloc(md->io_pool, gfp_mask); 503 - } 504 - 505 - static void free_old_rq_tio(struct dm_rq_target_io *tio) 506 - { 507 - mempool_free(tio, tio->md->io_pool); 508 - } 509 - 510 - static struct request *alloc_old_clone_request(struct mapped_device *md, 511 - gfp_t gfp_mask) 512 - { 513 - return mempool_alloc(md->rq_pool, gfp_mask); 514 - } 515 - 516 - static void free_old_clone_request(struct mapped_device *md, struct request *rq) 517 - { 518 - mempool_free(rq, md->rq_pool); 519 - } 520 - 521 - static int md_in_flight(struct mapped_device *md) 682 + int md_in_flight(struct mapped_device *md) 522 683 { 523 684 return atomic_read(&md->pending[READ]) + 524 685 atomic_read(&md->pending[WRITE]); ··· 814 1019 } 815 1020 } 816 1021 817 - static void disable_write_same(struct mapped_device *md) 1022 + void disable_write_same(struct mapped_device *md) 818 1023 { 819 1024 struct queue_limits *limits = dm_get_queue_limits(md); 820 1025 ··· 854 1059 855 1060 free_tio(tio); 856 1061 dec_pending(io, error); 857 - } 858 - 859 - /* 860 - * Partial completion handling for request-based dm 861 - */ 862 - static void end_clone_bio(struct bio *clone) 863 - { 864 - struct dm_rq_clone_bio_info *info = 865 - container_of(clone, struct dm_rq_clone_bio_info, clone); 866 - struct dm_rq_target_io *tio = info->tio; 867 - struct bio *bio = info->orig; 868 - unsigned int nr_bytes = info->orig->bi_iter.bi_size; 869 - int error = clone->bi_error; 870 - 871 - bio_put(clone); 872 - 873 - if (tio->error) 874 - /* 875 - * An error has already been detected on the request. 876 - * Once error occurred, just let clone->end_io() handle 877 - * the remainder. 878 - */ 879 - return; 880 - else if (error) { 881 - /* 882 - * Don't notice the error to the upper layer yet. 883 - * The error handling decision is made by the target driver, 884 - * when the request is completed. 885 - */ 886 - tio->error = error; 887 - return; 888 - } 889 - 890 - /* 891 - * I/O for the bio successfully completed. 892 - * Notice the data completion to the upper layer. 893 - */ 894 - 895 - /* 896 - * bios are processed from the head of the list. 897 - * So the completing bio should always be rq->bio. 898 - * If it's not, something wrong is happening. 899 - */ 900 - if (tio->orig->bio != bio) 901 - DMERR("bio completion is going in the middle of the request"); 902 - 903 - /* 904 - * Update the original request. 905 - * Do not use blk_end_request() here, because it may complete 906 - * the original request before the clone, and break the ordering. 907 - */ 908 - blk_update_request(tio->orig, 0, nr_bytes); 909 - } 910 - 911 - static struct dm_rq_target_io *tio_from_request(struct request *rq) 912 - { 913 - return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special); 914 - } 915 - 916 - static void rq_end_stats(struct mapped_device *md, struct request *orig) 917 - { 918 - if (unlikely(dm_stats_used(&md->stats))) { 919 - struct dm_rq_target_io *tio = tio_from_request(orig); 920 - tio->duration_jiffies = jiffies - tio->duration_jiffies; 921 - dm_stats_account_io(&md->stats, rq_data_dir(orig), 922 - blk_rq_pos(orig), tio->n_sectors, true, 923 - tio->duration_jiffies, &tio->stats_aux); 924 - } 925 - } 926 - 927 - /* 928 - * Don't touch any member of the md after calling this function because 929 - * the md may be freed in dm_put() at the end of this function. 930 - * Or do dm_get() before calling this function and dm_put() later. 931 - */ 932 - static void rq_completed(struct mapped_device *md, int rw, bool run_queue) 933 - { 934 - atomic_dec(&md->pending[rw]); 935 - 936 - /* nudge anyone waiting on suspend queue */ 937 - if (!md_in_flight(md)) 938 - wake_up(&md->wait); 939 - 940 - /* 941 - * Run this off this callpath, as drivers could invoke end_io while 942 - * inside their request_fn (and holding the queue lock). Calling 943 - * back into ->request_fn() could deadlock attempting to grab the 944 - * queue lock again. 945 - */ 946 - if (!md->queue->mq_ops && run_queue) 947 - blk_run_queue_async(md->queue); 948 - 949 - /* 950 - * dm_put() must be at the end of this function. See the comment above 951 - */ 952 - dm_put(md); 953 - } 954 - 955 - static void free_rq_clone(struct request *clone) 956 - { 957 - struct dm_rq_target_io *tio = clone->end_io_data; 958 - struct mapped_device *md = tio->md; 959 - 960 - blk_rq_unprep_clone(clone); 961 - 962 - if (md->type == DM_TYPE_MQ_REQUEST_BASED) 963 - /* stacked on blk-mq queue(s) */ 964 - tio->ti->type->release_clone_rq(clone); 965 - else if (!md->queue->mq_ops) 966 - /* request_fn queue stacked on request_fn queue(s) */ 967 - free_old_clone_request(md, clone); 968 - 969 - if (!md->queue->mq_ops) 970 - free_old_rq_tio(tio); 971 - } 972 - 973 - /* 974 - * Complete the clone and the original request. 975 - * Must be called without clone's queue lock held, 976 - * see end_clone_request() for more details. 977 - */ 978 - static void dm_end_request(struct request *clone, int error) 979 - { 980 - int rw = rq_data_dir(clone); 981 - struct dm_rq_target_io *tio = clone->end_io_data; 982 - struct mapped_device *md = tio->md; 983 - struct request *rq = tio->orig; 984 - 985 - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 986 - rq->errors = clone->errors; 987 - rq->resid_len = clone->resid_len; 988 - 989 - if (rq->sense) 990 - /* 991 - * We are using the sense buffer of the original 992 - * request. 993 - * So setting the length of the sense data is enough. 994 - */ 995 - rq->sense_len = clone->sense_len; 996 - } 997 - 998 - free_rq_clone(clone); 999 - rq_end_stats(md, rq); 1000 - if (!rq->q->mq_ops) 1001 - blk_end_request_all(rq, error); 1002 - else 1003 - blk_mq_end_request(rq, error); 1004 - rq_completed(md, rw, true); 1005 - } 1006 - 1007 - static void dm_unprep_request(struct request *rq) 1008 - { 1009 - struct dm_rq_target_io *tio = tio_from_request(rq); 1010 - struct request *clone = tio->clone; 1011 - 1012 - if (!rq->q->mq_ops) { 1013 - rq->special = NULL; 1014 - rq->cmd_flags &= ~REQ_DONTPREP; 1015 - } 1016 - 1017 - if (clone) 1018 - free_rq_clone(clone); 1019 - else if (!tio->md->queue->mq_ops) 1020 - free_old_rq_tio(tio); 1021 - } 1022 - 1023 - /* 1024 - * Requeue the original request of a clone. 1025 - */ 1026 - static void dm_old_requeue_request(struct request *rq) 1027 - { 1028 - struct request_queue *q = rq->q; 1029 - unsigned long flags; 1030 - 1031 - spin_lock_irqsave(q->queue_lock, flags); 1032 - blk_requeue_request(q, rq); 1033 - blk_run_queue_async(q); 1034 - spin_unlock_irqrestore(q->queue_lock, flags); 1035 - } 1036 - 1037 - static void dm_mq_requeue_request(struct request *rq) 1038 - { 1039 - struct request_queue *q = rq->q; 1040 - unsigned long flags; 1041 - 1042 - blk_mq_requeue_request(rq); 1043 - spin_lock_irqsave(q->queue_lock, flags); 1044 - if (!blk_queue_stopped(q)) 1045 - blk_mq_kick_requeue_list(q); 1046 - spin_unlock_irqrestore(q->queue_lock, flags); 1047 - } 1048 - 1049 - static void dm_requeue_original_request(struct mapped_device *md, 1050 - struct request *rq) 1051 - { 1052 - int rw = rq_data_dir(rq); 1053 - 1054 - rq_end_stats(md, rq); 1055 - dm_unprep_request(rq); 1056 - 1057 - if (!rq->q->mq_ops) 1058 - dm_old_requeue_request(rq); 1059 - else 1060 - dm_mq_requeue_request(rq); 1061 - 1062 - rq_completed(md, rw, false); 1063 - } 1064 - 1065 - static void dm_old_stop_queue(struct request_queue *q) 1066 - { 1067 - unsigned long flags; 1068 - 1069 - spin_lock_irqsave(q->queue_lock, flags); 1070 - if (blk_queue_stopped(q)) { 1071 - spin_unlock_irqrestore(q->queue_lock, flags); 1072 - return; 1073 - } 1074 - 1075 - blk_stop_queue(q); 1076 - spin_unlock_irqrestore(q->queue_lock, flags); 1077 - } 1078 - 1079 - static void dm_stop_queue(struct request_queue *q) 1080 - { 1081 - if (!q->mq_ops) 1082 - dm_old_stop_queue(q); 1083 - else 1084 - blk_mq_stop_hw_queues(q); 1085 - } 1086 - 1087 - static void dm_old_start_queue(struct request_queue *q) 1088 - { 1089 - unsigned long flags; 1090 - 1091 - spin_lock_irqsave(q->queue_lock, flags); 1092 - if (blk_queue_stopped(q)) 1093 - blk_start_queue(q); 1094 - spin_unlock_irqrestore(q->queue_lock, flags); 1095 - } 1096 - 1097 - static void dm_start_queue(struct request_queue *q) 1098 - { 1099 - if (!q->mq_ops) 1100 - dm_old_start_queue(q); 1101 - else { 1102 - blk_mq_start_stopped_hw_queues(q, true); 1103 - blk_mq_kick_requeue_list(q); 1104 - } 1105 - } 1106 - 1107 - static void dm_done(struct request *clone, int error, bool mapped) 1108 - { 1109 - int r = error; 1110 - struct dm_rq_target_io *tio = clone->end_io_data; 1111 - dm_request_endio_fn rq_end_io = NULL; 1112 - 1113 - if (tio->ti) { 1114 - rq_end_io = tio->ti->type->rq_end_io; 1115 - 1116 - if (mapped && rq_end_io) 1117 - r = rq_end_io(tio->ti, clone, error, &tio->info); 1118 - } 1119 - 1120 - if (unlikely(r == -EREMOTEIO && (req_op(clone) == REQ_OP_WRITE_SAME) && 1121 - !clone->q->limits.max_write_same_sectors)) 1122 - disable_write_same(tio->md); 1123 - 1124 - if (r <= 0) 1125 - /* The target wants to complete the I/O */ 1126 - dm_end_request(clone, r); 1127 - else if (r == DM_ENDIO_INCOMPLETE) 1128 - /* The target will handle the I/O */ 1129 - return; 1130 - else if (r == DM_ENDIO_REQUEUE) 1131 - /* The target wants to requeue the I/O */ 1132 - dm_requeue_original_request(tio->md, tio->orig); 1133 - else { 1134 - DMWARN("unimplemented target endio return value: %d", r); 1135 - BUG(); 1136 - } 1137 - } 1138 - 1139 - /* 1140 - * Request completion handler for request-based dm 1141 - */ 1142 - static void dm_softirq_done(struct request *rq) 1143 - { 1144 - bool mapped = true; 1145 - struct dm_rq_target_io *tio = tio_from_request(rq); 1146 - struct request *clone = tio->clone; 1147 - int rw; 1148 - 1149 - if (!clone) { 1150 - rq_end_stats(tio->md, rq); 1151 - rw = rq_data_dir(rq); 1152 - if (!rq->q->mq_ops) { 1153 - blk_end_request_all(rq, tio->error); 1154 - rq_completed(tio->md, rw, false); 1155 - free_old_rq_tio(tio); 1156 - } else { 1157 - blk_mq_end_request(rq, tio->error); 1158 - rq_completed(tio->md, rw, false); 1159 - } 1160 - return; 1161 - } 1162 - 1163 - if (rq->cmd_flags & REQ_FAILED) 1164 - mapped = false; 1165 - 1166 - dm_done(clone, tio->error, mapped); 1167 - } 1168 - 1169 - /* 1170 - * Complete the clone and the original request with the error status 1171 - * through softirq context. 1172 - */ 1173 - static void dm_complete_request(struct request *rq, int error) 1174 - { 1175 - struct dm_rq_target_io *tio = tio_from_request(rq); 1176 - 1177 - tio->error = error; 1178 - if (!rq->q->mq_ops) 1179 - blk_complete_request(rq); 1180 - else 1181 - blk_mq_complete_request(rq, error); 1182 - } 1183 - 1184 - /* 1185 - * Complete the not-mapped clone and the original request with the error status 1186 - * through softirq context. 1187 - * Target's rq_end_io() function isn't called. 1188 - * This may be used when the target's map_rq() or clone_and_map_rq() functions fail. 1189 - */ 1190 - static void dm_kill_unmapped_request(struct request *rq, int error) 1191 - { 1192 - rq->cmd_flags |= REQ_FAILED; 1193 - dm_complete_request(rq, error); 1194 - } 1195 - 1196 - /* 1197 - * Called with the clone's queue lock held (in the case of .request_fn) 1198 - */ 1199 - static void end_clone_request(struct request *clone, int error) 1200 - { 1201 - struct dm_rq_target_io *tio = clone->end_io_data; 1202 - 1203 - if (!clone->q->mq_ops) { 1204 - /* 1205 - * For just cleaning up the information of the queue in which 1206 - * the clone was dispatched. 1207 - * The clone is *NOT* freed actually here because it is alloced 1208 - * from dm own mempool (REQ_ALLOCED isn't set). 1209 - */ 1210 - __blk_put_request(clone->q, clone); 1211 - } 1212 - 1213 - /* 1214 - * Actual request completion is done in a softirq context which doesn't 1215 - * hold the clone's queue lock. Otherwise, deadlock could occur because: 1216 - * - another request may be submitted by the upper level driver 1217 - * of the stacking during the completion 1218 - * - the submission which requires queue lock may be done 1219 - * against this clone's queue 1220 - */ 1221 - dm_complete_request(tio->orig, error); 1222 1062 } 1223 1063 1224 1064 /* ··· 904 1474 return 0; 905 1475 } 906 1476 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 1477 + 1478 + static long dm_blk_direct_access(struct block_device *bdev, sector_t sector, 1479 + void __pmem **kaddr, pfn_t *pfn, long size) 1480 + { 1481 + struct mapped_device *md = bdev->bd_disk->private_data; 1482 + struct dm_table *map; 1483 + struct dm_target *ti; 1484 + int srcu_idx; 1485 + long len, ret = -EIO; 1486 + 1487 + map = dm_get_live_table(md, &srcu_idx); 1488 + if (!map) 1489 + goto out; 1490 + 1491 + ti = dm_table_find_target(map, sector); 1492 + if (!dm_target_is_valid(ti)) 1493 + goto out; 1494 + 1495 + len = max_io_len(sector, ti) << SECTOR_SHIFT; 1496 + size = min(len, size); 1497 + 1498 + if (ti->type->direct_access) 1499 + ret = ti->type->direct_access(ti, sector, kaddr, pfn, size); 1500 + out: 1501 + dm_put_live_table(md, srcu_idx); 1502 + return min(ret, size); 1503 + } 907 1504 908 1505 /* 909 1506 * A target may call dm_accept_partial_bio only from the map routine. It is ··· 1302 1845 return BLK_QC_T_NONE; 1303 1846 } 1304 1847 1305 - int dm_request_based(struct mapped_device *md) 1306 - { 1307 - return blk_queue_stackable(md->queue); 1308 - } 1309 - 1310 - static void dm_dispatch_clone_request(struct request *clone, struct request *rq) 1311 - { 1312 - int r; 1313 - 1314 - if (blk_queue_io_stat(clone->q)) 1315 - clone->cmd_flags |= REQ_IO_STAT; 1316 - 1317 - clone->start_time = jiffies; 1318 - r = blk_insert_cloned_request(clone->q, clone); 1319 - if (r) 1320 - /* must complete clone in terms of original request */ 1321 - dm_complete_request(rq, r); 1322 - } 1323 - 1324 - static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1325 - void *data) 1326 - { 1327 - struct dm_rq_target_io *tio = data; 1328 - struct dm_rq_clone_bio_info *info = 1329 - container_of(bio, struct dm_rq_clone_bio_info, clone); 1330 - 1331 - info->orig = bio_orig; 1332 - info->tio = tio; 1333 - bio->bi_end_io = end_clone_bio; 1334 - 1335 - return 0; 1336 - } 1337 - 1338 - static int setup_clone(struct request *clone, struct request *rq, 1339 - struct dm_rq_target_io *tio, gfp_t gfp_mask) 1340 - { 1341 - int r; 1342 - 1343 - r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask, 1344 - dm_rq_bio_constructor, tio); 1345 - if (r) 1346 - return r; 1347 - 1348 - clone->cmd = rq->cmd; 1349 - clone->cmd_len = rq->cmd_len; 1350 - clone->sense = rq->sense; 1351 - clone->end_io = end_clone_request; 1352 - clone->end_io_data = tio; 1353 - 1354 - tio->clone = clone; 1355 - 1356 - return 0; 1357 - } 1358 - 1359 - static struct request *clone_old_rq(struct request *rq, struct mapped_device *md, 1360 - struct dm_rq_target_io *tio, gfp_t gfp_mask) 1361 - { 1362 - /* 1363 - * Create clone for use with .request_fn request_queue 1364 - */ 1365 - struct request *clone; 1366 - 1367 - clone = alloc_old_clone_request(md, gfp_mask); 1368 - if (!clone) 1369 - return NULL; 1370 - 1371 - blk_rq_init(NULL, clone); 1372 - if (setup_clone(clone, rq, tio, gfp_mask)) { 1373 - /* -ENOMEM */ 1374 - free_old_clone_request(md, clone); 1375 - return NULL; 1376 - } 1377 - 1378 - return clone; 1379 - } 1380 - 1381 - static void map_tio_request(struct kthread_work *work); 1382 - 1383 - static void init_tio(struct dm_rq_target_io *tio, struct request *rq, 1384 - struct mapped_device *md) 1385 - { 1386 - tio->md = md; 1387 - tio->ti = NULL; 1388 - tio->clone = NULL; 1389 - tio->orig = rq; 1390 - tio->error = 0; 1391 - /* 1392 - * Avoid initializing info for blk-mq; it passes 1393 - * target-specific data through info.ptr 1394 - * (see: dm_mq_init_request) 1395 - */ 1396 - if (!md->init_tio_pdu) 1397 - memset(&tio->info, 0, sizeof(tio->info)); 1398 - if (md->kworker_task) 1399 - init_kthread_work(&tio->work, map_tio_request); 1400 - } 1401 - 1402 - static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq, 1403 - struct mapped_device *md, 1404 - gfp_t gfp_mask) 1405 - { 1406 - struct dm_rq_target_io *tio; 1407 - int srcu_idx; 1408 - struct dm_table *table; 1409 - 1410 - tio = alloc_old_rq_tio(md, gfp_mask); 1411 - if (!tio) 1412 - return NULL; 1413 - 1414 - init_tio(tio, rq, md); 1415 - 1416 - table = dm_get_live_table(md, &srcu_idx); 1417 - /* 1418 - * Must clone a request if this .request_fn DM device 1419 - * is stacked on .request_fn device(s). 1420 - */ 1421 - if (!dm_table_mq_request_based(table)) { 1422 - if (!clone_old_rq(rq, md, tio, gfp_mask)) { 1423 - dm_put_live_table(md, srcu_idx); 1424 - free_old_rq_tio(tio); 1425 - return NULL; 1426 - } 1427 - } 1428 - dm_put_live_table(md, srcu_idx); 1429 - 1430 - return tio; 1431 - } 1432 - 1433 - /* 1434 - * Called with the queue lock held. 1435 - */ 1436 - static int dm_old_prep_fn(struct request_queue *q, struct request *rq) 1437 - { 1438 - struct mapped_device *md = q->queuedata; 1439 - struct dm_rq_target_io *tio; 1440 - 1441 - if (unlikely(rq->special)) { 1442 - DMWARN("Already has something in rq->special."); 1443 - return BLKPREP_KILL; 1444 - } 1445 - 1446 - tio = dm_old_prep_tio(rq, md, GFP_ATOMIC); 1447 - if (!tio) 1448 - return BLKPREP_DEFER; 1449 - 1450 - rq->special = tio; 1451 - rq->cmd_flags |= REQ_DONTPREP; 1452 - 1453 - return BLKPREP_OK; 1454 - } 1455 - 1456 - /* 1457 - * Returns: 1458 - * 0 : the request has been processed 1459 - * DM_MAPIO_REQUEUE : the original request needs to be requeued 1460 - * < 0 : the request was completed due to failure 1461 - */ 1462 - static int map_request(struct dm_rq_target_io *tio, struct request *rq, 1463 - struct mapped_device *md) 1464 - { 1465 - int r; 1466 - struct dm_target *ti = tio->ti; 1467 - struct request *clone = NULL; 1468 - 1469 - if (tio->clone) { 1470 - clone = tio->clone; 1471 - r = ti->type->map_rq(ti, clone, &tio->info); 1472 - } else { 1473 - r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); 1474 - if (r < 0) { 1475 - /* The target wants to complete the I/O */ 1476 - dm_kill_unmapped_request(rq, r); 1477 - return r; 1478 - } 1479 - if (r != DM_MAPIO_REMAPPED) 1480 - return r; 1481 - if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { 1482 - /* -ENOMEM */ 1483 - ti->type->release_clone_rq(clone); 1484 - return DM_MAPIO_REQUEUE; 1485 - } 1486 - } 1487 - 1488 - switch (r) { 1489 - case DM_MAPIO_SUBMITTED: 1490 - /* The target has taken the I/O to submit by itself later */ 1491 - break; 1492 - case DM_MAPIO_REMAPPED: 1493 - /* The target has remapped the I/O so dispatch it */ 1494 - trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1495 - blk_rq_pos(rq)); 1496 - dm_dispatch_clone_request(clone, rq); 1497 - break; 1498 - case DM_MAPIO_REQUEUE: 1499 - /* The target wants to requeue the I/O */ 1500 - dm_requeue_original_request(md, tio->orig); 1501 - break; 1502 - default: 1503 - if (r > 0) { 1504 - DMWARN("unimplemented target map return value: %d", r); 1505 - BUG(); 1506 - } 1507 - 1508 - /* The target wants to complete the I/O */ 1509 - dm_kill_unmapped_request(rq, r); 1510 - return r; 1511 - } 1512 - 1513 - return 0; 1514 - } 1515 - 1516 - static void map_tio_request(struct kthread_work *work) 1517 - { 1518 - struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work); 1519 - struct request *rq = tio->orig; 1520 - struct mapped_device *md = tio->md; 1521 - 1522 - if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) 1523 - dm_requeue_original_request(md, rq); 1524 - } 1525 - 1526 - static void dm_start_request(struct mapped_device *md, struct request *orig) 1527 - { 1528 - if (!orig->q->mq_ops) 1529 - blk_start_request(orig); 1530 - else 1531 - blk_mq_start_request(orig); 1532 - atomic_inc(&md->pending[rq_data_dir(orig)]); 1533 - 1534 - if (md->seq_rq_merge_deadline_usecs) { 1535 - md->last_rq_pos = rq_end_sector(orig); 1536 - md->last_rq_rw = rq_data_dir(orig); 1537 - md->last_rq_start_time = ktime_get(); 1538 - } 1539 - 1540 - if (unlikely(dm_stats_used(&md->stats))) { 1541 - struct dm_rq_target_io *tio = tio_from_request(orig); 1542 - tio->duration_jiffies = jiffies; 1543 - tio->n_sectors = blk_rq_sectors(orig); 1544 - dm_stats_account_io(&md->stats, rq_data_dir(orig), 1545 - blk_rq_pos(orig), tio->n_sectors, false, 0, 1546 - &tio->stats_aux); 1547 - } 1548 - 1549 - /* 1550 - * Hold the md reference here for the in-flight I/O. 1551 - * We can't rely on the reference count by device opener, 1552 - * because the device may be closed during the request completion 1553 - * when all bios are completed. 1554 - * See the comment in rq_completed() too. 1555 - */ 1556 - dm_get(md); 1557 - } 1558 - 1559 - #define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000 1560 - 1561 - ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf) 1562 - { 1563 - return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs); 1564 - } 1565 - 1566 - ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, 1567 - const char *buf, size_t count) 1568 - { 1569 - unsigned deadline; 1570 - 1571 - if (!dm_request_based(md) || md->use_blk_mq) 1572 - return count; 1573 - 1574 - if (kstrtouint(buf, 10, &deadline)) 1575 - return -EINVAL; 1576 - 1577 - if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS) 1578 - deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS; 1579 - 1580 - md->seq_rq_merge_deadline_usecs = deadline; 1581 - 1582 - return count; 1583 - } 1584 - 1585 - static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md) 1586 - { 1587 - ktime_t kt_deadline; 1588 - 1589 - if (!md->seq_rq_merge_deadline_usecs) 1590 - return false; 1591 - 1592 - kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC); 1593 - kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline); 1594 - 1595 - return !ktime_after(ktime_get(), kt_deadline); 1596 - } 1597 - 1598 - /* 1599 - * q->request_fn for request-based dm. 1600 - * Called with the queue lock held. 1601 - */ 1602 - static void dm_request_fn(struct request_queue *q) 1603 - { 1604 - struct mapped_device *md = q->queuedata; 1605 - struct dm_target *ti = md->immutable_target; 1606 - struct request *rq; 1607 - struct dm_rq_target_io *tio; 1608 - sector_t pos = 0; 1609 - 1610 - if (unlikely(!ti)) { 1611 - int srcu_idx; 1612 - struct dm_table *map = dm_get_live_table(md, &srcu_idx); 1613 - 1614 - ti = dm_table_find_target(map, pos); 1615 - dm_put_live_table(md, srcu_idx); 1616 - } 1617 - 1618 - /* 1619 - * For suspend, check blk_queue_stopped() and increment 1620 - * ->pending within a single queue_lock not to increment the 1621 - * number of in-flight I/Os after the queue is stopped in 1622 - * dm_suspend(). 1623 - */ 1624 - while (!blk_queue_stopped(q)) { 1625 - rq = blk_peek_request(q); 1626 - if (!rq) 1627 - return; 1628 - 1629 - /* always use block 0 to find the target for flushes for now */ 1630 - pos = 0; 1631 - if (req_op(rq) != REQ_OP_FLUSH) 1632 - pos = blk_rq_pos(rq); 1633 - 1634 - if ((dm_request_peeked_before_merge_deadline(md) && 1635 - md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 && 1636 - md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) || 1637 - (ti->type->busy && ti->type->busy(ti))) { 1638 - blk_delay_queue(q, HZ / 100); 1639 - return; 1640 - } 1641 - 1642 - dm_start_request(md, rq); 1643 - 1644 - tio = tio_from_request(rq); 1645 - /* Establish tio->ti before queuing work (map_tio_request) */ 1646 - tio->ti = ti; 1647 - queue_kthread_work(&md->kworker, &tio->work); 1648 - BUG_ON(!irqs_disabled()); 1649 - } 1650 - } 1651 - 1652 1848 static int dm_any_congested(void *congested_data, int bdi_bits) 1653 1849 { 1654 1850 int r = bdi_bits; ··· 1379 2269 1380 2270 static void dm_wq_work(struct work_struct *work); 1381 2271 1382 - static void dm_init_md_queue(struct mapped_device *md) 2272 + void dm_init_md_queue(struct mapped_device *md) 1383 2273 { 1384 2274 /* 1385 2275 * Request-based dm devices cannot be stacked on top of bio-based dm ··· 1400 2290 md->queue->backing_dev_info.congested_data = md; 1401 2291 } 1402 2292 1403 - static void dm_init_normal_md_queue(struct mapped_device *md) 2293 + void dm_init_normal_md_queue(struct mapped_device *md) 1404 2294 { 1405 2295 md->use_blk_mq = false; 1406 2296 dm_init_md_queue(md); ··· 1440 2330 bdput(md->bdev); 1441 2331 md->bdev = NULL; 1442 2332 } 2333 + 2334 + dm_mq_cleanup_mapped_device(md); 1443 2335 } 1444 2336 1445 2337 /* ··· 1475 2363 goto bad_io_barrier; 1476 2364 1477 2365 md->numa_node_id = numa_node_id; 1478 - md->use_blk_mq = use_blk_mq; 2366 + md->use_blk_mq = dm_use_blk_mq_default(); 1479 2367 md->init_tio_pdu = false; 1480 2368 md->type = DM_TYPE_NONE; 1481 2369 mutex_init(&md->suspend_lock); ··· 1560 2448 unlock_fs(md); 1561 2449 1562 2450 cleanup_mapped_device(md); 1563 - if (md->tag_set) { 1564 - blk_mq_free_tag_set(md->tag_set); 1565 - kfree(md->tag_set); 1566 - } 1567 2451 1568 2452 free_table_devices(&md->table_devices); 1569 2453 dm_stats_cleanup(&md->stats); ··· 1575 2467 1576 2468 if (md->bs) { 1577 2469 /* The md already has necessary mempools. */ 1578 - if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) { 2470 + if (dm_table_bio_based(t)) { 1579 2471 /* 1580 2472 * Reload bioset because front_pad may have changed 1581 2473 * because a different table was loaded. ··· 1765 2657 } 1766 2658 EXPORT_SYMBOL_GPL(dm_get_queue_limits); 1767 2659 1768 - static void dm_old_init_rq_based_worker_thread(struct mapped_device *md) 1769 - { 1770 - /* Initialize the request-based DM worker thread */ 1771 - init_kthread_worker(&md->kworker); 1772 - md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker, 1773 - "kdmwork-%s", dm_device_name(md)); 1774 - } 1775 - 1776 - /* 1777 - * Fully initialize a .request_fn request-based queue. 1778 - */ 1779 - static int dm_old_init_request_queue(struct mapped_device *md) 1780 - { 1781 - /* Fully initialize the queue */ 1782 - if (!blk_init_allocated_queue(md->queue, dm_request_fn, NULL)) 1783 - return -EINVAL; 1784 - 1785 - /* disable dm_request_fn's merge heuristic by default */ 1786 - md->seq_rq_merge_deadline_usecs = 0; 1787 - 1788 - dm_init_normal_md_queue(md); 1789 - blk_queue_softirq_done(md->queue, dm_softirq_done); 1790 - blk_queue_prep_rq(md->queue, dm_old_prep_fn); 1791 - 1792 - dm_old_init_rq_based_worker_thread(md); 1793 - 1794 - elv_register_queue(md->queue); 1795 - 1796 - return 0; 1797 - } 1798 - 1799 - static int dm_mq_init_request(void *data, struct request *rq, 1800 - unsigned int hctx_idx, unsigned int request_idx, 1801 - unsigned int numa_node) 1802 - { 1803 - struct mapped_device *md = data; 1804 - struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); 1805 - 1806 - /* 1807 - * Must initialize md member of tio, otherwise it won't 1808 - * be available in dm_mq_queue_rq. 1809 - */ 1810 - tio->md = md; 1811 - 1812 - if (md->init_tio_pdu) { 1813 - /* target-specific per-io data is immediately after the tio */ 1814 - tio->info.ptr = tio + 1; 1815 - } 1816 - 1817 - return 0; 1818 - } 1819 - 1820 - static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 1821 - const struct blk_mq_queue_data *bd) 1822 - { 1823 - struct request *rq = bd->rq; 1824 - struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); 1825 - struct mapped_device *md = tio->md; 1826 - struct dm_target *ti = md->immutable_target; 1827 - 1828 - if (unlikely(!ti)) { 1829 - int srcu_idx; 1830 - struct dm_table *map = dm_get_live_table(md, &srcu_idx); 1831 - 1832 - ti = dm_table_find_target(map, 0); 1833 - dm_put_live_table(md, srcu_idx); 1834 - } 1835 - 1836 - if (ti->type->busy && ti->type->busy(ti)) 1837 - return BLK_MQ_RQ_QUEUE_BUSY; 1838 - 1839 - dm_start_request(md, rq); 1840 - 1841 - /* Init tio using md established in .init_request */ 1842 - init_tio(tio, rq, md); 1843 - 1844 - /* 1845 - * Establish tio->ti before queuing work (map_tio_request) 1846 - * or making direct call to map_request(). 1847 - */ 1848 - tio->ti = ti; 1849 - 1850 - /* Direct call is fine since .queue_rq allows allocations */ 1851 - if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) { 1852 - /* Undo dm_start_request() before requeuing */ 1853 - rq_end_stats(md, rq); 1854 - rq_completed(md, rq_data_dir(rq), false); 1855 - return BLK_MQ_RQ_QUEUE_BUSY; 1856 - } 1857 - 1858 - return BLK_MQ_RQ_QUEUE_OK; 1859 - } 1860 - 1861 - static struct blk_mq_ops dm_mq_ops = { 1862 - .queue_rq = dm_mq_queue_rq, 1863 - .map_queue = blk_mq_map_queue, 1864 - .complete = dm_softirq_done, 1865 - .init_request = dm_mq_init_request, 1866 - }; 1867 - 1868 - static int dm_mq_init_request_queue(struct mapped_device *md, 1869 - struct dm_target *immutable_tgt) 1870 - { 1871 - struct request_queue *q; 1872 - int err; 1873 - 1874 - if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) { 1875 - DMERR("request-based dm-mq may only be stacked on blk-mq device(s)"); 1876 - return -EINVAL; 1877 - } 1878 - 1879 - md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id); 1880 - if (!md->tag_set) 1881 - return -ENOMEM; 1882 - 1883 - md->tag_set->ops = &dm_mq_ops; 1884 - md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); 1885 - md->tag_set->numa_node = md->numa_node_id; 1886 - md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 1887 - md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); 1888 - md->tag_set->driver_data = md; 1889 - 1890 - md->tag_set->cmd_size = sizeof(struct dm_rq_target_io); 1891 - if (immutable_tgt && immutable_tgt->per_io_data_size) { 1892 - /* any target-specific per-io data is immediately after the tio */ 1893 - md->tag_set->cmd_size += immutable_tgt->per_io_data_size; 1894 - md->init_tio_pdu = true; 1895 - } 1896 - 1897 - err = blk_mq_alloc_tag_set(md->tag_set); 1898 - if (err) 1899 - goto out_kfree_tag_set; 1900 - 1901 - q = blk_mq_init_allocated_queue(md->tag_set, md->queue); 1902 - if (IS_ERR(q)) { 1903 - err = PTR_ERR(q); 1904 - goto out_tag_set; 1905 - } 1906 - dm_init_md_queue(md); 1907 - 1908 - /* backfill 'mq' sysfs registration normally done in blk_register_queue */ 1909 - blk_mq_register_disk(md->disk); 1910 - 1911 - return 0; 1912 - 1913 - out_tag_set: 1914 - blk_mq_free_tag_set(md->tag_set); 1915 - out_kfree_tag_set: 1916 - kfree(md->tag_set); 1917 - 1918 - return err; 1919 - } 1920 - 1921 - static unsigned filter_md_type(unsigned type, struct mapped_device *md) 1922 - { 1923 - if (type == DM_TYPE_BIO_BASED) 1924 - return type; 1925 - 1926 - return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED; 1927 - } 1928 - 1929 2660 /* 1930 2661 * Setup the DM device's queue based on md's type 1931 2662 */ 1932 2663 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) 1933 2664 { 1934 2665 int r; 1935 - unsigned md_type = filter_md_type(dm_get_md_type(md), md); 2666 + unsigned type = dm_get_md_type(md); 1936 2667 1937 - switch (md_type) { 2668 + switch (type) { 1938 2669 case DM_TYPE_REQUEST_BASED: 1939 2670 r = dm_old_init_request_queue(md); 1940 2671 if (r) { ··· 1782 2835 } 1783 2836 break; 1784 2837 case DM_TYPE_MQ_REQUEST_BASED: 1785 - r = dm_mq_init_request_queue(md, dm_table_get_immutable_target(t)); 2838 + r = dm_mq_init_request_queue(md, t); 1786 2839 if (r) { 1787 2840 DMERR("Cannot initialize queue for request-based dm-mq mapped device"); 1788 2841 return r; 1789 2842 } 1790 2843 break; 1791 2844 case DM_TYPE_BIO_BASED: 2845 + case DM_TYPE_DAX_BIO_BASED: 1792 2846 dm_init_normal_md_queue(md); 1793 2847 blk_queue_make_request(md->queue, dm_make_request); 1794 2848 /* ··· 1798 2850 */ 1799 2851 bioset_free(md->queue->bio_split); 1800 2852 md->queue->bio_split = NULL; 2853 + 2854 + if (type == DM_TYPE_DAX_BIO_BASED) 2855 + queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue); 1801 2856 break; 1802 2857 } 1803 2858 ··· 2495 3544 if (!pools) 2496 3545 return NULL; 2497 3546 2498 - type = filter_md_type(type, md); 2499 - 2500 3547 switch (type) { 2501 3548 case DM_TYPE_BIO_BASED: 3549 + case DM_TYPE_DAX_BIO_BASED: 2502 3550 cachep = _io_cache; 2503 3551 pool_size = dm_get_reserved_bio_based_ios(); 2504 3552 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); ··· 2554 3604 kfree(pools); 2555 3605 } 2556 3606 3607 + struct dm_pr { 3608 + u64 old_key; 3609 + u64 new_key; 3610 + u32 flags; 3611 + bool fail_early; 3612 + }; 3613 + 3614 + static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn, 3615 + void *data) 3616 + { 3617 + struct mapped_device *md = bdev->bd_disk->private_data; 3618 + struct dm_table *table; 3619 + struct dm_target *ti; 3620 + int ret = -ENOTTY, srcu_idx; 3621 + 3622 + table = dm_get_live_table(md, &srcu_idx); 3623 + if (!table || !dm_table_get_size(table)) 3624 + goto out; 3625 + 3626 + /* We only support devices that have a single target */ 3627 + if (dm_table_get_num_targets(table) != 1) 3628 + goto out; 3629 + ti = dm_table_get_target(table, 0); 3630 + 3631 + ret = -EINVAL; 3632 + if (!ti->type->iterate_devices) 3633 + goto out; 3634 + 3635 + ret = ti->type->iterate_devices(ti, fn, data); 3636 + out: 3637 + dm_put_live_table(md, srcu_idx); 3638 + return ret; 3639 + } 3640 + 3641 + /* 3642 + * For register / unregister we need to manually call out to every path. 3643 + */ 3644 + static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev, 3645 + sector_t start, sector_t len, void *data) 3646 + { 3647 + struct dm_pr *pr = data; 3648 + const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; 3649 + 3650 + if (!ops || !ops->pr_register) 3651 + return -EOPNOTSUPP; 3652 + return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags); 3653 + } 3654 + 2557 3655 static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, 2558 3656 u32 flags) 2559 3657 { 2560 - struct mapped_device *md = bdev->bd_disk->private_data; 2561 - const struct pr_ops *ops; 2562 - fmode_t mode; 2563 - int r; 3658 + struct dm_pr pr = { 3659 + .old_key = old_key, 3660 + .new_key = new_key, 3661 + .flags = flags, 3662 + .fail_early = true, 3663 + }; 3664 + int ret; 2564 3665 2565 - r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); 2566 - if (r < 0) 2567 - return r; 3666 + ret = dm_call_pr(bdev, __dm_pr_register, &pr); 3667 + if (ret && new_key) { 3668 + /* unregister all paths if we failed to register any path */ 3669 + pr.old_key = new_key; 3670 + pr.new_key = 0; 3671 + pr.flags = 0; 3672 + pr.fail_early = false; 3673 + dm_call_pr(bdev, __dm_pr_register, &pr); 3674 + } 2568 3675 2569 - ops = bdev->bd_disk->fops->pr_ops; 2570 - if (ops && ops->pr_register) 2571 - r = ops->pr_register(bdev, old_key, new_key, flags); 2572 - else 2573 - r = -EOPNOTSUPP; 2574 - 2575 - bdput(bdev); 2576 - return r; 3676 + return ret; 2577 3677 } 2578 3678 2579 3679 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, ··· 2724 3724 .open = dm_blk_open, 2725 3725 .release = dm_blk_close, 2726 3726 .ioctl = dm_blk_ioctl, 3727 + .direct_access = dm_blk_direct_access, 2727 3728 .getgeo = dm_blk_getgeo, 2728 3729 .pr_ops = &dm_pr_ops, 2729 3730 .owner = THIS_MODULE ··· 2741 3740 2742 3741 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); 2743 3742 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); 2744 - 2745 - module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); 2746 - MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); 2747 - 2748 - module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR); 2749 - MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices"); 2750 - 2751 - module_param(dm_mq_nr_hw_queues, uint, S_IRUGO | S_IWUSR); 2752 - MODULE_PARM_DESC(dm_mq_nr_hw_queues, "Number of hardware queues for request-based dm-mq devices"); 2753 - 2754 - module_param(dm_mq_queue_depth, uint, S_IRUGO | S_IWUSR); 2755 - MODULE_PARM_DESC(dm_mq_queue_depth, "Queue depth for request-based dm-mq devices"); 2756 3743 2757 3744 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); 2758 3745 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");

+4 -32

drivers/md/dm.h

··· 13 13 #include <linux/fs.h> 14 14 #include <linux/device-mapper.h> 15 15 #include <linux/list.h> 16 + #include <linux/moduleparam.h> 16 17 #include <linux/blkdev.h> 17 18 #include <linux/backing-dev.h> 18 19 #include <linux/hdreg.h> ··· 32 31 * Status feature flags 33 32 */ 34 33 #define DM_STATUS_NOFLUSH_FLAG (1 << 0) 35 - 36 - /* 37 - * Type of table and mapped_device's mempool 38 - */ 39 - #define DM_TYPE_NONE 0 40 - #define DM_TYPE_BIO_BASED 1 41 - #define DM_TYPE_REQUEST_BASED 2 42 - #define DM_TYPE_MQ_REQUEST_BASED 3 43 34 44 35 /* 45 36 * List of devices that a metadevice uses and should open/close. ··· 68 75 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); 69 76 struct dm_target *dm_table_get_immutable_target(struct dm_table *t); 70 77 struct dm_target *dm_table_get_wildcard_target(struct dm_table *t); 78 + bool dm_table_bio_based(struct dm_table *t); 71 79 bool dm_table_request_based(struct dm_table *t); 72 - bool dm_table_mq_request_based(struct dm_table *t); 80 + bool dm_table_all_blk_mq_devices(struct dm_table *t); 73 81 void dm_table_free_md_mempools(struct dm_table *t); 74 82 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); 75 83 ··· 155 161 /* 156 162 * sysfs interface 157 163 */ 158 - struct dm_kobject_holder { 159 - struct kobject kobj; 160 - struct completion completion; 161 - }; 162 - 163 - static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) 164 - { 165 - return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; 166 - } 167 - 168 164 int dm_sysfs_init(struct mapped_device *md); 169 165 void dm_sysfs_exit(struct mapped_device *md); 170 166 struct kobject *dm_kobject(struct mapped_device *md); ··· 196 212 void dm_internal_suspend(struct mapped_device *md); 197 213 void dm_internal_resume(struct mapped_device *md); 198 214 199 - bool dm_use_blk_mq(struct mapped_device *md); 200 - 201 215 int dm_io_init(void); 202 216 void dm_io_exit(void); 203 217 ··· 210 228 void dm_free_md_mempools(struct dm_md_mempools *pools); 211 229 212 230 /* 213 - * Helpers that are used by DM core 231 + * Various helpers 214 232 */ 215 233 unsigned dm_get_reserved_bio_based_ios(void); 216 - unsigned dm_get_reserved_rq_based_ios(void); 217 - 218 - static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen) 219 - { 220 - return !maxlen || strlen(result) + 1 >= maxlen; 221 - } 222 - 223 - ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf); 224 - ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, 225 - const char *buf, size_t count); 226 234 227 235 #endif

+8 -1

drivers/md/persistent-data/dm-btree.c

··· 429 429 430 430 if (flags & INTERNAL_NODE) { 431 431 i = lower_bound(n, key); 432 - if (i < 0 || i >= nr_entries) { 432 + if (i < 0) { 433 + /* 434 + * avoid early -ENODATA return when all entries are 435 + * higher than the search @key. 436 + */ 437 + i = 0; 438 + } 439 + if (i >= nr_entries) { 433 440 r = -ENODATA; 434 441 goto out; 435 442 }

+1 -2

drivers/scsi/sd.c

··· 1619 1619 return -EOPNOTSUPP; 1620 1620 return sd_pr_command(bdev, (flags & PR_FL_IGNORE_KEY) ? 0x06 : 0x00, 1621 1621 old_key, new_key, 0, 1622 - (1 << 0) /* APTPL */ | 1623 - (1 << 2) /* ALL_TG_PT */); 1622 + (1 << 0) /* APTPL */); 1624 1623 } 1625 1624 1626 1625 static int sd_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,

+26

include/linux/device-mapper.h

··· 19 19 struct mapped_device; 20 20 struct bio_vec; 21 21 22 + /* 23 + * Type of table, mapped_device's mempool and request_queue 24 + */ 25 + #define DM_TYPE_NONE 0 26 + #define DM_TYPE_BIO_BASED 1 27 + #define DM_TYPE_REQUEST_BASED 2 28 + #define DM_TYPE_MQ_REQUEST_BASED 3 29 + #define DM_TYPE_DAX_BIO_BASED 4 30 + 22 31 typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t; 23 32 24 33 union map_info { ··· 125 116 */ 126 117 typedef int (*dm_busy_fn) (struct dm_target *ti); 127 118 119 + /* 120 + * Returns: 121 + * < 0 : error 122 + * >= 0 : the number of bytes accessible at the address 123 + */ 124 + typedef long (*dm_direct_access_fn) (struct dm_target *ti, sector_t sector, 125 + void __pmem **kaddr, pfn_t *pfn, long size); 126 + 128 127 void dm_error(const char *message); 129 128 130 129 struct dm_dev { ··· 179 162 dm_busy_fn busy; 180 163 dm_iterate_devices_fn iterate_devices; 181 164 dm_io_hints_fn io_hints; 165 + dm_direct_access_fn direct_access; 182 166 183 167 /* For internal device-mapper use. */ 184 168 struct list_head list; ··· 460 442 * Target_ctr should call this if it needs to add any callbacks. 461 443 */ 462 444 void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb); 445 + 446 + /* 447 + * Target can use this to set the table's type. 448 + * Can only ever be called from a target's ctr. 449 + * Useful for "hybrid" target (supports both bio-based 450 + * and request-based). 451 + */ 452 + void dm_table_set_type(struct dm_table *t, unsigned type); 463 453 464 454 /* 465 455 * Finally call this to make the table ready for use.

+2 -2

include/uapi/linux/dm-ioctl.h

··· 267 267 #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) 268 268 269 269 #define DM_VERSION_MAJOR 4 270 - #define DM_VERSION_MINOR 34 270 + #define DM_VERSION_MINOR 35 271 271 #define DM_VERSION_PATCHLEVEL 0 272 - #define DM_VERSION_EXTRA "-ioctl (2015-10-28)" 272 + #define DM_VERSION_EXTRA "-ioctl (2016-06-23)" 273 273 274 274 /* Status bits */ 275 275 #define DM_READONLY_FLAG (1 << 0) /* In/Out */