Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v3.14 538 lines 14 kB view raw
1/* 2 * Copyright (C) 2010-2012 by Dell Inc. All rights reserved. 3 * Copyright (C) 2011-2013 Red Hat, Inc. 4 * 5 * This file is released under the GPL. 6 * 7 * dm-switch is a device-mapper target that maps IO to underlying block 8 * devices efficiently when there are a large number of fixed-sized 9 * address regions but there is no simple pattern to allow for a compact 10 * mapping representation such as dm-stripe. 11 */ 12 13#include <linux/device-mapper.h> 14 15#include <linux/module.h> 16#include <linux/init.h> 17#include <linux/vmalloc.h> 18 19#define DM_MSG_PREFIX "switch" 20 21/* 22 * One region_table_slot_t holds <region_entries_per_slot> region table 23 * entries each of which is <region_table_entry_bits> in size. 24 */ 25typedef unsigned long region_table_slot_t; 26 27/* 28 * A device with the offset to its start sector. 29 */ 30struct switch_path { 31 struct dm_dev *dmdev; 32 sector_t start; 33}; 34 35/* 36 * Context block for a dm switch device. 37 */ 38struct switch_ctx { 39 struct dm_target *ti; 40 41 unsigned nr_paths; /* Number of paths in path_list. */ 42 43 unsigned region_size; /* Region size in 512-byte sectors */ 44 unsigned long nr_regions; /* Number of regions making up the device */ 45 signed char region_size_bits; /* log2 of region_size or -1 */ 46 47 unsigned char region_table_entry_bits; /* Number of bits in one region table entry */ 48 unsigned char region_entries_per_slot; /* Number of entries in one region table slot */ 49 signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */ 50 51 region_table_slot_t *region_table; /* Region table */ 52 53 /* 54 * Array of dm devices to switch between. 55 */ 56 struct switch_path path_list[0]; 57}; 58 59static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths, 60 unsigned region_size) 61{ 62 struct switch_ctx *sctx; 63 64 sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path), 65 GFP_KERNEL); 66 if (!sctx) 67 return NULL; 68 69 sctx->ti = ti; 70 sctx->region_size = region_size; 71 72 ti->private = sctx; 73 74 return sctx; 75} 76 77static int alloc_region_table(struct dm_target *ti, unsigned nr_paths) 78{ 79 struct switch_ctx *sctx = ti->private; 80 sector_t nr_regions = ti->len; 81 sector_t nr_slots; 82 83 if (!(sctx->region_size & (sctx->region_size - 1))) 84 sctx->region_size_bits = __ffs(sctx->region_size); 85 else 86 sctx->region_size_bits = -1; 87 88 sctx->region_table_entry_bits = 1; 89 while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 && 90 (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths) 91 sctx->region_table_entry_bits++; 92 93 sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits; 94 if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1))) 95 sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot); 96 else 97 sctx->region_entries_per_slot_bits = -1; 98 99 if (sector_div(nr_regions, sctx->region_size)) 100 nr_regions++; 101 102 sctx->nr_regions = nr_regions; 103 if (sctx->nr_regions != nr_regions || sctx->nr_regions >= ULONG_MAX) { 104 ti->error = "Region table too large"; 105 return -EINVAL; 106 } 107 108 nr_slots = nr_regions; 109 if (sector_div(nr_slots, sctx->region_entries_per_slot)) 110 nr_slots++; 111 112 if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) { 113 ti->error = "Region table too large"; 114 return -EINVAL; 115 } 116 117 sctx->region_table = vmalloc(nr_slots * sizeof(region_table_slot_t)); 118 if (!sctx->region_table) { 119 ti->error = "Cannot allocate region table"; 120 return -ENOMEM; 121 } 122 123 return 0; 124} 125 126static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr, 127 unsigned long *region_index, unsigned *bit) 128{ 129 if (sctx->region_entries_per_slot_bits >= 0) { 130 *region_index = region_nr >> sctx->region_entries_per_slot_bits; 131 *bit = region_nr & (sctx->region_entries_per_slot - 1); 132 } else { 133 *region_index = region_nr / sctx->region_entries_per_slot; 134 *bit = region_nr % sctx->region_entries_per_slot; 135 } 136 137 *bit *= sctx->region_table_entry_bits; 138} 139 140/* 141 * Find which path to use at given offset. 142 */ 143static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset) 144{ 145 unsigned long region_index; 146 unsigned bit, path_nr; 147 sector_t p; 148 149 p = offset; 150 if (sctx->region_size_bits >= 0) 151 p >>= sctx->region_size_bits; 152 else 153 sector_div(p, sctx->region_size); 154 155 switch_get_position(sctx, p, &region_index, &bit); 156 path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) & 157 ((1 << sctx->region_table_entry_bits) - 1); 158 159 /* This can only happen if the processor uses non-atomic stores. */ 160 if (unlikely(path_nr >= sctx->nr_paths)) 161 path_nr = 0; 162 163 return path_nr; 164} 165 166static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr, 167 unsigned value) 168{ 169 unsigned long region_index; 170 unsigned bit; 171 region_table_slot_t pte; 172 173 switch_get_position(sctx, region_nr, &region_index, &bit); 174 175 pte = sctx->region_table[region_index]; 176 pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit); 177 pte |= (region_table_slot_t)value << bit; 178 sctx->region_table[region_index] = pte; 179} 180 181/* 182 * Fill the region table with an initial round robin pattern. 183 */ 184static void initialise_region_table(struct switch_ctx *sctx) 185{ 186 unsigned path_nr = 0; 187 unsigned long region_nr; 188 189 for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) { 190 switch_region_table_write(sctx, region_nr, path_nr); 191 if (++path_nr >= sctx->nr_paths) 192 path_nr = 0; 193 } 194} 195 196static int parse_path(struct dm_arg_set *as, struct dm_target *ti) 197{ 198 struct switch_ctx *sctx = ti->private; 199 unsigned long long start; 200 int r; 201 202 r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), 203 &sctx->path_list[sctx->nr_paths].dmdev); 204 if (r) { 205 ti->error = "Device lookup failed"; 206 return r; 207 } 208 209 if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) { 210 ti->error = "Invalid device starting offset"; 211 dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev); 212 return -EINVAL; 213 } 214 215 sctx->path_list[sctx->nr_paths].start = start; 216 217 sctx->nr_paths++; 218 219 return 0; 220} 221 222/* 223 * Destructor: Don't free the dm_target, just the ti->private data (if any). 224 */ 225static void switch_dtr(struct dm_target *ti) 226{ 227 struct switch_ctx *sctx = ti->private; 228 229 while (sctx->nr_paths--) 230 dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev); 231 232 vfree(sctx->region_table); 233 kfree(sctx); 234} 235 236/* 237 * Constructor arguments: 238 * <num_paths> <region_size> <num_optional_args> [<optional_args>...] 239 * [<dev_path> <offset>]+ 240 * 241 * Optional args are to allow for future extension: currently this 242 * parameter must be 0. 243 */ 244static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv) 245{ 246 static struct dm_arg _args[] = { 247 {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"}, 248 {1, UINT_MAX, "Invalid region size"}, 249 {0, 0, "Invalid number of optional args"}, 250 }; 251 252 struct switch_ctx *sctx; 253 struct dm_arg_set as; 254 unsigned nr_paths, region_size, nr_optional_args; 255 int r; 256 257 as.argc = argc; 258 as.argv = argv; 259 260 r = dm_read_arg(_args, &as, &nr_paths, &ti->error); 261 if (r) 262 return -EINVAL; 263 264 r = dm_read_arg(_args + 1, &as, &region_size, &ti->error); 265 if (r) 266 return r; 267 268 r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error); 269 if (r) 270 return r; 271 /* parse optional arguments here, if we add any */ 272 273 if (as.argc != nr_paths * 2) { 274 ti->error = "Incorrect number of path arguments"; 275 return -EINVAL; 276 } 277 278 sctx = alloc_switch_ctx(ti, nr_paths, region_size); 279 if (!sctx) { 280 ti->error = "Cannot allocate redirection context"; 281 return -ENOMEM; 282 } 283 284 r = dm_set_target_max_io_len(ti, region_size); 285 if (r) 286 goto error; 287 288 while (as.argc) { 289 r = parse_path(&as, ti); 290 if (r) 291 goto error; 292 } 293 294 r = alloc_region_table(ti, nr_paths); 295 if (r) 296 goto error; 297 298 initialise_region_table(sctx); 299 300 /* For UNMAP, sending the request down any path is sufficient */ 301 ti->num_discard_bios = 1; 302 303 return 0; 304 305error: 306 switch_dtr(ti); 307 308 return r; 309} 310 311static int switch_map(struct dm_target *ti, struct bio *bio) 312{ 313 struct switch_ctx *sctx = ti->private; 314 sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector); 315 unsigned path_nr = switch_get_path_nr(sctx, offset); 316 317 bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev; 318 bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset; 319 320 return DM_MAPIO_REMAPPED; 321} 322 323/* 324 * We need to parse hex numbers in the message as quickly as possible. 325 * 326 * This table-based hex parser improves performance. 327 * It improves a time to load 1000000 entries compared to the condition-based 328 * parser. 329 * table-based parser condition-based parser 330 * PA-RISC 0.29s 0.31s 331 * Opteron 0.0495s 0.0498s 332 */ 333static const unsigned char hex_table[256] = { 334255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 335255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 336255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 3370, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, 338255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, 339255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 340255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, 341255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 342255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 343255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 344255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 345255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 346255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 347255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 348255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 349255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 350}; 351 352static __always_inline unsigned long parse_hex(const char **string) 353{ 354 unsigned char d; 355 unsigned long r = 0; 356 357 while ((d = hex_table[(unsigned char)**string]) < 16) { 358 r = (r << 4) | d; 359 (*string)++; 360 } 361 362 return r; 363} 364 365static int process_set_region_mappings(struct switch_ctx *sctx, 366 unsigned argc, char **argv) 367{ 368 unsigned i; 369 unsigned long region_index = 0; 370 371 for (i = 1; i < argc; i++) { 372 unsigned long path_nr; 373 const char *string = argv[i]; 374 375 if (*string == ':') 376 region_index++; 377 else { 378 region_index = parse_hex(&string); 379 if (unlikely(*string != ':')) { 380 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); 381 return -EINVAL; 382 } 383 } 384 385 string++; 386 if (unlikely(!*string)) { 387 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); 388 return -EINVAL; 389 } 390 391 path_nr = parse_hex(&string); 392 if (unlikely(*string)) { 393 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); 394 return -EINVAL; 395 } 396 if (unlikely(region_index >= sctx->nr_regions)) { 397 DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions); 398 return -EINVAL; 399 } 400 if (unlikely(path_nr >= sctx->nr_paths)) { 401 DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths); 402 return -EINVAL; 403 } 404 405 switch_region_table_write(sctx, region_index, path_nr); 406 } 407 408 return 0; 409} 410 411/* 412 * Messages are processed one-at-a-time. 413 * 414 * Only set_region_mappings is supported. 415 */ 416static int switch_message(struct dm_target *ti, unsigned argc, char **argv) 417{ 418 static DEFINE_MUTEX(message_mutex); 419 420 struct switch_ctx *sctx = ti->private; 421 int r = -EINVAL; 422 423 mutex_lock(&message_mutex); 424 425 if (!strcasecmp(argv[0], "set_region_mappings")) 426 r = process_set_region_mappings(sctx, argc, argv); 427 else 428 DMWARN("Unrecognised message received."); 429 430 mutex_unlock(&message_mutex); 431 432 return r; 433} 434 435static void switch_status(struct dm_target *ti, status_type_t type, 436 unsigned status_flags, char *result, unsigned maxlen) 437{ 438 struct switch_ctx *sctx = ti->private; 439 unsigned sz = 0; 440 int path_nr; 441 442 switch (type) { 443 case STATUSTYPE_INFO: 444 result[0] = '\0'; 445 break; 446 447 case STATUSTYPE_TABLE: 448 DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size); 449 for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) 450 DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name, 451 (unsigned long long)sctx->path_list[path_nr].start); 452 break; 453 } 454} 455 456/* 457 * Switch ioctl: 458 * 459 * Passthrough all ioctls to the path for sector 0 460 */ 461static int switch_ioctl(struct dm_target *ti, unsigned cmd, 462 unsigned long arg) 463{ 464 struct switch_ctx *sctx = ti->private; 465 struct block_device *bdev; 466 fmode_t mode; 467 unsigned path_nr; 468 int r = 0; 469 470 path_nr = switch_get_path_nr(sctx, 0); 471 472 bdev = sctx->path_list[path_nr].dmdev->bdev; 473 mode = sctx->path_list[path_nr].dmdev->mode; 474 475 /* 476 * Only pass ioctls through if the device sizes match exactly. 477 */ 478 if (ti->len + sctx->path_list[path_nr].start != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) 479 r = scsi_verify_blk_ioctl(NULL, cmd); 480 481 return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); 482} 483 484static int switch_iterate_devices(struct dm_target *ti, 485 iterate_devices_callout_fn fn, void *data) 486{ 487 struct switch_ctx *sctx = ti->private; 488 int path_nr; 489 int r; 490 491 for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) { 492 r = fn(ti, sctx->path_list[path_nr].dmdev, 493 sctx->path_list[path_nr].start, ti->len, data); 494 if (r) 495 return r; 496 } 497 498 return 0; 499} 500 501static struct target_type switch_target = { 502 .name = "switch", 503 .version = {1, 0, 0}, 504 .module = THIS_MODULE, 505 .ctr = switch_ctr, 506 .dtr = switch_dtr, 507 .map = switch_map, 508 .message = switch_message, 509 .status = switch_status, 510 .ioctl = switch_ioctl, 511 .iterate_devices = switch_iterate_devices, 512}; 513 514static int __init dm_switch_init(void) 515{ 516 int r; 517 518 r = dm_register_target(&switch_target); 519 if (r < 0) 520 DMERR("dm_register_target() failed %d", r); 521 522 return r; 523} 524 525static void __exit dm_switch_exit(void) 526{ 527 dm_unregister_target(&switch_target); 528} 529 530module_init(dm_switch_init); 531module_exit(dm_switch_exit); 532 533MODULE_DESCRIPTION(DM_NAME " dynamic path switching target"); 534MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>"); 535MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>"); 536MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>"); 537MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>"); 538MODULE_LICENSE("GPL");