Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2001 Sistina Software (UK) Limited.
4 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
5 *
6 * This file is released under the GPL.
7 */
8
9#include "dm-core.h"
10#include "dm-rq.h"
11
12#include <linux/module.h>
13#include <linux/vmalloc.h>
14#include <linux/blkdev.h>
15#include <linux/blk-integrity.h>
16#include <linux/namei.h>
17#include <linux/ctype.h>
18#include <linux/string.h>
19#include <linux/slab.h>
20#include <linux/interrupt.h>
21#include <linux/mutex.h>
22#include <linux/delay.h>
23#include <linux/atomic.h>
24#include <linux/blk-mq.h>
25#include <linux/mount.h>
26#include <linux/dax.h>
27
28#define DM_MSG_PREFIX "table"
29
30#define NODE_SIZE L1_CACHE_BYTES
31#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
32#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
33
34/*
35 * Similar to ceiling(log_size(n))
36 */
37static unsigned int int_log(unsigned int n, unsigned int base)
38{
39 int result = 0;
40
41 while (n > 1) {
42 n = dm_div_up(n, base);
43 result++;
44 }
45
46 return result;
47}
48
49/*
50 * Calculate the index of the child node of the n'th node k'th key.
51 */
52static inline unsigned int get_child(unsigned int n, unsigned int k)
53{
54 return (n * CHILDREN_PER_NODE) + k;
55}
56
57/*
58 * Return the n'th node of level l from table t.
59 */
60static inline sector_t *get_node(struct dm_table *t,
61 unsigned int l, unsigned int n)
62{
63 return t->index[l] + (n * KEYS_PER_NODE);
64}
65
66/*
67 * Return the highest key that you could lookup from the n'th
68 * node on level l of the btree.
69 */
70static sector_t high(struct dm_table *t, unsigned int l, unsigned int n)
71{
72 for (; l < t->depth - 1; l++)
73 n = get_child(n, CHILDREN_PER_NODE - 1);
74
75 if (n >= t->counts[l])
76 return (sector_t) -1;
77
78 return get_node(t, l, n)[KEYS_PER_NODE - 1];
79}
80
81/*
82 * Fills in a level of the btree based on the highs of the level
83 * below it.
84 */
85static int setup_btree_index(unsigned int l, struct dm_table *t)
86{
87 unsigned int n, k;
88 sector_t *node;
89
90 for (n = 0U; n < t->counts[l]; n++) {
91 node = get_node(t, l, n);
92
93 for (k = 0U; k < KEYS_PER_NODE; k++)
94 node[k] = high(t, l + 1, get_child(n, k));
95 }
96
97 return 0;
98}
99
100/*
101 * highs, and targets are managed as dynamic arrays during a
102 * table load.
103 */
104static int alloc_targets(struct dm_table *t, unsigned int num)
105{
106 sector_t *n_highs;
107 struct dm_target *n_targets;
108
109 /*
110 * Allocate both the target array and offset array at once.
111 */
112 n_highs = kvcalloc(num, sizeof(struct dm_target) + sizeof(sector_t),
113 GFP_KERNEL);
114 if (!n_highs)
115 return -ENOMEM;
116
117 n_targets = (struct dm_target *) (n_highs + num);
118
119 memset(n_highs, -1, sizeof(*n_highs) * num);
120 kvfree(t->highs);
121
122 t->num_allocated = num;
123 t->highs = n_highs;
124 t->targets = n_targets;
125
126 return 0;
127}
128
129int dm_table_create(struct dm_table **result, blk_mode_t mode,
130 unsigned int num_targets, struct mapped_device *md)
131{
132 struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL);
133
134 if (!t)
135 return -ENOMEM;
136
137 INIT_LIST_HEAD(&t->devices);
138
139 if (!num_targets)
140 num_targets = KEYS_PER_NODE;
141
142 num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
143
144 if (!num_targets) {
145 kfree(t);
146 return -ENOMEM;
147 }
148
149 if (alloc_targets(t, num_targets)) {
150 kfree(t);
151 return -ENOMEM;
152 }
153
154 t->type = DM_TYPE_NONE;
155 t->mode = mode;
156 t->md = md;
157 *result = t;
158 return 0;
159}
160
161static void free_devices(struct list_head *devices, struct mapped_device *md)
162{
163 struct list_head *tmp, *next;
164
165 list_for_each_safe(tmp, next, devices) {
166 struct dm_dev_internal *dd =
167 list_entry(tmp, struct dm_dev_internal, list);
168 DMWARN("%s: dm_table_destroy: dm_put_device call missing for %s",
169 dm_device_name(md), dd->dm_dev->name);
170 dm_put_table_device(md, dd->dm_dev);
171 kfree(dd);
172 }
173}
174
175static void dm_table_destroy_crypto_profile(struct dm_table *t);
176
177void dm_table_destroy(struct dm_table *t)
178{
179 if (!t)
180 return;
181
182 /* free the indexes */
183 if (t->depth >= 2)
184 kvfree(t->index[t->depth - 2]);
185
186 /* free the targets */
187 for (unsigned int i = 0; i < t->num_targets; i++) {
188 struct dm_target *ti = dm_table_get_target(t, i);
189
190 if (ti->type->dtr)
191 ti->type->dtr(ti);
192
193 dm_put_target_type(ti->type);
194 }
195
196 kvfree(t->highs);
197
198 /* free the device list */
199 free_devices(&t->devices, t->md);
200
201 dm_free_md_mempools(t->mempools);
202
203 dm_table_destroy_crypto_profile(t);
204
205 kfree(t);
206}
207
208/*
209 * See if we've already got a device in the list.
210 */
211static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev)
212{
213 struct dm_dev_internal *dd;
214
215 list_for_each_entry(dd, l, list)
216 if (dd->dm_dev->bdev->bd_dev == dev)
217 return dd;
218
219 return NULL;
220}
221
222/*
223 * If possible, this checks an area of a destination device is invalid.
224 */
225static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
226 sector_t start, sector_t len, void *data)
227{
228 struct queue_limits *limits = data;
229 struct block_device *bdev = dev->bdev;
230 sector_t dev_size = bdev_nr_sectors(bdev);
231 unsigned short logical_block_size_sectors =
232 limits->logical_block_size >> SECTOR_SHIFT;
233
234 if (!dev_size)
235 return 0;
236
237 if ((start >= dev_size) || (start + len > dev_size)) {
238 DMERR("%s: %pg too small for target: start=%llu, len=%llu, dev_size=%llu",
239 dm_device_name(ti->table->md), bdev,
240 (unsigned long long)start,
241 (unsigned long long)len,
242 (unsigned long long)dev_size);
243 return 1;
244 }
245
246 /*
247 * If the target is mapped to zoned block device(s), check
248 * that the zones are not partially mapped.
249 */
250 if (bdev_is_zoned(bdev)) {
251 unsigned int zone_sectors = bdev_zone_sectors(bdev);
252
253 if (start & (zone_sectors - 1)) {
254 DMERR("%s: start=%llu not aligned to h/w zone size %u of %pg",
255 dm_device_name(ti->table->md),
256 (unsigned long long)start,
257 zone_sectors, bdev);
258 return 1;
259 }
260
261 /*
262 * Note: The last zone of a zoned block device may be smaller
263 * than other zones. So for a target mapping the end of a
264 * zoned block device with such a zone, len would not be zone
265 * aligned. We do not allow such last smaller zone to be part
266 * of the mapping here to ensure that mappings with multiple
267 * devices do not end up with a smaller zone in the middle of
268 * the sector range.
269 */
270 if (len & (zone_sectors - 1)) {
271 DMERR("%s: len=%llu not aligned to h/w zone size %u of %pg",
272 dm_device_name(ti->table->md),
273 (unsigned long long)len,
274 zone_sectors, bdev);
275 return 1;
276 }
277 }
278
279 if (logical_block_size_sectors <= 1)
280 return 0;
281
282 if (start & (logical_block_size_sectors - 1)) {
283 DMERR("%s: start=%llu not aligned to h/w logical block size %u of %pg",
284 dm_device_name(ti->table->md),
285 (unsigned long long)start,
286 limits->logical_block_size, bdev);
287 return 1;
288 }
289
290 if (len & (logical_block_size_sectors - 1)) {
291 DMERR("%s: len=%llu not aligned to h/w logical block size %u of %pg",
292 dm_device_name(ti->table->md),
293 (unsigned long long)len,
294 limits->logical_block_size, bdev);
295 return 1;
296 }
297
298 return 0;
299}
300
301/*
302 * This upgrades the mode on an already open dm_dev, being
303 * careful to leave things as they were if we fail to reopen the
304 * device and not to touch the existing bdev field in case
305 * it is accessed concurrently.
306 */
307static int upgrade_mode(struct dm_dev_internal *dd, blk_mode_t new_mode,
308 struct mapped_device *md)
309{
310 int r;
311 struct dm_dev *old_dev, *new_dev;
312
313 old_dev = dd->dm_dev;
314
315 r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev,
316 dd->dm_dev->mode | new_mode, &new_dev);
317 if (r)
318 return r;
319
320 dd->dm_dev = new_dev;
321 dm_put_table_device(md, old_dev);
322
323 return 0;
324}
325
326/*
327 * Add a device to the list, or just increment the usage count if
328 * it's already present.
329 *
330 * Note: the __ref annotation is because this function can call the __init
331 * marked early_lookup_bdev when called during early boot code from dm-init.c.
332 */
333int __ref dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode,
334 struct dm_dev **result)
335{
336 int r;
337 dev_t dev;
338 unsigned int major, minor;
339 char dummy;
340 struct dm_dev_internal *dd;
341 struct dm_table *t = ti->table;
342
343 BUG_ON(!t);
344
345 if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) {
346 /* Extract the major/minor numbers */
347 dev = MKDEV(major, minor);
348 if (MAJOR(dev) != major || MINOR(dev) != minor)
349 return -EOVERFLOW;
350 } else {
351 r = lookup_bdev(path, &dev);
352#ifndef MODULE
353 if (r && system_state < SYSTEM_RUNNING)
354 r = early_lookup_bdev(path, &dev);
355#endif
356 if (r)
357 return r;
358 }
359 if (dev == disk_devt(t->md->disk))
360 return -EINVAL;
361
362 dd = find_device(&t->devices, dev);
363 if (!dd) {
364 dd = kmalloc(sizeof(*dd), GFP_KERNEL);
365 if (!dd)
366 return -ENOMEM;
367
368 r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev);
369 if (r) {
370 kfree(dd);
371 return r;
372 }
373
374 refcount_set(&dd->count, 1);
375 list_add(&dd->list, &t->devices);
376 goto out;
377
378 } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) {
379 r = upgrade_mode(dd, mode, t->md);
380 if (r)
381 return r;
382 }
383 refcount_inc(&dd->count);
384out:
385 *result = dd->dm_dev;
386 return 0;
387}
388EXPORT_SYMBOL(dm_get_device);
389
390static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
391 sector_t start, sector_t len, void *data)
392{
393 struct queue_limits *limits = data;
394 struct block_device *bdev = dev->bdev;
395 struct request_queue *q = bdev_get_queue(bdev);
396
397 if (unlikely(!q)) {
398 DMWARN("%s: Cannot set limits for nonexistent device %pg",
399 dm_device_name(ti->table->md), bdev);
400 return 0;
401 }
402
403 if (blk_stack_limits(limits, &q->limits,
404 get_start_sect(bdev) + start) < 0)
405 DMWARN("%s: adding target device %pg caused an alignment inconsistency: "
406 "physical_block_size=%u, logical_block_size=%u, "
407 "alignment_offset=%u, start=%llu",
408 dm_device_name(ti->table->md), bdev,
409 q->limits.physical_block_size,
410 q->limits.logical_block_size,
411 q->limits.alignment_offset,
412 (unsigned long long) start << SECTOR_SHIFT);
413 return 0;
414}
415
416/*
417 * Decrement a device's use count and remove it if necessary.
418 */
419void dm_put_device(struct dm_target *ti, struct dm_dev *d)
420{
421 int found = 0;
422 struct list_head *devices = &ti->table->devices;
423 struct dm_dev_internal *dd;
424
425 list_for_each_entry(dd, devices, list) {
426 if (dd->dm_dev == d) {
427 found = 1;
428 break;
429 }
430 }
431 if (!found) {
432 DMERR("%s: device %s not in table devices list",
433 dm_device_name(ti->table->md), d->name);
434 return;
435 }
436 if (refcount_dec_and_test(&dd->count)) {
437 dm_put_table_device(ti->table->md, d);
438 list_del(&dd->list);
439 kfree(dd);
440 }
441}
442EXPORT_SYMBOL(dm_put_device);
443
444/*
445 * Checks to see if the target joins onto the end of the table.
446 */
447static int adjoin(struct dm_table *t, struct dm_target *ti)
448{
449 struct dm_target *prev;
450
451 if (!t->num_targets)
452 return !ti->begin;
453
454 prev = &t->targets[t->num_targets - 1];
455 return (ti->begin == (prev->begin + prev->len));
456}
457
458/*
459 * Used to dynamically allocate the arg array.
460 *
461 * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must
462 * process messages even if some device is suspended. These messages have a
463 * small fixed number of arguments.
464 *
465 * On the other hand, dm-switch needs to process bulk data using messages and
466 * excessive use of GFP_NOIO could cause trouble.
467 */
468static char **realloc_argv(unsigned int *size, char **old_argv)
469{
470 char **argv;
471 unsigned int new_size;
472 gfp_t gfp;
473
474 if (*size) {
475 new_size = *size * 2;
476 gfp = GFP_KERNEL;
477 } else {
478 new_size = 8;
479 gfp = GFP_NOIO;
480 }
481 argv = kmalloc_array(new_size, sizeof(*argv), gfp);
482 if (argv && old_argv) {
483 memcpy(argv, old_argv, *size * sizeof(*argv));
484 *size = new_size;
485 }
486
487 kfree(old_argv);
488 return argv;
489}
490
491/*
492 * Destructively splits up the argument list to pass to ctr.
493 */
494int dm_split_args(int *argc, char ***argvp, char *input)
495{
496 char *start, *end = input, *out, **argv = NULL;
497 unsigned int array_size = 0;
498
499 *argc = 0;
500
501 if (!input) {
502 *argvp = NULL;
503 return 0;
504 }
505
506 argv = realloc_argv(&array_size, argv);
507 if (!argv)
508 return -ENOMEM;
509
510 while (1) {
511 /* Skip whitespace */
512 start = skip_spaces(end);
513
514 if (!*start)
515 break; /* success, we hit the end */
516
517 /* 'out' is used to remove any back-quotes */
518 end = out = start;
519 while (*end) {
520 /* Everything apart from '\0' can be quoted */
521 if (*end == '\\' && *(end + 1)) {
522 *out++ = *(end + 1);
523 end += 2;
524 continue;
525 }
526
527 if (isspace(*end))
528 break; /* end of token */
529
530 *out++ = *end++;
531 }
532
533 /* have we already filled the array ? */
534 if ((*argc + 1) > array_size) {
535 argv = realloc_argv(&array_size, argv);
536 if (!argv)
537 return -ENOMEM;
538 }
539
540 /* we know this is whitespace */
541 if (*end)
542 end++;
543
544 /* terminate the string and put it in the array */
545 *out = '\0';
546 argv[*argc] = start;
547 (*argc)++;
548 }
549
550 *argvp = argv;
551 return 0;
552}
553
554/*
555 * Impose necessary and sufficient conditions on a devices's table such
556 * that any incoming bio which respects its logical_block_size can be
557 * processed successfully. If it falls across the boundary between
558 * two or more targets, the size of each piece it gets split into must
559 * be compatible with the logical_block_size of the target processing it.
560 */
561static int validate_hardware_logical_block_alignment(struct dm_table *t,
562 struct queue_limits *limits)
563{
564 /*
565 * This function uses arithmetic modulo the logical_block_size
566 * (in units of 512-byte sectors).
567 */
568 unsigned short device_logical_block_size_sects =
569 limits->logical_block_size >> SECTOR_SHIFT;
570
571 /*
572 * Offset of the start of the next table entry, mod logical_block_size.
573 */
574 unsigned short next_target_start = 0;
575
576 /*
577 * Given an aligned bio that extends beyond the end of a
578 * target, how many sectors must the next target handle?
579 */
580 unsigned short remaining = 0;
581
582 struct dm_target *ti;
583 struct queue_limits ti_limits;
584 unsigned int i;
585
586 /*
587 * Check each entry in the table in turn.
588 */
589 for (i = 0; i < t->num_targets; i++) {
590 ti = dm_table_get_target(t, i);
591
592 blk_set_stacking_limits(&ti_limits);
593
594 /* combine all target devices' limits */
595 if (ti->type->iterate_devices)
596 ti->type->iterate_devices(ti, dm_set_device_limits,
597 &ti_limits);
598
599 /*
600 * If the remaining sectors fall entirely within this
601 * table entry are they compatible with its logical_block_size?
602 */
603 if (remaining < ti->len &&
604 remaining & ((ti_limits.logical_block_size >>
605 SECTOR_SHIFT) - 1))
606 break; /* Error */
607
608 next_target_start =
609 (unsigned short) ((next_target_start + ti->len) &
610 (device_logical_block_size_sects - 1));
611 remaining = next_target_start ?
612 device_logical_block_size_sects - next_target_start : 0;
613 }
614
615 if (remaining) {
616 DMERR("%s: table line %u (start sect %llu len %llu) "
617 "not aligned to h/w logical block size %u",
618 dm_device_name(t->md), i,
619 (unsigned long long) ti->begin,
620 (unsigned long long) ti->len,
621 limits->logical_block_size);
622 return -EINVAL;
623 }
624
625 return 0;
626}
627
628int dm_table_add_target(struct dm_table *t, const char *type,
629 sector_t start, sector_t len, char *params)
630{
631 int r = -EINVAL, argc;
632 char **argv;
633 struct dm_target *ti;
634
635 if (t->singleton) {
636 DMERR("%s: target type %s must appear alone in table",
637 dm_device_name(t->md), t->targets->type->name);
638 return -EINVAL;
639 }
640
641 BUG_ON(t->num_targets >= t->num_allocated);
642
643 ti = t->targets + t->num_targets;
644 memset(ti, 0, sizeof(*ti));
645
646 if (!len) {
647 DMERR("%s: zero-length target", dm_device_name(t->md));
648 return -EINVAL;
649 }
650
651 ti->type = dm_get_target_type(type);
652 if (!ti->type) {
653 DMERR("%s: %s: unknown target type", dm_device_name(t->md), type);
654 return -EINVAL;
655 }
656
657 if (dm_target_needs_singleton(ti->type)) {
658 if (t->num_targets) {
659 ti->error = "singleton target type must appear alone in table";
660 goto bad;
661 }
662 t->singleton = true;
663 }
664
665 if (dm_target_always_writeable(ti->type) &&
666 !(t->mode & BLK_OPEN_WRITE)) {
667 ti->error = "target type may not be included in a read-only table";
668 goto bad;
669 }
670
671 if (t->immutable_target_type) {
672 if (t->immutable_target_type != ti->type) {
673 ti->error = "immutable target type cannot be mixed with other target types";
674 goto bad;
675 }
676 } else if (dm_target_is_immutable(ti->type)) {
677 if (t->num_targets) {
678 ti->error = "immutable target type cannot be mixed with other target types";
679 goto bad;
680 }
681 t->immutable_target_type = ti->type;
682 }
683
684 if (dm_target_has_integrity(ti->type))
685 t->integrity_added = 1;
686
687 ti->table = t;
688 ti->begin = start;
689 ti->len = len;
690 ti->error = "Unknown error";
691
692 /*
693 * Does this target adjoin the previous one ?
694 */
695 if (!adjoin(t, ti)) {
696 ti->error = "Gap in table";
697 goto bad;
698 }
699
700 r = dm_split_args(&argc, &argv, params);
701 if (r) {
702 ti->error = "couldn't split parameters";
703 goto bad;
704 }
705
706 r = ti->type->ctr(ti, argc, argv);
707 kfree(argv);
708 if (r)
709 goto bad;
710
711 t->highs[t->num_targets++] = ti->begin + ti->len - 1;
712
713 if (!ti->num_discard_bios && ti->discards_supported)
714 DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.",
715 dm_device_name(t->md), type);
716
717 if (ti->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key))
718 static_branch_enable(&swap_bios_enabled);
719
720 return 0;
721
722 bad:
723 DMERR("%s: %s: %s (%pe)", dm_device_name(t->md), type, ti->error, ERR_PTR(r));
724 dm_put_target_type(ti->type);
725 return r;
726}
727
728/*
729 * Target argument parsing helpers.
730 */
731static int validate_next_arg(const struct dm_arg *arg, struct dm_arg_set *arg_set,
732 unsigned int *value, char **error, unsigned int grouped)
733{
734 const char *arg_str = dm_shift_arg(arg_set);
735 char dummy;
736
737 if (!arg_str ||
738 (sscanf(arg_str, "%u%c", value, &dummy) != 1) ||
739 (*value < arg->min) ||
740 (*value > arg->max) ||
741 (grouped && arg_set->argc < *value)) {
742 *error = arg->error;
743 return -EINVAL;
744 }
745
746 return 0;
747}
748
749int dm_read_arg(const struct dm_arg *arg, struct dm_arg_set *arg_set,
750 unsigned int *value, char **error)
751{
752 return validate_next_arg(arg, arg_set, value, error, 0);
753}
754EXPORT_SYMBOL(dm_read_arg);
755
756int dm_read_arg_group(const struct dm_arg *arg, struct dm_arg_set *arg_set,
757 unsigned int *value, char **error)
758{
759 return validate_next_arg(arg, arg_set, value, error, 1);
760}
761EXPORT_SYMBOL(dm_read_arg_group);
762
763const char *dm_shift_arg(struct dm_arg_set *as)
764{
765 char *r;
766
767 if (as->argc) {
768 as->argc--;
769 r = *as->argv;
770 as->argv++;
771 return r;
772 }
773
774 return NULL;
775}
776EXPORT_SYMBOL(dm_shift_arg);
777
778void dm_consume_args(struct dm_arg_set *as, unsigned int num_args)
779{
780 BUG_ON(as->argc < num_args);
781 as->argc -= num_args;
782 as->argv += num_args;
783}
784EXPORT_SYMBOL(dm_consume_args);
785
786static bool __table_type_bio_based(enum dm_queue_mode table_type)
787{
788 return (table_type == DM_TYPE_BIO_BASED ||
789 table_type == DM_TYPE_DAX_BIO_BASED);
790}
791
792static bool __table_type_request_based(enum dm_queue_mode table_type)
793{
794 return table_type == DM_TYPE_REQUEST_BASED;
795}
796
797void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type)
798{
799 t->type = type;
800}
801EXPORT_SYMBOL_GPL(dm_table_set_type);
802
803/* validate the dax capability of the target device span */
804static int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev,
805 sector_t start, sector_t len, void *data)
806{
807 if (dev->dax_dev)
808 return false;
809
810 DMDEBUG("%pg: error: dax unsupported by block device", dev->bdev);
811 return true;
812}
813
814/* Check devices support synchronous DAX */
815static int device_not_dax_synchronous_capable(struct dm_target *ti, struct dm_dev *dev,
816 sector_t start, sector_t len, void *data)
817{
818 return !dev->dax_dev || !dax_synchronous(dev->dax_dev);
819}
820
821static bool dm_table_supports_dax(struct dm_table *t,
822 iterate_devices_callout_fn iterate_fn)
823{
824 /* Ensure that all targets support DAX. */
825 for (unsigned int i = 0; i < t->num_targets; i++) {
826 struct dm_target *ti = dm_table_get_target(t, i);
827
828 if (!ti->type->direct_access)
829 return false;
830
831 if (!ti->type->iterate_devices ||
832 ti->type->iterate_devices(ti, iterate_fn, NULL))
833 return false;
834 }
835
836 return true;
837}
838
839static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev,
840 sector_t start, sector_t len, void *data)
841{
842 struct block_device *bdev = dev->bdev;
843 struct request_queue *q = bdev_get_queue(bdev);
844
845 /* request-based cannot stack on partitions! */
846 if (bdev_is_partition(bdev))
847 return false;
848
849 return queue_is_mq(q);
850}
851
852static int dm_table_determine_type(struct dm_table *t)
853{
854 unsigned int bio_based = 0, request_based = 0, hybrid = 0;
855 struct dm_target *ti;
856 struct list_head *devices = dm_table_get_devices(t);
857 enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
858
859 if (t->type != DM_TYPE_NONE) {
860 /* target already set the table's type */
861 if (t->type == DM_TYPE_BIO_BASED) {
862 /* possibly upgrade to a variant of bio-based */
863 goto verify_bio_based;
864 }
865 BUG_ON(t->type == DM_TYPE_DAX_BIO_BASED);
866 goto verify_rq_based;
867 }
868
869 for (unsigned int i = 0; i < t->num_targets; i++) {
870 ti = dm_table_get_target(t, i);
871 if (dm_target_hybrid(ti))
872 hybrid = 1;
873 else if (dm_target_request_based(ti))
874 request_based = 1;
875 else
876 bio_based = 1;
877
878 if (bio_based && request_based) {
879 DMERR("Inconsistent table: different target types can't be mixed up");
880 return -EINVAL;
881 }
882 }
883
884 if (hybrid && !bio_based && !request_based) {
885 /*
886 * The targets can work either way.
887 * Determine the type from the live device.
888 * Default to bio-based if device is new.
889 */
890 if (__table_type_request_based(live_md_type))
891 request_based = 1;
892 else
893 bio_based = 1;
894 }
895
896 if (bio_based) {
897verify_bio_based:
898 /* We must use this table as bio-based */
899 t->type = DM_TYPE_BIO_BASED;
900 if (dm_table_supports_dax(t, device_not_dax_capable) ||
901 (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) {
902 t->type = DM_TYPE_DAX_BIO_BASED;
903 }
904 return 0;
905 }
906
907 BUG_ON(!request_based); /* No targets in this table */
908
909 t->type = DM_TYPE_REQUEST_BASED;
910
911verify_rq_based:
912 /*
913 * Request-based dm supports only tables that have a single target now.
914 * To support multiple targets, request splitting support is needed,
915 * and that needs lots of changes in the block-layer.
916 * (e.g. request completion process for partial completion.)
917 */
918 if (t->num_targets > 1) {
919 DMERR("request-based DM doesn't support multiple targets");
920 return -EINVAL;
921 }
922
923 if (list_empty(devices)) {
924 int srcu_idx;
925 struct dm_table *live_table = dm_get_live_table(t->md, &srcu_idx);
926
927 /* inherit live table's type */
928 if (live_table)
929 t->type = live_table->type;
930 dm_put_live_table(t->md, srcu_idx);
931 return 0;
932 }
933
934 ti = dm_table_get_immutable_target(t);
935 if (!ti) {
936 DMERR("table load rejected: immutable target is required");
937 return -EINVAL;
938 } else if (ti->max_io_len) {
939 DMERR("table load rejected: immutable target that splits IO is not supported");
940 return -EINVAL;
941 }
942
943 /* Non-request-stackable devices can't be used for request-based dm */
944 if (!ti->type->iterate_devices ||
945 !ti->type->iterate_devices(ti, device_is_rq_stackable, NULL)) {
946 DMERR("table load rejected: including non-request-stackable devices");
947 return -EINVAL;
948 }
949
950 return 0;
951}
952
953enum dm_queue_mode dm_table_get_type(struct dm_table *t)
954{
955 return t->type;
956}
957
958struct target_type *dm_table_get_immutable_target_type(struct dm_table *t)
959{
960 return t->immutable_target_type;
961}
962
963struct dm_target *dm_table_get_immutable_target(struct dm_table *t)
964{
965 /* Immutable target is implicitly a singleton */
966 if (t->num_targets > 1 ||
967 !dm_target_is_immutable(t->targets[0].type))
968 return NULL;
969
970 return t->targets;
971}
972
973struct dm_target *dm_table_get_wildcard_target(struct dm_table *t)
974{
975 for (unsigned int i = 0; i < t->num_targets; i++) {
976 struct dm_target *ti = dm_table_get_target(t, i);
977
978 if (dm_target_is_wildcard(ti->type))
979 return ti;
980 }
981
982 return NULL;
983}
984
985bool dm_table_bio_based(struct dm_table *t)
986{
987 return __table_type_bio_based(dm_table_get_type(t));
988}
989
990bool dm_table_request_based(struct dm_table *t)
991{
992 return __table_type_request_based(dm_table_get_type(t));
993}
994
995static bool dm_table_supports_poll(struct dm_table *t);
996
997static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
998{
999 enum dm_queue_mode type = dm_table_get_type(t);
1000 unsigned int per_io_data_size = 0, front_pad, io_front_pad;
1001 unsigned int min_pool_size = 0, pool_size;
1002 struct dm_md_mempools *pools;
1003
1004 if (unlikely(type == DM_TYPE_NONE)) {
1005 DMERR("no table type is set, can't allocate mempools");
1006 return -EINVAL;
1007 }
1008
1009 pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
1010 if (!pools)
1011 return -ENOMEM;
1012
1013 if (type == DM_TYPE_REQUEST_BASED) {
1014 pool_size = dm_get_reserved_rq_based_ios();
1015 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
1016 goto init_bs;
1017 }
1018
1019 for (unsigned int i = 0; i < t->num_targets; i++) {
1020 struct dm_target *ti = dm_table_get_target(t, i);
1021
1022 per_io_data_size = max(per_io_data_size, ti->per_io_data_size);
1023 min_pool_size = max(min_pool_size, ti->num_flush_bios);
1024 }
1025 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
1026 front_pad = roundup(per_io_data_size,
1027 __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET;
1028
1029 io_front_pad = roundup(per_io_data_size,
1030 __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET;
1031 if (bioset_init(&pools->io_bs, pool_size, io_front_pad,
1032 dm_table_supports_poll(t) ? BIOSET_PERCPU_CACHE : 0))
1033 goto out_free_pools;
1034 if (t->integrity_supported &&
1035 bioset_integrity_create(&pools->io_bs, pool_size))
1036 goto out_free_pools;
1037init_bs:
1038 if (bioset_init(&pools->bs, pool_size, front_pad, 0))
1039 goto out_free_pools;
1040 if (t->integrity_supported &&
1041 bioset_integrity_create(&pools->bs, pool_size))
1042 goto out_free_pools;
1043
1044 t->mempools = pools;
1045 return 0;
1046
1047out_free_pools:
1048 dm_free_md_mempools(pools);
1049 return -ENOMEM;
1050}
1051
1052static int setup_indexes(struct dm_table *t)
1053{
1054 int i;
1055 unsigned int total = 0;
1056 sector_t *indexes;
1057
1058 /* allocate the space for *all* the indexes */
1059 for (i = t->depth - 2; i >= 0; i--) {
1060 t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE);
1061 total += t->counts[i];
1062 }
1063
1064 indexes = kvcalloc(total, NODE_SIZE, GFP_KERNEL);
1065 if (!indexes)
1066 return -ENOMEM;
1067
1068 /* set up internal nodes, bottom-up */
1069 for (i = t->depth - 2; i >= 0; i--) {
1070 t->index[i] = indexes;
1071 indexes += (KEYS_PER_NODE * t->counts[i]);
1072 setup_btree_index(i, t);
1073 }
1074
1075 return 0;
1076}
1077
1078/*
1079 * Builds the btree to index the map.
1080 */
1081static int dm_table_build_index(struct dm_table *t)
1082{
1083 int r = 0;
1084 unsigned int leaf_nodes;
1085
1086 /* how many indexes will the btree have ? */
1087 leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
1088 t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
1089
1090 /* leaf layer has already been set up */
1091 t->counts[t->depth - 1] = leaf_nodes;
1092 t->index[t->depth - 1] = t->highs;
1093
1094 if (t->depth >= 2)
1095 r = setup_indexes(t);
1096
1097 return r;
1098}
1099
1100static bool integrity_profile_exists(struct gendisk *disk)
1101{
1102 return !!blk_get_integrity(disk);
1103}
1104
1105/*
1106 * Get a disk whose integrity profile reflects the table's profile.
1107 * Returns NULL if integrity support was inconsistent or unavailable.
1108 */
1109static struct gendisk *dm_table_get_integrity_disk(struct dm_table *t)
1110{
1111 struct list_head *devices = dm_table_get_devices(t);
1112 struct dm_dev_internal *dd = NULL;
1113 struct gendisk *prev_disk = NULL, *template_disk = NULL;
1114
1115 for (unsigned int i = 0; i < t->num_targets; i++) {
1116 struct dm_target *ti = dm_table_get_target(t, i);
1117
1118 if (!dm_target_passes_integrity(ti->type))
1119 goto no_integrity;
1120 }
1121
1122 list_for_each_entry(dd, devices, list) {
1123 template_disk = dd->dm_dev->bdev->bd_disk;
1124 if (!integrity_profile_exists(template_disk))
1125 goto no_integrity;
1126 else if (prev_disk &&
1127 blk_integrity_compare(prev_disk, template_disk) < 0)
1128 goto no_integrity;
1129 prev_disk = template_disk;
1130 }
1131
1132 return template_disk;
1133
1134no_integrity:
1135 if (prev_disk)
1136 DMWARN("%s: integrity not set: %s and %s profile mismatch",
1137 dm_device_name(t->md),
1138 prev_disk->disk_name,
1139 template_disk->disk_name);
1140 return NULL;
1141}
1142
1143/*
1144 * Register the mapped device for blk_integrity support if the
1145 * underlying devices have an integrity profile. But all devices may
1146 * not have matching profiles (checking all devices isn't reliable
1147 * during table load because this table may use other DM device(s) which
1148 * must be resumed before they will have an initialized integity
1149 * profile). Consequently, stacked DM devices force a 2 stage integrity
1150 * profile validation: First pass during table load, final pass during
1151 * resume.
1152 */
1153static int dm_table_register_integrity(struct dm_table *t)
1154{
1155 struct mapped_device *md = t->md;
1156 struct gendisk *template_disk = NULL;
1157
1158 /* If target handles integrity itself do not register it here. */
1159 if (t->integrity_added)
1160 return 0;
1161
1162 template_disk = dm_table_get_integrity_disk(t);
1163 if (!template_disk)
1164 return 0;
1165
1166 if (!integrity_profile_exists(dm_disk(md))) {
1167 t->integrity_supported = true;
1168 /*
1169 * Register integrity profile during table load; we can do
1170 * this because the final profile must match during resume.
1171 */
1172 blk_integrity_register(dm_disk(md),
1173 blk_get_integrity(template_disk));
1174 return 0;
1175 }
1176
1177 /*
1178 * If DM device already has an initialized integrity
1179 * profile the new profile should not conflict.
1180 */
1181 if (blk_integrity_compare(dm_disk(md), template_disk) < 0) {
1182 DMERR("%s: conflict with existing integrity profile: %s profile mismatch",
1183 dm_device_name(t->md),
1184 template_disk->disk_name);
1185 return 1;
1186 }
1187
1188 /* Preserve existing integrity profile */
1189 t->integrity_supported = true;
1190 return 0;
1191}
1192
1193#ifdef CONFIG_BLK_INLINE_ENCRYPTION
1194
1195struct dm_crypto_profile {
1196 struct blk_crypto_profile profile;
1197 struct mapped_device *md;
1198};
1199
1200static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev,
1201 sector_t start, sector_t len, void *data)
1202{
1203 const struct blk_crypto_key *key = data;
1204
1205 blk_crypto_evict_key(dev->bdev, key);
1206 return 0;
1207}
1208
1209/*
1210 * When an inline encryption key is evicted from a device-mapper device, evict
1211 * it from all the underlying devices.
1212 */
1213static int dm_keyslot_evict(struct blk_crypto_profile *profile,
1214 const struct blk_crypto_key *key, unsigned int slot)
1215{
1216 struct mapped_device *md =
1217 container_of(profile, struct dm_crypto_profile, profile)->md;
1218 struct dm_table *t;
1219 int srcu_idx;
1220
1221 t = dm_get_live_table(md, &srcu_idx);
1222 if (!t)
1223 return 0;
1224
1225 for (unsigned int i = 0; i < t->num_targets; i++) {
1226 struct dm_target *ti = dm_table_get_target(t, i);
1227
1228 if (!ti->type->iterate_devices)
1229 continue;
1230 ti->type->iterate_devices(ti, dm_keyslot_evict_callback,
1231 (void *)key);
1232 }
1233
1234 dm_put_live_table(md, srcu_idx);
1235 return 0;
1236}
1237
1238static int
1239device_intersect_crypto_capabilities(struct dm_target *ti, struct dm_dev *dev,
1240 sector_t start, sector_t len, void *data)
1241{
1242 struct blk_crypto_profile *parent = data;
1243 struct blk_crypto_profile *child =
1244 bdev_get_queue(dev->bdev)->crypto_profile;
1245
1246 blk_crypto_intersect_capabilities(parent, child);
1247 return 0;
1248}
1249
1250void dm_destroy_crypto_profile(struct blk_crypto_profile *profile)
1251{
1252 struct dm_crypto_profile *dmcp = container_of(profile,
1253 struct dm_crypto_profile,
1254 profile);
1255
1256 if (!profile)
1257 return;
1258
1259 blk_crypto_profile_destroy(profile);
1260 kfree(dmcp);
1261}
1262
1263static void dm_table_destroy_crypto_profile(struct dm_table *t)
1264{
1265 dm_destroy_crypto_profile(t->crypto_profile);
1266 t->crypto_profile = NULL;
1267}
1268
1269/*
1270 * Constructs and initializes t->crypto_profile with a crypto profile that
1271 * represents the common set of crypto capabilities of the devices described by
1272 * the dm_table. However, if the constructed crypto profile doesn't support all
1273 * crypto capabilities that are supported by the current mapped_device, it
1274 * returns an error instead, since we don't support removing crypto capabilities
1275 * on table changes. Finally, if the constructed crypto profile is "empty" (has
1276 * no crypto capabilities at all), it just sets t->crypto_profile to NULL.
1277 */
1278static int dm_table_construct_crypto_profile(struct dm_table *t)
1279{
1280 struct dm_crypto_profile *dmcp;
1281 struct blk_crypto_profile *profile;
1282 unsigned int i;
1283 bool empty_profile = true;
1284
1285 dmcp = kmalloc(sizeof(*dmcp), GFP_KERNEL);
1286 if (!dmcp)
1287 return -ENOMEM;
1288 dmcp->md = t->md;
1289
1290 profile = &dmcp->profile;
1291 blk_crypto_profile_init(profile, 0);
1292 profile->ll_ops.keyslot_evict = dm_keyslot_evict;
1293 profile->max_dun_bytes_supported = UINT_MAX;
1294 memset(profile->modes_supported, 0xFF,
1295 sizeof(profile->modes_supported));
1296
1297 for (i = 0; i < t->num_targets; i++) {
1298 struct dm_target *ti = dm_table_get_target(t, i);
1299
1300 if (!dm_target_passes_crypto(ti->type)) {
1301 blk_crypto_intersect_capabilities(profile, NULL);
1302 break;
1303 }
1304 if (!ti->type->iterate_devices)
1305 continue;
1306 ti->type->iterate_devices(ti,
1307 device_intersect_crypto_capabilities,
1308 profile);
1309 }
1310
1311 if (t->md->queue &&
1312 !blk_crypto_has_capabilities(profile,
1313 t->md->queue->crypto_profile)) {
1314 DMERR("Inline encryption capabilities of new DM table were more restrictive than the old table's. This is not supported!");
1315 dm_destroy_crypto_profile(profile);
1316 return -EINVAL;
1317 }
1318
1319 /*
1320 * If the new profile doesn't actually support any crypto capabilities,
1321 * we may as well represent it with a NULL profile.
1322 */
1323 for (i = 0; i < ARRAY_SIZE(profile->modes_supported); i++) {
1324 if (profile->modes_supported[i]) {
1325 empty_profile = false;
1326 break;
1327 }
1328 }
1329
1330 if (empty_profile) {
1331 dm_destroy_crypto_profile(profile);
1332 profile = NULL;
1333 }
1334
1335 /*
1336 * t->crypto_profile is only set temporarily while the table is being
1337 * set up, and it gets set to NULL after the profile has been
1338 * transferred to the request_queue.
1339 */
1340 t->crypto_profile = profile;
1341
1342 return 0;
1343}
1344
1345static void dm_update_crypto_profile(struct request_queue *q,
1346 struct dm_table *t)
1347{
1348 if (!t->crypto_profile)
1349 return;
1350
1351 /* Make the crypto profile less restrictive. */
1352 if (!q->crypto_profile) {
1353 blk_crypto_register(t->crypto_profile, q);
1354 } else {
1355 blk_crypto_update_capabilities(q->crypto_profile,
1356 t->crypto_profile);
1357 dm_destroy_crypto_profile(t->crypto_profile);
1358 }
1359 t->crypto_profile = NULL;
1360}
1361
1362#else /* CONFIG_BLK_INLINE_ENCRYPTION */
1363
1364static int dm_table_construct_crypto_profile(struct dm_table *t)
1365{
1366 return 0;
1367}
1368
1369void dm_destroy_crypto_profile(struct blk_crypto_profile *profile)
1370{
1371}
1372
1373static void dm_table_destroy_crypto_profile(struct dm_table *t)
1374{
1375}
1376
1377static void dm_update_crypto_profile(struct request_queue *q,
1378 struct dm_table *t)
1379{
1380}
1381
1382#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
1383
1384/*
1385 * Prepares the table for use by building the indices,
1386 * setting the type, and allocating mempools.
1387 */
1388int dm_table_complete(struct dm_table *t)
1389{
1390 int r;
1391
1392 r = dm_table_determine_type(t);
1393 if (r) {
1394 DMERR("unable to determine table type");
1395 return r;
1396 }
1397
1398 r = dm_table_build_index(t);
1399 if (r) {
1400 DMERR("unable to build btrees");
1401 return r;
1402 }
1403
1404 r = dm_table_register_integrity(t);
1405 if (r) {
1406 DMERR("could not register integrity profile.");
1407 return r;
1408 }
1409
1410 r = dm_table_construct_crypto_profile(t);
1411 if (r) {
1412 DMERR("could not construct crypto profile.");
1413 return r;
1414 }
1415
1416 r = dm_table_alloc_md_mempools(t, t->md);
1417 if (r)
1418 DMERR("unable to allocate mempools");
1419
1420 return r;
1421}
1422
1423static DEFINE_MUTEX(_event_lock);
1424void dm_table_event_callback(struct dm_table *t,
1425 void (*fn)(void *), void *context)
1426{
1427 mutex_lock(&_event_lock);
1428 t->event_fn = fn;
1429 t->event_context = context;
1430 mutex_unlock(&_event_lock);
1431}
1432
1433void dm_table_event(struct dm_table *t)
1434{
1435 mutex_lock(&_event_lock);
1436 if (t->event_fn)
1437 t->event_fn(t->event_context);
1438 mutex_unlock(&_event_lock);
1439}
1440EXPORT_SYMBOL(dm_table_event);
1441
1442inline sector_t dm_table_get_size(struct dm_table *t)
1443{
1444 return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
1445}
1446EXPORT_SYMBOL(dm_table_get_size);
1447
1448/*
1449 * Search the btree for the correct target.
1450 *
1451 * Caller should check returned pointer for NULL
1452 * to trap I/O beyond end of device.
1453 */
1454struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
1455{
1456 unsigned int l, n = 0, k = 0;
1457 sector_t *node;
1458
1459 if (unlikely(sector >= dm_table_get_size(t)))
1460 return NULL;
1461
1462 for (l = 0; l < t->depth; l++) {
1463 n = get_child(n, k);
1464 node = get_node(t, l, n);
1465
1466 for (k = 0; k < KEYS_PER_NODE; k++)
1467 if (node[k] >= sector)
1468 break;
1469 }
1470
1471 return &t->targets[(KEYS_PER_NODE * n) + k];
1472}
1473
1474static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev,
1475 sector_t start, sector_t len, void *data)
1476{
1477 struct request_queue *q = bdev_get_queue(dev->bdev);
1478
1479 return !test_bit(QUEUE_FLAG_POLL, &q->queue_flags);
1480}
1481
1482/*
1483 * type->iterate_devices() should be called when the sanity check needs to
1484 * iterate and check all underlying data devices. iterate_devices() will
1485 * iterate all underlying data devices until it encounters a non-zero return
1486 * code, returned by whether the input iterate_devices_callout_fn, or
1487 * iterate_devices() itself internally.
1488 *
1489 * For some target type (e.g. dm-stripe), one call of iterate_devices() may
1490 * iterate multiple underlying devices internally, in which case a non-zero
1491 * return code returned by iterate_devices_callout_fn will stop the iteration
1492 * in advance.
1493 *
1494 * Cases requiring _any_ underlying device supporting some kind of attribute,
1495 * should use the iteration structure like dm_table_any_dev_attr(), or call
1496 * it directly. @func should handle semantics of positive examples, e.g.
1497 * capable of something.
1498 *
1499 * Cases requiring _all_ underlying devices supporting some kind of attribute,
1500 * should use the iteration structure like dm_table_supports_nowait() or
1501 * dm_table_supports_discards(). Or introduce dm_table_all_devs_attr() that
1502 * uses an @anti_func that handle semantics of counter examples, e.g. not
1503 * capable of something. So: return !dm_table_any_dev_attr(t, anti_func, data);
1504 */
1505static bool dm_table_any_dev_attr(struct dm_table *t,
1506 iterate_devices_callout_fn func, void *data)
1507{
1508 for (unsigned int i = 0; i < t->num_targets; i++) {
1509 struct dm_target *ti = dm_table_get_target(t, i);
1510
1511 if (ti->type->iterate_devices &&
1512 ti->type->iterate_devices(ti, func, data))
1513 return true;
1514 }
1515
1516 return false;
1517}
1518
1519static int count_device(struct dm_target *ti, struct dm_dev *dev,
1520 sector_t start, sector_t len, void *data)
1521{
1522 unsigned int *num_devices = data;
1523
1524 (*num_devices)++;
1525
1526 return 0;
1527}
1528
1529static bool dm_table_supports_poll(struct dm_table *t)
1530{
1531 for (unsigned int i = 0; i < t->num_targets; i++) {
1532 struct dm_target *ti = dm_table_get_target(t, i);
1533
1534 if (!ti->type->iterate_devices ||
1535 ti->type->iterate_devices(ti, device_not_poll_capable, NULL))
1536 return false;
1537 }
1538
1539 return true;
1540}
1541
1542/*
1543 * Check whether a table has no data devices attached using each
1544 * target's iterate_devices method.
1545 * Returns false if the result is unknown because a target doesn't
1546 * support iterate_devices.
1547 */
1548bool dm_table_has_no_data_devices(struct dm_table *t)
1549{
1550 for (unsigned int i = 0; i < t->num_targets; i++) {
1551 struct dm_target *ti = dm_table_get_target(t, i);
1552 unsigned int num_devices = 0;
1553
1554 if (!ti->type->iterate_devices)
1555 return false;
1556
1557 ti->type->iterate_devices(ti, count_device, &num_devices);
1558 if (num_devices)
1559 return false;
1560 }
1561
1562 return true;
1563}
1564
1565static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev,
1566 sector_t start, sector_t len, void *data)
1567{
1568 struct request_queue *q = bdev_get_queue(dev->bdev);
1569 enum blk_zoned_model *zoned_model = data;
1570
1571 return blk_queue_zoned_model(q) != *zoned_model;
1572}
1573
1574/*
1575 * Check the device zoned model based on the target feature flag. If the target
1576 * has the DM_TARGET_ZONED_HM feature flag set, host-managed zoned devices are
1577 * also accepted but all devices must have the same zoned model. If the target
1578 * has the DM_TARGET_MIXED_ZONED_MODEL feature set, the devices can have any
1579 * zoned model with all zoned devices having the same zone size.
1580 */
1581static bool dm_table_supports_zoned_model(struct dm_table *t,
1582 enum blk_zoned_model zoned_model)
1583{
1584 for (unsigned int i = 0; i < t->num_targets; i++) {
1585 struct dm_target *ti = dm_table_get_target(t, i);
1586
1587 if (dm_target_supports_zoned_hm(ti->type)) {
1588 if (!ti->type->iterate_devices ||
1589 ti->type->iterate_devices(ti, device_not_zoned_model,
1590 &zoned_model))
1591 return false;
1592 } else if (!dm_target_supports_mixed_zoned_model(ti->type)) {
1593 if (zoned_model == BLK_ZONED_HM)
1594 return false;
1595 }
1596 }
1597
1598 return true;
1599}
1600
1601static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev,
1602 sector_t start, sector_t len, void *data)
1603{
1604 unsigned int *zone_sectors = data;
1605
1606 if (!bdev_is_zoned(dev->bdev))
1607 return 0;
1608 return bdev_zone_sectors(dev->bdev) != *zone_sectors;
1609}
1610
1611/*
1612 * Check consistency of zoned model and zone sectors across all targets. For
1613 * zone sectors, if the destination device is a zoned block device, it shall
1614 * have the specified zone_sectors.
1615 */
1616static int validate_hardware_zoned_model(struct dm_table *t,
1617 enum blk_zoned_model zoned_model,
1618 unsigned int zone_sectors)
1619{
1620 if (zoned_model == BLK_ZONED_NONE)
1621 return 0;
1622
1623 if (!dm_table_supports_zoned_model(t, zoned_model)) {
1624 DMERR("%s: zoned model is not consistent across all devices",
1625 dm_device_name(t->md));
1626 return -EINVAL;
1627 }
1628
1629 /* Check zone size validity and compatibility */
1630 if (!zone_sectors || !is_power_of_2(zone_sectors))
1631 return -EINVAL;
1632
1633 if (dm_table_any_dev_attr(t, device_not_matches_zone_sectors, &zone_sectors)) {
1634 DMERR("%s: zone sectors is not consistent across all zoned devices",
1635 dm_device_name(t->md));
1636 return -EINVAL;
1637 }
1638
1639 return 0;
1640}
1641
1642/*
1643 * Establish the new table's queue_limits and validate them.
1644 */
1645int dm_calculate_queue_limits(struct dm_table *t,
1646 struct queue_limits *limits)
1647{
1648 struct queue_limits ti_limits;
1649 enum blk_zoned_model zoned_model = BLK_ZONED_NONE;
1650 unsigned int zone_sectors = 0;
1651
1652 blk_set_stacking_limits(limits);
1653
1654 for (unsigned int i = 0; i < t->num_targets; i++) {
1655 struct dm_target *ti = dm_table_get_target(t, i);
1656
1657 blk_set_stacking_limits(&ti_limits);
1658
1659 if (!ti->type->iterate_devices) {
1660 /* Set I/O hints portion of queue limits */
1661 if (ti->type->io_hints)
1662 ti->type->io_hints(ti, &ti_limits);
1663 goto combine_limits;
1664 }
1665
1666 /*
1667 * Combine queue limits of all the devices this target uses.
1668 */
1669 ti->type->iterate_devices(ti, dm_set_device_limits,
1670 &ti_limits);
1671
1672 if (zoned_model == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) {
1673 /*
1674 * After stacking all limits, validate all devices
1675 * in table support this zoned model and zone sectors.
1676 */
1677 zoned_model = ti_limits.zoned;
1678 zone_sectors = ti_limits.chunk_sectors;
1679 }
1680
1681 /* Set I/O hints portion of queue limits */
1682 if (ti->type->io_hints)
1683 ti->type->io_hints(ti, &ti_limits);
1684
1685 /*
1686 * Check each device area is consistent with the target's
1687 * overall queue limits.
1688 */
1689 if (ti->type->iterate_devices(ti, device_area_is_invalid,
1690 &ti_limits))
1691 return -EINVAL;
1692
1693combine_limits:
1694 /*
1695 * Merge this target's queue limits into the overall limits
1696 * for the table.
1697 */
1698 if (blk_stack_limits(limits, &ti_limits, 0) < 0)
1699 DMWARN("%s: adding target device (start sect %llu len %llu) "
1700 "caused an alignment inconsistency",
1701 dm_device_name(t->md),
1702 (unsigned long long) ti->begin,
1703 (unsigned long long) ti->len);
1704 }
1705
1706 /*
1707 * Verify that the zoned model and zone sectors, as determined before
1708 * any .io_hints override, are the same across all devices in the table.
1709 * - this is especially relevant if .io_hints is emulating a disk-managed
1710 * zoned model (aka BLK_ZONED_NONE) on host-managed zoned block devices.
1711 * BUT...
1712 */
1713 if (limits->zoned != BLK_ZONED_NONE) {
1714 /*
1715 * ...IF the above limits stacking determined a zoned model
1716 * validate that all of the table's devices conform to it.
1717 */
1718 zoned_model = limits->zoned;
1719 zone_sectors = limits->chunk_sectors;
1720 }
1721 if (validate_hardware_zoned_model(t, zoned_model, zone_sectors))
1722 return -EINVAL;
1723
1724 return validate_hardware_logical_block_alignment(t, limits);
1725}
1726
1727/*
1728 * Verify that all devices have an integrity profile that matches the
1729 * DM device's registered integrity profile. If the profiles don't
1730 * match then unregister the DM device's integrity profile.
1731 */
1732static void dm_table_verify_integrity(struct dm_table *t)
1733{
1734 struct gendisk *template_disk = NULL;
1735
1736 if (t->integrity_added)
1737 return;
1738
1739 if (t->integrity_supported) {
1740 /*
1741 * Verify that the original integrity profile
1742 * matches all the devices in this table.
1743 */
1744 template_disk = dm_table_get_integrity_disk(t);
1745 if (template_disk &&
1746 blk_integrity_compare(dm_disk(t->md), template_disk) >= 0)
1747 return;
1748 }
1749
1750 if (integrity_profile_exists(dm_disk(t->md))) {
1751 DMWARN("%s: unable to establish an integrity profile",
1752 dm_device_name(t->md));
1753 blk_integrity_unregister(dm_disk(t->md));
1754 }
1755}
1756
1757static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
1758 sector_t start, sector_t len, void *data)
1759{
1760 unsigned long flush = (unsigned long) data;
1761 struct request_queue *q = bdev_get_queue(dev->bdev);
1762
1763 return (q->queue_flags & flush);
1764}
1765
1766static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
1767{
1768 /*
1769 * Require at least one underlying device to support flushes.
1770 * t->devices includes internal dm devices such as mirror logs
1771 * so we need to use iterate_devices here, which targets
1772 * supporting flushes must provide.
1773 */
1774 for (unsigned int i = 0; i < t->num_targets; i++) {
1775 struct dm_target *ti = dm_table_get_target(t, i);
1776
1777 if (!ti->num_flush_bios)
1778 continue;
1779
1780 if (ti->flush_supported)
1781 return true;
1782
1783 if (ti->type->iterate_devices &&
1784 ti->type->iterate_devices(ti, device_flush_capable, (void *) flush))
1785 return true;
1786 }
1787
1788 return false;
1789}
1790
1791static int device_dax_write_cache_enabled(struct dm_target *ti,
1792 struct dm_dev *dev, sector_t start,
1793 sector_t len, void *data)
1794{
1795 struct dax_device *dax_dev = dev->dax_dev;
1796
1797 if (!dax_dev)
1798 return false;
1799
1800 if (dax_write_cache_enabled(dax_dev))
1801 return true;
1802 return false;
1803}
1804
1805static int device_is_rotational(struct dm_target *ti, struct dm_dev *dev,
1806 sector_t start, sector_t len, void *data)
1807{
1808 return !bdev_nonrot(dev->bdev);
1809}
1810
1811static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
1812 sector_t start, sector_t len, void *data)
1813{
1814 struct request_queue *q = bdev_get_queue(dev->bdev);
1815
1816 return !blk_queue_add_random(q);
1817}
1818
1819static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev,
1820 sector_t start, sector_t len, void *data)
1821{
1822 struct request_queue *q = bdev_get_queue(dev->bdev);
1823
1824 return !q->limits.max_write_zeroes_sectors;
1825}
1826
1827static bool dm_table_supports_write_zeroes(struct dm_table *t)
1828{
1829 for (unsigned int i = 0; i < t->num_targets; i++) {
1830 struct dm_target *ti = dm_table_get_target(t, i);
1831
1832 if (!ti->num_write_zeroes_bios)
1833 return false;
1834
1835 if (!ti->type->iterate_devices ||
1836 ti->type->iterate_devices(ti, device_not_write_zeroes_capable, NULL))
1837 return false;
1838 }
1839
1840 return true;
1841}
1842
1843static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev,
1844 sector_t start, sector_t len, void *data)
1845{
1846 return !bdev_nowait(dev->bdev);
1847}
1848
1849static bool dm_table_supports_nowait(struct dm_table *t)
1850{
1851 for (unsigned int i = 0; i < t->num_targets; i++) {
1852 struct dm_target *ti = dm_table_get_target(t, i);
1853
1854 if (!dm_target_supports_nowait(ti->type))
1855 return false;
1856
1857 if (!ti->type->iterate_devices ||
1858 ti->type->iterate_devices(ti, device_not_nowait_capable, NULL))
1859 return false;
1860 }
1861
1862 return true;
1863}
1864
1865static int device_not_discard_capable(struct dm_target *ti, struct dm_dev *dev,
1866 sector_t start, sector_t len, void *data)
1867{
1868 return !bdev_max_discard_sectors(dev->bdev);
1869}
1870
1871static bool dm_table_supports_discards(struct dm_table *t)
1872{
1873 for (unsigned int i = 0; i < t->num_targets; i++) {
1874 struct dm_target *ti = dm_table_get_target(t, i);
1875
1876 if (!ti->num_discard_bios)
1877 return false;
1878
1879 /*
1880 * Either the target provides discard support (as implied by setting
1881 * 'discards_supported') or it relies on _all_ data devices having
1882 * discard support.
1883 */
1884 if (!ti->discards_supported &&
1885 (!ti->type->iterate_devices ||
1886 ti->type->iterate_devices(ti, device_not_discard_capable, NULL)))
1887 return false;
1888 }
1889
1890 return true;
1891}
1892
1893static int device_not_secure_erase_capable(struct dm_target *ti,
1894 struct dm_dev *dev, sector_t start,
1895 sector_t len, void *data)
1896{
1897 return !bdev_max_secure_erase_sectors(dev->bdev);
1898}
1899
1900static bool dm_table_supports_secure_erase(struct dm_table *t)
1901{
1902 for (unsigned int i = 0; i < t->num_targets; i++) {
1903 struct dm_target *ti = dm_table_get_target(t, i);
1904
1905 if (!ti->num_secure_erase_bios)
1906 return false;
1907
1908 if (!ti->type->iterate_devices ||
1909 ti->type->iterate_devices(ti, device_not_secure_erase_capable, NULL))
1910 return false;
1911 }
1912
1913 return true;
1914}
1915
1916static int device_requires_stable_pages(struct dm_target *ti,
1917 struct dm_dev *dev, sector_t start,
1918 sector_t len, void *data)
1919{
1920 return bdev_stable_writes(dev->bdev);
1921}
1922
1923int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1924 struct queue_limits *limits)
1925{
1926 bool wc = false, fua = false;
1927 int r;
1928
1929 /*
1930 * Copy table's limits to the DM device's request_queue
1931 */
1932 q->limits = *limits;
1933
1934 if (dm_table_supports_nowait(t))
1935 blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q);
1936 else
1937 blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q);
1938
1939 if (!dm_table_supports_discards(t)) {
1940 q->limits.max_discard_sectors = 0;
1941 q->limits.max_hw_discard_sectors = 0;
1942 q->limits.discard_granularity = 0;
1943 q->limits.discard_alignment = 0;
1944 q->limits.discard_misaligned = 0;
1945 }
1946
1947 if (!dm_table_supports_secure_erase(t))
1948 q->limits.max_secure_erase_sectors = 0;
1949
1950 if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) {
1951 wc = true;
1952 if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA)))
1953 fua = true;
1954 }
1955 blk_queue_write_cache(q, wc, fua);
1956
1957 if (dm_table_supports_dax(t, device_not_dax_capable)) {
1958 blk_queue_flag_set(QUEUE_FLAG_DAX, q);
1959 if (dm_table_supports_dax(t, device_not_dax_synchronous_capable))
1960 set_dax_synchronous(t->md->dax_dev);
1961 } else
1962 blk_queue_flag_clear(QUEUE_FLAG_DAX, q);
1963
1964 if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL))
1965 dax_write_cache(t->md->dax_dev, true);
1966
1967 /* Ensure that all underlying devices are non-rotational. */
1968 if (dm_table_any_dev_attr(t, device_is_rotational, NULL))
1969 blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
1970 else
1971 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
1972
1973 if (!dm_table_supports_write_zeroes(t))
1974 q->limits.max_write_zeroes_sectors = 0;
1975
1976 dm_table_verify_integrity(t);
1977
1978 /*
1979 * Some devices don't use blk_integrity but still want stable pages
1980 * because they do their own checksumming.
1981 * If any underlying device requires stable pages, a table must require
1982 * them as well. Only targets that support iterate_devices are considered:
1983 * don't want error, zero, etc to require stable pages.
1984 */
1985 if (dm_table_any_dev_attr(t, device_requires_stable_pages, NULL))
1986 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
1987 else
1988 blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
1989
1990 /*
1991 * Determine whether or not this queue's I/O timings contribute
1992 * to the entropy pool, Only request-based targets use this.
1993 * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not
1994 * have it set.
1995 */
1996 if (blk_queue_add_random(q) &&
1997 dm_table_any_dev_attr(t, device_is_not_random, NULL))
1998 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
1999
2000 /*
2001 * For a zoned target, setup the zones related queue attributes
2002 * and resources necessary for zone append emulation if necessary.
2003 */
2004 if (blk_queue_is_zoned(q)) {
2005 r = dm_set_zones_restrictions(t, q);
2006 if (r)
2007 return r;
2008 if (!static_key_enabled(&zoned_enabled.key))
2009 static_branch_enable(&zoned_enabled);
2010 }
2011
2012 dm_update_crypto_profile(q, t);
2013 disk_update_readahead(t->md->disk);
2014
2015 /*
2016 * Check for request-based device is left to
2017 * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
2018 *
2019 * For bio-based device, only set QUEUE_FLAG_POLL when all
2020 * underlying devices supporting polling.
2021 */
2022 if (__table_type_bio_based(t->type)) {
2023 if (dm_table_supports_poll(t))
2024 blk_queue_flag_set(QUEUE_FLAG_POLL, q);
2025 else
2026 blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
2027 }
2028
2029 return 0;
2030}
2031
2032struct list_head *dm_table_get_devices(struct dm_table *t)
2033{
2034 return &t->devices;
2035}
2036
2037blk_mode_t dm_table_get_mode(struct dm_table *t)
2038{
2039 return t->mode;
2040}
2041EXPORT_SYMBOL(dm_table_get_mode);
2042
2043enum suspend_mode {
2044 PRESUSPEND,
2045 PRESUSPEND_UNDO,
2046 POSTSUSPEND,
2047};
2048
2049static void suspend_targets(struct dm_table *t, enum suspend_mode mode)
2050{
2051 lockdep_assert_held(&t->md->suspend_lock);
2052
2053 for (unsigned int i = 0; i < t->num_targets; i++) {
2054 struct dm_target *ti = dm_table_get_target(t, i);
2055
2056 switch (mode) {
2057 case PRESUSPEND:
2058 if (ti->type->presuspend)
2059 ti->type->presuspend(ti);
2060 break;
2061 case PRESUSPEND_UNDO:
2062 if (ti->type->presuspend_undo)
2063 ti->type->presuspend_undo(ti);
2064 break;
2065 case POSTSUSPEND:
2066 if (ti->type->postsuspend)
2067 ti->type->postsuspend(ti);
2068 break;
2069 }
2070 }
2071}
2072
2073void dm_table_presuspend_targets(struct dm_table *t)
2074{
2075 if (!t)
2076 return;
2077
2078 suspend_targets(t, PRESUSPEND);
2079}
2080
2081void dm_table_presuspend_undo_targets(struct dm_table *t)
2082{
2083 if (!t)
2084 return;
2085
2086 suspend_targets(t, PRESUSPEND_UNDO);
2087}
2088
2089void dm_table_postsuspend_targets(struct dm_table *t)
2090{
2091 if (!t)
2092 return;
2093
2094 suspend_targets(t, POSTSUSPEND);
2095}
2096
2097int dm_table_resume_targets(struct dm_table *t)
2098{
2099 unsigned int i;
2100 int r = 0;
2101
2102 lockdep_assert_held(&t->md->suspend_lock);
2103
2104 for (i = 0; i < t->num_targets; i++) {
2105 struct dm_target *ti = dm_table_get_target(t, i);
2106
2107 if (!ti->type->preresume)
2108 continue;
2109
2110 r = ti->type->preresume(ti);
2111 if (r) {
2112 DMERR("%s: %s: preresume failed, error = %d",
2113 dm_device_name(t->md), ti->type->name, r);
2114 return r;
2115 }
2116 }
2117
2118 for (i = 0; i < t->num_targets; i++) {
2119 struct dm_target *ti = dm_table_get_target(t, i);
2120
2121 if (ti->type->resume)
2122 ti->type->resume(ti);
2123 }
2124
2125 return 0;
2126}
2127
2128struct mapped_device *dm_table_get_md(struct dm_table *t)
2129{
2130 return t->md;
2131}
2132EXPORT_SYMBOL(dm_table_get_md);
2133
2134const char *dm_table_device_name(struct dm_table *t)
2135{
2136 return dm_device_name(t->md);
2137}
2138EXPORT_SYMBOL_GPL(dm_table_device_name);
2139
2140void dm_table_run_md_queue_async(struct dm_table *t)
2141{
2142 if (!dm_table_request_based(t))
2143 return;
2144
2145 if (t->md->queue)
2146 blk_mq_run_hw_queues(t->md->queue, true);
2147}
2148EXPORT_SYMBOL(dm_table_run_md_queue_async);
2149