drivers/md/dm.c at v6.4-rc3

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / drivers / md / dm.c
at v6.4-rc3 3422 lines 81 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   4 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   5 *
   6 * This file is released under the GPL.
   7 */
   8
   9#include "dm-core.h"
  10#include "dm-rq.h"
  11#include "dm-uevent.h"
  12#include "dm-ima.h"
  13
  14#include <linux/init.h>
  15#include <linux/module.h>
  16#include <linux/mutex.h>
  17#include <linux/sched/mm.h>
  18#include <linux/sched/signal.h>
  19#include <linux/blkpg.h>
  20#include <linux/bio.h>
  21#include <linux/mempool.h>
  22#include <linux/dax.h>
  23#include <linux/slab.h>
  24#include <linux/idr.h>
  25#include <linux/uio.h>
  26#include <linux/hdreg.h>
  27#include <linux/delay.h>
  28#include <linux/wait.h>
  29#include <linux/pr.h>
  30#include <linux/refcount.h>
  31#include <linux/part_stat.h>
  32#include <linux/blk-crypto.h>
  33#include <linux/blk-crypto-profile.h>
  34
  35#define DM_MSG_PREFIX "core"
  36
  37/*
  38 * Cookies are numeric values sent with CHANGE and REMOVE
  39 * uevents while resuming, removing or renaming the device.
  40 */
  41#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  42#define DM_COOKIE_LENGTH 24
  43
  44/*
  45 * For REQ_POLLED fs bio, this flag is set if we link mapped underlying
  46 * dm_io into one list, and reuse bio->bi_private as the list head. Before
  47 * ending this fs bio, we will recover its ->bi_private.
  48 */
  49#define REQ_DM_POLL_LIST	REQ_DRV
  50
  51static const char *_name = DM_NAME;
  52
  53static unsigned int major;
  54static unsigned int _major;
  55
  56static DEFINE_IDR(_minor_idr);
  57
  58static DEFINE_SPINLOCK(_minor_lock);
  59
  60static void do_deferred_remove(struct work_struct *w);
  61
  62static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
  63
  64static struct workqueue_struct *deferred_remove_workqueue;
  65
  66atomic_t dm_global_event_nr = ATOMIC_INIT(0);
  67DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
  68
  69void dm_issue_global_event(void)
  70{
  71	atomic_inc(&dm_global_event_nr);
  72	wake_up(&dm_global_eventq);
  73}
  74
  75DEFINE_STATIC_KEY_FALSE(stats_enabled);
  76DEFINE_STATIC_KEY_FALSE(swap_bios_enabled);
  77DEFINE_STATIC_KEY_FALSE(zoned_enabled);
  78
  79/*
  80 * One of these is allocated (on-stack) per original bio.
  81 */
  82struct clone_info {
  83	struct dm_table *map;
  84	struct bio *bio;
  85	struct dm_io *io;
  86	sector_t sector;
  87	unsigned int sector_count;
  88	bool is_abnormal_io:1;
  89	bool submit_as_polled:1;
  90};
  91
  92static inline struct dm_target_io *clone_to_tio(struct bio *clone)
  93{
  94	return container_of(clone, struct dm_target_io, clone);
  95}
  96
  97void *dm_per_bio_data(struct bio *bio, size_t data_size)
  98{
  99	if (!dm_tio_flagged(clone_to_tio(bio), DM_TIO_INSIDE_DM_IO))
 100		return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size;
 101	return (char *)bio - DM_IO_BIO_OFFSET - data_size;
 102}
 103EXPORT_SYMBOL_GPL(dm_per_bio_data);
 104
 105struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
 106{
 107	struct dm_io *io = (struct dm_io *)((char *)data + data_size);
 108
 109	if (io->magic == DM_IO_MAGIC)
 110		return (struct bio *)((char *)io + DM_IO_BIO_OFFSET);
 111	BUG_ON(io->magic != DM_TIO_MAGIC);
 112	return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET);
 113}
 114EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
 115
 116unsigned int dm_bio_get_target_bio_nr(const struct bio *bio)
 117{
 118	return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
 119}
 120EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
 121
 122#define MINOR_ALLOCED ((void *)-1)
 123
 124#define DM_NUMA_NODE NUMA_NO_NODE
 125static int dm_numa_node = DM_NUMA_NODE;
 126
 127#define DEFAULT_SWAP_BIOS	(8 * 1048576 / PAGE_SIZE)
 128static int swap_bios = DEFAULT_SWAP_BIOS;
 129static int get_swap_bios(void)
 130{
 131	int latch = READ_ONCE(swap_bios);
 132
 133	if (unlikely(latch <= 0))
 134		latch = DEFAULT_SWAP_BIOS;
 135	return latch;
 136}
 137
 138struct table_device {
 139	struct list_head list;
 140	refcount_t count;
 141	struct dm_dev dm_dev;
 142};
 143
 144/*
 145 * Bio-based DM's mempools' reserved IOs set by the user.
 146 */
 147#define RESERVED_BIO_BASED_IOS		16
 148static unsigned int reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 149
 150static int __dm_get_module_param_int(int *module_param, int min, int max)
 151{
 152	int param = READ_ONCE(*module_param);
 153	int modified_param = 0;
 154	bool modified = true;
 155
 156	if (param < min)
 157		modified_param = min;
 158	else if (param > max)
 159		modified_param = max;
 160	else
 161		modified = false;
 162
 163	if (modified) {
 164		(void)cmpxchg(module_param, param, modified_param);
 165		param = modified_param;
 166	}
 167
 168	return param;
 169}
 170
 171unsigned int __dm_get_module_param(unsigned int *module_param, unsigned int def, unsigned int max)
 172{
 173	unsigned int param = READ_ONCE(*module_param);
 174	unsigned int modified_param = 0;
 175
 176	if (!param)
 177		modified_param = def;
 178	else if (param > max)
 179		modified_param = max;
 180
 181	if (modified_param) {
 182		(void)cmpxchg(module_param, param, modified_param);
 183		param = modified_param;
 184	}
 185
 186	return param;
 187}
 188
 189unsigned int dm_get_reserved_bio_based_ios(void)
 190{
 191	return __dm_get_module_param(&reserved_bio_based_ios,
 192				     RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
 193}
 194EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 195
 196static unsigned int dm_get_numa_node(void)
 197{
 198	return __dm_get_module_param_int(&dm_numa_node,
 199					 DM_NUMA_NODE, num_online_nodes() - 1);
 200}
 201
 202static int __init local_init(void)
 203{
 204	int r;
 205
 206	r = dm_uevent_init();
 207	if (r)
 208		return r;
 209
 210	deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
 211	if (!deferred_remove_workqueue) {
 212		r = -ENOMEM;
 213		goto out_uevent_exit;
 214	}
 215
 216	_major = major;
 217	r = register_blkdev(_major, _name);
 218	if (r < 0)
 219		goto out_free_workqueue;
 220
 221	if (!_major)
 222		_major = r;
 223
 224	return 0;
 225
 226out_free_workqueue:
 227	destroy_workqueue(deferred_remove_workqueue);
 228out_uevent_exit:
 229	dm_uevent_exit();
 230
 231	return r;
 232}
 233
 234static void local_exit(void)
 235{
 236	destroy_workqueue(deferred_remove_workqueue);
 237
 238	unregister_blkdev(_major, _name);
 239	dm_uevent_exit();
 240
 241	_major = 0;
 242
 243	DMINFO("cleaned up");
 244}
 245
 246static int (*_inits[])(void) __initdata = {
 247	local_init,
 248	dm_target_init,
 249	dm_linear_init,
 250	dm_stripe_init,
 251	dm_io_init,
 252	dm_kcopyd_init,
 253	dm_interface_init,
 254	dm_statistics_init,
 255};
 256
 257static void (*_exits[])(void) = {
 258	local_exit,
 259	dm_target_exit,
 260	dm_linear_exit,
 261	dm_stripe_exit,
 262	dm_io_exit,
 263	dm_kcopyd_exit,
 264	dm_interface_exit,
 265	dm_statistics_exit,
 266};
 267
 268static int __init dm_init(void)
 269{
 270	const int count = ARRAY_SIZE(_inits);
 271	int r, i;
 272
 273#if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE))
 274	DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled."
 275	       " Duplicate IMA measurements will not be recorded in the IMA log.");
 276#endif
 277
 278	for (i = 0; i < count; i++) {
 279		r = _inits[i]();
 280		if (r)
 281			goto bad;
 282	}
 283
 284	return 0;
 285bad:
 286	while (i--)
 287		_exits[i]();
 288
 289	return r;
 290}
 291
 292static void __exit dm_exit(void)
 293{
 294	int i = ARRAY_SIZE(_exits);
 295
 296	while (i--)
 297		_exits[i]();
 298
 299	/*
 300	 * Should be empty by this point.
 301	 */
 302	idr_destroy(&_minor_idr);
 303}
 304
 305/*
 306 * Block device functions
 307 */
 308int dm_deleting_md(struct mapped_device *md)
 309{
 310	return test_bit(DMF_DELETING, &md->flags);
 311}
 312
 313static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 314{
 315	struct mapped_device *md;
 316
 317	spin_lock(&_minor_lock);
 318
 319	md = bdev->bd_disk->private_data;
 320	if (!md)
 321		goto out;
 322
 323	if (test_bit(DMF_FREEING, &md->flags) ||
 324	    dm_deleting_md(md)) {
 325		md = NULL;
 326		goto out;
 327	}
 328
 329	dm_get(md);
 330	atomic_inc(&md->open_count);
 331out:
 332	spin_unlock(&_minor_lock);
 333
 334	return md ? 0 : -ENXIO;
 335}
 336
 337static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 338{
 339	struct mapped_device *md;
 340
 341	spin_lock(&_minor_lock);
 342
 343	md = disk->private_data;
 344	if (WARN_ON(!md))
 345		goto out;
 346
 347	if (atomic_dec_and_test(&md->open_count) &&
 348	    (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
 349		queue_work(deferred_remove_workqueue, &deferred_remove_work);
 350
 351	dm_put(md);
 352out:
 353	spin_unlock(&_minor_lock);
 354}
 355
 356int dm_open_count(struct mapped_device *md)
 357{
 358	return atomic_read(&md->open_count);
 359}
 360
 361/*
 362 * Guarantees nothing is using the device before it's deleted.
 363 */
 364int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
 365{
 366	int r = 0;
 367
 368	spin_lock(&_minor_lock);
 369
 370	if (dm_open_count(md)) {
 371		r = -EBUSY;
 372		if (mark_deferred)
 373			set_bit(DMF_DEFERRED_REMOVE, &md->flags);
 374	} else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
 375		r = -EEXIST;
 376	else
 377		set_bit(DMF_DELETING, &md->flags);
 378
 379	spin_unlock(&_minor_lock);
 380
 381	return r;
 382}
 383
 384int dm_cancel_deferred_remove(struct mapped_device *md)
 385{
 386	int r = 0;
 387
 388	spin_lock(&_minor_lock);
 389
 390	if (test_bit(DMF_DELETING, &md->flags))
 391		r = -EBUSY;
 392	else
 393		clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
 394
 395	spin_unlock(&_minor_lock);
 396
 397	return r;
 398}
 399
 400static void do_deferred_remove(struct work_struct *w)
 401{
 402	dm_deferred_remove();
 403}
 404
 405static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 406{
 407	struct mapped_device *md = bdev->bd_disk->private_data;
 408
 409	return dm_get_geometry(md, geo);
 410}
 411
 412static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
 413			    struct block_device **bdev)
 414{
 415	struct dm_target *ti;
 416	struct dm_table *map;
 417	int r;
 418
 419retry:
 420	r = -ENOTTY;
 421	map = dm_get_live_table(md, srcu_idx);
 422	if (!map || !dm_table_get_size(map))
 423		return r;
 424
 425	/* We only support devices that have a single target */
 426	if (map->num_targets != 1)
 427		return r;
 428
 429	ti = dm_table_get_target(map, 0);
 430	if (!ti->type->prepare_ioctl)
 431		return r;
 432
 433	if (dm_suspended_md(md))
 434		return -EAGAIN;
 435
 436	r = ti->type->prepare_ioctl(ti, bdev);
 437	if (r == -ENOTCONN && !fatal_signal_pending(current)) {
 438		dm_put_live_table(md, *srcu_idx);
 439		fsleep(10000);
 440		goto retry;
 441	}
 442
 443	return r;
 444}
 445
 446static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
 447{
 448	dm_put_live_table(md, srcu_idx);
 449}
 450
 451static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 452			unsigned int cmd, unsigned long arg)
 453{
 454	struct mapped_device *md = bdev->bd_disk->private_data;
 455	int r, srcu_idx;
 456
 457	r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
 458	if (r < 0)
 459		goto out;
 460
 461	if (r > 0) {
 462		/*
 463		 * Target determined this ioctl is being issued against a
 464		 * subset of the parent bdev; require extra privileges.
 465		 */
 466		if (!capable(CAP_SYS_RAWIO)) {
 467			DMDEBUG_LIMIT(
 468	"%s: sending ioctl %x to DM device without required privilege.",
 469				current->comm, cmd);
 470			r = -ENOIOCTLCMD;
 471			goto out;
 472		}
 473	}
 474
 475	if (!bdev->bd_disk->fops->ioctl)
 476		r = -ENOTTY;
 477	else
 478		r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
 479out:
 480	dm_unprepare_ioctl(md, srcu_idx);
 481	return r;
 482}
 483
 484u64 dm_start_time_ns_from_clone(struct bio *bio)
 485{
 486	return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time);
 487}
 488EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
 489
 490static bool bio_is_flush_with_data(struct bio *bio)
 491{
 492	return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size);
 493}
 494
 495static void dm_io_acct(struct dm_io *io, bool end)
 496{
 497	struct dm_stats_aux *stats_aux = &io->stats_aux;
 498	unsigned long start_time = io->start_time;
 499	struct mapped_device *md = io->md;
 500	struct bio *bio = io->orig_bio;
 501	unsigned int sectors;
 502
 503	/*
 504	 * If REQ_PREFLUSH set, don't account payload, it will be
 505	 * submitted (and accounted) after this flush completes.
 506	 */
 507	if (bio_is_flush_with_data(bio))
 508		sectors = 0;
 509	else if (likely(!(dm_io_flagged(io, DM_IO_WAS_SPLIT))))
 510		sectors = bio_sectors(bio);
 511	else
 512		sectors = io->sectors;
 513
 514	if (!end)
 515		bdev_start_io_acct(bio->bi_bdev, bio_op(bio), start_time);
 516	else
 517		bdev_end_io_acct(bio->bi_bdev, bio_op(bio), sectors,
 518				 start_time);
 519
 520	if (static_branch_unlikely(&stats_enabled) &&
 521	    unlikely(dm_stats_used(&md->stats))) {
 522		sector_t sector;
 523
 524		if (likely(!dm_io_flagged(io, DM_IO_WAS_SPLIT)))
 525			sector = bio->bi_iter.bi_sector;
 526		else
 527			sector = bio_end_sector(bio) - io->sector_offset;
 528
 529		dm_stats_account_io(&md->stats, bio_data_dir(bio),
 530				    sector, sectors,
 531				    end, start_time, stats_aux);
 532	}
 533}
 534
 535static void __dm_start_io_acct(struct dm_io *io)
 536{
 537	dm_io_acct(io, false);
 538}
 539
 540static void dm_start_io_acct(struct dm_io *io, struct bio *clone)
 541{
 542	/*
 543	 * Ensure IO accounting is only ever started once.
 544	 */
 545	if (dm_io_flagged(io, DM_IO_ACCOUNTED))
 546		return;
 547
 548	/* Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */
 549	if (!clone || likely(dm_tio_is_normal(clone_to_tio(clone)))) {
 550		dm_io_set_flag(io, DM_IO_ACCOUNTED);
 551	} else {
 552		unsigned long flags;
 553		/* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */
 554		spin_lock_irqsave(&io->lock, flags);
 555		if (dm_io_flagged(io, DM_IO_ACCOUNTED)) {
 556			spin_unlock_irqrestore(&io->lock, flags);
 557			return;
 558		}
 559		dm_io_set_flag(io, DM_IO_ACCOUNTED);
 560		spin_unlock_irqrestore(&io->lock, flags);
 561	}
 562
 563	__dm_start_io_acct(io);
 564}
 565
 566static void dm_end_io_acct(struct dm_io *io)
 567{
 568	dm_io_acct(io, true);
 569}
 570
 571static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 572{
 573	struct dm_io *io;
 574	struct dm_target_io *tio;
 575	struct bio *clone;
 576
 577	clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->mempools->io_bs);
 578	tio = clone_to_tio(clone);
 579	tio->flags = 0;
 580	dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO);
 581	tio->io = NULL;
 582
 583	io = container_of(tio, struct dm_io, tio);
 584	io->magic = DM_IO_MAGIC;
 585	io->status = BLK_STS_OK;
 586
 587	/* one ref is for submission, the other is for completion */
 588	atomic_set(&io->io_count, 2);
 589	this_cpu_inc(*md->pending_io);
 590	io->orig_bio = bio;
 591	io->md = md;
 592	spin_lock_init(&io->lock);
 593	io->start_time = jiffies;
 594	io->flags = 0;
 595
 596	if (static_branch_unlikely(&stats_enabled))
 597		dm_stats_record_start(&md->stats, &io->stats_aux);
 598
 599	return io;
 600}
 601
 602static void free_io(struct dm_io *io)
 603{
 604	bio_put(&io->tio.clone);
 605}
 606
 607static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 608			     unsigned int target_bio_nr, unsigned int *len, gfp_t gfp_mask)
 609{
 610	struct mapped_device *md = ci->io->md;
 611	struct dm_target_io *tio;
 612	struct bio *clone;
 613
 614	if (!ci->io->tio.io) {
 615		/* the dm_target_io embedded in ci->io is available */
 616		tio = &ci->io->tio;
 617		/* alloc_io() already initialized embedded clone */
 618		clone = &tio->clone;
 619	} else {
 620		clone = bio_alloc_clone(NULL, ci->bio, gfp_mask,
 621					&md->mempools->bs);
 622		if (!clone)
 623			return NULL;
 624
 625		/* REQ_DM_POLL_LIST shouldn't be inherited */
 626		clone->bi_opf &= ~REQ_DM_POLL_LIST;
 627
 628		tio = clone_to_tio(clone);
 629		tio->flags = 0; /* also clears DM_TIO_INSIDE_DM_IO */
 630	}
 631
 632	tio->magic = DM_TIO_MAGIC;
 633	tio->io = ci->io;
 634	tio->ti = ti;
 635	tio->target_bio_nr = target_bio_nr;
 636	tio->len_ptr = len;
 637	tio->old_sector = 0;
 638
 639	/* Set default bdev, but target must bio_set_dev() before issuing IO */
 640	clone->bi_bdev = md->disk->part0;
 641	if (unlikely(ti->needs_bio_set_dev))
 642		bio_set_dev(clone, md->disk->part0);
 643
 644	if (len) {
 645		clone->bi_iter.bi_size = to_bytes(*len);
 646		if (bio_integrity(clone))
 647			bio_integrity_trim(clone);
 648	}
 649
 650	return clone;
 651}
 652
 653static void free_tio(struct bio *clone)
 654{
 655	if (dm_tio_flagged(clone_to_tio(clone), DM_TIO_INSIDE_DM_IO))
 656		return;
 657	bio_put(clone);
 658}
 659
 660/*
 661 * Add the bio to the list of deferred io.
 662 */
 663static void queue_io(struct mapped_device *md, struct bio *bio)
 664{
 665	unsigned long flags;
 666
 667	spin_lock_irqsave(&md->deferred_lock, flags);
 668	bio_list_add(&md->deferred, bio);
 669	spin_unlock_irqrestore(&md->deferred_lock, flags);
 670	queue_work(md->wq, &md->work);
 671}
 672
 673/*
 674 * Everyone (including functions in this file), should use this
 675 * function to access the md->map field, and make sure they call
 676 * dm_put_live_table() when finished.
 677 */
 678struct dm_table *dm_get_live_table(struct mapped_device *md,
 679				   int *srcu_idx) __acquires(md->io_barrier)
 680{
 681	*srcu_idx = srcu_read_lock(&md->io_barrier);
 682
 683	return srcu_dereference(md->map, &md->io_barrier);
 684}
 685
 686void dm_put_live_table(struct mapped_device *md,
 687		       int srcu_idx) __releases(md->io_barrier)
 688{
 689	srcu_read_unlock(&md->io_barrier, srcu_idx);
 690}
 691
 692void dm_sync_table(struct mapped_device *md)
 693{
 694	synchronize_srcu(&md->io_barrier);
 695	synchronize_rcu_expedited();
 696}
 697
 698/*
 699 * A fast alternative to dm_get_live_table/dm_put_live_table.
 700 * The caller must not block between these two functions.
 701 */
 702static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
 703{
 704	rcu_read_lock();
 705	return rcu_dereference(md->map);
 706}
 707
 708static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
 709{
 710	rcu_read_unlock();
 711}
 712
 713static inline struct dm_table *dm_get_live_table_bio(struct mapped_device *md,
 714					int *srcu_idx, blk_opf_t bio_opf)
 715{
 716	if (bio_opf & REQ_NOWAIT)
 717		return dm_get_live_table_fast(md);
 718	else
 719		return dm_get_live_table(md, srcu_idx);
 720}
 721
 722static inline void dm_put_live_table_bio(struct mapped_device *md, int srcu_idx,
 723					 blk_opf_t bio_opf)
 724{
 725	if (bio_opf & REQ_NOWAIT)
 726		dm_put_live_table_fast(md);
 727	else
 728		dm_put_live_table(md, srcu_idx);
 729}
 730
 731static char *_dm_claim_ptr = "I belong to device-mapper";
 732
 733/*
 734 * Open a table device so we can use it as a map destination.
 735 */
 736static struct table_device *open_table_device(struct mapped_device *md,
 737		dev_t dev, fmode_t mode)
 738{
 739	struct table_device *td;
 740	struct block_device *bdev;
 741	u64 part_off;
 742	int r;
 743
 744	td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
 745	if (!td)
 746		return ERR_PTR(-ENOMEM);
 747	refcount_set(&td->count, 1);
 748
 749	bdev = blkdev_get_by_dev(dev, mode | FMODE_EXCL, _dm_claim_ptr);
 750	if (IS_ERR(bdev)) {
 751		r = PTR_ERR(bdev);
 752		goto out_free_td;
 753	}
 754
 755	/*
 756	 * We can be called before the dm disk is added.  In that case we can't
 757	 * register the holder relation here.  It will be done once add_disk was
 758	 * called.
 759	 */
 760	if (md->disk->slave_dir) {
 761		r = bd_link_disk_holder(bdev, md->disk);
 762		if (r)
 763			goto out_blkdev_put;
 764	}
 765
 766	td->dm_dev.mode = mode;
 767	td->dm_dev.bdev = bdev;
 768	td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL);
 769	format_dev_t(td->dm_dev.name, dev);
 770	list_add(&td->list, &md->table_devices);
 771	return td;
 772
 773out_blkdev_put:
 774	blkdev_put(bdev, mode | FMODE_EXCL);
 775out_free_td:
 776	kfree(td);
 777	return ERR_PTR(r);
 778}
 779
 780/*
 781 * Close a table device that we've been using.
 782 */
 783static void close_table_device(struct table_device *td, struct mapped_device *md)
 784{
 785	if (md->disk->slave_dir)
 786		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
 787	blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
 788	put_dax(td->dm_dev.dax_dev);
 789	list_del(&td->list);
 790	kfree(td);
 791}
 792
 793static struct table_device *find_table_device(struct list_head *l, dev_t dev,
 794					      fmode_t mode)
 795{
 796	struct table_device *td;
 797
 798	list_for_each_entry(td, l, list)
 799		if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
 800			return td;
 801
 802	return NULL;
 803}
 804
 805int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
 806			struct dm_dev **result)
 807{
 808	struct table_device *td;
 809
 810	mutex_lock(&md->table_devices_lock);
 811	td = find_table_device(&md->table_devices, dev, mode);
 812	if (!td) {
 813		td = open_table_device(md, dev, mode);
 814		if (IS_ERR(td)) {
 815			mutex_unlock(&md->table_devices_lock);
 816			return PTR_ERR(td);
 817		}
 818	} else {
 819		refcount_inc(&td->count);
 820	}
 821	mutex_unlock(&md->table_devices_lock);
 822
 823	*result = &td->dm_dev;
 824	return 0;
 825}
 826
 827void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
 828{
 829	struct table_device *td = container_of(d, struct table_device, dm_dev);
 830
 831	mutex_lock(&md->table_devices_lock);
 832	if (refcount_dec_and_test(&td->count))
 833		close_table_device(td, md);
 834	mutex_unlock(&md->table_devices_lock);
 835}
 836
 837/*
 838 * Get the geometry associated with a dm device
 839 */
 840int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 841{
 842	*geo = md->geometry;
 843
 844	return 0;
 845}
 846
 847/*
 848 * Set the geometry of a device.
 849 */
 850int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 851{
 852	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 853
 854	if (geo->start > sz) {
 855		DMERR("Start sector is beyond the geometry limits.");
 856		return -EINVAL;
 857	}
 858
 859	md->geometry = *geo;
 860
 861	return 0;
 862}
 863
 864static int __noflush_suspending(struct mapped_device *md)
 865{
 866	return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 867}
 868
 869static void dm_requeue_add_io(struct dm_io *io, bool first_stage)
 870{
 871	struct mapped_device *md = io->md;
 872
 873	if (first_stage) {
 874		struct dm_io *next = md->requeue_list;
 875
 876		md->requeue_list = io;
 877		io->next = next;
 878	} else {
 879		bio_list_add_head(&md->deferred, io->orig_bio);
 880	}
 881}
 882
 883static void dm_kick_requeue(struct mapped_device *md, bool first_stage)
 884{
 885	if (first_stage)
 886		queue_work(md->wq, &md->requeue_work);
 887	else
 888		queue_work(md->wq, &md->work);
 889}
 890
 891/*
 892 * Return true if the dm_io's original bio is requeued.
 893 * io->status is updated with error if requeue disallowed.
 894 */
 895static bool dm_handle_requeue(struct dm_io *io, bool first_stage)
 896{
 897	struct bio *bio = io->orig_bio;
 898	bool handle_requeue = (io->status == BLK_STS_DM_REQUEUE);
 899	bool handle_polled_eagain = ((io->status == BLK_STS_AGAIN) &&
 900				     (bio->bi_opf & REQ_POLLED));
 901	struct mapped_device *md = io->md;
 902	bool requeued = false;
 903
 904	if (handle_requeue || handle_polled_eagain) {
 905		unsigned long flags;
 906
 907		if (bio->bi_opf & REQ_POLLED) {
 908			/*
 909			 * Upper layer won't help us poll split bio
 910			 * (io->orig_bio may only reflect a subset of the
 911			 * pre-split original) so clear REQ_POLLED.
 912			 */
 913			bio_clear_polled(bio);
 914		}
 915
 916		/*
 917		 * Target requested pushing back the I/O or
 918		 * polled IO hit BLK_STS_AGAIN.
 919		 */
 920		spin_lock_irqsave(&md->deferred_lock, flags);
 921		if ((__noflush_suspending(md) &&
 922		     !WARN_ON_ONCE(dm_is_zone_write(md, bio))) ||
 923		    handle_polled_eagain || first_stage) {
 924			dm_requeue_add_io(io, first_stage);
 925			requeued = true;
 926		} else {
 927			/*
 928			 * noflush suspend was interrupted or this is
 929			 * a write to a zoned target.
 930			 */
 931			io->status = BLK_STS_IOERR;
 932		}
 933		spin_unlock_irqrestore(&md->deferred_lock, flags);
 934	}
 935
 936	if (requeued)
 937		dm_kick_requeue(md, first_stage);
 938
 939	return requeued;
 940}
 941
 942static void __dm_io_complete(struct dm_io *io, bool first_stage)
 943{
 944	struct bio *bio = io->orig_bio;
 945	struct mapped_device *md = io->md;
 946	blk_status_t io_error;
 947	bool requeued;
 948
 949	requeued = dm_handle_requeue(io, first_stage);
 950	if (requeued && first_stage)
 951		return;
 952
 953	io_error = io->status;
 954	if (dm_io_flagged(io, DM_IO_ACCOUNTED))
 955		dm_end_io_acct(io);
 956	else if (!io_error) {
 957		/*
 958		 * Must handle target that DM_MAPIO_SUBMITTED only to
 959		 * then bio_endio() rather than dm_submit_bio_remap()
 960		 */
 961		__dm_start_io_acct(io);
 962		dm_end_io_acct(io);
 963	}
 964	free_io(io);
 965	smp_wmb();
 966	this_cpu_dec(*md->pending_io);
 967
 968	/* nudge anyone waiting on suspend queue */
 969	if (unlikely(wq_has_sleeper(&md->wait)))
 970		wake_up(&md->wait);
 971
 972	/* Return early if the original bio was requeued */
 973	if (requeued)
 974		return;
 975
 976	if (bio_is_flush_with_data(bio)) {
 977		/*
 978		 * Preflush done for flush with data, reissue
 979		 * without REQ_PREFLUSH.
 980		 */
 981		bio->bi_opf &= ~REQ_PREFLUSH;
 982		queue_io(md, bio);
 983	} else {
 984		/* done with normal IO or empty flush */
 985		if (io_error)
 986			bio->bi_status = io_error;
 987		bio_endio(bio);
 988	}
 989}
 990
 991static void dm_wq_requeue_work(struct work_struct *work)
 992{
 993	struct mapped_device *md = container_of(work, struct mapped_device,
 994						requeue_work);
 995	unsigned long flags;
 996	struct dm_io *io;
 997
 998	/* reuse deferred lock to simplify dm_handle_requeue */
 999	spin_lock_irqsave(&md->deferred_lock, flags);
1000	io = md->requeue_list;
1001	md->requeue_list = NULL;
1002	spin_unlock_irqrestore(&md->deferred_lock, flags);
1003
1004	while (io) {
1005		struct dm_io *next = io->next;
1006
1007		dm_io_rewind(io, &md->disk->bio_split);
1008
1009		io->next = NULL;
1010		__dm_io_complete(io, false);
1011		io = next;
1012		cond_resched();
1013	}
1014}
1015
1016/*
1017 * Two staged requeue:
1018 *
1019 * 1) io->orig_bio points to the real original bio, and the part mapped to
1020 *    this io must be requeued, instead of other parts of the original bio.
1021 *
1022 * 2) io->orig_bio points to new cloned bio which matches the requeued dm_io.
1023 */
1024static void dm_io_complete(struct dm_io *io)
1025{
1026	bool first_requeue;
1027
1028	/*
1029	 * Only dm_io that has been split needs two stage requeue, otherwise
1030	 * we may run into long bio clone chain during suspend and OOM could
1031	 * be triggered.
1032	 *
1033	 * Also flush data dm_io won't be marked as DM_IO_WAS_SPLIT, so they
1034	 * also aren't handled via the first stage requeue.
1035	 */
1036	if (dm_io_flagged(io, DM_IO_WAS_SPLIT))
1037		first_requeue = true;
1038	else
1039		first_requeue = false;
1040
1041	__dm_io_complete(io, first_requeue);
1042}
1043
1044/*
1045 * Decrements the number of outstanding ios that a bio has been
1046 * cloned into, completing the original io if necc.
1047 */
1048static inline void __dm_io_dec_pending(struct dm_io *io)
1049{
1050	if (atomic_dec_and_test(&io->io_count))
1051		dm_io_complete(io);
1052}
1053
1054static void dm_io_set_error(struct dm_io *io, blk_status_t error)
1055{
1056	unsigned long flags;
1057
1058	/* Push-back supersedes any I/O errors */
1059	spin_lock_irqsave(&io->lock, flags);
1060	if (!(io->status == BLK_STS_DM_REQUEUE &&
1061	      __noflush_suspending(io->md))) {
1062		io->status = error;
1063	}
1064	spin_unlock_irqrestore(&io->lock, flags);
1065}
1066
1067static void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
1068{
1069	if (unlikely(error))
1070		dm_io_set_error(io, error);
1071
1072	__dm_io_dec_pending(io);
1073}
1074
1075/*
1076 * The queue_limits are only valid as long as you have a reference
1077 * count on 'md'. But _not_ imposing verification to avoid atomic_read(),
1078 */
1079static inline struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
1080{
1081	return &md->queue->limits;
1082}
1083
1084void disable_discard(struct mapped_device *md)
1085{
1086	struct queue_limits *limits = dm_get_queue_limits(md);
1087
1088	/* device doesn't really support DISCARD, disable it */
1089	limits->max_discard_sectors = 0;
1090}
1091
1092void disable_write_zeroes(struct mapped_device *md)
1093{
1094	struct queue_limits *limits = dm_get_queue_limits(md);
1095
1096	/* device doesn't really support WRITE ZEROES, disable it */
1097	limits->max_write_zeroes_sectors = 0;
1098}
1099
1100static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
1101{
1102	return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
1103}
1104
1105static void clone_endio(struct bio *bio)
1106{
1107	blk_status_t error = bio->bi_status;
1108	struct dm_target_io *tio = clone_to_tio(bio);
1109	struct dm_target *ti = tio->ti;
1110	dm_endio_fn endio = ti->type->end_io;
1111	struct dm_io *io = tio->io;
1112	struct mapped_device *md = io->md;
1113
1114	if (unlikely(error == BLK_STS_TARGET)) {
1115		if (bio_op(bio) == REQ_OP_DISCARD &&
1116		    !bdev_max_discard_sectors(bio->bi_bdev))
1117			disable_discard(md);
1118		else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
1119			 !bdev_write_zeroes_sectors(bio->bi_bdev))
1120			disable_write_zeroes(md);
1121	}
1122
1123	if (static_branch_unlikely(&zoned_enabled) &&
1124	    unlikely(bdev_is_zoned(bio->bi_bdev)))
1125		dm_zone_endio(io, bio);
1126
1127	if (endio) {
1128		int r = endio(ti, bio, &error);
1129
1130		switch (r) {
1131		case DM_ENDIO_REQUEUE:
1132			if (static_branch_unlikely(&zoned_enabled)) {
1133				/*
1134				 * Requeuing writes to a sequential zone of a zoned
1135				 * target will break the sequential write pattern:
1136				 * fail such IO.
1137				 */
1138				if (WARN_ON_ONCE(dm_is_zone_write(md, bio)))
1139					error = BLK_STS_IOERR;
1140				else
1141					error = BLK_STS_DM_REQUEUE;
1142			} else
1143				error = BLK_STS_DM_REQUEUE;
1144			fallthrough;
1145		case DM_ENDIO_DONE:
1146			break;
1147		case DM_ENDIO_INCOMPLETE:
1148			/* The target will handle the io */
1149			return;
1150		default:
1151			DMCRIT("unimplemented target endio return value: %d", r);
1152			BUG();
1153		}
1154	}
1155
1156	if (static_branch_unlikely(&swap_bios_enabled) &&
1157	    unlikely(swap_bios_limit(ti, bio)))
1158		up(&md->swap_bios_semaphore);
1159
1160	free_tio(bio);
1161	dm_io_dec_pending(io, error);
1162}
1163
1164/*
1165 * Return maximum size of I/O possible at the supplied sector up to the current
1166 * target boundary.
1167 */
1168static inline sector_t max_io_len_target_boundary(struct dm_target *ti,
1169						  sector_t target_offset)
1170{
1171	return ti->len - target_offset;
1172}
1173
1174static sector_t __max_io_len(struct dm_target *ti, sector_t sector,
1175			     unsigned int max_granularity)
1176{
1177	sector_t target_offset = dm_target_offset(ti, sector);
1178	sector_t len = max_io_len_target_boundary(ti, target_offset);
1179
1180	/*
1181	 * Does the target need to split IO even further?
1182	 * - varied (per target) IO splitting is a tenet of DM; this
1183	 *   explains why stacked chunk_sectors based splitting via
1184	 *   bio_split_to_limits() isn't possible here.
1185	 */
1186	if (!max_granularity)
1187		return len;
1188	return min_t(sector_t, len,
1189		min(queue_max_sectors(ti->table->md->queue),
1190		    blk_chunk_sectors_left(target_offset, max_granularity)));
1191}
1192
1193static inline sector_t max_io_len(struct dm_target *ti, sector_t sector)
1194{
1195	return __max_io_len(ti, sector, ti->max_io_len);
1196}
1197
1198int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1199{
1200	if (len > UINT_MAX) {
1201		DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1202		      (unsigned long long)len, UINT_MAX);
1203		ti->error = "Maximum size of target IO is too large";
1204		return -EINVAL;
1205	}
1206
1207	ti->max_io_len = (uint32_t) len;
1208
1209	return 0;
1210}
1211EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1212
1213static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1214						sector_t sector, int *srcu_idx)
1215	__acquires(md->io_barrier)
1216{
1217	struct dm_table *map;
1218	struct dm_target *ti;
1219
1220	map = dm_get_live_table(md, srcu_idx);
1221	if (!map)
1222		return NULL;
1223
1224	ti = dm_table_find_target(map, sector);
1225	if (!ti)
1226		return NULL;
1227
1228	return ti;
1229}
1230
1231static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
1232		long nr_pages, enum dax_access_mode mode, void **kaddr,
1233		pfn_t *pfn)
1234{
1235	struct mapped_device *md = dax_get_private(dax_dev);
1236	sector_t sector = pgoff * PAGE_SECTORS;
1237	struct dm_target *ti;
1238	long len, ret = -EIO;
1239	int srcu_idx;
1240
1241	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1242
1243	if (!ti)
1244		goto out;
1245	if (!ti->type->direct_access)
1246		goto out;
1247	len = max_io_len(ti, sector) / PAGE_SECTORS;
1248	if (len < 1)
1249		goto out;
1250	nr_pages = min(len, nr_pages);
1251	ret = ti->type->direct_access(ti, pgoff, nr_pages, mode, kaddr, pfn);
1252
1253 out:
1254	dm_put_live_table(md, srcu_idx);
1255
1256	return ret;
1257}
1258
1259static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
1260				  size_t nr_pages)
1261{
1262	struct mapped_device *md = dax_get_private(dax_dev);
1263	sector_t sector = pgoff * PAGE_SECTORS;
1264	struct dm_target *ti;
1265	int ret = -EIO;
1266	int srcu_idx;
1267
1268	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1269
1270	if (!ti)
1271		goto out;
1272	if (WARN_ON(!ti->type->dax_zero_page_range)) {
1273		/*
1274		 * ->zero_page_range() is mandatory dax operation. If we are
1275		 *  here, something is wrong.
1276		 */
1277		goto out;
1278	}
1279	ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
1280 out:
1281	dm_put_live_table(md, srcu_idx);
1282
1283	return ret;
1284}
1285
1286static size_t dm_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
1287		void *addr, size_t bytes, struct iov_iter *i)
1288{
1289	struct mapped_device *md = dax_get_private(dax_dev);
1290	sector_t sector = pgoff * PAGE_SECTORS;
1291	struct dm_target *ti;
1292	int srcu_idx;
1293	long ret = 0;
1294
1295	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1296	if (!ti || !ti->type->dax_recovery_write)
1297		goto out;
1298
1299	ret = ti->type->dax_recovery_write(ti, pgoff, addr, bytes, i);
1300out:
1301	dm_put_live_table(md, srcu_idx);
1302	return ret;
1303}
1304
1305/*
1306 * A target may call dm_accept_partial_bio only from the map routine.  It is
1307 * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
1308 * operations, REQ_OP_ZONE_APPEND (zone append writes) and any bio serviced by
1309 * __send_duplicate_bios().
1310 *
1311 * dm_accept_partial_bio informs the dm that the target only wants to process
1312 * additional n_sectors sectors of the bio and the rest of the data should be
1313 * sent in a next bio.
1314 *
1315 * A diagram that explains the arithmetics:
1316 * +--------------------+---------------+-------+
1317 * |         1          |       2       |   3   |
1318 * +--------------------+---------------+-------+
1319 *
1320 * <-------------- *tio->len_ptr --------------->
1321 *                      <----- bio_sectors ----->
1322 *                      <-- n_sectors -->
1323 *
1324 * Region 1 was already iterated over with bio_advance or similar function.
1325 *	(it may be empty if the target doesn't use bio_advance)
1326 * Region 2 is the remaining bio size that the target wants to process.
1327 *	(it may be empty if region 1 is non-empty, although there is no reason
1328 *	 to make it empty)
1329 * The target requires that region 3 is to be sent in the next bio.
1330 *
1331 * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1332 * the partially processed part (the sum of regions 1+2) must be the same for all
1333 * copies of the bio.
1334 */
1335void dm_accept_partial_bio(struct bio *bio, unsigned int n_sectors)
1336{
1337	struct dm_target_io *tio = clone_to_tio(bio);
1338	struct dm_io *io = tio->io;
1339	unsigned int bio_sectors = bio_sectors(bio);
1340
1341	BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO));
1342	BUG_ON(op_is_zone_mgmt(bio_op(bio)));
1343	BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
1344	BUG_ON(bio_sectors > *tio->len_ptr);
1345	BUG_ON(n_sectors > bio_sectors);
1346
1347	*tio->len_ptr -= bio_sectors - n_sectors;
1348	bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1349
1350	/*
1351	 * __split_and_process_bio() may have already saved mapped part
1352	 * for accounting but it is being reduced so update accordingly.
1353	 */
1354	dm_io_set_flag(io, DM_IO_WAS_SPLIT);
1355	io->sectors = n_sectors;
1356	io->sector_offset = bio_sectors(io->orig_bio);
1357}
1358EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1359
1360/*
1361 * @clone: clone bio that DM core passed to target's .map function
1362 * @tgt_clone: clone of @clone bio that target needs submitted
1363 *
1364 * Targets should use this interface to submit bios they take
1365 * ownership of when returning DM_MAPIO_SUBMITTED.
1366 *
1367 * Target should also enable ti->accounts_remapped_io
1368 */
1369void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone)
1370{
1371	struct dm_target_io *tio = clone_to_tio(clone);
1372	struct dm_io *io = tio->io;
1373
1374	/* establish bio that will get submitted */
1375	if (!tgt_clone)
1376		tgt_clone = clone;
1377
1378	/*
1379	 * Account io->origin_bio to DM dev on behalf of target
1380	 * that took ownership of IO with DM_MAPIO_SUBMITTED.
1381	 */
1382	dm_start_io_acct(io, clone);
1383
1384	trace_block_bio_remap(tgt_clone, disk_devt(io->md->disk),
1385			      tio->old_sector);
1386	submit_bio_noacct(tgt_clone);
1387}
1388EXPORT_SYMBOL_GPL(dm_submit_bio_remap);
1389
1390static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
1391{
1392	mutex_lock(&md->swap_bios_lock);
1393	while (latch < md->swap_bios) {
1394		cond_resched();
1395		down(&md->swap_bios_semaphore);
1396		md->swap_bios--;
1397	}
1398	while (latch > md->swap_bios) {
1399		cond_resched();
1400		up(&md->swap_bios_semaphore);
1401		md->swap_bios++;
1402	}
1403	mutex_unlock(&md->swap_bios_lock);
1404}
1405
1406static void __map_bio(struct bio *clone)
1407{
1408	struct dm_target_io *tio = clone_to_tio(clone);
1409	struct dm_target *ti = tio->ti;
1410	struct dm_io *io = tio->io;
1411	struct mapped_device *md = io->md;
1412	int r;
1413
1414	clone->bi_end_io = clone_endio;
1415
1416	/*
1417	 * Map the clone.
1418	 */
1419	tio->old_sector = clone->bi_iter.bi_sector;
1420
1421	if (static_branch_unlikely(&swap_bios_enabled) &&
1422	    unlikely(swap_bios_limit(ti, clone))) {
1423		int latch = get_swap_bios();
1424
1425		if (unlikely(latch != md->swap_bios))
1426			__set_swap_bios_limit(md, latch);
1427		down(&md->swap_bios_semaphore);
1428	}
1429
1430	if (static_branch_unlikely(&zoned_enabled)) {
1431		/*
1432		 * Check if the IO needs a special mapping due to zone append
1433		 * emulation on zoned target. In this case, dm_zone_map_bio()
1434		 * calls the target map operation.
1435		 */
1436		if (unlikely(dm_emulate_zone_append(md)))
1437			r = dm_zone_map_bio(tio);
1438		else
1439			r = ti->type->map(ti, clone);
1440	} else
1441		r = ti->type->map(ti, clone);
1442
1443	switch (r) {
1444	case DM_MAPIO_SUBMITTED:
1445		/* target has assumed ownership of this io */
1446		if (!ti->accounts_remapped_io)
1447			dm_start_io_acct(io, clone);
1448		break;
1449	case DM_MAPIO_REMAPPED:
1450		dm_submit_bio_remap(clone, NULL);
1451		break;
1452	case DM_MAPIO_KILL:
1453	case DM_MAPIO_REQUEUE:
1454		if (static_branch_unlikely(&swap_bios_enabled) &&
1455		    unlikely(swap_bios_limit(ti, clone)))
1456			up(&md->swap_bios_semaphore);
1457		free_tio(clone);
1458		if (r == DM_MAPIO_KILL)
1459			dm_io_dec_pending(io, BLK_STS_IOERR);
1460		else
1461			dm_io_dec_pending(io, BLK_STS_DM_REQUEUE);
1462		break;
1463	default:
1464		DMCRIT("unimplemented target map return value: %d", r);
1465		BUG();
1466	}
1467}
1468
1469static void setup_split_accounting(struct clone_info *ci, unsigned int len)
1470{
1471	struct dm_io *io = ci->io;
1472
1473	if (ci->sector_count > len) {
1474		/*
1475		 * Split needed, save the mapped part for accounting.
1476		 * NOTE: dm_accept_partial_bio() will update accordingly.
1477		 */
1478		dm_io_set_flag(io, DM_IO_WAS_SPLIT);
1479		io->sectors = len;
1480		io->sector_offset = bio_sectors(ci->bio);
1481	}
1482}
1483
1484static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1485				struct dm_target *ti, unsigned int num_bios,
1486				unsigned *len)
1487{
1488	struct bio *bio;
1489	int try;
1490
1491	for (try = 0; try < 2; try++) {
1492		int bio_nr;
1493
1494		if (try)
1495			mutex_lock(&ci->io->md->table_devices_lock);
1496		for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1497			bio = alloc_tio(ci, ti, bio_nr, len,
1498					try ? GFP_NOIO : GFP_NOWAIT);
1499			if (!bio)
1500				break;
1501
1502			bio_list_add(blist, bio);
1503		}
1504		if (try)
1505			mutex_unlock(&ci->io->md->table_devices_lock);
1506		if (bio_nr == num_bios)
1507			return;
1508
1509		while ((bio = bio_list_pop(blist)))
1510			free_tio(bio);
1511	}
1512}
1513
1514static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1515				 unsigned int num_bios, unsigned int *len)
1516{
1517	struct bio_list blist = BIO_EMPTY_LIST;
1518	struct bio *clone;
1519	unsigned int ret = 0;
1520
1521	switch (num_bios) {
1522	case 0:
1523		break;
1524	case 1:
1525		if (len)
1526			setup_split_accounting(ci, *len);
1527		clone = alloc_tio(ci, ti, 0, len, GFP_NOIO);
1528		__map_bio(clone);
1529		ret = 1;
1530		break;
1531	default:
1532		if (len)
1533			setup_split_accounting(ci, *len);
1534		/* dm_accept_partial_bio() is not supported with shared tio->len_ptr */
1535		alloc_multiple_bios(&blist, ci, ti, num_bios, len);
1536		while ((clone = bio_list_pop(&blist))) {
1537			dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO);
1538			__map_bio(clone);
1539			ret += 1;
1540		}
1541		break;
1542	}
1543
1544	return ret;
1545}
1546
1547static void __send_empty_flush(struct clone_info *ci)
1548{
1549	struct dm_table *t = ci->map;
1550	struct bio flush_bio;
1551
1552	/*
1553	 * Use an on-stack bio for this, it's safe since we don't
1554	 * need to reference it after submit. It's just used as
1555	 * the basis for the clone(s).
1556	 */
1557	bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0,
1558		 REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
1559
1560	ci->bio = &flush_bio;
1561	ci->sector_count = 0;
1562	ci->io->tio.clone.bi_iter.bi_size = 0;
1563
1564	for (unsigned int i = 0; i < t->num_targets; i++) {
1565		unsigned int bios;
1566		struct dm_target *ti = dm_table_get_target(t, i);
1567
1568		atomic_add(ti->num_flush_bios, &ci->io->io_count);
1569		bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1570		atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
1571	}
1572
1573	/*
1574	 * alloc_io() takes one extra reference for submission, so the
1575	 * reference won't reach 0 without the following subtraction
1576	 */
1577	atomic_sub(1, &ci->io->io_count);
1578
1579	bio_uninit(ci->bio);
1580}
1581
1582static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1583					unsigned int num_bios,
1584					unsigned int max_granularity)
1585{
1586	unsigned int len, bios;
1587
1588	len = min_t(sector_t, ci->sector_count,
1589		    __max_io_len(ti, ci->sector, max_granularity));
1590
1591	atomic_add(num_bios, &ci->io->io_count);
1592	bios = __send_duplicate_bios(ci, ti, num_bios, &len);
1593	/*
1594	 * alloc_io() takes one extra reference for submission, so the
1595	 * reference won't reach 0 without the following (+1) subtraction
1596	 */
1597	atomic_sub(num_bios - bios + 1, &ci->io->io_count);
1598
1599	ci->sector += len;
1600	ci->sector_count -= len;
1601}
1602
1603static bool is_abnormal_io(struct bio *bio)
1604{
1605	enum req_op op = bio_op(bio);
1606
1607	if (op != REQ_OP_READ && op != REQ_OP_WRITE && op != REQ_OP_FLUSH) {
1608		switch (op) {
1609		case REQ_OP_DISCARD:
1610		case REQ_OP_SECURE_ERASE:
1611		case REQ_OP_WRITE_ZEROES:
1612			return true;
1613		default:
1614			break;
1615		}
1616	}
1617
1618	return false;
1619}
1620
1621static blk_status_t __process_abnormal_io(struct clone_info *ci,
1622					  struct dm_target *ti)
1623{
1624	unsigned int num_bios = 0;
1625	unsigned int max_granularity = 0;
1626	struct queue_limits *limits = dm_get_queue_limits(ti->table->md);
1627
1628	switch (bio_op(ci->bio)) {
1629	case REQ_OP_DISCARD:
1630		num_bios = ti->num_discard_bios;
1631		if (ti->max_discard_granularity)
1632			max_granularity = limits->max_discard_sectors;
1633		break;
1634	case REQ_OP_SECURE_ERASE:
1635		num_bios = ti->num_secure_erase_bios;
1636		if (ti->max_secure_erase_granularity)
1637			max_granularity = limits->max_secure_erase_sectors;
1638		break;
1639	case REQ_OP_WRITE_ZEROES:
1640		num_bios = ti->num_write_zeroes_bios;
1641		if (ti->max_write_zeroes_granularity)
1642			max_granularity = limits->max_write_zeroes_sectors;
1643		break;
1644	default:
1645		break;
1646	}
1647
1648	/*
1649	 * Even though the device advertised support for this type of
1650	 * request, that does not mean every target supports it, and
1651	 * reconfiguration might also have changed that since the
1652	 * check was performed.
1653	 */
1654	if (unlikely(!num_bios))
1655		return BLK_STS_NOTSUPP;
1656
1657	__send_changing_extent_only(ci, ti, num_bios, max_granularity);
1658	return BLK_STS_OK;
1659}
1660
1661/*
1662 * Reuse ->bi_private as dm_io list head for storing all dm_io instances
1663 * associated with this bio, and this bio's bi_private needs to be
1664 * stored in dm_io->data before the reuse.
1665 *
1666 * bio->bi_private is owned by fs or upper layer, so block layer won't
1667 * touch it after splitting. Meantime it won't be changed by anyone after
1668 * bio is submitted. So this reuse is safe.
1669 */
1670static inline struct dm_io **dm_poll_list_head(struct bio *bio)
1671{
1672	return (struct dm_io **)&bio->bi_private;
1673}
1674
1675static void dm_queue_poll_io(struct bio *bio, struct dm_io *io)
1676{
1677	struct dm_io **head = dm_poll_list_head(bio);
1678
1679	if (!(bio->bi_opf & REQ_DM_POLL_LIST)) {
1680		bio->bi_opf |= REQ_DM_POLL_LIST;
1681		/*
1682		 * Save .bi_private into dm_io, so that we can reuse
1683		 * .bi_private as dm_io list head for storing dm_io list
1684		 */
1685		io->data = bio->bi_private;
1686
1687		/* tell block layer to poll for completion */
1688		bio->bi_cookie = ~BLK_QC_T_NONE;
1689
1690		io->next = NULL;
1691	} else {
1692		/*
1693		 * bio recursed due to split, reuse original poll list,
1694		 * and save bio->bi_private too.
1695		 */
1696		io->data = (*head)->data;
1697		io->next = *head;
1698	}
1699
1700	*head = io;
1701}
1702
1703/*
1704 * Select the correct strategy for processing a non-flush bio.
1705 */
1706static blk_status_t __split_and_process_bio(struct clone_info *ci)
1707{
1708	struct bio *clone;
1709	struct dm_target *ti;
1710	unsigned int len;
1711
1712	ti = dm_table_find_target(ci->map, ci->sector);
1713	if (unlikely(!ti))
1714		return BLK_STS_IOERR;
1715
1716	if (unlikely((ci->bio->bi_opf & REQ_NOWAIT) != 0) &&
1717	    unlikely(!dm_target_supports_nowait(ti->type)))
1718		return BLK_STS_NOTSUPP;
1719
1720	if (unlikely(ci->is_abnormal_io))
1721		return __process_abnormal_io(ci, ti);
1722
1723	/*
1724	 * Only support bio polling for normal IO, and the target io is
1725	 * exactly inside the dm_io instance (verified in dm_poll_dm_io)
1726	 */
1727	ci->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED);
1728
1729	len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
1730	setup_split_accounting(ci, len);
1731	clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO);
1732	__map_bio(clone);
1733
1734	ci->sector += len;
1735	ci->sector_count -= len;
1736
1737	return BLK_STS_OK;
1738}
1739
1740static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1741			    struct dm_table *map, struct bio *bio, bool is_abnormal)
1742{
1743	ci->map = map;
1744	ci->io = alloc_io(md, bio);
1745	ci->bio = bio;
1746	ci->is_abnormal_io = is_abnormal;
1747	ci->submit_as_polled = false;
1748	ci->sector = bio->bi_iter.bi_sector;
1749	ci->sector_count = bio_sectors(bio);
1750
1751	/* Shouldn't happen but sector_count was being set to 0 so... */
1752	if (static_branch_unlikely(&zoned_enabled) &&
1753	    WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count))
1754		ci->sector_count = 0;
1755}
1756
1757/*
1758 * Entry point to split a bio into clones and submit them to the targets.
1759 */
1760static void dm_split_and_process_bio(struct mapped_device *md,
1761				     struct dm_table *map, struct bio *bio)
1762{
1763	struct clone_info ci;
1764	struct dm_io *io;
1765	blk_status_t error = BLK_STS_OK;
1766	bool is_abnormal;
1767
1768	is_abnormal = is_abnormal_io(bio);
1769	if (unlikely(is_abnormal)) {
1770		/*
1771		 * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc)
1772		 * otherwise associated queue_limits won't be imposed.
1773		 */
1774		bio = bio_split_to_limits(bio);
1775		if (!bio)
1776			return;
1777	}
1778
1779	init_clone_info(&ci, md, map, bio, is_abnormal);
1780	io = ci.io;
1781
1782	if (bio->bi_opf & REQ_PREFLUSH) {
1783		__send_empty_flush(&ci);
1784		/* dm_io_complete submits any data associated with flush */
1785		goto out;
1786	}
1787
1788	error = __split_and_process_bio(&ci);
1789	if (error || !ci.sector_count)
1790		goto out;
1791	/*
1792	 * Remainder must be passed to submit_bio_noacct() so it gets handled
1793	 * *after* bios already submitted have been completely processed.
1794	 */
1795	bio_trim(bio, io->sectors, ci.sector_count);
1796	trace_block_split(bio, bio->bi_iter.bi_sector);
1797	bio_inc_remaining(bio);
1798	submit_bio_noacct(bio);
1799out:
1800	/*
1801	 * Drop the extra reference count for non-POLLED bio, and hold one
1802	 * reference for POLLED bio, which will be released in dm_poll_bio
1803	 *
1804	 * Add every dm_io instance into the dm_io list head which is stored
1805	 * in bio->bi_private, so that dm_poll_bio can poll them all.
1806	 */
1807	if (error || !ci.submit_as_polled) {
1808		/*
1809		 * In case of submission failure, the extra reference for
1810		 * submitting io isn't consumed yet
1811		 */
1812		if (error)
1813			atomic_dec(&io->io_count);
1814		dm_io_dec_pending(io, error);
1815	} else
1816		dm_queue_poll_io(bio, io);
1817}
1818
1819static void dm_submit_bio(struct bio *bio)
1820{
1821	struct mapped_device *md = bio->bi_bdev->bd_disk->private_data;
1822	int srcu_idx;
1823	struct dm_table *map;
1824	blk_opf_t bio_opf = bio->bi_opf;
1825
1826	map = dm_get_live_table_bio(md, &srcu_idx, bio_opf);
1827
1828	/* If suspended, or map not yet available, queue this IO for later */
1829	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) ||
1830	    unlikely(!map)) {
1831		if (bio->bi_opf & REQ_NOWAIT)
1832			bio_wouldblock_error(bio);
1833		else if (bio->bi_opf & REQ_RAHEAD)
1834			bio_io_error(bio);
1835		else
1836			queue_io(md, bio);
1837		goto out;
1838	}
1839
1840	dm_split_and_process_bio(md, map, bio);
1841out:
1842	dm_put_live_table_bio(md, srcu_idx, bio_opf);
1843}
1844
1845static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob,
1846			  unsigned int flags)
1847{
1848	WARN_ON_ONCE(!dm_tio_is_normal(&io->tio));
1849
1850	/* don't poll if the mapped io is done */
1851	if (atomic_read(&io->io_count) > 1)
1852		bio_poll(&io->tio.clone, iob, flags);
1853
1854	/* bio_poll holds the last reference */
1855	return atomic_read(&io->io_count) == 1;
1856}
1857
1858static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob,
1859		       unsigned int flags)
1860{
1861	struct dm_io **head = dm_poll_list_head(bio);
1862	struct dm_io *list = *head;
1863	struct dm_io *tmp = NULL;
1864	struct dm_io *curr, *next;
1865
1866	/* Only poll normal bio which was marked as REQ_DM_POLL_LIST */
1867	if (!(bio->bi_opf & REQ_DM_POLL_LIST))
1868		return 0;
1869
1870	WARN_ON_ONCE(!list);
1871
1872	/*
1873	 * Restore .bi_private before possibly completing dm_io.
1874	 *
1875	 * bio_poll() is only possible once @bio has been completely
1876	 * submitted via submit_bio_noacct()'s depth-first submission.
1877	 * So there is no dm_queue_poll_io() race associated with
1878	 * clearing REQ_DM_POLL_LIST here.
1879	 */
1880	bio->bi_opf &= ~REQ_DM_POLL_LIST;
1881	bio->bi_private = list->data;
1882
1883	for (curr = list, next = curr->next; curr; curr = next, next =
1884			curr ? curr->next : NULL) {
1885		if (dm_poll_dm_io(curr, iob, flags)) {
1886			/*
1887			 * clone_endio() has already occurred, so no
1888			 * error handling is needed here.
1889			 */
1890			__dm_io_dec_pending(curr);
1891		} else {
1892			curr->next = tmp;
1893			tmp = curr;
1894		}
1895	}
1896
1897	/* Not done? */
1898	if (tmp) {
1899		bio->bi_opf |= REQ_DM_POLL_LIST;
1900		/* Reset bio->bi_private to dm_io list head */
1901		*head = tmp;
1902		return 0;
1903	}
1904	return 1;
1905}
1906
1907/*
1908 *---------------------------------------------------------------
1909 * An IDR is used to keep track of allocated minor numbers.
1910 *---------------------------------------------------------------
1911 */
1912static void free_minor(int minor)
1913{
1914	spin_lock(&_minor_lock);
1915	idr_remove(&_minor_idr, minor);
1916	spin_unlock(&_minor_lock);
1917}
1918
1919/*
1920 * See if the device with a specific minor # is free.
1921 */
1922static int specific_minor(int minor)
1923{
1924	int r;
1925
1926	if (minor >= (1 << MINORBITS))
1927		return -EINVAL;
1928
1929	idr_preload(GFP_KERNEL);
1930	spin_lock(&_minor_lock);
1931
1932	r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1933
1934	spin_unlock(&_minor_lock);
1935	idr_preload_end();
1936	if (r < 0)
1937		return r == -ENOSPC ? -EBUSY : r;
1938	return 0;
1939}
1940
1941static int next_free_minor(int *minor)
1942{
1943	int r;
1944
1945	idr_preload(GFP_KERNEL);
1946	spin_lock(&_minor_lock);
1947
1948	r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1949
1950	spin_unlock(&_minor_lock);
1951	idr_preload_end();
1952	if (r < 0)
1953		return r;
1954	*minor = r;
1955	return 0;
1956}
1957
1958static const struct block_device_operations dm_blk_dops;
1959static const struct block_device_operations dm_rq_blk_dops;
1960static const struct dax_operations dm_dax_ops;
1961
1962static void dm_wq_work(struct work_struct *work);
1963
1964#ifdef CONFIG_BLK_INLINE_ENCRYPTION
1965static void dm_queue_destroy_crypto_profile(struct request_queue *q)
1966{
1967	dm_destroy_crypto_profile(q->crypto_profile);
1968}
1969
1970#else /* CONFIG_BLK_INLINE_ENCRYPTION */
1971
1972static inline void dm_queue_destroy_crypto_profile(struct request_queue *q)
1973{
1974}
1975#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
1976
1977static void cleanup_mapped_device(struct mapped_device *md)
1978{
1979	if (md->wq)
1980		destroy_workqueue(md->wq);
1981	dm_free_md_mempools(md->mempools);
1982
1983	if (md->dax_dev) {
1984		dax_remove_host(md->disk);
1985		kill_dax(md->dax_dev);
1986		put_dax(md->dax_dev);
1987		md->dax_dev = NULL;
1988	}
1989
1990	dm_cleanup_zoned_dev(md);
1991	if (md->disk) {
1992		spin_lock(&_minor_lock);
1993		md->disk->private_data = NULL;
1994		spin_unlock(&_minor_lock);
1995		if (dm_get_md_type(md) != DM_TYPE_NONE) {
1996			struct table_device *td;
1997
1998			dm_sysfs_exit(md);
1999			list_for_each_entry(td, &md->table_devices, list) {
2000				bd_unlink_disk_holder(td->dm_dev.bdev,
2001						      md->disk);
2002			}
2003
2004			/*
2005			 * Hold lock to make sure del_gendisk() won't concurrent
2006			 * with open/close_table_device().
2007			 */
2008			mutex_lock(&md->table_devices_lock);
2009			del_gendisk(md->disk);
2010			mutex_unlock(&md->table_devices_lock);
2011		}
2012		dm_queue_destroy_crypto_profile(md->queue);
2013		put_disk(md->disk);
2014	}
2015
2016	if (md->pending_io) {
2017		free_percpu(md->pending_io);
2018		md->pending_io = NULL;
2019	}
2020
2021	cleanup_srcu_struct(&md->io_barrier);
2022
2023	mutex_destroy(&md->suspend_lock);
2024	mutex_destroy(&md->type_lock);
2025	mutex_destroy(&md->table_devices_lock);
2026	mutex_destroy(&md->swap_bios_lock);
2027
2028	dm_mq_cleanup_mapped_device(md);
2029}
2030
2031/*
2032 * Allocate and initialise a blank device with a given minor.
2033 */
2034static struct mapped_device *alloc_dev(int minor)
2035{
2036	int r, numa_node_id = dm_get_numa_node();
2037	struct mapped_device *md;
2038	void *old_md;
2039
2040	md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
2041	if (!md) {
2042		DMERR("unable to allocate device, out of memory.");
2043		return NULL;
2044	}
2045
2046	if (!try_module_get(THIS_MODULE))
2047		goto bad_module_get;
2048
2049	/* get a minor number for the dev */
2050	if (minor == DM_ANY_MINOR)
2051		r = next_free_minor(&minor);
2052	else
2053		r = specific_minor(minor);
2054	if (r < 0)
2055		goto bad_minor;
2056
2057	r = init_srcu_struct(&md->io_barrier);
2058	if (r < 0)
2059		goto bad_io_barrier;
2060
2061	md->numa_node_id = numa_node_id;
2062	md->init_tio_pdu = false;
2063	md->type = DM_TYPE_NONE;
2064	mutex_init(&md->suspend_lock);
2065	mutex_init(&md->type_lock);
2066	mutex_init(&md->table_devices_lock);
2067	spin_lock_init(&md->deferred_lock);
2068	atomic_set(&md->holders, 1);
2069	atomic_set(&md->open_count, 0);
2070	atomic_set(&md->event_nr, 0);
2071	atomic_set(&md->uevent_seq, 0);
2072	INIT_LIST_HEAD(&md->uevent_list);
2073	INIT_LIST_HEAD(&md->table_devices);
2074	spin_lock_init(&md->uevent_lock);
2075
2076	/*
2077	 * default to bio-based until DM table is loaded and md->type
2078	 * established. If request-based table is loaded: blk-mq will
2079	 * override accordingly.
2080	 */
2081	md->disk = blk_alloc_disk(md->numa_node_id);
2082	if (!md->disk)
2083		goto bad;
2084	md->queue = md->disk->queue;
2085
2086	init_waitqueue_head(&md->wait);
2087	INIT_WORK(&md->work, dm_wq_work);
2088	INIT_WORK(&md->requeue_work, dm_wq_requeue_work);
2089	init_waitqueue_head(&md->eventq);
2090	init_completion(&md->kobj_holder.completion);
2091
2092	md->requeue_list = NULL;
2093	md->swap_bios = get_swap_bios();
2094	sema_init(&md->swap_bios_semaphore, md->swap_bios);
2095	mutex_init(&md->swap_bios_lock);
2096
2097	md->disk->major = _major;
2098	md->disk->first_minor = minor;
2099	md->disk->minors = 1;
2100	md->disk->flags |= GENHD_FL_NO_PART;
2101	md->disk->fops = &dm_blk_dops;
2102	md->disk->private_data = md;
2103	sprintf(md->disk->disk_name, "dm-%d", minor);
2104
2105	if (IS_ENABLED(CONFIG_FS_DAX)) {
2106		md->dax_dev = alloc_dax(md, &dm_dax_ops);
2107		if (IS_ERR(md->dax_dev)) {
2108			md->dax_dev = NULL;
2109			goto bad;
2110		}
2111		set_dax_nocache(md->dax_dev);
2112		set_dax_nomc(md->dax_dev);
2113		if (dax_add_host(md->dax_dev, md->disk))
2114			goto bad;
2115	}
2116
2117	format_dev_t(md->name, MKDEV(_major, minor));
2118
2119	md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM, 0, md->name);
2120	if (!md->wq)
2121		goto bad;
2122
2123	md->pending_io = alloc_percpu(unsigned long);
2124	if (!md->pending_io)
2125		goto bad;
2126
2127	r = dm_stats_init(&md->stats);
2128	if (r < 0)
2129		goto bad;
2130
2131	/* Populate the mapping, nobody knows we exist yet */
2132	spin_lock(&_minor_lock);
2133	old_md = idr_replace(&_minor_idr, md, minor);
2134	spin_unlock(&_minor_lock);
2135
2136	BUG_ON(old_md != MINOR_ALLOCED);
2137
2138	return md;
2139
2140bad:
2141	cleanup_mapped_device(md);
2142bad_io_barrier:
2143	free_minor(minor);
2144bad_minor:
2145	module_put(THIS_MODULE);
2146bad_module_get:
2147	kvfree(md);
2148	return NULL;
2149}
2150
2151static void unlock_fs(struct mapped_device *md);
2152
2153static void free_dev(struct mapped_device *md)
2154{
2155	int minor = MINOR(disk_devt(md->disk));
2156
2157	unlock_fs(md);
2158
2159	cleanup_mapped_device(md);
2160
2161	WARN_ON_ONCE(!list_empty(&md->table_devices));
2162	dm_stats_cleanup(&md->stats);
2163	free_minor(minor);
2164
2165	module_put(THIS_MODULE);
2166	kvfree(md);
2167}
2168
2169/*
2170 * Bind a table to the device.
2171 */
2172static void event_callback(void *context)
2173{
2174	unsigned long flags;
2175	LIST_HEAD(uevents);
2176	struct mapped_device *md = context;
2177
2178	spin_lock_irqsave(&md->uevent_lock, flags);
2179	list_splice_init(&md->uevent_list, &uevents);
2180	spin_unlock_irqrestore(&md->uevent_lock, flags);
2181
2182	dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2183
2184	atomic_inc(&md->event_nr);
2185	wake_up(&md->eventq);
2186	dm_issue_global_event();
2187}
2188
2189/*
2190 * Returns old map, which caller must destroy.
2191 */
2192static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2193			       struct queue_limits *limits)
2194{
2195	struct dm_table *old_map;
2196	sector_t size;
2197	int ret;
2198
2199	lockdep_assert_held(&md->suspend_lock);
2200
2201	size = dm_table_get_size(t);
2202
2203	/*
2204	 * Wipe any geometry if the size of the table changed.
2205	 */
2206	if (size != dm_get_size(md))
2207		memset(&md->geometry, 0, sizeof(md->geometry));
2208
2209	set_capacity(md->disk, size);
2210
2211	dm_table_event_callback(t, event_callback, md);
2212
2213	if (dm_table_request_based(t)) {
2214		/*
2215		 * Leverage the fact that request-based DM targets are
2216		 * immutable singletons - used to optimize dm_mq_queue_rq.
2217		 */
2218		md->immutable_target = dm_table_get_immutable_target(t);
2219
2220		/*
2221		 * There is no need to reload with request-based dm because the
2222		 * size of front_pad doesn't change.
2223		 *
2224		 * Note for future: If you are to reload bioset, prep-ed
2225		 * requests in the queue may refer to bio from the old bioset,
2226		 * so you must walk through the queue to unprep.
2227		 */
2228		if (!md->mempools) {
2229			md->mempools = t->mempools;
2230			t->mempools = NULL;
2231		}
2232	} else {
2233		/*
2234		 * The md may already have mempools that need changing.
2235		 * If so, reload bioset because front_pad may have changed
2236		 * because a different table was loaded.
2237		 */
2238		dm_free_md_mempools(md->mempools);
2239		md->mempools = t->mempools;
2240		t->mempools = NULL;
2241	}
2242
2243	ret = dm_table_set_restrictions(t, md->queue, limits);
2244	if (ret) {
2245		old_map = ERR_PTR(ret);
2246		goto out;
2247	}
2248
2249	old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2250	rcu_assign_pointer(md->map, (void *)t);
2251	md->immutable_target_type = dm_table_get_immutable_target_type(t);
2252
2253	if (old_map)
2254		dm_sync_table(md);
2255out:
2256	return old_map;
2257}
2258
2259/*
2260 * Returns unbound table for the caller to free.
2261 */
2262static struct dm_table *__unbind(struct mapped_device *md)
2263{
2264	struct dm_table *map = rcu_dereference_protected(md->map, 1);
2265
2266	if (!map)
2267		return NULL;
2268
2269	dm_table_event_callback(map, NULL, NULL);
2270	RCU_INIT_POINTER(md->map, NULL);
2271	dm_sync_table(md);
2272
2273	return map;
2274}
2275
2276/*
2277 * Constructor for a new device.
2278 */
2279int dm_create(int minor, struct mapped_device **result)
2280{
2281	struct mapped_device *md;
2282
2283	md = alloc_dev(minor);
2284	if (!md)
2285		return -ENXIO;
2286
2287	dm_ima_reset_data(md);
2288
2289	*result = md;
2290	return 0;
2291}
2292
2293/*
2294 * Functions to manage md->type.
2295 * All are required to hold md->type_lock.
2296 */
2297void dm_lock_md_type(struct mapped_device *md)
2298{
2299	mutex_lock(&md->type_lock);
2300}
2301
2302void dm_unlock_md_type(struct mapped_device *md)
2303{
2304	mutex_unlock(&md->type_lock);
2305}
2306
2307void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2308{
2309	BUG_ON(!mutex_is_locked(&md->type_lock));
2310	md->type = type;
2311}
2312
2313enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2314{
2315	return md->type;
2316}
2317
2318struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2319{
2320	return md->immutable_target_type;
2321}
2322
2323/*
2324 * Setup the DM device's queue based on md's type
2325 */
2326int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2327{
2328	enum dm_queue_mode type = dm_table_get_type(t);
2329	struct queue_limits limits;
2330	struct table_device *td;
2331	int r;
2332
2333	switch (type) {
2334	case DM_TYPE_REQUEST_BASED:
2335		md->disk->fops = &dm_rq_blk_dops;
2336		r = dm_mq_init_request_queue(md, t);
2337		if (r) {
2338			DMERR("Cannot initialize queue for request-based dm mapped device");
2339			return r;
2340		}
2341		break;
2342	case DM_TYPE_BIO_BASED:
2343	case DM_TYPE_DAX_BIO_BASED:
2344		break;
2345	case DM_TYPE_NONE:
2346		WARN_ON_ONCE(true);
2347		break;
2348	}
2349
2350	r = dm_calculate_queue_limits(t, &limits);
2351	if (r) {
2352		DMERR("Cannot calculate initial queue limits");
2353		return r;
2354	}
2355	r = dm_table_set_restrictions(t, md->queue, &limits);
2356	if (r)
2357		return r;
2358
2359	/*
2360	 * Hold lock to make sure add_disk() and del_gendisk() won't concurrent
2361	 * with open_table_device() and close_table_device().
2362	 */
2363	mutex_lock(&md->table_devices_lock);
2364	r = add_disk(md->disk);
2365	mutex_unlock(&md->table_devices_lock);
2366	if (r)
2367		return r;
2368
2369	/*
2370	 * Register the holder relationship for devices added before the disk
2371	 * was live.
2372	 */
2373	list_for_each_entry(td, &md->table_devices, list) {
2374		r = bd_link_disk_holder(td->dm_dev.bdev, md->disk);
2375		if (r)
2376			goto out_undo_holders;
2377	}
2378
2379	r = dm_sysfs_init(md);
2380	if (r)
2381		goto out_undo_holders;
2382
2383	md->type = type;
2384	return 0;
2385
2386out_undo_holders:
2387	list_for_each_entry_continue_reverse(td, &md->table_devices, list)
2388		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
2389	mutex_lock(&md->table_devices_lock);
2390	del_gendisk(md->disk);
2391	mutex_unlock(&md->table_devices_lock);
2392	return r;
2393}
2394
2395struct mapped_device *dm_get_md(dev_t dev)
2396{
2397	struct mapped_device *md;
2398	unsigned int minor = MINOR(dev);
2399
2400	if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2401		return NULL;
2402
2403	spin_lock(&_minor_lock);
2404
2405	md = idr_find(&_minor_idr, minor);
2406	if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2407	    test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2408		md = NULL;
2409		goto out;
2410	}
2411	dm_get(md);
2412out:
2413	spin_unlock(&_minor_lock);
2414
2415	return md;
2416}
2417EXPORT_SYMBOL_GPL(dm_get_md);
2418
2419void *dm_get_mdptr(struct mapped_device *md)
2420{
2421	return md->interface_ptr;
2422}
2423
2424void dm_set_mdptr(struct mapped_device *md, void *ptr)
2425{
2426	md->interface_ptr = ptr;
2427}
2428
2429void dm_get(struct mapped_device *md)
2430{
2431	atomic_inc(&md->holders);
2432	BUG_ON(test_bit(DMF_FREEING, &md->flags));
2433}
2434
2435int dm_hold(struct mapped_device *md)
2436{
2437	spin_lock(&_minor_lock);
2438	if (test_bit(DMF_FREEING, &md->flags)) {
2439		spin_unlock(&_minor_lock);
2440		return -EBUSY;
2441	}
2442	dm_get(md);
2443	spin_unlock(&_minor_lock);
2444	return 0;
2445}
2446EXPORT_SYMBOL_GPL(dm_hold);
2447
2448const char *dm_device_name(struct mapped_device *md)
2449{
2450	return md->name;
2451}
2452EXPORT_SYMBOL_GPL(dm_device_name);
2453
2454static void __dm_destroy(struct mapped_device *md, bool wait)
2455{
2456	struct dm_table *map;
2457	int srcu_idx;
2458
2459	might_sleep();
2460
2461	spin_lock(&_minor_lock);
2462	idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2463	set_bit(DMF_FREEING, &md->flags);
2464	spin_unlock(&_minor_lock);
2465
2466	blk_mark_disk_dead(md->disk);
2467
2468	/*
2469	 * Take suspend_lock so that presuspend and postsuspend methods
2470	 * do not race with internal suspend.
2471	 */
2472	mutex_lock(&md->suspend_lock);
2473	map = dm_get_live_table(md, &srcu_idx);
2474	if (!dm_suspended_md(md)) {
2475		dm_table_presuspend_targets(map);
2476		set_bit(DMF_SUSPENDED, &md->flags);
2477		set_bit(DMF_POST_SUSPENDING, &md->flags);
2478		dm_table_postsuspend_targets(map);
2479	}
2480	/* dm_put_live_table must be before fsleep, otherwise deadlock is possible */
2481	dm_put_live_table(md, srcu_idx);
2482	mutex_unlock(&md->suspend_lock);
2483
2484	/*
2485	 * Rare, but there may be I/O requests still going to complete,
2486	 * for example.  Wait for all references to disappear.
2487	 * No one should increment the reference count of the mapped_device,
2488	 * after the mapped_device state becomes DMF_FREEING.
2489	 */
2490	if (wait)
2491		while (atomic_read(&md->holders))
2492			fsleep(1000);
2493	else if (atomic_read(&md->holders))
2494		DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2495		       dm_device_name(md), atomic_read(&md->holders));
2496
2497	dm_table_destroy(__unbind(md));
2498	free_dev(md);
2499}
2500
2501void dm_destroy(struct mapped_device *md)
2502{
2503	__dm_destroy(md, true);
2504}
2505
2506void dm_destroy_immediate(struct mapped_device *md)
2507{
2508	__dm_destroy(md, false);
2509}
2510
2511void dm_put(struct mapped_device *md)
2512{
2513	atomic_dec(&md->holders);
2514}
2515EXPORT_SYMBOL_GPL(dm_put);
2516
2517static bool dm_in_flight_bios(struct mapped_device *md)
2518{
2519	int cpu;
2520	unsigned long sum = 0;
2521
2522	for_each_possible_cpu(cpu)
2523		sum += *per_cpu_ptr(md->pending_io, cpu);
2524
2525	return sum != 0;
2526}
2527
2528static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state)
2529{
2530	int r = 0;
2531	DEFINE_WAIT(wait);
2532
2533	while (true) {
2534		prepare_to_wait(&md->wait, &wait, task_state);
2535
2536		if (!dm_in_flight_bios(md))
2537			break;
2538
2539		if (signal_pending_state(task_state, current)) {
2540			r = -EINTR;
2541			break;
2542		}
2543
2544		io_schedule();
2545	}
2546	finish_wait(&md->wait, &wait);
2547
2548	smp_rmb();
2549
2550	return r;
2551}
2552
2553static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state)
2554{
2555	int r = 0;
2556
2557	if (!queue_is_mq(md->queue))
2558		return dm_wait_for_bios_completion(md, task_state);
2559
2560	while (true) {
2561		if (!blk_mq_queue_inflight(md->queue))
2562			break;
2563
2564		if (signal_pending_state(task_state, current)) {
2565			r = -EINTR;
2566			break;
2567		}
2568
2569		fsleep(5000);
2570	}
2571
2572	return r;
2573}
2574
2575/*
2576 * Process the deferred bios
2577 */
2578static void dm_wq_work(struct work_struct *work)
2579{
2580	struct mapped_device *md = container_of(work, struct mapped_device, work);
2581	struct bio *bio;
2582
2583	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2584		spin_lock_irq(&md->deferred_lock);
2585		bio = bio_list_pop(&md->deferred);
2586		spin_unlock_irq(&md->deferred_lock);
2587
2588		if (!bio)
2589			break;
2590
2591		submit_bio_noacct(bio);
2592		cond_resched();
2593	}
2594}
2595
2596static void dm_queue_flush(struct mapped_device *md)
2597{
2598	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2599	smp_mb__after_atomic();
2600	queue_work(md->wq, &md->work);
2601}
2602
2603/*
2604 * Swap in a new table, returning the old one for the caller to destroy.
2605 */
2606struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2607{
2608	struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2609	struct queue_limits limits;
2610	int r;
2611
2612	mutex_lock(&md->suspend_lock);
2613
2614	/* device must be suspended */
2615	if (!dm_suspended_md(md))
2616		goto out;
2617
2618	/*
2619	 * If the new table has no data devices, retain the existing limits.
2620	 * This helps multipath with queue_if_no_path if all paths disappear,
2621	 * then new I/O is queued based on these limits, and then some paths
2622	 * reappear.
2623	 */
2624	if (dm_table_has_no_data_devices(table)) {
2625		live_map = dm_get_live_table_fast(md);
2626		if (live_map)
2627			limits = md->queue->limits;
2628		dm_put_live_table_fast(md);
2629	}
2630
2631	if (!live_map) {
2632		r = dm_calculate_queue_limits(table, &limits);
2633		if (r) {
2634			map = ERR_PTR(r);
2635			goto out;
2636		}
2637	}
2638
2639	map = __bind(md, table, &limits);
2640	dm_issue_global_event();
2641
2642out:
2643	mutex_unlock(&md->suspend_lock);
2644	return map;
2645}
2646
2647/*
2648 * Functions to lock and unlock any filesystem running on the
2649 * device.
2650 */
2651static int lock_fs(struct mapped_device *md)
2652{
2653	int r;
2654
2655	WARN_ON(test_bit(DMF_FROZEN, &md->flags));
2656
2657	r = freeze_bdev(md->disk->part0);
2658	if (!r)
2659		set_bit(DMF_FROZEN, &md->flags);
2660	return r;
2661}
2662
2663static void unlock_fs(struct mapped_device *md)
2664{
2665	if (!test_bit(DMF_FROZEN, &md->flags))
2666		return;
2667	thaw_bdev(md->disk->part0);
2668	clear_bit(DMF_FROZEN, &md->flags);
2669}
2670
2671/*
2672 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
2673 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
2674 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
2675 *
2676 * If __dm_suspend returns 0, the device is completely quiescent
2677 * now. There is no request-processing activity. All new requests
2678 * are being added to md->deferred list.
2679 */
2680static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2681			unsigned int suspend_flags, unsigned int task_state,
2682			int dmf_suspended_flag)
2683{
2684	bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2685	bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2686	int r;
2687
2688	lockdep_assert_held(&md->suspend_lock);
2689
2690	/*
2691	 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2692	 * This flag is cleared before dm_suspend returns.
2693	 */
2694	if (noflush)
2695		set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2696	else
2697		DMDEBUG("%s: suspending with flush", dm_device_name(md));
2698
2699	/*
2700	 * This gets reverted if there's an error later and the targets
2701	 * provide the .presuspend_undo hook.
2702	 */
2703	dm_table_presuspend_targets(map);
2704
2705	/*
2706	 * Flush I/O to the device.
2707	 * Any I/O submitted after lock_fs() may not be flushed.
2708	 * noflush takes precedence over do_lockfs.
2709	 * (lock_fs() flushes I/Os and waits for them to complete.)
2710	 */
2711	if (!noflush && do_lockfs) {
2712		r = lock_fs(md);
2713		if (r) {
2714			dm_table_presuspend_undo_targets(map);
2715			return r;
2716		}
2717	}
2718
2719	/*
2720	 * Here we must make sure that no processes are submitting requests
2721	 * to target drivers i.e. no one may be executing
2722	 * dm_split_and_process_bio from dm_submit_bio.
2723	 *
2724	 * To get all processes out of dm_split_and_process_bio in dm_submit_bio,
2725	 * we take the write lock. To prevent any process from reentering
2726	 * dm_split_and_process_bio from dm_submit_bio and quiesce the thread
2727	 * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call
2728	 * flush_workqueue(md->wq).
2729	 */
2730	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2731	if (map)
2732		synchronize_srcu(&md->io_barrier);
2733
2734	/*
2735	 * Stop md->queue before flushing md->wq in case request-based
2736	 * dm defers requests to md->wq from md->queue.
2737	 */
2738	if (dm_request_based(md))
2739		dm_stop_queue(md->queue);
2740
2741	flush_workqueue(md->wq);
2742
2743	/*
2744	 * At this point no more requests are entering target request routines.
2745	 * We call dm_wait_for_completion to wait for all existing requests
2746	 * to finish.
2747	 */
2748	r = dm_wait_for_completion(md, task_state);
2749	if (!r)
2750		set_bit(dmf_suspended_flag, &md->flags);
2751
2752	if (noflush)
2753		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2754	if (map)
2755		synchronize_srcu(&md->io_barrier);
2756
2757	/* were we interrupted ? */
2758	if (r < 0) {
2759		dm_queue_flush(md);
2760
2761		if (dm_request_based(md))
2762			dm_start_queue(md->queue);
2763
2764		unlock_fs(md);
2765		dm_table_presuspend_undo_targets(map);
2766		/* pushback list is already flushed, so skip flush */
2767	}
2768
2769	return r;
2770}
2771
2772/*
2773 * We need to be able to change a mapping table under a mounted
2774 * filesystem.  For example we might want to move some data in
2775 * the background.  Before the table can be swapped with
2776 * dm_bind_table, dm_suspend must be called to flush any in
2777 * flight bios and ensure that any further io gets deferred.
2778 */
2779/*
2780 * Suspend mechanism in request-based dm.
2781 *
2782 * 1. Flush all I/Os by lock_fs() if needed.
2783 * 2. Stop dispatching any I/O by stopping the request_queue.
2784 * 3. Wait for all in-flight I/Os to be completed or requeued.
2785 *
2786 * To abort suspend, start the request_queue.
2787 */
2788int dm_suspend(struct mapped_device *md, unsigned int suspend_flags)
2789{
2790	struct dm_table *map = NULL;
2791	int r = 0;
2792
2793retry:
2794	mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2795
2796	if (dm_suspended_md(md)) {
2797		r = -EINVAL;
2798		goto out_unlock;
2799	}
2800
2801	if (dm_suspended_internally_md(md)) {
2802		/* already internally suspended, wait for internal resume */
2803		mutex_unlock(&md->suspend_lock);
2804		r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2805		if (r)
2806			return r;
2807		goto retry;
2808	}
2809
2810	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2811
2812	r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2813	if (r)
2814		goto out_unlock;
2815
2816	set_bit(DMF_POST_SUSPENDING, &md->flags);
2817	dm_table_postsuspend_targets(map);
2818	clear_bit(DMF_POST_SUSPENDING, &md->flags);
2819
2820out_unlock:
2821	mutex_unlock(&md->suspend_lock);
2822	return r;
2823}
2824
2825static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2826{
2827	if (map) {
2828		int r = dm_table_resume_targets(map);
2829
2830		if (r)
2831			return r;
2832	}
2833
2834	dm_queue_flush(md);
2835
2836	/*
2837	 * Flushing deferred I/Os must be done after targets are resumed
2838	 * so that mapping of targets can work correctly.
2839	 * Request-based dm is queueing the deferred I/Os in its request_queue.
2840	 */
2841	if (dm_request_based(md))
2842		dm_start_queue(md->queue);
2843
2844	unlock_fs(md);
2845
2846	return 0;
2847}
2848
2849int dm_resume(struct mapped_device *md)
2850{
2851	int r;
2852	struct dm_table *map = NULL;
2853
2854retry:
2855	r = -EINVAL;
2856	mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2857
2858	if (!dm_suspended_md(md))
2859		goto out;
2860
2861	if (dm_suspended_internally_md(md)) {
2862		/* already internally suspended, wait for internal resume */
2863		mutex_unlock(&md->suspend_lock);
2864		r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2865		if (r)
2866			return r;
2867		goto retry;
2868	}
2869
2870	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2871	if (!map || !dm_table_get_size(map))
2872		goto out;
2873
2874	r = __dm_resume(md, map);
2875	if (r)
2876		goto out;
2877
2878	clear_bit(DMF_SUSPENDED, &md->flags);
2879out:
2880	mutex_unlock(&md->suspend_lock);
2881
2882	return r;
2883}
2884
2885/*
2886 * Internal suspend/resume works like userspace-driven suspend. It waits
2887 * until all bios finish and prevents issuing new bios to the target drivers.
2888 * It may be used only from the kernel.
2889 */
2890
2891static void __dm_internal_suspend(struct mapped_device *md, unsigned int suspend_flags)
2892{
2893	struct dm_table *map = NULL;
2894
2895	lockdep_assert_held(&md->suspend_lock);
2896
2897	if (md->internal_suspend_count++)
2898		return; /* nested internal suspend */
2899
2900	if (dm_suspended_md(md)) {
2901		set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2902		return; /* nest suspend */
2903	}
2904
2905	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2906
2907	/*
2908	 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
2909	 * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
2910	 * would require changing .presuspend to return an error -- avoid this
2911	 * until there is a need for more elaborate variants of internal suspend.
2912	 */
2913	(void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2914			    DMF_SUSPENDED_INTERNALLY);
2915
2916	set_bit(DMF_POST_SUSPENDING, &md->flags);
2917	dm_table_postsuspend_targets(map);
2918	clear_bit(DMF_POST_SUSPENDING, &md->flags);
2919}
2920
2921static void __dm_internal_resume(struct mapped_device *md)
2922{
2923	BUG_ON(!md->internal_suspend_count);
2924
2925	if (--md->internal_suspend_count)
2926		return; /* resume from nested internal suspend */
2927
2928	if (dm_suspended_md(md))
2929		goto done; /* resume from nested suspend */
2930
2931	/*
2932	 * NOTE: existing callers don't need to call dm_table_resume_targets
2933	 * (which may fail -- so best to avoid it for now by passing NULL map)
2934	 */
2935	(void) __dm_resume(md, NULL);
2936
2937done:
2938	clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2939	smp_mb__after_atomic();
2940	wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2941}
2942
2943void dm_internal_suspend_noflush(struct mapped_device *md)
2944{
2945	mutex_lock(&md->suspend_lock);
2946	__dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2947	mutex_unlock(&md->suspend_lock);
2948}
2949EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2950
2951void dm_internal_resume(struct mapped_device *md)
2952{
2953	mutex_lock(&md->suspend_lock);
2954	__dm_internal_resume(md);
2955	mutex_unlock(&md->suspend_lock);
2956}
2957EXPORT_SYMBOL_GPL(dm_internal_resume);
2958
2959/*
2960 * Fast variants of internal suspend/resume hold md->suspend_lock,
2961 * which prevents interaction with userspace-driven suspend.
2962 */
2963
2964void dm_internal_suspend_fast(struct mapped_device *md)
2965{
2966	mutex_lock(&md->suspend_lock);
2967	if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2968		return;
2969
2970	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2971	synchronize_srcu(&md->io_barrier);
2972	flush_workqueue(md->wq);
2973	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2974}
2975EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2976
2977void dm_internal_resume_fast(struct mapped_device *md)
2978{
2979	if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2980		goto done;
2981
2982	dm_queue_flush(md);
2983
2984done:
2985	mutex_unlock(&md->suspend_lock);
2986}
2987EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2988
2989/*
2990 *---------------------------------------------------------------
2991 * Event notification.
2992 *---------------------------------------------------------------
2993 */
2994int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2995		      unsigned int cookie, bool need_resize_uevent)
2996{
2997	int r;
2998	unsigned int noio_flag;
2999	char udev_cookie[DM_COOKIE_LENGTH];
3000	char *envp[3] = { NULL, NULL, NULL };
3001	char **envpp = envp;
3002	if (cookie) {
3003		snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
3004			 DM_COOKIE_ENV_VAR_NAME, cookie);
3005		*envpp++ = udev_cookie;
3006	}
3007	if (need_resize_uevent) {
3008		*envpp++ = "RESIZE=1";
3009	}
3010
3011	noio_flag = memalloc_noio_save();
3012
3013	r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp);
3014
3015	memalloc_noio_restore(noio_flag);
3016
3017	return r;
3018}
3019
3020uint32_t dm_next_uevent_seq(struct mapped_device *md)
3021{
3022	return atomic_add_return(1, &md->uevent_seq);
3023}
3024
3025uint32_t dm_get_event_nr(struct mapped_device *md)
3026{
3027	return atomic_read(&md->event_nr);
3028}
3029
3030int dm_wait_event(struct mapped_device *md, int event_nr)
3031{
3032	return wait_event_interruptible(md->eventq,
3033			(event_nr != atomic_read(&md->event_nr)));
3034}
3035
3036void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
3037{
3038	unsigned long flags;
3039
3040	spin_lock_irqsave(&md->uevent_lock, flags);
3041	list_add(elist, &md->uevent_list);
3042	spin_unlock_irqrestore(&md->uevent_lock, flags);
3043}
3044
3045/*
3046 * The gendisk is only valid as long as you have a reference
3047 * count on 'md'.
3048 */
3049struct gendisk *dm_disk(struct mapped_device *md)
3050{
3051	return md->disk;
3052}
3053EXPORT_SYMBOL_GPL(dm_disk);
3054
3055struct kobject *dm_kobject(struct mapped_device *md)
3056{
3057	return &md->kobj_holder.kobj;
3058}
3059
3060struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
3061{
3062	struct mapped_device *md;
3063
3064	md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
3065
3066	spin_lock(&_minor_lock);
3067	if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
3068		md = NULL;
3069		goto out;
3070	}
3071	dm_get(md);
3072out:
3073	spin_unlock(&_minor_lock);
3074
3075	return md;
3076}
3077
3078int dm_suspended_md(struct mapped_device *md)
3079{
3080	return test_bit(DMF_SUSPENDED, &md->flags);
3081}
3082
3083static int dm_post_suspending_md(struct mapped_device *md)
3084{
3085	return test_bit(DMF_POST_SUSPENDING, &md->flags);
3086}
3087
3088int dm_suspended_internally_md(struct mapped_device *md)
3089{
3090	return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3091}
3092
3093int dm_test_deferred_remove_flag(struct mapped_device *md)
3094{
3095	return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
3096}
3097
3098int dm_suspended(struct dm_target *ti)
3099{
3100	return dm_suspended_md(ti->table->md);
3101}
3102EXPORT_SYMBOL_GPL(dm_suspended);
3103
3104int dm_post_suspending(struct dm_target *ti)
3105{
3106	return dm_post_suspending_md(ti->table->md);
3107}
3108EXPORT_SYMBOL_GPL(dm_post_suspending);
3109
3110int dm_noflush_suspending(struct dm_target *ti)
3111{
3112	return __noflush_suspending(ti->table->md);
3113}
3114EXPORT_SYMBOL_GPL(dm_noflush_suspending);
3115
3116void dm_free_md_mempools(struct dm_md_mempools *pools)
3117{
3118	if (!pools)
3119		return;
3120
3121	bioset_exit(&pools->bs);
3122	bioset_exit(&pools->io_bs);
3123
3124	kfree(pools);
3125}
3126
3127struct dm_pr {
3128	u64	old_key;
3129	u64	new_key;
3130	u32	flags;
3131	bool	abort;
3132	bool	fail_early;
3133	int	ret;
3134	enum pr_type type;
3135};
3136
3137static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
3138		      struct dm_pr *pr)
3139{
3140	struct mapped_device *md = bdev->bd_disk->private_data;
3141	struct dm_table *table;
3142	struct dm_target *ti;
3143	int ret = -ENOTTY, srcu_idx;
3144
3145	table = dm_get_live_table(md, &srcu_idx);
3146	if (!table || !dm_table_get_size(table))
3147		goto out;
3148
3149	/* We only support devices that have a single target */
3150	if (table->num_targets != 1)
3151		goto out;
3152	ti = dm_table_get_target(table, 0);
3153
3154	if (dm_suspended_md(md)) {
3155		ret = -EAGAIN;
3156		goto out;
3157	}
3158
3159	ret = -EINVAL;
3160	if (!ti->type->iterate_devices)
3161		goto out;
3162
3163	ti->type->iterate_devices(ti, fn, pr);
3164	ret = 0;
3165out:
3166	dm_put_live_table(md, srcu_idx);
3167	return ret;
3168}
3169
3170/*
3171 * For register / unregister we need to manually call out to every path.
3172 */
3173static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
3174			    sector_t start, sector_t len, void *data)
3175{
3176	struct dm_pr *pr = data;
3177	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3178	int ret;
3179
3180	if (!ops || !ops->pr_register) {
3181		pr->ret = -EOPNOTSUPP;
3182		return -1;
3183	}
3184
3185	ret = ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
3186	if (!ret)
3187		return 0;
3188
3189	if (!pr->ret)
3190		pr->ret = ret;
3191
3192	if (pr->fail_early)
3193		return -1;
3194
3195	return 0;
3196}
3197
3198static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
3199			  u32 flags)
3200{
3201	struct dm_pr pr = {
3202		.old_key	= old_key,
3203		.new_key	= new_key,
3204		.flags		= flags,
3205		.fail_early	= true,
3206		.ret		= 0,
3207	};
3208	int ret;
3209
3210	ret = dm_call_pr(bdev, __dm_pr_register, &pr);
3211	if (ret) {
3212		/* Didn't even get to register a path */
3213		return ret;
3214	}
3215
3216	if (!pr.ret)
3217		return 0;
3218	ret = pr.ret;
3219
3220	if (!new_key)
3221		return ret;
3222
3223	/* unregister all paths if we failed to register any path */
3224	pr.old_key = new_key;
3225	pr.new_key = 0;
3226	pr.flags = 0;
3227	pr.fail_early = false;
3228	(void) dm_call_pr(bdev, __dm_pr_register, &pr);
3229	return ret;
3230}
3231
3232
3233static int __dm_pr_reserve(struct dm_target *ti, struct dm_dev *dev,
3234			   sector_t start, sector_t len, void *data)
3235{
3236	struct dm_pr *pr = data;
3237	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3238
3239	if (!ops || !ops->pr_reserve) {
3240		pr->ret = -EOPNOTSUPP;
3241		return -1;
3242	}
3243
3244	pr->ret = ops->pr_reserve(dev->bdev, pr->old_key, pr->type, pr->flags);
3245	if (!pr->ret)
3246		return -1;
3247
3248	return 0;
3249}
3250
3251static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
3252			 u32 flags)
3253{
3254	struct dm_pr pr = {
3255		.old_key	= key,
3256		.flags		= flags,
3257		.type		= type,
3258		.fail_early	= false,
3259		.ret		= 0,
3260	};
3261	int ret;
3262
3263	ret = dm_call_pr(bdev, __dm_pr_reserve, &pr);
3264	if (ret)
3265		return ret;
3266
3267	return pr.ret;
3268}
3269
3270/*
3271 * If there is a non-All Registrants type of reservation, the release must be
3272 * sent down the holding path. For the cases where there is no reservation or
3273 * the path is not the holder the device will also return success, so we must
3274 * try each path to make sure we got the correct path.
3275 */
3276static int __dm_pr_release(struct dm_target *ti, struct dm_dev *dev,
3277			   sector_t start, sector_t len, void *data)
3278{
3279	struct dm_pr *pr = data;
3280	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3281
3282	if (!ops || !ops->pr_release) {
3283		pr->ret = -EOPNOTSUPP;
3284		return -1;
3285	}
3286
3287	pr->ret = ops->pr_release(dev->bdev, pr->old_key, pr->type);
3288	if (pr->ret)
3289		return -1;
3290
3291	return 0;
3292}
3293
3294static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3295{
3296	struct dm_pr pr = {
3297		.old_key	= key,
3298		.type		= type,
3299		.fail_early	= false,
3300	};
3301	int ret;
3302
3303	ret = dm_call_pr(bdev, __dm_pr_release, &pr);
3304	if (ret)
3305		return ret;
3306
3307	return pr.ret;
3308}
3309
3310static int __dm_pr_preempt(struct dm_target *ti, struct dm_dev *dev,
3311			   sector_t start, sector_t len, void *data)
3312{
3313	struct dm_pr *pr = data;
3314	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3315
3316	if (!ops || !ops->pr_preempt) {
3317		pr->ret = -EOPNOTSUPP;
3318		return -1;
3319	}
3320
3321	pr->ret = ops->pr_preempt(dev->bdev, pr->old_key, pr->new_key, pr->type,
3322				  pr->abort);
3323	if (!pr->ret)
3324		return -1;
3325
3326	return 0;
3327}
3328
3329static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3330			 enum pr_type type, bool abort)
3331{
3332	struct dm_pr pr = {
3333		.new_key	= new_key,
3334		.old_key	= old_key,
3335		.type		= type,
3336		.fail_early	= false,
3337	};
3338	int ret;
3339
3340	ret = dm_call_pr(bdev, __dm_pr_preempt, &pr);
3341	if (ret)
3342		return ret;
3343
3344	return pr.ret;
3345}
3346
3347static int dm_pr_clear(struct block_device *bdev, u64 key)
3348{
3349	struct mapped_device *md = bdev->bd_disk->private_data;
3350	const struct pr_ops *ops;
3351	int r, srcu_idx;
3352
3353	r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3354	if (r < 0)
3355		goto out;
3356
3357	ops = bdev->bd_disk->fops->pr_ops;
3358	if (ops && ops->pr_clear)
3359		r = ops->pr_clear(bdev, key);
3360	else
3361		r = -EOPNOTSUPP;
3362out:
3363	dm_unprepare_ioctl(md, srcu_idx);
3364	return r;
3365}
3366
3367static const struct pr_ops dm_pr_ops = {
3368	.pr_register	= dm_pr_register,
3369	.pr_reserve	= dm_pr_reserve,
3370	.pr_release	= dm_pr_release,
3371	.pr_preempt	= dm_pr_preempt,
3372	.pr_clear	= dm_pr_clear,
3373};
3374
3375static const struct block_device_operations dm_blk_dops = {
3376	.submit_bio = dm_submit_bio,
3377	.poll_bio = dm_poll_bio,
3378	.open = dm_blk_open,
3379	.release = dm_blk_close,
3380	.ioctl = dm_blk_ioctl,
3381	.getgeo = dm_blk_getgeo,
3382	.report_zones = dm_blk_report_zones,
3383	.pr_ops = &dm_pr_ops,
3384	.owner = THIS_MODULE
3385};
3386
3387static const struct block_device_operations dm_rq_blk_dops = {
3388	.open = dm_blk_open,
3389	.release = dm_blk_close,
3390	.ioctl = dm_blk_ioctl,
3391	.getgeo = dm_blk_getgeo,
3392	.pr_ops = &dm_pr_ops,
3393	.owner = THIS_MODULE
3394};
3395
3396static const struct dax_operations dm_dax_ops = {
3397	.direct_access = dm_dax_direct_access,
3398	.zero_page_range = dm_dax_zero_page_range,
3399	.recovery_write = dm_dax_recovery_write,
3400};
3401
3402/*
3403 * module hooks
3404 */
3405module_init(dm_init);
3406module_exit(dm_exit);
3407
3408module_param(major, uint, 0);
3409MODULE_PARM_DESC(major, "The major number of the device mapper");
3410
3411module_param(reserved_bio_based_ios, uint, 0644);
3412MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3413
3414module_param(dm_numa_node, int, 0644);
3415MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3416
3417module_param(swap_bios, int, 0644);
3418MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs");
3419
3420MODULE_DESCRIPTION(DM_NAME " driver");
3421MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3422MODULE_LICENSE("GPL");
Configure Feed

Configure Feed