drivers/md/dm.c at v4.12 · tjh.dev/kernel

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / drivers / md / dm.c
at v4.12 2881 lines 67 kB view raw
wrap content
   1/*
   2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4 *
   5 * This file is released under the GPL.
   6 */
   7
   8#include "dm-core.h"
   9#include "dm-rq.h"
  10#include "dm-uevent.h"
  11
  12#include <linux/init.h>
  13#include <linux/module.h>
  14#include <linux/mutex.h>
  15#include <linux/sched/signal.h>
  16#include <linux/blkpg.h>
  17#include <linux/bio.h>
  18#include <linux/mempool.h>
  19#include <linux/dax.h>
  20#include <linux/slab.h>
  21#include <linux/idr.h>
  22#include <linux/hdreg.h>
  23#include <linux/delay.h>
  24#include <linux/wait.h>
  25#include <linux/pr.h>
  26
  27#define DM_MSG_PREFIX "core"
  28
  29#ifdef CONFIG_PRINTK
  30/*
  31 * ratelimit state to be used in DMXXX_LIMIT().
  32 */
  33DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
  34		       DEFAULT_RATELIMIT_INTERVAL,
  35		       DEFAULT_RATELIMIT_BURST);
  36EXPORT_SYMBOL(dm_ratelimit_state);
  37#endif
  38
  39/*
  40 * Cookies are numeric values sent with CHANGE and REMOVE
  41 * uevents while resuming, removing or renaming the device.
  42 */
  43#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  44#define DM_COOKIE_LENGTH 24
  45
  46static const char *_name = DM_NAME;
  47
  48static unsigned int major = 0;
  49static unsigned int _major = 0;
  50
  51static DEFINE_IDR(_minor_idr);
  52
  53static DEFINE_SPINLOCK(_minor_lock);
  54
  55static void do_deferred_remove(struct work_struct *w);
  56
  57static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
  58
  59static struct workqueue_struct *deferred_remove_workqueue;
  60
  61/*
  62 * One of these is allocated per bio.
  63 */
  64struct dm_io {
  65	struct mapped_device *md;
  66	int error;
  67	atomic_t io_count;
  68	struct bio *bio;
  69	unsigned long start_time;
  70	spinlock_t endio_lock;
  71	struct dm_stats_aux stats_aux;
  72};
  73
  74#define MINOR_ALLOCED ((void *)-1)
  75
  76/*
  77 * Bits for the md->flags field.
  78 */
  79#define DMF_BLOCK_IO_FOR_SUSPEND 0
  80#define DMF_SUSPENDED 1
  81#define DMF_FROZEN 2
  82#define DMF_FREEING 3
  83#define DMF_DELETING 4
  84#define DMF_NOFLUSH_SUSPENDING 5
  85#define DMF_DEFERRED_REMOVE 6
  86#define DMF_SUSPENDED_INTERNALLY 7
  87
  88#define DM_NUMA_NODE NUMA_NO_NODE
  89static int dm_numa_node = DM_NUMA_NODE;
  90
  91/*
  92 * For mempools pre-allocation at the table loading time.
  93 */
  94struct dm_md_mempools {
  95	mempool_t *io_pool;
  96	struct bio_set *bs;
  97};
  98
  99struct table_device {
 100	struct list_head list;
 101	atomic_t count;
 102	struct dm_dev dm_dev;
 103};
 104
 105static struct kmem_cache *_io_cache;
 106static struct kmem_cache *_rq_tio_cache;
 107static struct kmem_cache *_rq_cache;
 108
 109/*
 110 * Bio-based DM's mempools' reserved IOs set by the user.
 111 */
 112#define RESERVED_BIO_BASED_IOS		16
 113static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 114
 115static int __dm_get_module_param_int(int *module_param, int min, int max)
 116{
 117	int param = ACCESS_ONCE(*module_param);
 118	int modified_param = 0;
 119	bool modified = true;
 120
 121	if (param < min)
 122		modified_param = min;
 123	else if (param > max)
 124		modified_param = max;
 125	else
 126		modified = false;
 127
 128	if (modified) {
 129		(void)cmpxchg(module_param, param, modified_param);
 130		param = modified_param;
 131	}
 132
 133	return param;
 134}
 135
 136unsigned __dm_get_module_param(unsigned *module_param,
 137			       unsigned def, unsigned max)
 138{
 139	unsigned param = ACCESS_ONCE(*module_param);
 140	unsigned modified_param = 0;
 141
 142	if (!param)
 143		modified_param = def;
 144	else if (param > max)
 145		modified_param = max;
 146
 147	if (modified_param) {
 148		(void)cmpxchg(module_param, param, modified_param);
 149		param = modified_param;
 150	}
 151
 152	return param;
 153}
 154
 155unsigned dm_get_reserved_bio_based_ios(void)
 156{
 157	return __dm_get_module_param(&reserved_bio_based_ios,
 158				     RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
 159}
 160EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 161
 162static unsigned dm_get_numa_node(void)
 163{
 164	return __dm_get_module_param_int(&dm_numa_node,
 165					 DM_NUMA_NODE, num_online_nodes() - 1);
 166}
 167
 168static int __init local_init(void)
 169{
 170	int r = -ENOMEM;
 171
 172	/* allocate a slab for the dm_ios */
 173	_io_cache = KMEM_CACHE(dm_io, 0);
 174	if (!_io_cache)
 175		return r;
 176
 177	_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
 178	if (!_rq_tio_cache)
 179		goto out_free_io_cache;
 180
 181	_rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
 182				      __alignof__(struct request), 0, NULL);
 183	if (!_rq_cache)
 184		goto out_free_rq_tio_cache;
 185
 186	r = dm_uevent_init();
 187	if (r)
 188		goto out_free_rq_cache;
 189
 190	deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
 191	if (!deferred_remove_workqueue) {
 192		r = -ENOMEM;
 193		goto out_uevent_exit;
 194	}
 195
 196	_major = major;
 197	r = register_blkdev(_major, _name);
 198	if (r < 0)
 199		goto out_free_workqueue;
 200
 201	if (!_major)
 202		_major = r;
 203
 204	return 0;
 205
 206out_free_workqueue:
 207	destroy_workqueue(deferred_remove_workqueue);
 208out_uevent_exit:
 209	dm_uevent_exit();
 210out_free_rq_cache:
 211	kmem_cache_destroy(_rq_cache);
 212out_free_rq_tio_cache:
 213	kmem_cache_destroy(_rq_tio_cache);
 214out_free_io_cache:
 215	kmem_cache_destroy(_io_cache);
 216
 217	return r;
 218}
 219
 220static void local_exit(void)
 221{
 222	flush_scheduled_work();
 223	destroy_workqueue(deferred_remove_workqueue);
 224
 225	kmem_cache_destroy(_rq_cache);
 226	kmem_cache_destroy(_rq_tio_cache);
 227	kmem_cache_destroy(_io_cache);
 228	unregister_blkdev(_major, _name);
 229	dm_uevent_exit();
 230
 231	_major = 0;
 232
 233	DMINFO("cleaned up");
 234}
 235
 236static int (*_inits[])(void) __initdata = {
 237	local_init,
 238	dm_target_init,
 239	dm_linear_init,
 240	dm_stripe_init,
 241	dm_io_init,
 242	dm_kcopyd_init,
 243	dm_interface_init,
 244	dm_statistics_init,
 245};
 246
 247static void (*_exits[])(void) = {
 248	local_exit,
 249	dm_target_exit,
 250	dm_linear_exit,
 251	dm_stripe_exit,
 252	dm_io_exit,
 253	dm_kcopyd_exit,
 254	dm_interface_exit,
 255	dm_statistics_exit,
 256};
 257
 258static int __init dm_init(void)
 259{
 260	const int count = ARRAY_SIZE(_inits);
 261
 262	int r, i;
 263
 264	for (i = 0; i < count; i++) {
 265		r = _inits[i]();
 266		if (r)
 267			goto bad;
 268	}
 269
 270	return 0;
 271
 272      bad:
 273	while (i--)
 274		_exits[i]();
 275
 276	return r;
 277}
 278
 279static void __exit dm_exit(void)
 280{
 281	int i = ARRAY_SIZE(_exits);
 282
 283	while (i--)
 284		_exits[i]();
 285
 286	/*
 287	 * Should be empty by this point.
 288	 */
 289	idr_destroy(&_minor_idr);
 290}
 291
 292/*
 293 * Block device functions
 294 */
 295int dm_deleting_md(struct mapped_device *md)
 296{
 297	return test_bit(DMF_DELETING, &md->flags);
 298}
 299
 300static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 301{
 302	struct mapped_device *md;
 303
 304	spin_lock(&_minor_lock);
 305
 306	md = bdev->bd_disk->private_data;
 307	if (!md)
 308		goto out;
 309
 310	if (test_bit(DMF_FREEING, &md->flags) ||
 311	    dm_deleting_md(md)) {
 312		md = NULL;
 313		goto out;
 314	}
 315
 316	dm_get(md);
 317	atomic_inc(&md->open_count);
 318out:
 319	spin_unlock(&_minor_lock);
 320
 321	return md ? 0 : -ENXIO;
 322}
 323
 324static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 325{
 326	struct mapped_device *md;
 327
 328	spin_lock(&_minor_lock);
 329
 330	md = disk->private_data;
 331	if (WARN_ON(!md))
 332		goto out;
 333
 334	if (atomic_dec_and_test(&md->open_count) &&
 335	    (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
 336		queue_work(deferred_remove_workqueue, &deferred_remove_work);
 337
 338	dm_put(md);
 339out:
 340	spin_unlock(&_minor_lock);
 341}
 342
 343int dm_open_count(struct mapped_device *md)
 344{
 345	return atomic_read(&md->open_count);
 346}
 347
 348/*
 349 * Guarantees nothing is using the device before it's deleted.
 350 */
 351int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
 352{
 353	int r = 0;
 354
 355	spin_lock(&_minor_lock);
 356
 357	if (dm_open_count(md)) {
 358		r = -EBUSY;
 359		if (mark_deferred)
 360			set_bit(DMF_DEFERRED_REMOVE, &md->flags);
 361	} else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
 362		r = -EEXIST;
 363	else
 364		set_bit(DMF_DELETING, &md->flags);
 365
 366	spin_unlock(&_minor_lock);
 367
 368	return r;
 369}
 370
 371int dm_cancel_deferred_remove(struct mapped_device *md)
 372{
 373	int r = 0;
 374
 375	spin_lock(&_minor_lock);
 376
 377	if (test_bit(DMF_DELETING, &md->flags))
 378		r = -EBUSY;
 379	else
 380		clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
 381
 382	spin_unlock(&_minor_lock);
 383
 384	return r;
 385}
 386
 387static void do_deferred_remove(struct work_struct *w)
 388{
 389	dm_deferred_remove();
 390}
 391
 392sector_t dm_get_size(struct mapped_device *md)
 393{
 394	return get_capacity(md->disk);
 395}
 396
 397struct request_queue *dm_get_md_queue(struct mapped_device *md)
 398{
 399	return md->queue;
 400}
 401
 402struct dm_stats *dm_get_stats(struct mapped_device *md)
 403{
 404	return &md->stats;
 405}
 406
 407static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 408{
 409	struct mapped_device *md = bdev->bd_disk->private_data;
 410
 411	return dm_get_geometry(md, geo);
 412}
 413
 414static int dm_grab_bdev_for_ioctl(struct mapped_device *md,
 415				  struct block_device **bdev,
 416				  fmode_t *mode)
 417{
 418	struct dm_target *tgt;
 419	struct dm_table *map;
 420	int srcu_idx, r;
 421
 422retry:
 423	r = -ENOTTY;
 424	map = dm_get_live_table(md, &srcu_idx);
 425	if (!map || !dm_table_get_size(map))
 426		goto out;
 427
 428	/* We only support devices that have a single target */
 429	if (dm_table_get_num_targets(map) != 1)
 430		goto out;
 431
 432	tgt = dm_table_get_target(map, 0);
 433	if (!tgt->type->prepare_ioctl)
 434		goto out;
 435
 436	if (dm_suspended_md(md)) {
 437		r = -EAGAIN;
 438		goto out;
 439	}
 440
 441	r = tgt->type->prepare_ioctl(tgt, bdev, mode);
 442	if (r < 0)
 443		goto out;
 444
 445	bdgrab(*bdev);
 446	dm_put_live_table(md, srcu_idx);
 447	return r;
 448
 449out:
 450	dm_put_live_table(md, srcu_idx);
 451	if (r == -ENOTCONN && !fatal_signal_pending(current)) {
 452		msleep(10);
 453		goto retry;
 454	}
 455	return r;
 456}
 457
 458static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 459			unsigned int cmd, unsigned long arg)
 460{
 461	struct mapped_device *md = bdev->bd_disk->private_data;
 462	int r;
 463
 464	r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
 465	if (r < 0)
 466		return r;
 467
 468	if (r > 0) {
 469		/*
 470		 * Target determined this ioctl is being issued against a
 471		 * subset of the parent bdev; require extra privileges.
 472		 */
 473		if (!capable(CAP_SYS_RAWIO)) {
 474			DMWARN_LIMIT(
 475	"%s: sending ioctl %x to DM device without required privilege.",
 476				current->comm, cmd);
 477			r = -ENOIOCTLCMD;
 478			goto out;
 479		}
 480	}
 481
 482	r =  __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 483out:
 484	bdput(bdev);
 485	return r;
 486}
 487
 488static struct dm_io *alloc_io(struct mapped_device *md)
 489{
 490	return mempool_alloc(md->io_pool, GFP_NOIO);
 491}
 492
 493static void free_io(struct mapped_device *md, struct dm_io *io)
 494{
 495	mempool_free(io, md->io_pool);
 496}
 497
 498static void free_tio(struct dm_target_io *tio)
 499{
 500	bio_put(&tio->clone);
 501}
 502
 503int md_in_flight(struct mapped_device *md)
 504{
 505	return atomic_read(&md->pending[READ]) +
 506	       atomic_read(&md->pending[WRITE]);
 507}
 508
 509static void start_io_acct(struct dm_io *io)
 510{
 511	struct mapped_device *md = io->md;
 512	struct bio *bio = io->bio;
 513	int cpu;
 514	int rw = bio_data_dir(bio);
 515
 516	io->start_time = jiffies;
 517
 518	cpu = part_stat_lock();
 519	part_round_stats(cpu, &dm_disk(md)->part0);
 520	part_stat_unlock();
 521	atomic_set(&dm_disk(md)->part0.in_flight[rw],
 522		atomic_inc_return(&md->pending[rw]));
 523
 524	if (unlikely(dm_stats_used(&md->stats)))
 525		dm_stats_account_io(&md->stats, bio_data_dir(bio),
 526				    bio->bi_iter.bi_sector, bio_sectors(bio),
 527				    false, 0, &io->stats_aux);
 528}
 529
 530static void end_io_acct(struct dm_io *io)
 531{
 532	struct mapped_device *md = io->md;
 533	struct bio *bio = io->bio;
 534	unsigned long duration = jiffies - io->start_time;
 535	int pending;
 536	int rw = bio_data_dir(bio);
 537
 538	generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
 539
 540	if (unlikely(dm_stats_used(&md->stats)))
 541		dm_stats_account_io(&md->stats, bio_data_dir(bio),
 542				    bio->bi_iter.bi_sector, bio_sectors(bio),
 543				    true, duration, &io->stats_aux);
 544
 545	/*
 546	 * After this is decremented the bio must not be touched if it is
 547	 * a flush.
 548	 */
 549	pending = atomic_dec_return(&md->pending[rw]);
 550	atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
 551	pending += atomic_read(&md->pending[rw^0x1]);
 552
 553	/* nudge anyone waiting on suspend queue */
 554	if (!pending)
 555		wake_up(&md->wait);
 556}
 557
 558/*
 559 * Add the bio to the list of deferred io.
 560 */
 561static void queue_io(struct mapped_device *md, struct bio *bio)
 562{
 563	unsigned long flags;
 564
 565	spin_lock_irqsave(&md->deferred_lock, flags);
 566	bio_list_add(&md->deferred, bio);
 567	spin_unlock_irqrestore(&md->deferred_lock, flags);
 568	queue_work(md->wq, &md->work);
 569}
 570
 571/*
 572 * Everyone (including functions in this file), should use this
 573 * function to access the md->map field, and make sure they call
 574 * dm_put_live_table() when finished.
 575 */
 576struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
 577{
 578	*srcu_idx = srcu_read_lock(&md->io_barrier);
 579
 580	return srcu_dereference(md->map, &md->io_barrier);
 581}
 582
 583void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
 584{
 585	srcu_read_unlock(&md->io_barrier, srcu_idx);
 586}
 587
 588void dm_sync_table(struct mapped_device *md)
 589{
 590	synchronize_srcu(&md->io_barrier);
 591	synchronize_rcu_expedited();
 592}
 593
 594/*
 595 * A fast alternative to dm_get_live_table/dm_put_live_table.
 596 * The caller must not block between these two functions.
 597 */
 598static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
 599{
 600	rcu_read_lock();
 601	return rcu_dereference(md->map);
 602}
 603
 604static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
 605{
 606	rcu_read_unlock();
 607}
 608
 609/*
 610 * Open a table device so we can use it as a map destination.
 611 */
 612static int open_table_device(struct table_device *td, dev_t dev,
 613			     struct mapped_device *md)
 614{
 615	static char *_claim_ptr = "I belong to device-mapper";
 616	struct block_device *bdev;
 617
 618	int r;
 619
 620	BUG_ON(td->dm_dev.bdev);
 621
 622	bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr);
 623	if (IS_ERR(bdev))
 624		return PTR_ERR(bdev);
 625
 626	r = bd_link_disk_holder(bdev, dm_disk(md));
 627	if (r) {
 628		blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
 629		return r;
 630	}
 631
 632	td->dm_dev.bdev = bdev;
 633	td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 634	return 0;
 635}
 636
 637/*
 638 * Close a table device that we've been using.
 639 */
 640static void close_table_device(struct table_device *td, struct mapped_device *md)
 641{
 642	if (!td->dm_dev.bdev)
 643		return;
 644
 645	bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
 646	blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
 647	put_dax(td->dm_dev.dax_dev);
 648	td->dm_dev.bdev = NULL;
 649	td->dm_dev.dax_dev = NULL;
 650}
 651
 652static struct table_device *find_table_device(struct list_head *l, dev_t dev,
 653					      fmode_t mode) {
 654	struct table_device *td;
 655
 656	list_for_each_entry(td, l, list)
 657		if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
 658			return td;
 659
 660	return NULL;
 661}
 662
 663int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
 664			struct dm_dev **result) {
 665	int r;
 666	struct table_device *td;
 667
 668	mutex_lock(&md->table_devices_lock);
 669	td = find_table_device(&md->table_devices, dev, mode);
 670	if (!td) {
 671		td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
 672		if (!td) {
 673			mutex_unlock(&md->table_devices_lock);
 674			return -ENOMEM;
 675		}
 676
 677		td->dm_dev.mode = mode;
 678		td->dm_dev.bdev = NULL;
 679
 680		if ((r = open_table_device(td, dev, md))) {
 681			mutex_unlock(&md->table_devices_lock);
 682			kfree(td);
 683			return r;
 684		}
 685
 686		format_dev_t(td->dm_dev.name, dev);
 687
 688		atomic_set(&td->count, 0);
 689		list_add(&td->list, &md->table_devices);
 690	}
 691	atomic_inc(&td->count);
 692	mutex_unlock(&md->table_devices_lock);
 693
 694	*result = &td->dm_dev;
 695	return 0;
 696}
 697EXPORT_SYMBOL_GPL(dm_get_table_device);
 698
 699void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
 700{
 701	struct table_device *td = container_of(d, struct table_device, dm_dev);
 702
 703	mutex_lock(&md->table_devices_lock);
 704	if (atomic_dec_and_test(&td->count)) {
 705		close_table_device(td, md);
 706		list_del(&td->list);
 707		kfree(td);
 708	}
 709	mutex_unlock(&md->table_devices_lock);
 710}
 711EXPORT_SYMBOL(dm_put_table_device);
 712
 713static void free_table_devices(struct list_head *devices)
 714{
 715	struct list_head *tmp, *next;
 716
 717	list_for_each_safe(tmp, next, devices) {
 718		struct table_device *td = list_entry(tmp, struct table_device, list);
 719
 720		DMWARN("dm_destroy: %s still exists with %d references",
 721		       td->dm_dev.name, atomic_read(&td->count));
 722		kfree(td);
 723	}
 724}
 725
 726/*
 727 * Get the geometry associated with a dm device
 728 */
 729int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 730{
 731	*geo = md->geometry;
 732
 733	return 0;
 734}
 735
 736/*
 737 * Set the geometry of a device.
 738 */
 739int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 740{
 741	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 742
 743	if (geo->start > sz) {
 744		DMWARN("Start sector is beyond the geometry limits.");
 745		return -EINVAL;
 746	}
 747
 748	md->geometry = *geo;
 749
 750	return 0;
 751}
 752
 753/*-----------------------------------------------------------------
 754 * CRUD START:
 755 *   A more elegant soln is in the works that uses the queue
 756 *   merge fn, unfortunately there are a couple of changes to
 757 *   the block layer that I want to make for this.  So in the
 758 *   interests of getting something for people to use I give
 759 *   you this clearly demarcated crap.
 760 *---------------------------------------------------------------*/
 761
 762static int __noflush_suspending(struct mapped_device *md)
 763{
 764	return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 765}
 766
 767/*
 768 * Decrements the number of outstanding ios that a bio has been
 769 * cloned into, completing the original io if necc.
 770 */
 771static void dec_pending(struct dm_io *io, int error)
 772{
 773	unsigned long flags;
 774	int io_error;
 775	struct bio *bio;
 776	struct mapped_device *md = io->md;
 777
 778	/* Push-back supersedes any I/O errors */
 779	if (unlikely(error)) {
 780		spin_lock_irqsave(&io->endio_lock, flags);
 781		if (!(io->error > 0 && __noflush_suspending(md)))
 782			io->error = error;
 783		spin_unlock_irqrestore(&io->endio_lock, flags);
 784	}
 785
 786	if (atomic_dec_and_test(&io->io_count)) {
 787		if (io->error == DM_ENDIO_REQUEUE) {
 788			/*
 789			 * Target requested pushing back the I/O.
 790			 */
 791			spin_lock_irqsave(&md->deferred_lock, flags);
 792			if (__noflush_suspending(md))
 793				bio_list_add_head(&md->deferred, io->bio);
 794			else
 795				/* noflush suspend was interrupted. */
 796				io->error = -EIO;
 797			spin_unlock_irqrestore(&md->deferred_lock, flags);
 798		}
 799
 800		io_error = io->error;
 801		bio = io->bio;
 802		end_io_acct(io);
 803		free_io(md, io);
 804
 805		if (io_error == DM_ENDIO_REQUEUE)
 806			return;
 807
 808		if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
 809			/*
 810			 * Preflush done for flush with data, reissue
 811			 * without REQ_PREFLUSH.
 812			 */
 813			bio->bi_opf &= ~REQ_PREFLUSH;
 814			queue_io(md, bio);
 815		} else {
 816			/* done with normal IO or empty flush */
 817			bio->bi_error = io_error;
 818			bio_endio(bio);
 819		}
 820	}
 821}
 822
 823void disable_write_same(struct mapped_device *md)
 824{
 825	struct queue_limits *limits = dm_get_queue_limits(md);
 826
 827	/* device doesn't really support WRITE SAME, disable it */
 828	limits->max_write_same_sectors = 0;
 829}
 830
 831void disable_write_zeroes(struct mapped_device *md)
 832{
 833	struct queue_limits *limits = dm_get_queue_limits(md);
 834
 835	/* device doesn't really support WRITE ZEROES, disable it */
 836	limits->max_write_zeroes_sectors = 0;
 837}
 838
 839static void clone_endio(struct bio *bio)
 840{
 841	int error = bio->bi_error;
 842	int r = error;
 843	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 844	struct dm_io *io = tio->io;
 845	struct mapped_device *md = tio->io->md;
 846	dm_endio_fn endio = tio->ti->type->end_io;
 847
 848	if (endio) {
 849		r = endio(tio->ti, bio, error);
 850		if (r < 0 || r == DM_ENDIO_REQUEUE)
 851			/*
 852			 * error and requeue request are handled
 853			 * in dec_pending().
 854			 */
 855			error = r;
 856		else if (r == DM_ENDIO_INCOMPLETE)
 857			/* The target will handle the io */
 858			return;
 859		else if (r) {
 860			DMWARN("unimplemented target endio return value: %d", r);
 861			BUG();
 862		}
 863	}
 864
 865	if (unlikely(r == -EREMOTEIO)) {
 866		if (bio_op(bio) == REQ_OP_WRITE_SAME &&
 867		    !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
 868			disable_write_same(md);
 869		if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
 870		    !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
 871			disable_write_zeroes(md);
 872	}
 873
 874	free_tio(tio);
 875	dec_pending(io, error);
 876}
 877
 878/*
 879 * Return maximum size of I/O possible at the supplied sector up to the current
 880 * target boundary.
 881 */
 882static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
 883{
 884	sector_t target_offset = dm_target_offset(ti, sector);
 885
 886	return ti->len - target_offset;
 887}
 888
 889static sector_t max_io_len(sector_t sector, struct dm_target *ti)
 890{
 891	sector_t len = max_io_len_target_boundary(sector, ti);
 892	sector_t offset, max_len;
 893
 894	/*
 895	 * Does the target need to split even further?
 896	 */
 897	if (ti->max_io_len) {
 898		offset = dm_target_offset(ti, sector);
 899		if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
 900			max_len = sector_div(offset, ti->max_io_len);
 901		else
 902			max_len = offset & (ti->max_io_len - 1);
 903		max_len = ti->max_io_len - max_len;
 904
 905		if (len > max_len)
 906			len = max_len;
 907	}
 908
 909	return len;
 910}
 911
 912int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
 913{
 914	if (len > UINT_MAX) {
 915		DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
 916		      (unsigned long long)len, UINT_MAX);
 917		ti->error = "Maximum size of target IO is too large";
 918		return -EINVAL;
 919	}
 920
 921	ti->max_io_len = (uint32_t) len;
 922
 923	return 0;
 924}
 925EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
 926
 927static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
 928		sector_t sector, int *srcu_idx)
 929{
 930	struct dm_table *map;
 931	struct dm_target *ti;
 932
 933	map = dm_get_live_table(md, srcu_idx);
 934	if (!map)
 935		return NULL;
 936
 937	ti = dm_table_find_target(map, sector);
 938	if (!dm_target_is_valid(ti))
 939		return NULL;
 940
 941	return ti;
 942}
 943
 944static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 945		long nr_pages, void **kaddr, pfn_t *pfn)
 946{
 947	struct mapped_device *md = dax_get_private(dax_dev);
 948	sector_t sector = pgoff * PAGE_SECTORS;
 949	struct dm_target *ti;
 950	long len, ret = -EIO;
 951	int srcu_idx;
 952
 953	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
 954
 955	if (!ti)
 956		goto out;
 957	if (!ti->type->direct_access)
 958		goto out;
 959	len = max_io_len(sector, ti) / PAGE_SECTORS;
 960	if (len < 1)
 961		goto out;
 962	nr_pages = min(len, nr_pages);
 963	if (ti->type->direct_access)
 964		ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
 965
 966 out:
 967	dm_put_live_table(md, srcu_idx);
 968
 969	return ret;
 970}
 971
 972/*
 973 * A target may call dm_accept_partial_bio only from the map routine.  It is
 974 * allowed for all bio types except REQ_PREFLUSH.
 975 *
 976 * dm_accept_partial_bio informs the dm that the target only wants to process
 977 * additional n_sectors sectors of the bio and the rest of the data should be
 978 * sent in a next bio.
 979 *
 980 * A diagram that explains the arithmetics:
 981 * +--------------------+---------------+-------+
 982 * |         1          |       2       |   3   |
 983 * +--------------------+---------------+-------+
 984 *
 985 * <-------------- *tio->len_ptr --------------->
 986 *                      <------- bi_size ------->
 987 *                      <-- n_sectors -->
 988 *
 989 * Region 1 was already iterated over with bio_advance or similar function.
 990 *	(it may be empty if the target doesn't use bio_advance)
 991 * Region 2 is the remaining bio size that the target wants to process.
 992 *	(it may be empty if region 1 is non-empty, although there is no reason
 993 *	 to make it empty)
 994 * The target requires that region 3 is to be sent in the next bio.
 995 *
 996 * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
 997 * the partially processed part (the sum of regions 1+2) must be the same for all
 998 * copies of the bio.
 999 */
1000void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1001{
1002	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1003	unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1004	BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1005	BUG_ON(bi_size > *tio->len_ptr);
1006	BUG_ON(n_sectors > bi_size);
1007	*tio->len_ptr -= bi_size - n_sectors;
1008	bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1009}
1010EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1011
1012/*
1013 * Flush current->bio_list when the target map method blocks.
1014 * This fixes deadlocks in snapshot and possibly in other targets.
1015 */
1016struct dm_offload {
1017	struct blk_plug plug;
1018	struct blk_plug_cb cb;
1019};
1020
1021static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
1022{
1023	struct dm_offload *o = container_of(cb, struct dm_offload, cb);
1024	struct bio_list list;
1025	struct bio *bio;
1026	int i;
1027
1028	INIT_LIST_HEAD(&o->cb.list);
1029
1030	if (unlikely(!current->bio_list))
1031		return;
1032
1033	for (i = 0; i < 2; i++) {
1034		list = current->bio_list[i];
1035		bio_list_init(&current->bio_list[i]);
1036
1037		while ((bio = bio_list_pop(&list))) {
1038			struct bio_set *bs = bio->bi_pool;
1039			if (unlikely(!bs) || bs == fs_bio_set) {
1040				bio_list_add(&current->bio_list[i], bio);
1041				continue;
1042			}
1043
1044			spin_lock(&bs->rescue_lock);
1045			bio_list_add(&bs->rescue_list, bio);
1046			queue_work(bs->rescue_workqueue, &bs->rescue_work);
1047			spin_unlock(&bs->rescue_lock);
1048		}
1049	}
1050}
1051
1052static void dm_offload_start(struct dm_offload *o)
1053{
1054	blk_start_plug(&o->plug);
1055	o->cb.callback = flush_current_bio_list;
1056	list_add(&o->cb.list, &current->plug->cb_list);
1057}
1058
1059static void dm_offload_end(struct dm_offload *o)
1060{
1061	list_del(&o->cb.list);
1062	blk_finish_plug(&o->plug);
1063}
1064
1065static void __map_bio(struct dm_target_io *tio)
1066{
1067	int r;
1068	sector_t sector;
1069	struct dm_offload o;
1070	struct bio *clone = &tio->clone;
1071	struct dm_target *ti = tio->ti;
1072
1073	clone->bi_end_io = clone_endio;
1074
1075	/*
1076	 * Map the clone.  If r == 0 we don't need to do
1077	 * anything, the target has assumed ownership of
1078	 * this io.
1079	 */
1080	atomic_inc(&tio->io->io_count);
1081	sector = clone->bi_iter.bi_sector;
1082
1083	dm_offload_start(&o);
1084	r = ti->type->map(ti, clone);
1085	dm_offload_end(&o);
1086
1087	if (r == DM_MAPIO_REMAPPED) {
1088		/* the bio has been remapped so dispatch it */
1089
1090		trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
1091				      tio->io->bio->bi_bdev->bd_dev, sector);
1092
1093		generic_make_request(clone);
1094	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
1095		/* error the io and bail out, or requeue it if needed */
1096		dec_pending(tio->io, r);
1097		free_tio(tio);
1098	} else if (r != DM_MAPIO_SUBMITTED) {
1099		DMWARN("unimplemented target map return value: %d", r);
1100		BUG();
1101	}
1102}
1103
1104struct clone_info {
1105	struct mapped_device *md;
1106	struct dm_table *map;
1107	struct bio *bio;
1108	struct dm_io *io;
1109	sector_t sector;
1110	unsigned sector_count;
1111};
1112
1113static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1114{
1115	bio->bi_iter.bi_sector = sector;
1116	bio->bi_iter.bi_size = to_bytes(len);
1117}
1118
1119/*
1120 * Creates a bio that consists of range of complete bvecs.
1121 */
1122static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1123		     sector_t sector, unsigned len)
1124{
1125	struct bio *clone = &tio->clone;
1126
1127	__bio_clone_fast(clone, bio);
1128
1129	if (unlikely(bio_integrity(bio) != NULL)) {
1130		int r;
1131
1132		if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1133			     !dm_target_passes_integrity(tio->ti->type))) {
1134			DMWARN("%s: the target %s doesn't support integrity data.",
1135				dm_device_name(tio->io->md),
1136				tio->ti->type->name);
1137			return -EIO;
1138		}
1139
1140		r = bio_integrity_clone(clone, bio, GFP_NOIO);
1141		if (r < 0)
1142			return r;
1143	}
1144
1145	bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1146	clone->bi_iter.bi_size = to_bytes(len);
1147
1148	if (unlikely(bio_integrity(bio) != NULL))
1149		bio_integrity_trim(clone, 0, len);
1150
1151	return 0;
1152}
1153
1154static struct dm_target_io *alloc_tio(struct clone_info *ci,
1155				      struct dm_target *ti,
1156				      unsigned target_bio_nr)
1157{
1158	struct dm_target_io *tio;
1159	struct bio *clone;
1160
1161	clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
1162	tio = container_of(clone, struct dm_target_io, clone);
1163
1164	tio->io = ci->io;
1165	tio->ti = ti;
1166	tio->target_bio_nr = target_bio_nr;
1167
1168	return tio;
1169}
1170
1171static void __clone_and_map_simple_bio(struct clone_info *ci,
1172				       struct dm_target *ti,
1173				       unsigned target_bio_nr, unsigned *len)
1174{
1175	struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr);
1176	struct bio *clone = &tio->clone;
1177
1178	tio->len_ptr = len;
1179
1180	__bio_clone_fast(clone, ci->bio);
1181	if (len)
1182		bio_setup_sector(clone, ci->sector, *len);
1183
1184	__map_bio(tio);
1185}
1186
1187static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1188				  unsigned num_bios, unsigned *len)
1189{
1190	unsigned target_bio_nr;
1191
1192	for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
1193		__clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
1194}
1195
1196static int __send_empty_flush(struct clone_info *ci)
1197{
1198	unsigned target_nr = 0;
1199	struct dm_target *ti;
1200
1201	BUG_ON(bio_has_data(ci->bio));
1202	while ((ti = dm_table_get_target(ci->map, target_nr++)))
1203		__send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1204
1205	return 0;
1206}
1207
1208static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1209				     sector_t sector, unsigned *len)
1210{
1211	struct bio *bio = ci->bio;
1212	struct dm_target_io *tio;
1213	unsigned target_bio_nr;
1214	unsigned num_target_bios = 1;
1215	int r = 0;
1216
1217	/*
1218	 * Does the target want to receive duplicate copies of the bio?
1219	 */
1220	if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
1221		num_target_bios = ti->num_write_bios(ti, bio);
1222
1223	for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
1224		tio = alloc_tio(ci, ti, target_bio_nr);
1225		tio->len_ptr = len;
1226		r = clone_bio(tio, bio, sector, *len);
1227		if (r < 0) {
1228			free_tio(tio);
1229			break;
1230		}
1231		__map_bio(tio);
1232	}
1233
1234	return r;
1235}
1236
1237typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1238
1239static unsigned get_num_discard_bios(struct dm_target *ti)
1240{
1241	return ti->num_discard_bios;
1242}
1243
1244static unsigned get_num_write_same_bios(struct dm_target *ti)
1245{
1246	return ti->num_write_same_bios;
1247}
1248
1249static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
1250{
1251	return ti->num_write_zeroes_bios;
1252}
1253
1254typedef bool (*is_split_required_fn)(struct dm_target *ti);
1255
1256static bool is_split_required_for_discard(struct dm_target *ti)
1257{
1258	return ti->split_discard_bios;
1259}
1260
1261static int __send_changing_extent_only(struct clone_info *ci,
1262				       get_num_bios_fn get_num_bios,
1263				       is_split_required_fn is_split_required)
1264{
1265	struct dm_target *ti;
1266	unsigned len;
1267	unsigned num_bios;
1268
1269	do {
1270		ti = dm_table_find_target(ci->map, ci->sector);
1271		if (!dm_target_is_valid(ti))
1272			return -EIO;
1273
1274		/*
1275		 * Even though the device advertised support for this type of
1276		 * request, that does not mean every target supports it, and
1277		 * reconfiguration might also have changed that since the
1278		 * check was performed.
1279		 */
1280		num_bios = get_num_bios ? get_num_bios(ti) : 0;
1281		if (!num_bios)
1282			return -EOPNOTSUPP;
1283
1284		if (is_split_required && !is_split_required(ti))
1285			len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1286		else
1287			len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
1288
1289		__send_duplicate_bios(ci, ti, num_bios, &len);
1290
1291		ci->sector += len;
1292	} while (ci->sector_count -= len);
1293
1294	return 0;
1295}
1296
1297static int __send_discard(struct clone_info *ci)
1298{
1299	return __send_changing_extent_only(ci, get_num_discard_bios,
1300					   is_split_required_for_discard);
1301}
1302
1303static int __send_write_same(struct clone_info *ci)
1304{
1305	return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
1306}
1307
1308static int __send_write_zeroes(struct clone_info *ci)
1309{
1310	return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL);
1311}
1312
1313/*
1314 * Select the correct strategy for processing a non-flush bio.
1315 */
1316static int __split_and_process_non_flush(struct clone_info *ci)
1317{
1318	struct bio *bio = ci->bio;
1319	struct dm_target *ti;
1320	unsigned len;
1321	int r;
1322
1323	if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
1324		return __send_discard(ci);
1325	else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
1326		return __send_write_same(ci);
1327	else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
1328		return __send_write_zeroes(ci);
1329
1330	ti = dm_table_find_target(ci->map, ci->sector);
1331	if (!dm_target_is_valid(ti))
1332		return -EIO;
1333
1334	len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
1335
1336	r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1337	if (r < 0)
1338		return r;
1339
1340	ci->sector += len;
1341	ci->sector_count -= len;
1342
1343	return 0;
1344}
1345
1346/*
1347 * Entry point to split a bio into clones and submit them to the targets.
1348 */
1349static void __split_and_process_bio(struct mapped_device *md,
1350				    struct dm_table *map, struct bio *bio)
1351{
1352	struct clone_info ci;
1353	int error = 0;
1354
1355	if (unlikely(!map)) {
1356		bio_io_error(bio);
1357		return;
1358	}
1359
1360	ci.map = map;
1361	ci.md = md;
1362	ci.io = alloc_io(md);
1363	ci.io->error = 0;
1364	atomic_set(&ci.io->io_count, 1);
1365	ci.io->bio = bio;
1366	ci.io->md = md;
1367	spin_lock_init(&ci.io->endio_lock);
1368	ci.sector = bio->bi_iter.bi_sector;
1369
1370	start_io_acct(ci.io);
1371
1372	if (bio->bi_opf & REQ_PREFLUSH) {
1373		ci.bio = &ci.md->flush_bio;
1374		ci.sector_count = 0;
1375		error = __send_empty_flush(&ci);
1376		/* dec_pending submits any data associated with flush */
1377	} else {
1378		ci.bio = bio;
1379		ci.sector_count = bio_sectors(bio);
1380		while (ci.sector_count && !error)
1381			error = __split_and_process_non_flush(&ci);
1382	}
1383
1384	/* drop the extra reference count */
1385	dec_pending(ci.io, error);
1386}
1387/*-----------------------------------------------------------------
1388 * CRUD END
1389 *---------------------------------------------------------------*/
1390
1391/*
1392 * The request function that just remaps the bio built up by
1393 * dm_merge_bvec.
1394 */
1395static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1396{
1397	int rw = bio_data_dir(bio);
1398	struct mapped_device *md = q->queuedata;
1399	int srcu_idx;
1400	struct dm_table *map;
1401
1402	map = dm_get_live_table(md, &srcu_idx);
1403
1404	generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
1405
1406	/* if we're suspended, we have to queue this io for later */
1407	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1408		dm_put_live_table(md, srcu_idx);
1409
1410		if (!(bio->bi_opf & REQ_RAHEAD))
1411			queue_io(md, bio);
1412		else
1413			bio_io_error(bio);
1414		return BLK_QC_T_NONE;
1415	}
1416
1417	__split_and_process_bio(md, map, bio);
1418	dm_put_live_table(md, srcu_idx);
1419	return BLK_QC_T_NONE;
1420}
1421
1422static int dm_any_congested(void *congested_data, int bdi_bits)
1423{
1424	int r = bdi_bits;
1425	struct mapped_device *md = congested_data;
1426	struct dm_table *map;
1427
1428	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1429		if (dm_request_based(md)) {
1430			/*
1431			 * With request-based DM we only need to check the
1432			 * top-level queue for congestion.
1433			 */
1434			r = md->queue->backing_dev_info->wb.state & bdi_bits;
1435		} else {
1436			map = dm_get_live_table_fast(md);
1437			if (map)
1438				r = dm_table_any_congested(map, bdi_bits);
1439			dm_put_live_table_fast(md);
1440		}
1441	}
1442
1443	return r;
1444}
1445
1446/*-----------------------------------------------------------------
1447 * An IDR is used to keep track of allocated minor numbers.
1448 *---------------------------------------------------------------*/
1449static void free_minor(int minor)
1450{
1451	spin_lock(&_minor_lock);
1452	idr_remove(&_minor_idr, minor);
1453	spin_unlock(&_minor_lock);
1454}
1455
1456/*
1457 * See if the device with a specific minor # is free.
1458 */
1459static int specific_minor(int minor)
1460{
1461	int r;
1462
1463	if (minor >= (1 << MINORBITS))
1464		return -EINVAL;
1465
1466	idr_preload(GFP_KERNEL);
1467	spin_lock(&_minor_lock);
1468
1469	r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1470
1471	spin_unlock(&_minor_lock);
1472	idr_preload_end();
1473	if (r < 0)
1474		return r == -ENOSPC ? -EBUSY : r;
1475	return 0;
1476}
1477
1478static int next_free_minor(int *minor)
1479{
1480	int r;
1481
1482	idr_preload(GFP_KERNEL);
1483	spin_lock(&_minor_lock);
1484
1485	r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1486
1487	spin_unlock(&_minor_lock);
1488	idr_preload_end();
1489	if (r < 0)
1490		return r;
1491	*minor = r;
1492	return 0;
1493}
1494
1495static const struct block_device_operations dm_blk_dops;
1496static const struct dax_operations dm_dax_ops;
1497
1498static void dm_wq_work(struct work_struct *work);
1499
1500void dm_init_md_queue(struct mapped_device *md)
1501{
1502	/*
1503	 * Request-based dm devices cannot be stacked on top of bio-based dm
1504	 * devices.  The type of this dm device may not have been decided yet.
1505	 * The type is decided at the first table loading time.
1506	 * To prevent problematic device stacking, clear the queue flag
1507	 * for request stacking support until then.
1508	 *
1509	 * This queue is new, so no concurrency on the queue_flags.
1510	 */
1511	queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1512
1513	/*
1514	 * Initialize data that will only be used by a non-blk-mq DM queue
1515	 * - must do so here (in alloc_dev callchain) before queue is used
1516	 */
1517	md->queue->queuedata = md;
1518	md->queue->backing_dev_info->congested_data = md;
1519}
1520
1521void dm_init_normal_md_queue(struct mapped_device *md)
1522{
1523	md->use_blk_mq = false;
1524	dm_init_md_queue(md);
1525
1526	/*
1527	 * Initialize aspects of queue that aren't relevant for blk-mq
1528	 */
1529	md->queue->backing_dev_info->congested_fn = dm_any_congested;
1530	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1531}
1532
1533static void cleanup_mapped_device(struct mapped_device *md)
1534{
1535	if (md->wq)
1536		destroy_workqueue(md->wq);
1537	if (md->kworker_task)
1538		kthread_stop(md->kworker_task);
1539	mempool_destroy(md->io_pool);
1540	if (md->bs)
1541		bioset_free(md->bs);
1542
1543	if (md->dax_dev) {
1544		kill_dax(md->dax_dev);
1545		put_dax(md->dax_dev);
1546		md->dax_dev = NULL;
1547	}
1548
1549	if (md->disk) {
1550		spin_lock(&_minor_lock);
1551		md->disk->private_data = NULL;
1552		spin_unlock(&_minor_lock);
1553		del_gendisk(md->disk);
1554		put_disk(md->disk);
1555	}
1556
1557	if (md->queue)
1558		blk_cleanup_queue(md->queue);
1559
1560	cleanup_srcu_struct(&md->io_barrier);
1561
1562	if (md->bdev) {
1563		bdput(md->bdev);
1564		md->bdev = NULL;
1565	}
1566
1567	dm_mq_cleanup_mapped_device(md);
1568}
1569
1570/*
1571 * Allocate and initialise a blank device with a given minor.
1572 */
1573static struct mapped_device *alloc_dev(int minor)
1574{
1575	int r, numa_node_id = dm_get_numa_node();
1576	struct dax_device *dax_dev;
1577	struct mapped_device *md;
1578	void *old_md;
1579
1580	md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1581	if (!md) {
1582		DMWARN("unable to allocate device, out of memory.");
1583		return NULL;
1584	}
1585
1586	if (!try_module_get(THIS_MODULE))
1587		goto bad_module_get;
1588
1589	/* get a minor number for the dev */
1590	if (minor == DM_ANY_MINOR)
1591		r = next_free_minor(&minor);
1592	else
1593		r = specific_minor(minor);
1594	if (r < 0)
1595		goto bad_minor;
1596
1597	r = init_srcu_struct(&md->io_barrier);
1598	if (r < 0)
1599		goto bad_io_barrier;
1600
1601	md->numa_node_id = numa_node_id;
1602	md->use_blk_mq = dm_use_blk_mq_default();
1603	md->init_tio_pdu = false;
1604	md->type = DM_TYPE_NONE;
1605	mutex_init(&md->suspend_lock);
1606	mutex_init(&md->type_lock);
1607	mutex_init(&md->table_devices_lock);
1608	spin_lock_init(&md->deferred_lock);
1609	atomic_set(&md->holders, 1);
1610	atomic_set(&md->open_count, 0);
1611	atomic_set(&md->event_nr, 0);
1612	atomic_set(&md->uevent_seq, 0);
1613	INIT_LIST_HEAD(&md->uevent_list);
1614	INIT_LIST_HEAD(&md->table_devices);
1615	spin_lock_init(&md->uevent_lock);
1616
1617	md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
1618	if (!md->queue)
1619		goto bad;
1620
1621	dm_init_md_queue(md);
1622
1623	md->disk = alloc_disk_node(1, numa_node_id);
1624	if (!md->disk)
1625		goto bad;
1626
1627	atomic_set(&md->pending[0], 0);
1628	atomic_set(&md->pending[1], 0);
1629	init_waitqueue_head(&md->wait);
1630	INIT_WORK(&md->work, dm_wq_work);
1631	init_waitqueue_head(&md->eventq);
1632	init_completion(&md->kobj_holder.completion);
1633	md->kworker_task = NULL;
1634
1635	md->disk->major = _major;
1636	md->disk->first_minor = minor;
1637	md->disk->fops = &dm_blk_dops;
1638	md->disk->queue = md->queue;
1639	md->disk->private_data = md;
1640	sprintf(md->disk->disk_name, "dm-%d", minor);
1641
1642	dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
1643	if (!dax_dev)
1644		goto bad;
1645	md->dax_dev = dax_dev;
1646
1647	add_disk(md->disk);
1648	format_dev_t(md->name, MKDEV(_major, minor));
1649
1650	md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
1651	if (!md->wq)
1652		goto bad;
1653
1654	md->bdev = bdget_disk(md->disk, 0);
1655	if (!md->bdev)
1656		goto bad;
1657
1658	bio_init(&md->flush_bio, NULL, 0);
1659	md->flush_bio.bi_bdev = md->bdev;
1660	md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1661
1662	dm_stats_init(&md->stats);
1663
1664	/* Populate the mapping, nobody knows we exist yet */
1665	spin_lock(&_minor_lock);
1666	old_md = idr_replace(&_minor_idr, md, minor);
1667	spin_unlock(&_minor_lock);
1668
1669	BUG_ON(old_md != MINOR_ALLOCED);
1670
1671	return md;
1672
1673bad:
1674	cleanup_mapped_device(md);
1675bad_io_barrier:
1676	free_minor(minor);
1677bad_minor:
1678	module_put(THIS_MODULE);
1679bad_module_get:
1680	kfree(md);
1681	return NULL;
1682}
1683
1684static void unlock_fs(struct mapped_device *md);
1685
1686static void free_dev(struct mapped_device *md)
1687{
1688	int minor = MINOR(disk_devt(md->disk));
1689
1690	unlock_fs(md);
1691
1692	cleanup_mapped_device(md);
1693
1694	free_table_devices(&md->table_devices);
1695	dm_stats_cleanup(&md->stats);
1696	free_minor(minor);
1697
1698	module_put(THIS_MODULE);
1699	kfree(md);
1700}
1701
1702static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
1703{
1704	struct dm_md_mempools *p = dm_table_get_md_mempools(t);
1705
1706	if (md->bs) {
1707		/* The md already has necessary mempools. */
1708		if (dm_table_bio_based(t)) {
1709			/*
1710			 * Reload bioset because front_pad may have changed
1711			 * because a different table was loaded.
1712			 */
1713			bioset_free(md->bs);
1714			md->bs = p->bs;
1715			p->bs = NULL;
1716		}
1717		/*
1718		 * There's no need to reload with request-based dm
1719		 * because the size of front_pad doesn't change.
1720		 * Note for future: If you are to reload bioset,
1721		 * prep-ed requests in the queue may refer
1722		 * to bio from the old bioset, so you must walk
1723		 * through the queue to unprep.
1724		 */
1725		goto out;
1726	}
1727
1728	BUG_ON(!p || md->io_pool || md->bs);
1729
1730	md->io_pool = p->io_pool;
1731	p->io_pool = NULL;
1732	md->bs = p->bs;
1733	p->bs = NULL;
1734
1735out:
1736	/* mempool bind completed, no longer need any mempools in the table */
1737	dm_table_free_md_mempools(t);
1738}
1739
1740/*
1741 * Bind a table to the device.
1742 */
1743static void event_callback(void *context)
1744{
1745	unsigned long flags;
1746	LIST_HEAD(uevents);
1747	struct mapped_device *md = (struct mapped_device *) context;
1748
1749	spin_lock_irqsave(&md->uevent_lock, flags);
1750	list_splice_init(&md->uevent_list, &uevents);
1751	spin_unlock_irqrestore(&md->uevent_lock, flags);
1752
1753	dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
1754
1755	atomic_inc(&md->event_nr);
1756	wake_up(&md->eventq);
1757}
1758
1759/*
1760 * Protected by md->suspend_lock obtained by dm_swap_table().
1761 */
1762static void __set_size(struct mapped_device *md, sector_t size)
1763{
1764	lockdep_assert_held(&md->suspend_lock);
1765
1766	set_capacity(md->disk, size);
1767
1768	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
1769}
1770
1771/*
1772 * Returns old map, which caller must destroy.
1773 */
1774static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
1775			       struct queue_limits *limits)
1776{
1777	struct dm_table *old_map;
1778	struct request_queue *q = md->queue;
1779	sector_t size;
1780
1781	lockdep_assert_held(&md->suspend_lock);
1782
1783	size = dm_table_get_size(t);
1784
1785	/*
1786	 * Wipe any geometry if the size of the table changed.
1787	 */
1788	if (size != dm_get_size(md))
1789		memset(&md->geometry, 0, sizeof(md->geometry));
1790
1791	__set_size(md, size);
1792
1793	dm_table_event_callback(t, event_callback, md);
1794
1795	/*
1796	 * The queue hasn't been stopped yet, if the old table type wasn't
1797	 * for request-based during suspension.  So stop it to prevent
1798	 * I/O mapping before resume.
1799	 * This must be done before setting the queue restrictions,
1800	 * because request-based dm may be run just after the setting.
1801	 */
1802	if (dm_table_request_based(t)) {
1803		dm_stop_queue(q);
1804		/*
1805		 * Leverage the fact that request-based DM targets are
1806		 * immutable singletons and establish md->immutable_target
1807		 * - used to optimize both dm_request_fn and dm_mq_queue_rq
1808		 */
1809		md->immutable_target = dm_table_get_immutable_target(t);
1810	}
1811
1812	__bind_mempools(md, t);
1813
1814	old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
1815	rcu_assign_pointer(md->map, (void *)t);
1816	md->immutable_target_type = dm_table_get_immutable_target_type(t);
1817
1818	dm_table_set_restrictions(t, q, limits);
1819	if (old_map)
1820		dm_sync_table(md);
1821
1822	return old_map;
1823}
1824
1825/*
1826 * Returns unbound table for the caller to free.
1827 */
1828static struct dm_table *__unbind(struct mapped_device *md)
1829{
1830	struct dm_table *map = rcu_dereference_protected(md->map, 1);
1831
1832	if (!map)
1833		return NULL;
1834
1835	dm_table_event_callback(map, NULL, NULL);
1836	RCU_INIT_POINTER(md->map, NULL);
1837	dm_sync_table(md);
1838
1839	return map;
1840}
1841
1842/*
1843 * Constructor for a new device.
1844 */
1845int dm_create(int minor, struct mapped_device **result)
1846{
1847	struct mapped_device *md;
1848
1849	md = alloc_dev(minor);
1850	if (!md)
1851		return -ENXIO;
1852
1853	dm_sysfs_init(md);
1854
1855	*result = md;
1856	return 0;
1857}
1858
1859/*
1860 * Functions to manage md->type.
1861 * All are required to hold md->type_lock.
1862 */
1863void dm_lock_md_type(struct mapped_device *md)
1864{
1865	mutex_lock(&md->type_lock);
1866}
1867
1868void dm_unlock_md_type(struct mapped_device *md)
1869{
1870	mutex_unlock(&md->type_lock);
1871}
1872
1873void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
1874{
1875	BUG_ON(!mutex_is_locked(&md->type_lock));
1876	md->type = type;
1877}
1878
1879enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
1880{
1881	return md->type;
1882}
1883
1884struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
1885{
1886	return md->immutable_target_type;
1887}
1888
1889/*
1890 * The queue_limits are only valid as long as you have a reference
1891 * count on 'md'.
1892 */
1893struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
1894{
1895	BUG_ON(!atomic_read(&md->holders));
1896	return &md->queue->limits;
1897}
1898EXPORT_SYMBOL_GPL(dm_get_queue_limits);
1899
1900/*
1901 * Setup the DM device's queue based on md's type
1902 */
1903int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
1904{
1905	int r;
1906	enum dm_queue_mode type = dm_get_md_type(md);
1907
1908	switch (type) {
1909	case DM_TYPE_REQUEST_BASED:
1910		r = dm_old_init_request_queue(md, t);
1911		if (r) {
1912			DMERR("Cannot initialize queue for request-based mapped device");
1913			return r;
1914		}
1915		break;
1916	case DM_TYPE_MQ_REQUEST_BASED:
1917		r = dm_mq_init_request_queue(md, t);
1918		if (r) {
1919			DMERR("Cannot initialize queue for request-based dm-mq mapped device");
1920			return r;
1921		}
1922		break;
1923	case DM_TYPE_BIO_BASED:
1924	case DM_TYPE_DAX_BIO_BASED:
1925		dm_init_normal_md_queue(md);
1926		blk_queue_make_request(md->queue, dm_make_request);
1927		/*
1928		 * DM handles splitting bios as needed.  Free the bio_split bioset
1929		 * since it won't be used (saves 1 process per bio-based DM device).
1930		 */
1931		bioset_free(md->queue->bio_split);
1932		md->queue->bio_split = NULL;
1933
1934		if (type == DM_TYPE_DAX_BIO_BASED)
1935			queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
1936		break;
1937	case DM_TYPE_NONE:
1938		WARN_ON_ONCE(true);
1939		break;
1940	}
1941
1942	return 0;
1943}
1944
1945struct mapped_device *dm_get_md(dev_t dev)
1946{
1947	struct mapped_device *md;
1948	unsigned minor = MINOR(dev);
1949
1950	if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
1951		return NULL;
1952
1953	spin_lock(&_minor_lock);
1954
1955	md = idr_find(&_minor_idr, minor);
1956	if (md) {
1957		if ((md == MINOR_ALLOCED ||
1958		     (MINOR(disk_devt(dm_disk(md))) != minor) ||
1959		     dm_deleting_md(md) ||
1960		     test_bit(DMF_FREEING, &md->flags))) {
1961			md = NULL;
1962			goto out;
1963		}
1964		dm_get(md);
1965	}
1966
1967out:
1968	spin_unlock(&_minor_lock);
1969
1970	return md;
1971}
1972EXPORT_SYMBOL_GPL(dm_get_md);
1973
1974void *dm_get_mdptr(struct mapped_device *md)
1975{
1976	return md->interface_ptr;
1977}
1978
1979void dm_set_mdptr(struct mapped_device *md, void *ptr)
1980{
1981	md->interface_ptr = ptr;
1982}
1983
1984void dm_get(struct mapped_device *md)
1985{
1986	atomic_inc(&md->holders);
1987	BUG_ON(test_bit(DMF_FREEING, &md->flags));
1988}
1989
1990int dm_hold(struct mapped_device *md)
1991{
1992	spin_lock(&_minor_lock);
1993	if (test_bit(DMF_FREEING, &md->flags)) {
1994		spin_unlock(&_minor_lock);
1995		return -EBUSY;
1996	}
1997	dm_get(md);
1998	spin_unlock(&_minor_lock);
1999	return 0;
2000}
2001EXPORT_SYMBOL_GPL(dm_hold);
2002
2003const char *dm_device_name(struct mapped_device *md)
2004{
2005	return md->name;
2006}
2007EXPORT_SYMBOL_GPL(dm_device_name);
2008
2009static void __dm_destroy(struct mapped_device *md, bool wait)
2010{
2011	struct request_queue *q = dm_get_md_queue(md);
2012	struct dm_table *map;
2013	int srcu_idx;
2014
2015	might_sleep();
2016
2017	spin_lock(&_minor_lock);
2018	idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2019	set_bit(DMF_FREEING, &md->flags);
2020	spin_unlock(&_minor_lock);
2021
2022	blk_set_queue_dying(q);
2023
2024	if (dm_request_based(md) && md->kworker_task)
2025		kthread_flush_worker(&md->kworker);
2026
2027	/*
2028	 * Take suspend_lock so that presuspend and postsuspend methods
2029	 * do not race with internal suspend.
2030	 */
2031	mutex_lock(&md->suspend_lock);
2032	map = dm_get_live_table(md, &srcu_idx);
2033	if (!dm_suspended_md(md)) {
2034		dm_table_presuspend_targets(map);
2035		dm_table_postsuspend_targets(map);
2036	}
2037	/* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2038	dm_put_live_table(md, srcu_idx);
2039	mutex_unlock(&md->suspend_lock);
2040
2041	/*
2042	 * Rare, but there may be I/O requests still going to complete,
2043	 * for example.  Wait for all references to disappear.
2044	 * No one should increment the reference count of the mapped_device,
2045	 * after the mapped_device state becomes DMF_FREEING.
2046	 */
2047	if (wait)
2048		while (atomic_read(&md->holders))
2049			msleep(1);
2050	else if (atomic_read(&md->holders))
2051		DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2052		       dm_device_name(md), atomic_read(&md->holders));
2053
2054	dm_sysfs_exit(md);
2055	dm_table_destroy(__unbind(md));
2056	free_dev(md);
2057}
2058
2059void dm_destroy(struct mapped_device *md)
2060{
2061	__dm_destroy(md, true);
2062}
2063
2064void dm_destroy_immediate(struct mapped_device *md)
2065{
2066	__dm_destroy(md, false);
2067}
2068
2069void dm_put(struct mapped_device *md)
2070{
2071	atomic_dec(&md->holders);
2072}
2073EXPORT_SYMBOL_GPL(dm_put);
2074
2075static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2076{
2077	int r = 0;
2078	DEFINE_WAIT(wait);
2079
2080	while (1) {
2081		prepare_to_wait(&md->wait, &wait, task_state);
2082
2083		if (!md_in_flight(md))
2084			break;
2085
2086		if (signal_pending_state(task_state, current)) {
2087			r = -EINTR;
2088			break;
2089		}
2090
2091		io_schedule();
2092	}
2093	finish_wait(&md->wait, &wait);
2094
2095	return r;
2096}
2097
2098/*
2099 * Process the deferred bios
2100 */
2101static void dm_wq_work(struct work_struct *work)
2102{
2103	struct mapped_device *md = container_of(work, struct mapped_device,
2104						work);
2105	struct bio *c;
2106	int srcu_idx;
2107	struct dm_table *map;
2108
2109	map = dm_get_live_table(md, &srcu_idx);
2110
2111	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2112		spin_lock_irq(&md->deferred_lock);
2113		c = bio_list_pop(&md->deferred);
2114		spin_unlock_irq(&md->deferred_lock);
2115
2116		if (!c)
2117			break;
2118
2119		if (dm_request_based(md))
2120			generic_make_request(c);
2121		else
2122			__split_and_process_bio(md, map, c);
2123	}
2124
2125	dm_put_live_table(md, srcu_idx);
2126}
2127
2128static void dm_queue_flush(struct mapped_device *md)
2129{
2130	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2131	smp_mb__after_atomic();
2132	queue_work(md->wq, &md->work);
2133}
2134
2135/*
2136 * Swap in a new table, returning the old one for the caller to destroy.
2137 */
2138struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2139{
2140	struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2141	struct queue_limits limits;
2142	int r;
2143
2144	mutex_lock(&md->suspend_lock);
2145
2146	/* device must be suspended */
2147	if (!dm_suspended_md(md))
2148		goto out;
2149
2150	/*
2151	 * If the new table has no data devices, retain the existing limits.
2152	 * This helps multipath with queue_if_no_path if all paths disappear,
2153	 * then new I/O is queued based on these limits, and then some paths
2154	 * reappear.
2155	 */
2156	if (dm_table_has_no_data_devices(table)) {
2157		live_map = dm_get_live_table_fast(md);
2158		if (live_map)
2159			limits = md->queue->limits;
2160		dm_put_live_table_fast(md);
2161	}
2162
2163	if (!live_map) {
2164		r = dm_calculate_queue_limits(table, &limits);
2165		if (r) {
2166			map = ERR_PTR(r);
2167			goto out;
2168		}
2169	}
2170
2171	map = __bind(md, table, &limits);
2172
2173out:
2174	mutex_unlock(&md->suspend_lock);
2175	return map;
2176}
2177
2178/*
2179 * Functions to lock and unlock any filesystem running on the
2180 * device.
2181 */
2182static int lock_fs(struct mapped_device *md)
2183{
2184	int r;
2185
2186	WARN_ON(md->frozen_sb);
2187
2188	md->frozen_sb = freeze_bdev(md->bdev);
2189	if (IS_ERR(md->frozen_sb)) {
2190		r = PTR_ERR(md->frozen_sb);
2191		md->frozen_sb = NULL;
2192		return r;
2193	}
2194
2195	set_bit(DMF_FROZEN, &md->flags);
2196
2197	return 0;
2198}
2199
2200static void unlock_fs(struct mapped_device *md)
2201{
2202	if (!test_bit(DMF_FROZEN, &md->flags))
2203		return;
2204
2205	thaw_bdev(md->bdev, md->frozen_sb);
2206	md->frozen_sb = NULL;
2207	clear_bit(DMF_FROZEN, &md->flags);
2208}
2209
2210/*
2211 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
2212 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
2213 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
2214 *
2215 * If __dm_suspend returns 0, the device is completely quiescent
2216 * now. There is no request-processing activity. All new requests
2217 * are being added to md->deferred list.
2218 */
2219static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2220			unsigned suspend_flags, long task_state,
2221			int dmf_suspended_flag)
2222{
2223	bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2224	bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2225	int r;
2226
2227	lockdep_assert_held(&md->suspend_lock);
2228
2229	/*
2230	 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2231	 * This flag is cleared before dm_suspend returns.
2232	 */
2233	if (noflush)
2234		set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2235	else
2236		pr_debug("%s: suspending with flush\n", dm_device_name(md));
2237
2238	/*
2239	 * This gets reverted if there's an error later and the targets
2240	 * provide the .presuspend_undo hook.
2241	 */
2242	dm_table_presuspend_targets(map);
2243
2244	/*
2245	 * Flush I/O to the device.
2246	 * Any I/O submitted after lock_fs() may not be flushed.
2247	 * noflush takes precedence over do_lockfs.
2248	 * (lock_fs() flushes I/Os and waits for them to complete.)
2249	 */
2250	if (!noflush && do_lockfs) {
2251		r = lock_fs(md);
2252		if (r) {
2253			dm_table_presuspend_undo_targets(map);
2254			return r;
2255		}
2256	}
2257
2258	/*
2259	 * Here we must make sure that no processes are submitting requests
2260	 * to target drivers i.e. no one may be executing
2261	 * __split_and_process_bio. This is called from dm_request and
2262	 * dm_wq_work.
2263	 *
2264	 * To get all processes out of __split_and_process_bio in dm_request,
2265	 * we take the write lock. To prevent any process from reentering
2266	 * __split_and_process_bio from dm_request and quiesce the thread
2267	 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2268	 * flush_workqueue(md->wq).
2269	 */
2270	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2271	if (map)
2272		synchronize_srcu(&md->io_barrier);
2273
2274	/*
2275	 * Stop md->queue before flushing md->wq in case request-based
2276	 * dm defers requests to md->wq from md->queue.
2277	 */
2278	if (dm_request_based(md)) {
2279		dm_stop_queue(md->queue);
2280		if (md->kworker_task)
2281			kthread_flush_worker(&md->kworker);
2282	}
2283
2284	flush_workqueue(md->wq);
2285
2286	/*
2287	 * At this point no more requests are entering target request routines.
2288	 * We call dm_wait_for_completion to wait for all existing requests
2289	 * to finish.
2290	 */
2291	r = dm_wait_for_completion(md, task_state);
2292	if (!r)
2293		set_bit(dmf_suspended_flag, &md->flags);
2294
2295	if (noflush)
2296		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2297	if (map)
2298		synchronize_srcu(&md->io_barrier);
2299
2300	/* were we interrupted ? */
2301	if (r < 0) {
2302		dm_queue_flush(md);
2303
2304		if (dm_request_based(md))
2305			dm_start_queue(md->queue);
2306
2307		unlock_fs(md);
2308		dm_table_presuspend_undo_targets(map);
2309		/* pushback list is already flushed, so skip flush */
2310	}
2311
2312	return r;
2313}
2314
2315/*
2316 * We need to be able to change a mapping table under a mounted
2317 * filesystem.  For example we might want to move some data in
2318 * the background.  Before the table can be swapped with
2319 * dm_bind_table, dm_suspend must be called to flush any in
2320 * flight bios and ensure that any further io gets deferred.
2321 */
2322/*
2323 * Suspend mechanism in request-based dm.
2324 *
2325 * 1. Flush all I/Os by lock_fs() if needed.
2326 * 2. Stop dispatching any I/O by stopping the request_queue.
2327 * 3. Wait for all in-flight I/Os to be completed or requeued.
2328 *
2329 * To abort suspend, start the request_queue.
2330 */
2331int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2332{
2333	struct dm_table *map = NULL;
2334	int r = 0;
2335
2336retry:
2337	mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2338
2339	if (dm_suspended_md(md)) {
2340		r = -EINVAL;
2341		goto out_unlock;
2342	}
2343
2344	if (dm_suspended_internally_md(md)) {
2345		/* already internally suspended, wait for internal resume */
2346		mutex_unlock(&md->suspend_lock);
2347		r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2348		if (r)
2349			return r;
2350		goto retry;
2351	}
2352
2353	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2354
2355	r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2356	if (r)
2357		goto out_unlock;
2358
2359	dm_table_postsuspend_targets(map);
2360
2361out_unlock:
2362	mutex_unlock(&md->suspend_lock);
2363	return r;
2364}
2365
2366static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2367{
2368	if (map) {
2369		int r = dm_table_resume_targets(map);
2370		if (r)
2371			return r;
2372	}
2373
2374	dm_queue_flush(md);
2375
2376	/*
2377	 * Flushing deferred I/Os must be done after targets are resumed
2378	 * so that mapping of targets can work correctly.
2379	 * Request-based dm is queueing the deferred I/Os in its request_queue.
2380	 */
2381	if (dm_request_based(md))
2382		dm_start_queue(md->queue);
2383
2384	unlock_fs(md);
2385
2386	return 0;
2387}
2388
2389int dm_resume(struct mapped_device *md)
2390{
2391	int r;
2392	struct dm_table *map = NULL;
2393
2394retry:
2395	r = -EINVAL;
2396	mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2397
2398	if (!dm_suspended_md(md))
2399		goto out;
2400
2401	if (dm_suspended_internally_md(md)) {
2402		/* already internally suspended, wait for internal resume */
2403		mutex_unlock(&md->suspend_lock);
2404		r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2405		if (r)
2406			return r;
2407		goto retry;
2408	}
2409
2410	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2411	if (!map || !dm_table_get_size(map))
2412		goto out;
2413
2414	r = __dm_resume(md, map);
2415	if (r)
2416		goto out;
2417
2418	clear_bit(DMF_SUSPENDED, &md->flags);
2419out:
2420	mutex_unlock(&md->suspend_lock);
2421
2422	return r;
2423}
2424
2425/*
2426 * Internal suspend/resume works like userspace-driven suspend. It waits
2427 * until all bios finish and prevents issuing new bios to the target drivers.
2428 * It may be used only from the kernel.
2429 */
2430
2431static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2432{
2433	struct dm_table *map = NULL;
2434
2435	lockdep_assert_held(&md->suspend_lock);
2436
2437	if (md->internal_suspend_count++)
2438		return; /* nested internal suspend */
2439
2440	if (dm_suspended_md(md)) {
2441		set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2442		return; /* nest suspend */
2443	}
2444
2445	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2446
2447	/*
2448	 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
2449	 * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
2450	 * would require changing .presuspend to return an error -- avoid this
2451	 * until there is a need for more elaborate variants of internal suspend.
2452	 */
2453	(void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2454			    DMF_SUSPENDED_INTERNALLY);
2455
2456	dm_table_postsuspend_targets(map);
2457}
2458
2459static void __dm_internal_resume(struct mapped_device *md)
2460{
2461	BUG_ON(!md->internal_suspend_count);
2462
2463	if (--md->internal_suspend_count)
2464		return; /* resume from nested internal suspend */
2465
2466	if (dm_suspended_md(md))
2467		goto done; /* resume from nested suspend */
2468
2469	/*
2470	 * NOTE: existing callers don't need to call dm_table_resume_targets
2471	 * (which may fail -- so best to avoid it for now by passing NULL map)
2472	 */
2473	(void) __dm_resume(md, NULL);
2474
2475done:
2476	clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2477	smp_mb__after_atomic();
2478	wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2479}
2480
2481void dm_internal_suspend_noflush(struct mapped_device *md)
2482{
2483	mutex_lock(&md->suspend_lock);
2484	__dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2485	mutex_unlock(&md->suspend_lock);
2486}
2487EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2488
2489void dm_internal_resume(struct mapped_device *md)
2490{
2491	mutex_lock(&md->suspend_lock);
2492	__dm_internal_resume(md);
2493	mutex_unlock(&md->suspend_lock);
2494}
2495EXPORT_SYMBOL_GPL(dm_internal_resume);
2496
2497/*
2498 * Fast variants of internal suspend/resume hold md->suspend_lock,
2499 * which prevents interaction with userspace-driven suspend.
2500 */
2501
2502void dm_internal_suspend_fast(struct mapped_device *md)
2503{
2504	mutex_lock(&md->suspend_lock);
2505	if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2506		return;
2507
2508	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2509	synchronize_srcu(&md->io_barrier);
2510	flush_workqueue(md->wq);
2511	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2512}
2513EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2514
2515void dm_internal_resume_fast(struct mapped_device *md)
2516{
2517	if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2518		goto done;
2519
2520	dm_queue_flush(md);
2521
2522done:
2523	mutex_unlock(&md->suspend_lock);
2524}
2525EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2526
2527/*-----------------------------------------------------------------
2528 * Event notification.
2529 *---------------------------------------------------------------*/
2530int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2531		       unsigned cookie)
2532{
2533	char udev_cookie[DM_COOKIE_LENGTH];
2534	char *envp[] = { udev_cookie, NULL };
2535
2536	if (!cookie)
2537		return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2538	else {
2539		snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2540			 DM_COOKIE_ENV_VAR_NAME, cookie);
2541		return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2542					  action, envp);
2543	}
2544}
2545
2546uint32_t dm_next_uevent_seq(struct mapped_device *md)
2547{
2548	return atomic_add_return(1, &md->uevent_seq);
2549}
2550
2551uint32_t dm_get_event_nr(struct mapped_device *md)
2552{
2553	return atomic_read(&md->event_nr);
2554}
2555
2556int dm_wait_event(struct mapped_device *md, int event_nr)
2557{
2558	return wait_event_interruptible(md->eventq,
2559			(event_nr != atomic_read(&md->event_nr)));
2560}
2561
2562void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2563{
2564	unsigned long flags;
2565
2566	spin_lock_irqsave(&md->uevent_lock, flags);
2567	list_add(elist, &md->uevent_list);
2568	spin_unlock_irqrestore(&md->uevent_lock, flags);
2569}
2570
2571/*
2572 * The gendisk is only valid as long as you have a reference
2573 * count on 'md'.
2574 */
2575struct gendisk *dm_disk(struct mapped_device *md)
2576{
2577	return md->disk;
2578}
2579EXPORT_SYMBOL_GPL(dm_disk);
2580
2581struct kobject *dm_kobject(struct mapped_device *md)
2582{
2583	return &md->kobj_holder.kobj;
2584}
2585
2586struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2587{
2588	struct mapped_device *md;
2589
2590	md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2591
2592	if (test_bit(DMF_FREEING, &md->flags) ||
2593	    dm_deleting_md(md))
2594		return NULL;
2595
2596	dm_get(md);
2597	return md;
2598}
2599
2600int dm_suspended_md(struct mapped_device *md)
2601{
2602	return test_bit(DMF_SUSPENDED, &md->flags);
2603}
2604
2605int dm_suspended_internally_md(struct mapped_device *md)
2606{
2607	return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2608}
2609
2610int dm_test_deferred_remove_flag(struct mapped_device *md)
2611{
2612	return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2613}
2614
2615int dm_suspended(struct dm_target *ti)
2616{
2617	return dm_suspended_md(dm_table_get_md(ti->table));
2618}
2619EXPORT_SYMBOL_GPL(dm_suspended);
2620
2621int dm_noflush_suspending(struct dm_target *ti)
2622{
2623	return __noflush_suspending(dm_table_get_md(ti->table));
2624}
2625EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2626
2627struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2628					    unsigned integrity, unsigned per_io_data_size)
2629{
2630	struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2631	unsigned int pool_size = 0;
2632	unsigned int front_pad;
2633
2634	if (!pools)
2635		return NULL;
2636
2637	switch (type) {
2638	case DM_TYPE_BIO_BASED:
2639	case DM_TYPE_DAX_BIO_BASED:
2640		pool_size = dm_get_reserved_bio_based_ios();
2641		front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
2642	
2643		pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
2644		if (!pools->io_pool)
2645			goto out;
2646		break;
2647	case DM_TYPE_REQUEST_BASED:
2648	case DM_TYPE_MQ_REQUEST_BASED:
2649		pool_size = dm_get_reserved_rq_based_ios();
2650		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2651		/* per_io_data_size is used for blk-mq pdu at queue allocation */
2652		break;
2653	default:
2654		BUG();
2655	}
2656
2657	pools->bs = bioset_create_nobvec(pool_size, front_pad);
2658	if (!pools->bs)
2659		goto out;
2660
2661	if (integrity && bioset_integrity_create(pools->bs, pool_size))
2662		goto out;
2663
2664	return pools;
2665
2666out:
2667	dm_free_md_mempools(pools);
2668
2669	return NULL;
2670}
2671
2672void dm_free_md_mempools(struct dm_md_mempools *pools)
2673{
2674	if (!pools)
2675		return;
2676
2677	mempool_destroy(pools->io_pool);
2678
2679	if (pools->bs)
2680		bioset_free(pools->bs);
2681
2682	kfree(pools);
2683}
2684
2685struct dm_pr {
2686	u64	old_key;
2687	u64	new_key;
2688	u32	flags;
2689	bool	fail_early;
2690};
2691
2692static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
2693		      void *data)
2694{
2695	struct mapped_device *md = bdev->bd_disk->private_data;
2696	struct dm_table *table;
2697	struct dm_target *ti;
2698	int ret = -ENOTTY, srcu_idx;
2699
2700	table = dm_get_live_table(md, &srcu_idx);
2701	if (!table || !dm_table_get_size(table))
2702		goto out;
2703
2704	/* We only support devices that have a single target */
2705	if (dm_table_get_num_targets(table) != 1)
2706		goto out;
2707	ti = dm_table_get_target(table, 0);
2708
2709	ret = -EINVAL;
2710	if (!ti->type->iterate_devices)
2711		goto out;
2712
2713	ret = ti->type->iterate_devices(ti, fn, data);
2714out:
2715	dm_put_live_table(md, srcu_idx);
2716	return ret;
2717}
2718
2719/*
2720 * For register / unregister we need to manually call out to every path.
2721 */
2722static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
2723			    sector_t start, sector_t len, void *data)
2724{
2725	struct dm_pr *pr = data;
2726	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
2727
2728	if (!ops || !ops->pr_register)
2729		return -EOPNOTSUPP;
2730	return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
2731}
2732
2733static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
2734			  u32 flags)
2735{
2736	struct dm_pr pr = {
2737		.old_key	= old_key,
2738		.new_key	= new_key,
2739		.flags		= flags,
2740		.fail_early	= true,
2741	};
2742	int ret;
2743
2744	ret = dm_call_pr(bdev, __dm_pr_register, &pr);
2745	if (ret && new_key) {
2746		/* unregister all paths if we failed to register any path */
2747		pr.old_key = new_key;
2748		pr.new_key = 0;
2749		pr.flags = 0;
2750		pr.fail_early = false;
2751		dm_call_pr(bdev, __dm_pr_register, &pr);
2752	}
2753
2754	return ret;
2755}
2756
2757static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
2758			 u32 flags)
2759{
2760	struct mapped_device *md = bdev->bd_disk->private_data;
2761	const struct pr_ops *ops;
2762	fmode_t mode;
2763	int r;
2764
2765	r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2766	if (r < 0)
2767		return r;
2768
2769	ops = bdev->bd_disk->fops->pr_ops;
2770	if (ops && ops->pr_reserve)
2771		r = ops->pr_reserve(bdev, key, type, flags);
2772	else
2773		r = -EOPNOTSUPP;
2774
2775	bdput(bdev);
2776	return r;
2777}
2778
2779static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
2780{
2781	struct mapped_device *md = bdev->bd_disk->private_data;
2782	const struct pr_ops *ops;
2783	fmode_t mode;
2784	int r;
2785
2786	r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2787	if (r < 0)
2788		return r;
2789
2790	ops = bdev->bd_disk->fops->pr_ops;
2791	if (ops && ops->pr_release)
2792		r = ops->pr_release(bdev, key, type);
2793	else
2794		r = -EOPNOTSUPP;
2795
2796	bdput(bdev);
2797	return r;
2798}
2799
2800static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
2801			 enum pr_type type, bool abort)
2802{
2803	struct mapped_device *md = bdev->bd_disk->private_data;
2804	const struct pr_ops *ops;
2805	fmode_t mode;
2806	int r;
2807
2808	r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2809	if (r < 0)
2810		return r;
2811
2812	ops = bdev->bd_disk->fops->pr_ops;
2813	if (ops && ops->pr_preempt)
2814		r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
2815	else
2816		r = -EOPNOTSUPP;
2817
2818	bdput(bdev);
2819	return r;
2820}
2821
2822static int dm_pr_clear(struct block_device *bdev, u64 key)
2823{
2824	struct mapped_device *md = bdev->bd_disk->private_data;
2825	const struct pr_ops *ops;
2826	fmode_t mode;
2827	int r;
2828
2829	r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2830	if (r < 0)
2831		return r;
2832
2833	ops = bdev->bd_disk->fops->pr_ops;
2834	if (ops && ops->pr_clear)
2835		r = ops->pr_clear(bdev, key);
2836	else
2837		r = -EOPNOTSUPP;
2838
2839	bdput(bdev);
2840	return r;
2841}
2842
2843static const struct pr_ops dm_pr_ops = {
2844	.pr_register	= dm_pr_register,
2845	.pr_reserve	= dm_pr_reserve,
2846	.pr_release	= dm_pr_release,
2847	.pr_preempt	= dm_pr_preempt,
2848	.pr_clear	= dm_pr_clear,
2849};
2850
2851static const struct block_device_operations dm_blk_dops = {
2852	.open = dm_blk_open,
2853	.release = dm_blk_close,
2854	.ioctl = dm_blk_ioctl,
2855	.getgeo = dm_blk_getgeo,
2856	.pr_ops = &dm_pr_ops,
2857	.owner = THIS_MODULE
2858};
2859
2860static const struct dax_operations dm_dax_ops = {
2861	.direct_access = dm_dax_direct_access,
2862};
2863
2864/*
2865 * module hooks
2866 */
2867module_init(dm_init);
2868module_exit(dm_exit);
2869
2870module_param(major, uint, 0);
2871MODULE_PARM_DESC(major, "The major number of the device mapper");
2872
2873module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
2874MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
2875
2876module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
2877MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
2878
2879MODULE_DESCRIPTION(DM_NAME " driver");
2880MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2881MODULE_LICENSE("GPL");
Configure Feed

Configure Feed