drivers/md/dm.c at v2.6.30-rc6

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / drivers / md / dm.c
at v2.6.30-rc6 1823 lines 38 kB view raw
wrap content
   1/*
   2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4 *
   5 * This file is released under the GPL.
   6 */
   7
   8#include "dm.h"
   9#include "dm-uevent.h"
  10
  11#include <linux/init.h>
  12#include <linux/module.h>
  13#include <linux/mutex.h>
  14#include <linux/moduleparam.h>
  15#include <linux/blkpg.h>
  16#include <linux/bio.h>
  17#include <linux/buffer_head.h>
  18#include <linux/mempool.h>
  19#include <linux/slab.h>
  20#include <linux/idr.h>
  21#include <linux/hdreg.h>
  22#include <linux/blktrace_api.h>
  23#include <trace/block.h>
  24
  25#define DM_MSG_PREFIX "core"
  26
  27static const char *_name = DM_NAME;
  28
  29static unsigned int major = 0;
  30static unsigned int _major = 0;
  31
  32static DEFINE_SPINLOCK(_minor_lock);
  33/*
  34 * For bio-based dm.
  35 * One of these is allocated per bio.
  36 */
  37struct dm_io {
  38	struct mapped_device *md;
  39	int error;
  40	atomic_t io_count;
  41	struct bio *bio;
  42	unsigned long start_time;
  43};
  44
  45/*
  46 * For bio-based dm.
  47 * One of these is allocated per target within a bio.  Hopefully
  48 * this will be simplified out one day.
  49 */
  50struct dm_target_io {
  51	struct dm_io *io;
  52	struct dm_target *ti;
  53	union map_info info;
  54};
  55
  56DEFINE_TRACE(block_bio_complete);
  57
  58/*
  59 * For request-based dm.
  60 * One of these is allocated per request.
  61 */
  62struct dm_rq_target_io {
  63	struct mapped_device *md;
  64	struct dm_target *ti;
  65	struct request *orig, clone;
  66	int error;
  67	union map_info info;
  68};
  69
  70/*
  71 * For request-based dm.
  72 * One of these is allocated per bio.
  73 */
  74struct dm_rq_clone_bio_info {
  75	struct bio *orig;
  76	struct request *rq;
  77};
  78
  79union map_info *dm_get_mapinfo(struct bio *bio)
  80{
  81	if (bio && bio->bi_private)
  82		return &((struct dm_target_io *)bio->bi_private)->info;
  83	return NULL;
  84}
  85
  86#define MINOR_ALLOCED ((void *)-1)
  87
  88/*
  89 * Bits for the md->flags field.
  90 */
  91#define DMF_BLOCK_IO_FOR_SUSPEND 0
  92#define DMF_SUSPENDED 1
  93#define DMF_FROZEN 2
  94#define DMF_FREEING 3
  95#define DMF_DELETING 4
  96#define DMF_NOFLUSH_SUSPENDING 5
  97#define DMF_QUEUE_IO_TO_THREAD 6
  98
  99/*
 100 * Work processed by per-device workqueue.
 101 */
 102struct mapped_device {
 103	struct rw_semaphore io_lock;
 104	struct mutex suspend_lock;
 105	rwlock_t map_lock;
 106	atomic_t holders;
 107	atomic_t open_count;
 108
 109	unsigned long flags;
 110
 111	struct request_queue *queue;
 112	struct gendisk *disk;
 113	char name[16];
 114
 115	void *interface_ptr;
 116
 117	/*
 118	 * A list of ios that arrived while we were suspended.
 119	 */
 120	atomic_t pending;
 121	wait_queue_head_t wait;
 122	struct work_struct work;
 123	struct bio_list deferred;
 124	spinlock_t deferred_lock;
 125
 126	/*
 127	 * An error from the barrier request currently being processed.
 128	 */
 129	int barrier_error;
 130
 131	/*
 132	 * Processing queue (flush/barriers)
 133	 */
 134	struct workqueue_struct *wq;
 135
 136	/*
 137	 * The current mapping.
 138	 */
 139	struct dm_table *map;
 140
 141	/*
 142	 * io objects are allocated from here.
 143	 */
 144	mempool_t *io_pool;
 145	mempool_t *tio_pool;
 146
 147	struct bio_set *bs;
 148
 149	/*
 150	 * Event handling.
 151	 */
 152	atomic_t event_nr;
 153	wait_queue_head_t eventq;
 154	atomic_t uevent_seq;
 155	struct list_head uevent_list;
 156	spinlock_t uevent_lock; /* Protect access to uevent_list */
 157
 158	/*
 159	 * freeze/thaw support require holding onto a super block
 160	 */
 161	struct super_block *frozen_sb;
 162	struct block_device *suspended_bdev;
 163
 164	/* forced geometry settings */
 165	struct hd_geometry geometry;
 166
 167	/* sysfs handle */
 168	struct kobject kobj;
 169};
 170
 171#define MIN_IOS 256
 172static struct kmem_cache *_io_cache;
 173static struct kmem_cache *_tio_cache;
 174static struct kmem_cache *_rq_tio_cache;
 175static struct kmem_cache *_rq_bio_info_cache;
 176
 177static int __init local_init(void)
 178{
 179	int r = -ENOMEM;
 180
 181	/* allocate a slab for the dm_ios */
 182	_io_cache = KMEM_CACHE(dm_io, 0);
 183	if (!_io_cache)
 184		return r;
 185
 186	/* allocate a slab for the target ios */
 187	_tio_cache = KMEM_CACHE(dm_target_io, 0);
 188	if (!_tio_cache)
 189		goto out_free_io_cache;
 190
 191	_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
 192	if (!_rq_tio_cache)
 193		goto out_free_tio_cache;
 194
 195	_rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
 196	if (!_rq_bio_info_cache)
 197		goto out_free_rq_tio_cache;
 198
 199	r = dm_uevent_init();
 200	if (r)
 201		goto out_free_rq_bio_info_cache;
 202
 203	_major = major;
 204	r = register_blkdev(_major, _name);
 205	if (r < 0)
 206		goto out_uevent_exit;
 207
 208	if (!_major)
 209		_major = r;
 210
 211	return 0;
 212
 213out_uevent_exit:
 214	dm_uevent_exit();
 215out_free_rq_bio_info_cache:
 216	kmem_cache_destroy(_rq_bio_info_cache);
 217out_free_rq_tio_cache:
 218	kmem_cache_destroy(_rq_tio_cache);
 219out_free_tio_cache:
 220	kmem_cache_destroy(_tio_cache);
 221out_free_io_cache:
 222	kmem_cache_destroy(_io_cache);
 223
 224	return r;
 225}
 226
 227static void local_exit(void)
 228{
 229	kmem_cache_destroy(_rq_bio_info_cache);
 230	kmem_cache_destroy(_rq_tio_cache);
 231	kmem_cache_destroy(_tio_cache);
 232	kmem_cache_destroy(_io_cache);
 233	unregister_blkdev(_major, _name);
 234	dm_uevent_exit();
 235
 236	_major = 0;
 237
 238	DMINFO("cleaned up");
 239}
 240
 241static int (*_inits[])(void) __initdata = {
 242	local_init,
 243	dm_target_init,
 244	dm_linear_init,
 245	dm_stripe_init,
 246	dm_kcopyd_init,
 247	dm_interface_init,
 248};
 249
 250static void (*_exits[])(void) = {
 251	local_exit,
 252	dm_target_exit,
 253	dm_linear_exit,
 254	dm_stripe_exit,
 255	dm_kcopyd_exit,
 256	dm_interface_exit,
 257};
 258
 259static int __init dm_init(void)
 260{
 261	const int count = ARRAY_SIZE(_inits);
 262
 263	int r, i;
 264
 265	for (i = 0; i < count; i++) {
 266		r = _inits[i]();
 267		if (r)
 268			goto bad;
 269	}
 270
 271	return 0;
 272
 273      bad:
 274	while (i--)
 275		_exits[i]();
 276
 277	return r;
 278}
 279
 280static void __exit dm_exit(void)
 281{
 282	int i = ARRAY_SIZE(_exits);
 283
 284	while (i--)
 285		_exits[i]();
 286}
 287
 288/*
 289 * Block device functions
 290 */
 291static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 292{
 293	struct mapped_device *md;
 294
 295	spin_lock(&_minor_lock);
 296
 297	md = bdev->bd_disk->private_data;
 298	if (!md)
 299		goto out;
 300
 301	if (test_bit(DMF_FREEING, &md->flags) ||
 302	    test_bit(DMF_DELETING, &md->flags)) {
 303		md = NULL;
 304		goto out;
 305	}
 306
 307	dm_get(md);
 308	atomic_inc(&md->open_count);
 309
 310out:
 311	spin_unlock(&_minor_lock);
 312
 313	return md ? 0 : -ENXIO;
 314}
 315
 316static int dm_blk_close(struct gendisk *disk, fmode_t mode)
 317{
 318	struct mapped_device *md = disk->private_data;
 319	atomic_dec(&md->open_count);
 320	dm_put(md);
 321	return 0;
 322}
 323
 324int dm_open_count(struct mapped_device *md)
 325{
 326	return atomic_read(&md->open_count);
 327}
 328
 329/*
 330 * Guarantees nothing is using the device before it's deleted.
 331 */
 332int dm_lock_for_deletion(struct mapped_device *md)
 333{
 334	int r = 0;
 335
 336	spin_lock(&_minor_lock);
 337
 338	if (dm_open_count(md))
 339		r = -EBUSY;
 340	else
 341		set_bit(DMF_DELETING, &md->flags);
 342
 343	spin_unlock(&_minor_lock);
 344
 345	return r;
 346}
 347
 348static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 349{
 350	struct mapped_device *md = bdev->bd_disk->private_data;
 351
 352	return dm_get_geometry(md, geo);
 353}
 354
 355static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 356			unsigned int cmd, unsigned long arg)
 357{
 358	struct mapped_device *md = bdev->bd_disk->private_data;
 359	struct dm_table *map = dm_get_table(md);
 360	struct dm_target *tgt;
 361	int r = -ENOTTY;
 362
 363	if (!map || !dm_table_get_size(map))
 364		goto out;
 365
 366	/* We only support devices that have a single target */
 367	if (dm_table_get_num_targets(map) != 1)
 368		goto out;
 369
 370	tgt = dm_table_get_target(map, 0);
 371
 372	if (dm_suspended(md)) {
 373		r = -EAGAIN;
 374		goto out;
 375	}
 376
 377	if (tgt->type->ioctl)
 378		r = tgt->type->ioctl(tgt, cmd, arg);
 379
 380out:
 381	dm_table_put(map);
 382
 383	return r;
 384}
 385
 386static struct dm_io *alloc_io(struct mapped_device *md)
 387{
 388	return mempool_alloc(md->io_pool, GFP_NOIO);
 389}
 390
 391static void free_io(struct mapped_device *md, struct dm_io *io)
 392{
 393	mempool_free(io, md->io_pool);
 394}
 395
 396static struct dm_target_io *alloc_tio(struct mapped_device *md)
 397{
 398	return mempool_alloc(md->tio_pool, GFP_NOIO);
 399}
 400
 401static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 402{
 403	mempool_free(tio, md->tio_pool);
 404}
 405
 406static void start_io_acct(struct dm_io *io)
 407{
 408	struct mapped_device *md = io->md;
 409	int cpu;
 410
 411	io->start_time = jiffies;
 412
 413	cpu = part_stat_lock();
 414	part_round_stats(cpu, &dm_disk(md)->part0);
 415	part_stat_unlock();
 416	dm_disk(md)->part0.in_flight = atomic_inc_return(&md->pending);
 417}
 418
 419static void end_io_acct(struct dm_io *io)
 420{
 421	struct mapped_device *md = io->md;
 422	struct bio *bio = io->bio;
 423	unsigned long duration = jiffies - io->start_time;
 424	int pending, cpu;
 425	int rw = bio_data_dir(bio);
 426
 427	cpu = part_stat_lock();
 428	part_round_stats(cpu, &dm_disk(md)->part0);
 429	part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
 430	part_stat_unlock();
 431
 432	/*
 433	 * After this is decremented the bio must not be touched if it is
 434	 * a barrier.
 435	 */
 436	dm_disk(md)->part0.in_flight = pending =
 437		atomic_dec_return(&md->pending);
 438
 439	/* nudge anyone waiting on suspend queue */
 440	if (!pending)
 441		wake_up(&md->wait);
 442}
 443
 444/*
 445 * Add the bio to the list of deferred io.
 446 */
 447static void queue_io(struct mapped_device *md, struct bio *bio)
 448{
 449	down_write(&md->io_lock);
 450
 451	spin_lock_irq(&md->deferred_lock);
 452	bio_list_add(&md->deferred, bio);
 453	spin_unlock_irq(&md->deferred_lock);
 454
 455	if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
 456		queue_work(md->wq, &md->work);
 457
 458	up_write(&md->io_lock);
 459}
 460
 461/*
 462 * Everyone (including functions in this file), should use this
 463 * function to access the md->map field, and make sure they call
 464 * dm_table_put() when finished.
 465 */
 466struct dm_table *dm_get_table(struct mapped_device *md)
 467{
 468	struct dm_table *t;
 469
 470	read_lock(&md->map_lock);
 471	t = md->map;
 472	if (t)
 473		dm_table_get(t);
 474	read_unlock(&md->map_lock);
 475
 476	return t;
 477}
 478
 479/*
 480 * Get the geometry associated with a dm device
 481 */
 482int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 483{
 484	*geo = md->geometry;
 485
 486	return 0;
 487}
 488
 489/*
 490 * Set the geometry of a device.
 491 */
 492int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 493{
 494	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 495
 496	if (geo->start > sz) {
 497		DMWARN("Start sector is beyond the geometry limits.");
 498		return -EINVAL;
 499	}
 500
 501	md->geometry = *geo;
 502
 503	return 0;
 504}
 505
 506/*-----------------------------------------------------------------
 507 * CRUD START:
 508 *   A more elegant soln is in the works that uses the queue
 509 *   merge fn, unfortunately there are a couple of changes to
 510 *   the block layer that I want to make for this.  So in the
 511 *   interests of getting something for people to use I give
 512 *   you this clearly demarcated crap.
 513 *---------------------------------------------------------------*/
 514
 515static int __noflush_suspending(struct mapped_device *md)
 516{
 517	return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 518}
 519
 520/*
 521 * Decrements the number of outstanding ios that a bio has been
 522 * cloned into, completing the original io if necc.
 523 */
 524static void dec_pending(struct dm_io *io, int error)
 525{
 526	unsigned long flags;
 527	int io_error;
 528	struct bio *bio;
 529	struct mapped_device *md = io->md;
 530
 531	/* Push-back supersedes any I/O errors */
 532	if (error && !(io->error > 0 && __noflush_suspending(md)))
 533		io->error = error;
 534
 535	if (atomic_dec_and_test(&io->io_count)) {
 536		if (io->error == DM_ENDIO_REQUEUE) {
 537			/*
 538			 * Target requested pushing back the I/O.
 539			 */
 540			spin_lock_irqsave(&md->deferred_lock, flags);
 541			if (__noflush_suspending(md))
 542				bio_list_add_head(&md->deferred, io->bio);
 543			else
 544				/* noflush suspend was interrupted. */
 545				io->error = -EIO;
 546			spin_unlock_irqrestore(&md->deferred_lock, flags);
 547		}
 548
 549		io_error = io->error;
 550		bio = io->bio;
 551
 552		if (bio_barrier(bio)) {
 553			/*
 554			 * There can be just one barrier request so we use
 555			 * a per-device variable for error reporting.
 556			 * Note that you can't touch the bio after end_io_acct
 557			 */
 558			md->barrier_error = io_error;
 559			end_io_acct(io);
 560		} else {
 561			end_io_acct(io);
 562
 563			if (io_error != DM_ENDIO_REQUEUE) {
 564				trace_block_bio_complete(md->queue, bio);
 565
 566				bio_endio(bio, io_error);
 567			}
 568		}
 569
 570		free_io(md, io);
 571	}
 572}
 573
 574static void clone_endio(struct bio *bio, int error)
 575{
 576	int r = 0;
 577	struct dm_target_io *tio = bio->bi_private;
 578	struct dm_io *io = tio->io;
 579	struct mapped_device *md = tio->io->md;
 580	dm_endio_fn endio = tio->ti->type->end_io;
 581
 582	if (!bio_flagged(bio, BIO_UPTODATE) && !error)
 583		error = -EIO;
 584
 585	if (endio) {
 586		r = endio(tio->ti, bio, error, &tio->info);
 587		if (r < 0 || r == DM_ENDIO_REQUEUE)
 588			/*
 589			 * error and requeue request are handled
 590			 * in dec_pending().
 591			 */
 592			error = r;
 593		else if (r == DM_ENDIO_INCOMPLETE)
 594			/* The target will handle the io */
 595			return;
 596		else if (r) {
 597			DMWARN("unimplemented target endio return value: %d", r);
 598			BUG();
 599		}
 600	}
 601
 602	/*
 603	 * Store md for cleanup instead of tio which is about to get freed.
 604	 */
 605	bio->bi_private = md->bs;
 606
 607	free_tio(md, tio);
 608	bio_put(bio);
 609	dec_pending(io, error);
 610}
 611
 612static sector_t max_io_len(struct mapped_device *md,
 613			   sector_t sector, struct dm_target *ti)
 614{
 615	sector_t offset = sector - ti->begin;
 616	sector_t len = ti->len - offset;
 617
 618	/*
 619	 * Does the target need to split even further ?
 620	 */
 621	if (ti->split_io) {
 622		sector_t boundary;
 623		boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
 624			   - offset;
 625		if (len > boundary)
 626			len = boundary;
 627	}
 628
 629	return len;
 630}
 631
 632static void __map_bio(struct dm_target *ti, struct bio *clone,
 633		      struct dm_target_io *tio)
 634{
 635	int r;
 636	sector_t sector;
 637	struct mapped_device *md;
 638
 639	/*
 640	 * Sanity checks.
 641	 */
 642	BUG_ON(!clone->bi_size);
 643
 644	clone->bi_end_io = clone_endio;
 645	clone->bi_private = tio;
 646
 647	/*
 648	 * Map the clone.  If r == 0 we don't need to do
 649	 * anything, the target has assumed ownership of
 650	 * this io.
 651	 */
 652	atomic_inc(&tio->io->io_count);
 653	sector = clone->bi_sector;
 654	r = ti->type->map(ti, clone, &tio->info);
 655	if (r == DM_MAPIO_REMAPPED) {
 656		/* the bio has been remapped so dispatch it */
 657
 658		trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
 659				    tio->io->bio->bi_bdev->bd_dev,
 660				    clone->bi_sector, sector);
 661
 662		generic_make_request(clone);
 663	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
 664		/* error the io and bail out, or requeue it if needed */
 665		md = tio->io->md;
 666		dec_pending(tio->io, r);
 667		/*
 668		 * Store bio_set for cleanup.
 669		 */
 670		clone->bi_private = md->bs;
 671		bio_put(clone);
 672		free_tio(md, tio);
 673	} else if (r) {
 674		DMWARN("unimplemented target map return value: %d", r);
 675		BUG();
 676	}
 677}
 678
 679struct clone_info {
 680	struct mapped_device *md;
 681	struct dm_table *map;
 682	struct bio *bio;
 683	struct dm_io *io;
 684	sector_t sector;
 685	sector_t sector_count;
 686	unsigned short idx;
 687};
 688
 689static void dm_bio_destructor(struct bio *bio)
 690{
 691	struct bio_set *bs = bio->bi_private;
 692
 693	bio_free(bio, bs);
 694}
 695
 696/*
 697 * Creates a little bio that is just does part of a bvec.
 698 */
 699static struct bio *split_bvec(struct bio *bio, sector_t sector,
 700			      unsigned short idx, unsigned int offset,
 701			      unsigned int len, struct bio_set *bs)
 702{
 703	struct bio *clone;
 704	struct bio_vec *bv = bio->bi_io_vec + idx;
 705
 706	clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
 707	clone->bi_destructor = dm_bio_destructor;
 708	*clone->bi_io_vec = *bv;
 709
 710	clone->bi_sector = sector;
 711	clone->bi_bdev = bio->bi_bdev;
 712	clone->bi_rw = bio->bi_rw & ~(1 << BIO_RW_BARRIER);
 713	clone->bi_vcnt = 1;
 714	clone->bi_size = to_bytes(len);
 715	clone->bi_io_vec->bv_offset = offset;
 716	clone->bi_io_vec->bv_len = clone->bi_size;
 717	clone->bi_flags |= 1 << BIO_CLONED;
 718
 719	if (bio_integrity(bio)) {
 720		bio_integrity_clone(clone, bio, GFP_NOIO);
 721		bio_integrity_trim(clone,
 722				   bio_sector_offset(bio, idx, offset), len);
 723	}
 724
 725	return clone;
 726}
 727
 728/*
 729 * Creates a bio that consists of range of complete bvecs.
 730 */
 731static struct bio *clone_bio(struct bio *bio, sector_t sector,
 732			     unsigned short idx, unsigned short bv_count,
 733			     unsigned int len, struct bio_set *bs)
 734{
 735	struct bio *clone;
 736
 737	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
 738	__bio_clone(clone, bio);
 739	clone->bi_rw &= ~(1 << BIO_RW_BARRIER);
 740	clone->bi_destructor = dm_bio_destructor;
 741	clone->bi_sector = sector;
 742	clone->bi_idx = idx;
 743	clone->bi_vcnt = idx + bv_count;
 744	clone->bi_size = to_bytes(len);
 745	clone->bi_flags &= ~(1 << BIO_SEG_VALID);
 746
 747	if (bio_integrity(bio)) {
 748		bio_integrity_clone(clone, bio, GFP_NOIO);
 749
 750		if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
 751			bio_integrity_trim(clone,
 752					   bio_sector_offset(bio, idx, 0), len);
 753	}
 754
 755	return clone;
 756}
 757
 758static int __clone_and_map(struct clone_info *ci)
 759{
 760	struct bio *clone, *bio = ci->bio;
 761	struct dm_target *ti;
 762	sector_t len = 0, max;
 763	struct dm_target_io *tio;
 764
 765	ti = dm_table_find_target(ci->map, ci->sector);
 766	if (!dm_target_is_valid(ti))
 767		return -EIO;
 768
 769	max = max_io_len(ci->md, ci->sector, ti);
 770
 771	/*
 772	 * Allocate a target io object.
 773	 */
 774	tio = alloc_tio(ci->md);
 775	tio->io = ci->io;
 776	tio->ti = ti;
 777	memset(&tio->info, 0, sizeof(tio->info));
 778
 779	if (ci->sector_count <= max) {
 780		/*
 781		 * Optimise for the simple case where we can do all of
 782		 * the remaining io with a single clone.
 783		 */
 784		clone = clone_bio(bio, ci->sector, ci->idx,
 785				  bio->bi_vcnt - ci->idx, ci->sector_count,
 786				  ci->md->bs);
 787		__map_bio(ti, clone, tio);
 788		ci->sector_count = 0;
 789
 790	} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
 791		/*
 792		 * There are some bvecs that don't span targets.
 793		 * Do as many of these as possible.
 794		 */
 795		int i;
 796		sector_t remaining = max;
 797		sector_t bv_len;
 798
 799		for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
 800			bv_len = to_sector(bio->bi_io_vec[i].bv_len);
 801
 802			if (bv_len > remaining)
 803				break;
 804
 805			remaining -= bv_len;
 806			len += bv_len;
 807		}
 808
 809		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
 810				  ci->md->bs);
 811		__map_bio(ti, clone, tio);
 812
 813		ci->sector += len;
 814		ci->sector_count -= len;
 815		ci->idx = i;
 816
 817	} else {
 818		/*
 819		 * Handle a bvec that must be split between two or more targets.
 820		 */
 821		struct bio_vec *bv = bio->bi_io_vec + ci->idx;
 822		sector_t remaining = to_sector(bv->bv_len);
 823		unsigned int offset = 0;
 824
 825		do {
 826			if (offset) {
 827				ti = dm_table_find_target(ci->map, ci->sector);
 828				if (!dm_target_is_valid(ti))
 829					return -EIO;
 830
 831				max = max_io_len(ci->md, ci->sector, ti);
 832
 833				tio = alloc_tio(ci->md);
 834				tio->io = ci->io;
 835				tio->ti = ti;
 836				memset(&tio->info, 0, sizeof(tio->info));
 837			}
 838
 839			len = min(remaining, max);
 840
 841			clone = split_bvec(bio, ci->sector, ci->idx,
 842					   bv->bv_offset + offset, len,
 843					   ci->md->bs);
 844
 845			__map_bio(ti, clone, tio);
 846
 847			ci->sector += len;
 848			ci->sector_count -= len;
 849			offset += to_bytes(len);
 850		} while (remaining -= len);
 851
 852		ci->idx++;
 853	}
 854
 855	return 0;
 856}
 857
 858/*
 859 * Split the bio into several clones and submit it to targets.
 860 */
 861static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 862{
 863	struct clone_info ci;
 864	int error = 0;
 865
 866	ci.map = dm_get_table(md);
 867	if (unlikely(!ci.map)) {
 868		if (!bio_barrier(bio))
 869			bio_io_error(bio);
 870		else
 871			md->barrier_error = -EIO;
 872		return;
 873	}
 874
 875	ci.md = md;
 876	ci.bio = bio;
 877	ci.io = alloc_io(md);
 878	ci.io->error = 0;
 879	atomic_set(&ci.io->io_count, 1);
 880	ci.io->bio = bio;
 881	ci.io->md = md;
 882	ci.sector = bio->bi_sector;
 883	ci.sector_count = bio_sectors(bio);
 884	ci.idx = bio->bi_idx;
 885
 886	start_io_acct(ci.io);
 887	while (ci.sector_count && !error)
 888		error = __clone_and_map(&ci);
 889
 890	/* drop the extra reference count */
 891	dec_pending(ci.io, error);
 892	dm_table_put(ci.map);
 893}
 894/*-----------------------------------------------------------------
 895 * CRUD END
 896 *---------------------------------------------------------------*/
 897
 898static int dm_merge_bvec(struct request_queue *q,
 899			 struct bvec_merge_data *bvm,
 900			 struct bio_vec *biovec)
 901{
 902	struct mapped_device *md = q->queuedata;
 903	struct dm_table *map = dm_get_table(md);
 904	struct dm_target *ti;
 905	sector_t max_sectors;
 906	int max_size = 0;
 907
 908	if (unlikely(!map))
 909		goto out;
 910
 911	ti = dm_table_find_target(map, bvm->bi_sector);
 912	if (!dm_target_is_valid(ti))
 913		goto out_table;
 914
 915	/*
 916	 * Find maximum amount of I/O that won't need splitting
 917	 */
 918	max_sectors = min(max_io_len(md, bvm->bi_sector, ti),
 919			  (sector_t) BIO_MAX_SECTORS);
 920	max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
 921	if (max_size < 0)
 922		max_size = 0;
 923
 924	/*
 925	 * merge_bvec_fn() returns number of bytes
 926	 * it can accept at this offset
 927	 * max is precomputed maximal io size
 928	 */
 929	if (max_size && ti->type->merge)
 930		max_size = ti->type->merge(ti, bvm, biovec, max_size);
 931
 932out_table:
 933	dm_table_put(map);
 934
 935out:
 936	/*
 937	 * Always allow an entire first page
 938	 */
 939	if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
 940		max_size = biovec->bv_len;
 941
 942	return max_size;
 943}
 944
 945/*
 946 * The request function that just remaps the bio built up by
 947 * dm_merge_bvec.
 948 */
 949static int dm_request(struct request_queue *q, struct bio *bio)
 950{
 951	int rw = bio_data_dir(bio);
 952	struct mapped_device *md = q->queuedata;
 953	int cpu;
 954
 955	down_read(&md->io_lock);
 956
 957	cpu = part_stat_lock();
 958	part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
 959	part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
 960	part_stat_unlock();
 961
 962	/*
 963	 * If we're suspended or the thread is processing barriers
 964	 * we have to queue this io for later.
 965	 */
 966	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
 967	    unlikely(bio_barrier(bio))) {
 968		up_read(&md->io_lock);
 969
 970		if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
 971		    bio_rw(bio) == READA) {
 972			bio_io_error(bio);
 973			return 0;
 974		}
 975
 976		queue_io(md, bio);
 977
 978		return 0;
 979	}
 980
 981	__split_and_process_bio(md, bio);
 982	up_read(&md->io_lock);
 983	return 0;
 984}
 985
 986static void dm_unplug_all(struct request_queue *q)
 987{
 988	struct mapped_device *md = q->queuedata;
 989	struct dm_table *map = dm_get_table(md);
 990
 991	if (map) {
 992		dm_table_unplug_all(map);
 993		dm_table_put(map);
 994	}
 995}
 996
 997static int dm_any_congested(void *congested_data, int bdi_bits)
 998{
 999	int r = bdi_bits;
1000	struct mapped_device *md = congested_data;
1001	struct dm_table *map;
1002
1003	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1004		map = dm_get_table(md);
1005		if (map) {
1006			r = dm_table_any_congested(map, bdi_bits);
1007			dm_table_put(map);
1008		}
1009	}
1010
1011	return r;
1012}
1013
1014/*-----------------------------------------------------------------
1015 * An IDR is used to keep track of allocated minor numbers.
1016 *---------------------------------------------------------------*/
1017static DEFINE_IDR(_minor_idr);
1018
1019static void free_minor(int minor)
1020{
1021	spin_lock(&_minor_lock);
1022	idr_remove(&_minor_idr, minor);
1023	spin_unlock(&_minor_lock);
1024}
1025
1026/*
1027 * See if the device with a specific minor # is free.
1028 */
1029static int specific_minor(int minor)
1030{
1031	int r, m;
1032
1033	if (minor >= (1 << MINORBITS))
1034		return -EINVAL;
1035
1036	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
1037	if (!r)
1038		return -ENOMEM;
1039
1040	spin_lock(&_minor_lock);
1041
1042	if (idr_find(&_minor_idr, minor)) {
1043		r = -EBUSY;
1044		goto out;
1045	}
1046
1047	r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
1048	if (r)
1049		goto out;
1050
1051	if (m != minor) {
1052		idr_remove(&_minor_idr, m);
1053		r = -EBUSY;
1054		goto out;
1055	}
1056
1057out:
1058	spin_unlock(&_minor_lock);
1059	return r;
1060}
1061
1062static int next_free_minor(int *minor)
1063{
1064	int r, m;
1065
1066	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
1067	if (!r)
1068		return -ENOMEM;
1069
1070	spin_lock(&_minor_lock);
1071
1072	r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
1073	if (r)
1074		goto out;
1075
1076	if (m >= (1 << MINORBITS)) {
1077		idr_remove(&_minor_idr, m);
1078		r = -ENOSPC;
1079		goto out;
1080	}
1081
1082	*minor = m;
1083
1084out:
1085	spin_unlock(&_minor_lock);
1086	return r;
1087}
1088
1089static struct block_device_operations dm_blk_dops;
1090
1091static void dm_wq_work(struct work_struct *work);
1092
1093/*
1094 * Allocate and initialise a blank device with a given minor.
1095 */
1096static struct mapped_device *alloc_dev(int minor)
1097{
1098	int r;
1099	struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
1100	void *old_md;
1101
1102	if (!md) {
1103		DMWARN("unable to allocate device, out of memory.");
1104		return NULL;
1105	}
1106
1107	if (!try_module_get(THIS_MODULE))
1108		goto bad_module_get;
1109
1110	/* get a minor number for the dev */
1111	if (minor == DM_ANY_MINOR)
1112		r = next_free_minor(&minor);
1113	else
1114		r = specific_minor(minor);
1115	if (r < 0)
1116		goto bad_minor;
1117
1118	init_rwsem(&md->io_lock);
1119	mutex_init(&md->suspend_lock);
1120	spin_lock_init(&md->deferred_lock);
1121	rwlock_init(&md->map_lock);
1122	atomic_set(&md->holders, 1);
1123	atomic_set(&md->open_count, 0);
1124	atomic_set(&md->event_nr, 0);
1125	atomic_set(&md->uevent_seq, 0);
1126	INIT_LIST_HEAD(&md->uevent_list);
1127	spin_lock_init(&md->uevent_lock);
1128
1129	md->queue = blk_alloc_queue(GFP_KERNEL);
1130	if (!md->queue)
1131		goto bad_queue;
1132
1133	md->queue->queuedata = md;
1134	md->queue->backing_dev_info.congested_fn = dm_any_congested;
1135	md->queue->backing_dev_info.congested_data = md;
1136	blk_queue_make_request(md->queue, dm_request);
1137	blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
1138	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1139	md->queue->unplug_fn = dm_unplug_all;
1140	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1141
1142	md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
1143	if (!md->io_pool)
1144		goto bad_io_pool;
1145
1146	md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
1147	if (!md->tio_pool)
1148		goto bad_tio_pool;
1149
1150	md->bs = bioset_create(16, 0);
1151	if (!md->bs)
1152		goto bad_no_bioset;
1153
1154	md->disk = alloc_disk(1);
1155	if (!md->disk)
1156		goto bad_disk;
1157
1158	atomic_set(&md->pending, 0);
1159	init_waitqueue_head(&md->wait);
1160	INIT_WORK(&md->work, dm_wq_work);
1161	init_waitqueue_head(&md->eventq);
1162
1163	md->disk->major = _major;
1164	md->disk->first_minor = minor;
1165	md->disk->fops = &dm_blk_dops;
1166	md->disk->queue = md->queue;
1167	md->disk->private_data = md;
1168	sprintf(md->disk->disk_name, "dm-%d", minor);
1169	add_disk(md->disk);
1170	format_dev_t(md->name, MKDEV(_major, minor));
1171
1172	md->wq = create_singlethread_workqueue("kdmflush");
1173	if (!md->wq)
1174		goto bad_thread;
1175
1176	/* Populate the mapping, nobody knows we exist yet */
1177	spin_lock(&_minor_lock);
1178	old_md = idr_replace(&_minor_idr, md, minor);
1179	spin_unlock(&_minor_lock);
1180
1181	BUG_ON(old_md != MINOR_ALLOCED);
1182
1183	return md;
1184
1185bad_thread:
1186	put_disk(md->disk);
1187bad_disk:
1188	bioset_free(md->bs);
1189bad_no_bioset:
1190	mempool_destroy(md->tio_pool);
1191bad_tio_pool:
1192	mempool_destroy(md->io_pool);
1193bad_io_pool:
1194	blk_cleanup_queue(md->queue);
1195bad_queue:
1196	free_minor(minor);
1197bad_minor:
1198	module_put(THIS_MODULE);
1199bad_module_get:
1200	kfree(md);
1201	return NULL;
1202}
1203
1204static void unlock_fs(struct mapped_device *md);
1205
1206static void free_dev(struct mapped_device *md)
1207{
1208	int minor = MINOR(disk_devt(md->disk));
1209
1210	if (md->suspended_bdev) {
1211		unlock_fs(md);
1212		bdput(md->suspended_bdev);
1213	}
1214	destroy_workqueue(md->wq);
1215	mempool_destroy(md->tio_pool);
1216	mempool_destroy(md->io_pool);
1217	bioset_free(md->bs);
1218	blk_integrity_unregister(md->disk);
1219	del_gendisk(md->disk);
1220	free_minor(minor);
1221
1222	spin_lock(&_minor_lock);
1223	md->disk->private_data = NULL;
1224	spin_unlock(&_minor_lock);
1225
1226	put_disk(md->disk);
1227	blk_cleanup_queue(md->queue);
1228	module_put(THIS_MODULE);
1229	kfree(md);
1230}
1231
1232/*
1233 * Bind a table to the device.
1234 */
1235static void event_callback(void *context)
1236{
1237	unsigned long flags;
1238	LIST_HEAD(uevents);
1239	struct mapped_device *md = (struct mapped_device *) context;
1240
1241	spin_lock_irqsave(&md->uevent_lock, flags);
1242	list_splice_init(&md->uevent_list, &uevents);
1243	spin_unlock_irqrestore(&md->uevent_lock, flags);
1244
1245	dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
1246
1247	atomic_inc(&md->event_nr);
1248	wake_up(&md->eventq);
1249}
1250
1251static void __set_size(struct mapped_device *md, sector_t size)
1252{
1253	set_capacity(md->disk, size);
1254
1255	mutex_lock(&md->suspended_bdev->bd_inode->i_mutex);
1256	i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
1257	mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex);
1258}
1259
1260static int __bind(struct mapped_device *md, struct dm_table *t)
1261{
1262	struct request_queue *q = md->queue;
1263	sector_t size;
1264
1265	size = dm_table_get_size(t);
1266
1267	/*
1268	 * Wipe any geometry if the size of the table changed.
1269	 */
1270	if (size != get_capacity(md->disk))
1271		memset(&md->geometry, 0, sizeof(md->geometry));
1272
1273	if (md->suspended_bdev)
1274		__set_size(md, size);
1275
1276	if (!size) {
1277		dm_table_destroy(t);
1278		return 0;
1279	}
1280
1281	dm_table_event_callback(t, event_callback, md);
1282
1283	write_lock(&md->map_lock);
1284	md->map = t;
1285	dm_table_set_restrictions(t, q);
1286	write_unlock(&md->map_lock);
1287
1288	return 0;
1289}
1290
1291static void __unbind(struct mapped_device *md)
1292{
1293	struct dm_table *map = md->map;
1294
1295	if (!map)
1296		return;
1297
1298	dm_table_event_callback(map, NULL, NULL);
1299	write_lock(&md->map_lock);
1300	md->map = NULL;
1301	write_unlock(&md->map_lock);
1302	dm_table_destroy(map);
1303}
1304
1305/*
1306 * Constructor for a new device.
1307 */
1308int dm_create(int minor, struct mapped_device **result)
1309{
1310	struct mapped_device *md;
1311
1312	md = alloc_dev(minor);
1313	if (!md)
1314		return -ENXIO;
1315
1316	dm_sysfs_init(md);
1317
1318	*result = md;
1319	return 0;
1320}
1321
1322static struct mapped_device *dm_find_md(dev_t dev)
1323{
1324	struct mapped_device *md;
1325	unsigned minor = MINOR(dev);
1326
1327	if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
1328		return NULL;
1329
1330	spin_lock(&_minor_lock);
1331
1332	md = idr_find(&_minor_idr, minor);
1333	if (md && (md == MINOR_ALLOCED ||
1334		   (MINOR(disk_devt(dm_disk(md))) != minor) ||
1335		   test_bit(DMF_FREEING, &md->flags))) {
1336		md = NULL;
1337		goto out;
1338	}
1339
1340out:
1341	spin_unlock(&_minor_lock);
1342
1343	return md;
1344}
1345
1346struct mapped_device *dm_get_md(dev_t dev)
1347{
1348	struct mapped_device *md = dm_find_md(dev);
1349
1350	if (md)
1351		dm_get(md);
1352
1353	return md;
1354}
1355
1356void *dm_get_mdptr(struct mapped_device *md)
1357{
1358	return md->interface_ptr;
1359}
1360
1361void dm_set_mdptr(struct mapped_device *md, void *ptr)
1362{
1363	md->interface_ptr = ptr;
1364}
1365
1366void dm_get(struct mapped_device *md)
1367{
1368	atomic_inc(&md->holders);
1369}
1370
1371const char *dm_device_name(struct mapped_device *md)
1372{
1373	return md->name;
1374}
1375EXPORT_SYMBOL_GPL(dm_device_name);
1376
1377void dm_put(struct mapped_device *md)
1378{
1379	struct dm_table *map;
1380
1381	BUG_ON(test_bit(DMF_FREEING, &md->flags));
1382
1383	if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
1384		map = dm_get_table(md);
1385		idr_replace(&_minor_idr, MINOR_ALLOCED,
1386			    MINOR(disk_devt(dm_disk(md))));
1387		set_bit(DMF_FREEING, &md->flags);
1388		spin_unlock(&_minor_lock);
1389		if (!dm_suspended(md)) {
1390			dm_table_presuspend_targets(map);
1391			dm_table_postsuspend_targets(map);
1392		}
1393		dm_sysfs_exit(md);
1394		dm_table_put(map);
1395		__unbind(md);
1396		free_dev(md);
1397	}
1398}
1399EXPORT_SYMBOL_GPL(dm_put);
1400
1401static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
1402{
1403	int r = 0;
1404	DECLARE_WAITQUEUE(wait, current);
1405
1406	dm_unplug_all(md->queue);
1407
1408	add_wait_queue(&md->wait, &wait);
1409
1410	while (1) {
1411		set_current_state(interruptible);
1412
1413		smp_mb();
1414		if (!atomic_read(&md->pending))
1415			break;
1416
1417		if (interruptible == TASK_INTERRUPTIBLE &&
1418		    signal_pending(current)) {
1419			r = -EINTR;
1420			break;
1421		}
1422
1423		io_schedule();
1424	}
1425	set_current_state(TASK_RUNNING);
1426
1427	remove_wait_queue(&md->wait, &wait);
1428
1429	return r;
1430}
1431
1432static int dm_flush(struct mapped_device *md)
1433{
1434	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
1435	return 0;
1436}
1437
1438static void process_barrier(struct mapped_device *md, struct bio *bio)
1439{
1440	int error = dm_flush(md);
1441
1442	if (unlikely(error)) {
1443		bio_endio(bio, error);
1444		return;
1445	}
1446	if (bio_empty_barrier(bio)) {
1447		bio_endio(bio, 0);
1448		return;
1449	}
1450
1451	__split_and_process_bio(md, bio);
1452
1453	error = dm_flush(md);
1454
1455	if (!error && md->barrier_error)
1456		error = md->barrier_error;
1457
1458	if (md->barrier_error != DM_ENDIO_REQUEUE)
1459		bio_endio(bio, error);
1460}
1461
1462/*
1463 * Process the deferred bios
1464 */
1465static void dm_wq_work(struct work_struct *work)
1466{
1467	struct mapped_device *md = container_of(work, struct mapped_device,
1468						work);
1469	struct bio *c;
1470
1471	down_write(&md->io_lock);
1472
1473	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1474		spin_lock_irq(&md->deferred_lock);
1475		c = bio_list_pop(&md->deferred);
1476		spin_unlock_irq(&md->deferred_lock);
1477
1478		if (!c) {
1479			clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
1480			break;
1481		}
1482
1483		up_write(&md->io_lock);
1484
1485		if (bio_barrier(c))
1486			process_barrier(md, c);
1487		else
1488			__split_and_process_bio(md, c);
1489
1490		down_write(&md->io_lock);
1491	}
1492
1493	up_write(&md->io_lock);
1494}
1495
1496static void dm_queue_flush(struct mapped_device *md)
1497{
1498	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
1499	smp_mb__after_clear_bit();
1500	queue_work(md->wq, &md->work);
1501}
1502
1503/*
1504 * Swap in a new table (destroying old one).
1505 */
1506int dm_swap_table(struct mapped_device *md, struct dm_table *table)
1507{
1508	int r = -EINVAL;
1509
1510	mutex_lock(&md->suspend_lock);
1511
1512	/* device must be suspended */
1513	if (!dm_suspended(md))
1514		goto out;
1515
1516	/* without bdev, the device size cannot be changed */
1517	if (!md->suspended_bdev)
1518		if (get_capacity(md->disk) != dm_table_get_size(table))
1519			goto out;
1520
1521	__unbind(md);
1522	r = __bind(md, table);
1523
1524out:
1525	mutex_unlock(&md->suspend_lock);
1526	return r;
1527}
1528
1529/*
1530 * Functions to lock and unlock any filesystem running on the
1531 * device.
1532 */
1533static int lock_fs(struct mapped_device *md)
1534{
1535	int r;
1536
1537	WARN_ON(md->frozen_sb);
1538
1539	md->frozen_sb = freeze_bdev(md->suspended_bdev);
1540	if (IS_ERR(md->frozen_sb)) {
1541		r = PTR_ERR(md->frozen_sb);
1542		md->frozen_sb = NULL;
1543		return r;
1544	}
1545
1546	set_bit(DMF_FROZEN, &md->flags);
1547
1548	/* don't bdput right now, we don't want the bdev
1549	 * to go away while it is locked.
1550	 */
1551	return 0;
1552}
1553
1554static void unlock_fs(struct mapped_device *md)
1555{
1556	if (!test_bit(DMF_FROZEN, &md->flags))
1557		return;
1558
1559	thaw_bdev(md->suspended_bdev, md->frozen_sb);
1560	md->frozen_sb = NULL;
1561	clear_bit(DMF_FROZEN, &md->flags);
1562}
1563
1564/*
1565 * We need to be able to change a mapping table under a mounted
1566 * filesystem.  For example we might want to move some data in
1567 * the background.  Before the table can be swapped with
1568 * dm_bind_table, dm_suspend must be called to flush any in
1569 * flight bios and ensure that any further io gets deferred.
1570 */
1571int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1572{
1573	struct dm_table *map = NULL;
1574	int r = 0;
1575	int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
1576	int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
1577
1578	mutex_lock(&md->suspend_lock);
1579
1580	if (dm_suspended(md)) {
1581		r = -EINVAL;
1582		goto out_unlock;
1583	}
1584
1585	map = dm_get_table(md);
1586
1587	/*
1588	 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
1589	 * This flag is cleared before dm_suspend returns.
1590	 */
1591	if (noflush)
1592		set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
1593
1594	/* This does not get reverted if there's an error later. */
1595	dm_table_presuspend_targets(map);
1596
1597	/* bdget() can stall if the pending I/Os are not flushed */
1598	if (!noflush) {
1599		md->suspended_bdev = bdget_disk(md->disk, 0);
1600		if (!md->suspended_bdev) {
1601			DMWARN("bdget failed in dm_suspend");
1602			r = -ENOMEM;
1603			goto out;
1604		}
1605
1606		/*
1607		 * Flush I/O to the device. noflush supersedes do_lockfs,
1608		 * because lock_fs() needs to flush I/Os.
1609		 */
1610		if (do_lockfs) {
1611			r = lock_fs(md);
1612			if (r)
1613				goto out;
1614		}
1615	}
1616
1617	/*
1618	 * Here we must make sure that no processes are submitting requests
1619	 * to target drivers i.e. no one may be executing
1620	 * __split_and_process_bio. This is called from dm_request and
1621	 * dm_wq_work.
1622	 *
1623	 * To get all processes out of __split_and_process_bio in dm_request,
1624	 * we take the write lock. To prevent any process from reentering
1625	 * __split_and_process_bio from dm_request, we set
1626	 * DMF_QUEUE_IO_TO_THREAD.
1627	 *
1628	 * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
1629	 * and call flush_workqueue(md->wq). flush_workqueue will wait until
1630	 * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
1631	 * further calls to __split_and_process_bio from dm_wq_work.
1632	 */
1633	down_write(&md->io_lock);
1634	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
1635	set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
1636	up_write(&md->io_lock);
1637
1638	flush_workqueue(md->wq);
1639
1640	/*
1641	 * At this point no more requests are entering target request routines.
1642	 * We call dm_wait_for_completion to wait for all existing requests
1643	 * to finish.
1644	 */
1645	r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
1646
1647	down_write(&md->io_lock);
1648	if (noflush)
1649		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
1650	up_write(&md->io_lock);
1651
1652	/* were we interrupted ? */
1653	if (r < 0) {
1654		dm_queue_flush(md);
1655
1656		unlock_fs(md);
1657		goto out; /* pushback list is already flushed, so skip flush */
1658	}
1659
1660	/*
1661	 * If dm_wait_for_completion returned 0, the device is completely
1662	 * quiescent now. There is no request-processing activity. All new
1663	 * requests are being added to md->deferred list.
1664	 */
1665
1666	dm_table_postsuspend_targets(map);
1667
1668	set_bit(DMF_SUSPENDED, &md->flags);
1669
1670out:
1671	if (r && md->suspended_bdev) {
1672		bdput(md->suspended_bdev);
1673		md->suspended_bdev = NULL;
1674	}
1675
1676	dm_table_put(map);
1677
1678out_unlock:
1679	mutex_unlock(&md->suspend_lock);
1680	return r;
1681}
1682
1683int dm_resume(struct mapped_device *md)
1684{
1685	int r = -EINVAL;
1686	struct dm_table *map = NULL;
1687
1688	mutex_lock(&md->suspend_lock);
1689	if (!dm_suspended(md))
1690		goto out;
1691
1692	map = dm_get_table(md);
1693	if (!map || !dm_table_get_size(map))
1694		goto out;
1695
1696	r = dm_table_resume_targets(map);
1697	if (r)
1698		goto out;
1699
1700	dm_queue_flush(md);
1701
1702	unlock_fs(md);
1703
1704	if (md->suspended_bdev) {
1705		bdput(md->suspended_bdev);
1706		md->suspended_bdev = NULL;
1707	}
1708
1709	clear_bit(DMF_SUSPENDED, &md->flags);
1710
1711	dm_table_unplug_all(map);
1712
1713	dm_kobject_uevent(md);
1714
1715	r = 0;
1716
1717out:
1718	dm_table_put(map);
1719	mutex_unlock(&md->suspend_lock);
1720
1721	return r;
1722}
1723
1724/*-----------------------------------------------------------------
1725 * Event notification.
1726 *---------------------------------------------------------------*/
1727void dm_kobject_uevent(struct mapped_device *md)
1728{
1729	kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE);
1730}
1731
1732uint32_t dm_next_uevent_seq(struct mapped_device *md)
1733{
1734	return atomic_add_return(1, &md->uevent_seq);
1735}
1736
1737uint32_t dm_get_event_nr(struct mapped_device *md)
1738{
1739	return atomic_read(&md->event_nr);
1740}
1741
1742int dm_wait_event(struct mapped_device *md, int event_nr)
1743{
1744	return wait_event_interruptible(md->eventq,
1745			(event_nr != atomic_read(&md->event_nr)));
1746}
1747
1748void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
1749{
1750	unsigned long flags;
1751
1752	spin_lock_irqsave(&md->uevent_lock, flags);
1753	list_add(elist, &md->uevent_list);
1754	spin_unlock_irqrestore(&md->uevent_lock, flags);
1755}
1756
1757/*
1758 * The gendisk is only valid as long as you have a reference
1759 * count on 'md'.
1760 */
1761struct gendisk *dm_disk(struct mapped_device *md)
1762{
1763	return md->disk;
1764}
1765
1766struct kobject *dm_kobject(struct mapped_device *md)
1767{
1768	return &md->kobj;
1769}
1770
1771/*
1772 * struct mapped_device should not be exported outside of dm.c
1773 * so use this check to verify that kobj is part of md structure
1774 */
1775struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
1776{
1777	struct mapped_device *md;
1778
1779	md = container_of(kobj, struct mapped_device, kobj);
1780	if (&md->kobj != kobj)
1781		return NULL;
1782
1783	dm_get(md);
1784	return md;
1785}
1786
1787int dm_suspended(struct mapped_device *md)
1788{
1789	return test_bit(DMF_SUSPENDED, &md->flags);
1790}
1791
1792int dm_noflush_suspending(struct dm_target *ti)
1793{
1794	struct mapped_device *md = dm_table_get_md(ti->table);
1795	int r = __noflush_suspending(md);
1796
1797	dm_put(md);
1798
1799	return r;
1800}
1801EXPORT_SYMBOL_GPL(dm_noflush_suspending);
1802
1803static struct block_device_operations dm_blk_dops = {
1804	.open = dm_blk_open,
1805	.release = dm_blk_close,
1806	.ioctl = dm_blk_ioctl,
1807	.getgeo = dm_blk_getgeo,
1808	.owner = THIS_MODULE
1809};
1810
1811EXPORT_SYMBOL(dm_get_mapinfo);
1812
1813/*
1814 * module hooks
1815 */
1816module_init(dm_init);
1817module_exit(dm_exit);
1818
1819module_param(major, uint, 0);
1820MODULE_PARM_DESC(major, "The major number of the device mapper");
1821MODULE_DESCRIPTION(DM_NAME " driver");
1822MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1823MODULE_LICENSE("GPL");
Configure Feed

Configure Feed