commit a4ffc0a0b240a29cbe489f6db9dae112a49ef1c1 · tjh.dev/kernel

+12 -12

drivers/md/Kconfig

··· 204 205 config DM_DEBUG 206 boolean "Device mapper debugging support" 207 - depends on BLK_DEV_DM && EXPERIMENTAL 208 ---help--- 209 Enable this for messages that may help debug device-mapper problems. 210 ··· 212 213 config DM_CRYPT 214 tristate "Crypt target support" 215 - depends on BLK_DEV_DM && EXPERIMENTAL 216 select CRYPTO 217 select CRYPTO_CBC 218 ---help--- ··· 230 If unsure, say N. 231 232 config DM_SNAPSHOT 233 - tristate "Snapshot target (EXPERIMENTAL)" 234 - depends on BLK_DEV_DM && EXPERIMENTAL 235 ---help--- 236 Allow volume managers to take writable snapshots of a device. 237 238 config DM_MIRROR 239 - tristate "Mirror target (EXPERIMENTAL)" 240 - depends on BLK_DEV_DM && EXPERIMENTAL 241 ---help--- 242 Allow volume managers to mirror logical volumes, also 243 needed for live data migration tools such as 'pvmove'. 244 245 config DM_ZERO 246 - tristate "Zero target (EXPERIMENTAL)" 247 - depends on BLK_DEV_DM && EXPERIMENTAL 248 ---help--- 249 A target that discards writes, and returns all zeroes for 250 reads. Useful in some recovery situations. 251 252 config DM_MULTIPATH 253 - tristate "Multipath target (EXPERIMENTAL)" 254 - depends on BLK_DEV_DM && EXPERIMENTAL 255 ---help--- 256 Allow volume managers to support multipath hardware. 257 258 config DM_MULTIPATH_EMC 259 - tristate "EMC CX/AX multipath support (EXPERIMENTAL)" 260 - depends on DM_MULTIPATH && BLK_DEV_DM && EXPERIMENTAL 261 ---help--- 262 Multipath support for EMC CX/AX series hardware. 263

··· 204 205 config DM_DEBUG 206 boolean "Device mapper debugging support" 207 + depends on BLK_DEV_DM 208 ---help--- 209 Enable this for messages that may help debug device-mapper problems. 210 ··· 212 213 config DM_CRYPT 214 tristate "Crypt target support" 215 + depends on BLK_DEV_DM 216 select CRYPTO 217 select CRYPTO_CBC 218 ---help--- ··· 230 If unsure, say N. 231 232 config DM_SNAPSHOT 233 + tristate "Snapshot target" 234 + depends on BLK_DEV_DM 235 ---help--- 236 Allow volume managers to take writable snapshots of a device. 237 238 config DM_MIRROR 239 + tristate "Mirror target" 240 + depends on BLK_DEV_DM 241 ---help--- 242 Allow volume managers to mirror logical volumes, also 243 needed for live data migration tools such as 'pvmove'. 244 245 config DM_ZERO 246 + tristate "Zero target" 247 + depends on BLK_DEV_DM 248 ---help--- 249 A target that discards writes, and returns all zeroes for 250 reads. Useful in some recovery situations. 251 252 config DM_MULTIPATH 253 + tristate "Multipath target" 254 + depends on BLK_DEV_DM 255 ---help--- 256 Allow volume managers to support multipath hardware. 257 258 config DM_MULTIPATH_EMC 259 + tristate "EMC CX/AX multipath support" 260 + depends on DM_MULTIPATH && BLK_DEV_DM 261 ---help--- 262 Multipath support for EMC CX/AX series hardware. 263

+321 -167

drivers/md/dm-crypt.c

··· 1 /* 2 * Copyright (C) 2003 Christophe Saout <christophe@saout.de> 3 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> 4 - * Copyright (C) 2006 Red Hat, Inc. All rights reserved. 5 * 6 * This file is released under the GPL. 7 */ 8 9 #include <linux/err.h> 10 #include <linux/module.h> 11 #include <linux/init.h> ··· 29 #define MESG_STR(x) x, sizeof(x) 30 31 /* 32 - * per bio private data 33 - */ 34 - struct dm_crypt_io { 35 - struct dm_target *target; 36 - struct bio *base_bio; 37 - struct work_struct work; 38 - atomic_t pending; 39 - int error; 40 - }; 41 - 42 - /* 43 * context holding the current state of a multi-part conversion 44 */ 45 struct convert_context { 46 struct bio *bio_in; 47 struct bio *bio_out; 48 unsigned int offset_in; ··· 40 unsigned int idx_in; 41 unsigned int idx_out; 42 sector_t sector; 43 - int write; 44 }; 45 46 struct crypt_config; ··· 83 sector_t start; 84 85 /* 86 - * pool for per bio private data and 87 - * for encryption buffer pages 88 */ 89 mempool_t *io_pool; 90 mempool_t *page_pool; 91 struct bio_set *bs; 92 ··· 105 sector_t iv_offset; 106 unsigned int iv_size; 107 108 char cipher[CRYPTO_MAX_ALG_NAME]; 109 char chainmode[CRYPTO_MAX_ALG_NAME]; 110 - struct crypto_blkcipher *tfm; 111 unsigned long flags; 112 unsigned int key_size; 113 u8 key[0]; ··· 136 static struct kmem_cache *_crypt_io_pool; 137 138 static void clone_init(struct dm_crypt_io *, struct bio *); 139 140 /* 141 * Different IV generation algorithms: ··· 217 return PTR_ERR(essiv_tfm); 218 } 219 if (crypto_cipher_blocksize(essiv_tfm) != 220 - crypto_blkcipher_ivsize(cc->tfm)) { 221 ti->error = "Block size of ESSIV cipher does " 222 "not match IV size of block cipher"; 223 crypto_free_cipher(essiv_tfm); ··· 254 static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, 255 const char *opts) 256 { 257 - unsigned int bs = crypto_blkcipher_blocksize(cc->tfm); 258 int log = ilog2(bs); 259 260 /* we need to calculate how far we must shift the sector count ··· 318 .generator = crypt_iv_null_gen 319 }; 320 321 - static int 322 - crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out, 323 - struct scatterlist *in, unsigned int length, 324 - int write, sector_t sector) 325 - { 326 - u8 iv[cc->iv_size] __attribute__ ((aligned(__alignof__(u64)))); 327 - struct blkcipher_desc desc = { 328 - .tfm = cc->tfm, 329 - .info = iv, 330 - .flags = CRYPTO_TFM_REQ_MAY_SLEEP, 331 - }; 332 - int r; 333 - 334 - if (cc->iv_gen_ops) { 335 - r = cc->iv_gen_ops->generator(cc, iv, sector); 336 - if (r < 0) 337 - return r; 338 - 339 - if (write) 340 - r = crypto_blkcipher_encrypt_iv(&desc, out, in, length); 341 - else 342 - r = crypto_blkcipher_decrypt_iv(&desc, out, in, length); 343 - } else { 344 - if (write) 345 - r = crypto_blkcipher_encrypt(&desc, out, in, length); 346 - else 347 - r = crypto_blkcipher_decrypt(&desc, out, in, length); 348 - } 349 - 350 - return r; 351 - } 352 - 353 static void crypt_convert_init(struct crypt_config *cc, 354 struct convert_context *ctx, 355 struct bio *bio_out, struct bio *bio_in, 356 - sector_t sector, int write) 357 { 358 ctx->bio_in = bio_in; 359 ctx->bio_out = bio_out; ··· 330 ctx->idx_in = bio_in ? bio_in->bi_idx : 0; 331 ctx->idx_out = bio_out ? bio_out->bi_idx : 0; 332 ctx->sector = sector + cc->iv_offset; 333 - ctx->write = write; 334 } 335 336 /* ··· 415 416 while(ctx->idx_in < ctx->bio_in->bi_vcnt && 417 ctx->idx_out < ctx->bio_out->bi_vcnt) { 418 - struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in); 419 - struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); 420 - struct scatterlist sg_in, sg_out; 421 422 - sg_init_table(&sg_in, 1); 423 - sg_set_page(&sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, bv_in->bv_offset + ctx->offset_in); 424 425 - sg_init_table(&sg_out, 1); 426 - sg_set_page(&sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT, bv_out->bv_offset + ctx->offset_out); 427 428 - ctx->offset_in += sg_in.length; 429 - if (ctx->offset_in >= bv_in->bv_len) { 430 - ctx->offset_in = 0; 431 - ctx->idx_in++; 432 } 433 434 - ctx->offset_out += sg_out.length; 435 - if (ctx->offset_out >= bv_out->bv_len) { 436 - ctx->offset_out = 0; 437 - ctx->idx_out++; 438 - } 439 - 440 - r = crypt_convert_scatterlist(cc, &sg_out, &sg_in, sg_in.length, 441 - ctx->write, ctx->sector); 442 - if (r < 0) 443 - break; 444 - 445 - ctx->sector++; 446 } 447 448 return r; 449 } ··· 526 * One of the bios was finished. Check for completion of 527 * the whole request and correctly clean up the buffer. 528 */ 529 - static void crypt_dec_pending(struct dm_crypt_io *io, int error) 530 { 531 - struct crypt_config *cc = (struct crypt_config *) io->target->private; 532 - 533 - if (error < 0) 534 - io->error = error; 535 536 if (!atomic_dec_and_test(&io->pending)) 537 return; 538 539 bio_endio(io->base_bio, io->error); 540 - 541 mempool_free(io, cc->io_pool); 542 } 543 ··· 551 * starved by new requests which can block in the first stages due 552 * to memory allocation. 553 */ 554 - static void kcryptd_do_work(struct work_struct *work); 555 - static void kcryptd_do_crypt(struct work_struct *work); 556 - 557 - static void kcryptd_queue_io(struct dm_crypt_io *io) 558 - { 559 - struct crypt_config *cc = io->target->private; 560 - 561 - INIT_WORK(&io->work, kcryptd_do_work); 562 - queue_work(cc->io_queue, &io->work); 563 - } 564 - 565 - static void kcryptd_queue_crypt(struct dm_crypt_io *io) 566 - { 567 - struct crypt_config *cc = io->target->private; 568 - 569 - INIT_WORK(&io->work, kcryptd_do_crypt); 570 - queue_work(cc->crypt_queue, &io->work); 571 - } 572 - 573 static void crypt_endio(struct bio *clone, int error) 574 { 575 struct dm_crypt_io *io = clone->bi_private; 576 struct crypt_config *cc = io->target->private; 577 - unsigned read_io = bio_data_dir(clone) == READ; 578 579 if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error)) 580 error = -EIO; ··· 563 /* 564 * free the processed pages 565 */ 566 - if (!read_io) { 567 crypt_free_buffer_pages(cc, clone); 568 - goto out; 569 } 570 571 if (unlikely(error)) 572 - goto out; 573 574 - bio_put(clone); 575 - kcryptd_queue_crypt(io); 576 - return; 577 - 578 - out: 579 - bio_put(clone); 580 - crypt_dec_pending(io, error); 581 } 582 583 static void clone_init(struct dm_crypt_io *io, struct bio *clone) ··· 590 clone->bi_destructor = dm_crypt_bio_destructor; 591 } 592 593 - static void process_read(struct dm_crypt_io *io) 594 { 595 struct crypt_config *cc = io->target->private; 596 struct bio *base_bio = io->base_bio; 597 struct bio *clone; 598 - sector_t sector = base_bio->bi_sector - io->target->begin; 599 600 atomic_inc(&io->pending); 601 ··· 605 */ 606 clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); 607 if (unlikely(!clone)) { 608 - crypt_dec_pending(io, -ENOMEM); 609 return; 610 } 611 ··· 614 clone->bi_idx = 0; 615 clone->bi_vcnt = bio_segments(base_bio); 616 clone->bi_size = base_bio->bi_size; 617 - clone->bi_sector = cc->start + sector; 618 memcpy(clone->bi_io_vec, bio_iovec(base_bio), 619 sizeof(struct bio_vec) * clone->bi_vcnt); 620 621 generic_make_request(clone); 622 } 623 624 - static void process_write(struct dm_crypt_io *io) 625 { 626 struct crypt_config *cc = io->target->private; 627 - struct bio *base_bio = io->base_bio; 628 struct bio *clone; 629 - struct convert_context ctx; 630 - unsigned remaining = base_bio->bi_size; 631 - sector_t sector = base_bio->bi_sector - io->target->begin; 632 - 633 - atomic_inc(&io->pending); 634 - 635 - crypt_convert_init(cc, &ctx, NULL, base_bio, sector, 1); 636 637 /* 638 * The allocated buffers can be smaller than the whole bio, ··· 687 while (remaining) { 688 clone = crypt_alloc_buffer(io, remaining); 689 if (unlikely(!clone)) { 690 - crypt_dec_pending(io, -ENOMEM); 691 return; 692 } 693 694 - ctx.bio_out = clone; 695 - ctx.idx_out = 0; 696 697 - if (unlikely(crypt_convert(cc, &ctx) < 0)) { 698 - crypt_free_buffer_pages(cc, clone); 699 - bio_put(clone); 700 - crypt_dec_pending(io, -EIO); 701 - return; 702 - } 703 - 704 - /* crypt_convert should have filled the clone bio */ 705 - BUG_ON(ctx.idx_out < clone->bi_vcnt); 706 - 707 - clone->bi_sector = cc->start + sector; 708 remaining -= clone->bi_size; 709 - sector += bio_sectors(clone); 710 711 - /* Grab another reference to the io struct 712 - * before we kick off the request */ 713 - if (remaining) 714 atomic_inc(&io->pending); 715 716 - generic_make_request(clone); 717 - 718 - /* Do not reference clone after this - it 719 - * may be gone already. */ 720 - 721 /* out of memory -> run queues */ 722 - if (remaining) 723 congestion_wait(WRITE, HZ/100); 724 } 725 } 726 727 - static void process_read_endio(struct dm_crypt_io *io) 728 { 729 struct crypt_config *cc = io->target->private; 730 - struct convert_context ctx; 731 732 - crypt_convert_init(cc, &ctx, io->base_bio, io->base_bio, 733 - io->base_bio->bi_sector - io->target->begin, 0); 734 735 - crypt_dec_pending(io, crypt_convert(cc, &ctx)); 736 } 737 738 - static void kcryptd_do_work(struct work_struct *work) 739 { 740 - struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); 741 742 - if (bio_data_dir(io->base_bio) == READ) 743 - process_read(io); 744 } 745 746 - static void kcryptd_do_crypt(struct work_struct *work) 747 { 748 - struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); 749 750 if (bio_data_dir(io->base_bio) == READ) 751 - process_read_endio(io); 752 else 753 - process_write(io); 754 } 755 756 /* ··· 866 static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) 867 { 868 struct crypt_config *cc; 869 - struct crypto_blkcipher *tfm; 870 char *tmp; 871 char *cipher; 872 char *chainmode; ··· 920 goto bad_cipher; 921 } 922 923 - tfm = crypto_alloc_blkcipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); 924 if (IS_ERR(tfm)) { 925 ti->error = "Error allocating crypto tfm"; 926 goto bad_cipher; ··· 954 cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) 955 goto bad_ivmode; 956 957 - cc->iv_size = crypto_blkcipher_ivsize(tfm); 958 if (cc->iv_size) 959 /* at least a 64 bit sector number should fit in our buffer */ 960 cc->iv_size = max(cc->iv_size, ··· 974 goto bad_slab_pool; 975 } 976 977 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); 978 if (!cc->page_pool) { 979 ti->error = "Cannot allocate page mempool"; ··· 1000 goto bad_bs; 1001 } 1002 1003 - if (crypto_blkcipher_setkey(tfm, cc->key, key_size) < 0) { 1004 ti->error = "Error setting key"; 1005 goto bad_device; 1006 } ··· 1061 bad_bs: 1062 mempool_destroy(cc->page_pool); 1063 bad_page_pool: 1064 mempool_destroy(cc->io_pool); 1065 bad_slab_pool: 1066 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) 1067 cc->iv_gen_ops->dtr(cc); 1068 bad_ivmode: 1069 - crypto_free_blkcipher(tfm); 1070 bad_cipher: 1071 /* Must zero key material before freeing */ 1072 memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); ··· 1083 destroy_workqueue(cc->io_queue); 1084 destroy_workqueue(cc->crypt_queue); 1085 1086 bioset_free(cc->bs); 1087 mempool_destroy(cc->page_pool); 1088 mempool_destroy(cc->io_pool); 1089 1090 kfree(cc->iv_mode); 1091 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) 1092 cc->iv_gen_ops->dtr(cc); 1093 - crypto_free_blkcipher(cc->tfm); 1094 dm_put_device(ti, cc->dev); 1095 1096 /* Must zero key material before freeing */ ··· 1111 io = mempool_alloc(cc->io_pool, GFP_NOIO); 1112 io->target = ti; 1113 io->base_bio = bio; 1114 io->error = 0; 1115 atomic_set(&io->pending, 0); 1116

··· 1 /* 2 * Copyright (C) 2003 Christophe Saout <christophe@saout.de> 3 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> 4 + * Copyright (C) 2006-2007 Red Hat, Inc. All rights reserved. 5 * 6 * This file is released under the GPL. 7 */ 8 9 + #include <linux/completion.h> 10 #include <linux/err.h> 11 #include <linux/module.h> 12 #include <linux/init.h> ··· 28 #define MESG_STR(x) x, sizeof(x) 29 30 /* 31 * context holding the current state of a multi-part conversion 32 */ 33 struct convert_context { 34 + struct completion restart; 35 struct bio *bio_in; 36 struct bio *bio_out; 37 unsigned int offset_in; ··· 49 unsigned int idx_in; 50 unsigned int idx_out; 51 sector_t sector; 52 + atomic_t pending; 53 + }; 54 + 55 + /* 56 + * per bio private data 57 + */ 58 + struct dm_crypt_io { 59 + struct dm_target *target; 60 + struct bio *base_bio; 61 + struct work_struct work; 62 + 63 + struct convert_context ctx; 64 + 65 + atomic_t pending; 66 + int error; 67 + sector_t sector; 68 + }; 69 + 70 + struct dm_crypt_request { 71 + struct scatterlist sg_in; 72 + struct scatterlist sg_out; 73 }; 74 75 struct crypt_config; ··· 72 sector_t start; 73 74 /* 75 + * pool for per bio private data, crypto requests and 76 + * encryption requeusts/buffer pages 77 */ 78 mempool_t *io_pool; 79 + mempool_t *req_pool; 80 mempool_t *page_pool; 81 struct bio_set *bs; 82 ··· 93 sector_t iv_offset; 94 unsigned int iv_size; 95 96 + /* 97 + * Layout of each crypto request: 98 + * 99 + * struct ablkcipher_request 100 + * context 101 + * padding 102 + * struct dm_crypt_request 103 + * padding 104 + * IV 105 + * 106 + * The padding is added so that dm_crypt_request and the IV are 107 + * correctly aligned. 108 + */ 109 + unsigned int dmreq_start; 110 + struct ablkcipher_request *req; 111 + 112 char cipher[CRYPTO_MAX_ALG_NAME]; 113 char chainmode[CRYPTO_MAX_ALG_NAME]; 114 + struct crypto_ablkcipher *tfm; 115 unsigned long flags; 116 unsigned int key_size; 117 u8 key[0]; ··· 108 static struct kmem_cache *_crypt_io_pool; 109 110 static void clone_init(struct dm_crypt_io *, struct bio *); 111 + static void kcryptd_queue_crypt(struct dm_crypt_io *io); 112 113 /* 114 * Different IV generation algorithms: ··· 188 return PTR_ERR(essiv_tfm); 189 } 190 if (crypto_cipher_blocksize(essiv_tfm) != 191 + crypto_ablkcipher_ivsize(cc->tfm)) { 192 ti->error = "Block size of ESSIV cipher does " 193 "not match IV size of block cipher"; 194 crypto_free_cipher(essiv_tfm); ··· 225 static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, 226 const char *opts) 227 { 228 + unsigned bs = crypto_ablkcipher_blocksize(cc->tfm); 229 int log = ilog2(bs); 230 231 /* we need to calculate how far we must shift the sector count ··· 289 .generator = crypt_iv_null_gen 290 }; 291 292 static void crypt_convert_init(struct crypt_config *cc, 293 struct convert_context *ctx, 294 struct bio *bio_out, struct bio *bio_in, 295 + sector_t sector) 296 { 297 ctx->bio_in = bio_in; 298 ctx->bio_out = bio_out; ··· 333 ctx->idx_in = bio_in ? bio_in->bi_idx : 0; 334 ctx->idx_out = bio_out ? bio_out->bi_idx : 0; 335 ctx->sector = sector + cc->iv_offset; 336 + init_completion(&ctx->restart); 337 + /* 338 + * Crypto operation can be asynchronous, 339 + * ctx->pending is increased after request submission. 340 + * We need to ensure that we don't call the crypt finish 341 + * operation before pending got incremented 342 + * (dependent on crypt submission return code). 343 + */ 344 + atomic_set(&ctx->pending, 2); 345 + } 346 + 347 + static int crypt_convert_block(struct crypt_config *cc, 348 + struct convert_context *ctx, 349 + struct ablkcipher_request *req) 350 + { 351 + struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in); 352 + struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); 353 + struct dm_crypt_request *dmreq; 354 + u8 *iv; 355 + int r = 0; 356 + 357 + dmreq = (struct dm_crypt_request *)((char *)req + cc->dmreq_start); 358 + iv = (u8 *)ALIGN((unsigned long)(dmreq + 1), 359 + crypto_ablkcipher_alignmask(cc->tfm) + 1); 360 + 361 + sg_init_table(&dmreq->sg_in, 1); 362 + sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, 363 + bv_in->bv_offset + ctx->offset_in); 364 + 365 + sg_init_table(&dmreq->sg_out, 1); 366 + sg_set_page(&dmreq->sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT, 367 + bv_out->bv_offset + ctx->offset_out); 368 + 369 + ctx->offset_in += 1 << SECTOR_SHIFT; 370 + if (ctx->offset_in >= bv_in->bv_len) { 371 + ctx->offset_in = 0; 372 + ctx->idx_in++; 373 + } 374 + 375 + ctx->offset_out += 1 << SECTOR_SHIFT; 376 + if (ctx->offset_out >= bv_out->bv_len) { 377 + ctx->offset_out = 0; 378 + ctx->idx_out++; 379 + } 380 + 381 + if (cc->iv_gen_ops) { 382 + r = cc->iv_gen_ops->generator(cc, iv, ctx->sector); 383 + if (r < 0) 384 + return r; 385 + } 386 + 387 + ablkcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out, 388 + 1 << SECTOR_SHIFT, iv); 389 + 390 + if (bio_data_dir(ctx->bio_in) == WRITE) 391 + r = crypto_ablkcipher_encrypt(req); 392 + else 393 + r = crypto_ablkcipher_decrypt(req); 394 + 395 + return r; 396 + } 397 + 398 + static void kcryptd_async_done(struct crypto_async_request *async_req, 399 + int error); 400 + static void crypt_alloc_req(struct crypt_config *cc, 401 + struct convert_context *ctx) 402 + { 403 + if (!cc->req) 404 + cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); 405 + ablkcipher_request_set_tfm(cc->req, cc->tfm); 406 + ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG | 407 + CRYPTO_TFM_REQ_MAY_SLEEP, 408 + kcryptd_async_done, ctx); 409 } 410 411 /* ··· 346 347 while(ctx->idx_in < ctx->bio_in->bi_vcnt && 348 ctx->idx_out < ctx->bio_out->bi_vcnt) { 349 350 + crypt_alloc_req(cc, ctx); 351 352 + r = crypt_convert_block(cc, ctx, cc->req); 353 354 + switch (r) { 355 + case -EBUSY: 356 + wait_for_completion(&ctx->restart); 357 + INIT_COMPLETION(ctx->restart); 358 + /* fall through*/ 359 + case -EINPROGRESS: 360 + atomic_inc(&ctx->pending); 361 + cc->req = NULL; 362 + r = 0; 363 + /* fall through*/ 364 + case 0: 365 + ctx->sector++; 366 + continue; 367 } 368 369 + break; 370 } 371 + 372 + /* 373 + * If there are pending crypto operation run async 374 + * code. Otherwise process return code synchronously. 375 + * The step of 2 ensures that async finish doesn't 376 + * call crypto finish too early. 377 + */ 378 + if (atomic_sub_return(2, &ctx->pending)) 379 + return -EINPROGRESS; 380 381 return r; 382 } ··· 455 * One of the bios was finished. Check for completion of 456 * the whole request and correctly clean up the buffer. 457 */ 458 + static void crypt_dec_pending(struct dm_crypt_io *io) 459 { 460 + struct crypt_config *cc = io->target->private; 461 462 if (!atomic_dec_and_test(&io->pending)) 463 return; 464 465 bio_endio(io->base_bio, io->error); 466 mempool_free(io, cc->io_pool); 467 } 468 ··· 484 * starved by new requests which can block in the first stages due 485 * to memory allocation. 486 */ 487 static void crypt_endio(struct bio *clone, int error) 488 { 489 struct dm_crypt_io *io = clone->bi_private; 490 struct crypt_config *cc = io->target->private; 491 + unsigned rw = bio_data_dir(clone); 492 493 if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error)) 494 error = -EIO; ··· 515 /* 516 * free the processed pages 517 */ 518 + if (rw == WRITE) 519 crypt_free_buffer_pages(cc, clone); 520 + 521 + bio_put(clone); 522 + 523 + if (rw == READ && !error) { 524 + kcryptd_queue_crypt(io); 525 + return; 526 } 527 528 if (unlikely(error)) 529 + io->error = error; 530 531 + crypt_dec_pending(io); 532 } 533 534 static void clone_init(struct dm_crypt_io *io, struct bio *clone) ··· 543 clone->bi_destructor = dm_crypt_bio_destructor; 544 } 545 546 + static void kcryptd_io_read(struct dm_crypt_io *io) 547 { 548 struct crypt_config *cc = io->target->private; 549 struct bio *base_bio = io->base_bio; 550 struct bio *clone; 551 552 atomic_inc(&io->pending); 553 ··· 559 */ 560 clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); 561 if (unlikely(!clone)) { 562 + io->error = -ENOMEM; 563 + crypt_dec_pending(io); 564 return; 565 } 566 ··· 567 clone->bi_idx = 0; 568 clone->bi_vcnt = bio_segments(base_bio); 569 clone->bi_size = base_bio->bi_size; 570 + clone->bi_sector = cc->start + io->sector; 571 memcpy(clone->bi_io_vec, bio_iovec(base_bio), 572 sizeof(struct bio_vec) * clone->bi_vcnt); 573 574 generic_make_request(clone); 575 } 576 577 + static void kcryptd_io_write(struct dm_crypt_io *io) 578 + { 579 + struct bio *clone = io->ctx.bio_out; 580 + 581 + generic_make_request(clone); 582 + } 583 + 584 + static void kcryptd_io(struct work_struct *work) 585 + { 586 + struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); 587 + 588 + if (bio_data_dir(io->base_bio) == READ) 589 + kcryptd_io_read(io); 590 + else 591 + kcryptd_io_write(io); 592 + } 593 + 594 + static void kcryptd_queue_io(struct dm_crypt_io *io) 595 { 596 struct crypt_config *cc = io->target->private; 597 + 598 + INIT_WORK(&io->work, kcryptd_io); 599 + queue_work(cc->io_queue, &io->work); 600 + } 601 + 602 + static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, 603 + int error, int async) 604 + { 605 + struct bio *clone = io->ctx.bio_out; 606 + struct crypt_config *cc = io->target->private; 607 + 608 + if (unlikely(error < 0)) { 609 + crypt_free_buffer_pages(cc, clone); 610 + bio_put(clone); 611 + io->error = -EIO; 612 + return; 613 + } 614 + 615 + /* crypt_convert should have filled the clone bio */ 616 + BUG_ON(io->ctx.idx_out < clone->bi_vcnt); 617 + 618 + clone->bi_sector = cc->start + io->sector; 619 + io->sector += bio_sectors(clone); 620 + 621 + if (async) 622 + kcryptd_queue_io(io); 623 + else { 624 + atomic_inc(&io->pending); 625 + generic_make_request(clone); 626 + } 627 + } 628 + 629 + static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io) 630 + { 631 + struct crypt_config *cc = io->target->private; 632 struct bio *clone; 633 + unsigned remaining = io->base_bio->bi_size; 634 + int r; 635 636 /* 637 * The allocated buffers can be smaller than the whole bio, ··· 594 while (remaining) { 595 clone = crypt_alloc_buffer(io, remaining); 596 if (unlikely(!clone)) { 597 + io->error = -ENOMEM; 598 return; 599 } 600 601 + io->ctx.bio_out = clone; 602 + io->ctx.idx_out = 0; 603 604 remaining -= clone->bi_size; 605 606 + r = crypt_convert(cc, &io->ctx); 607 + 608 + if (r != -EINPROGRESS) { 609 + kcryptd_crypt_write_io_submit(io, r, 0); 610 + if (unlikely(r < 0)) 611 + return; 612 + } else 613 atomic_inc(&io->pending); 614 615 /* out of memory -> run queues */ 616 + if (unlikely(remaining)) 617 congestion_wait(WRITE, HZ/100); 618 } 619 } 620 621 + static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) 622 { 623 struct crypt_config *cc = io->target->private; 624 625 + /* 626 + * Prevent io from disappearing until this function completes. 627 + */ 628 + atomic_inc(&io->pending); 629 630 + crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, io->sector); 631 + kcryptd_crypt_write_convert_loop(io); 632 + 633 + crypt_dec_pending(io); 634 } 635 636 + static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error) 637 { 638 + if (unlikely(error < 0)) 639 + io->error = -EIO; 640 641 + crypt_dec_pending(io); 642 } 643 644 + static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) 645 { 646 + struct crypt_config *cc = io->target->private; 647 + int r = 0; 648 + 649 + atomic_inc(&io->pending); 650 + 651 + crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio, 652 + io->sector); 653 + 654 + r = crypt_convert(cc, &io->ctx); 655 + 656 + if (r != -EINPROGRESS) 657 + kcryptd_crypt_read_done(io, r); 658 + 659 + crypt_dec_pending(io); 660 + } 661 + 662 + static void kcryptd_async_done(struct crypto_async_request *async_req, 663 + int error) 664 + { 665 + struct convert_context *ctx = async_req->data; 666 + struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); 667 + struct crypt_config *cc = io->target->private; 668 + 669 + if (error == -EINPROGRESS) { 670 + complete(&ctx->restart); 671 + return; 672 + } 673 + 674 + mempool_free(ablkcipher_request_cast(async_req), cc->req_pool); 675 + 676 + if (!atomic_dec_and_test(&ctx->pending)) 677 + return; 678 679 if (bio_data_dir(io->base_bio) == READ) 680 + kcryptd_crypt_read_done(io, error); 681 else 682 + kcryptd_crypt_write_io_submit(io, error, 1); 683 + } 684 + 685 + static void kcryptd_crypt(struct work_struct *work) 686 + { 687 + struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); 688 + 689 + if (bio_data_dir(io->base_bio) == READ) 690 + kcryptd_crypt_read_convert(io); 691 + else 692 + kcryptd_crypt_write_convert(io); 693 + } 694 + 695 + static void kcryptd_queue_crypt(struct dm_crypt_io *io) 696 + { 697 + struct crypt_config *cc = io->target->private; 698 + 699 + INIT_WORK(&io->work, kcryptd_crypt); 700 + queue_work(cc->crypt_queue, &io->work); 701 } 702 703 /* ··· 733 static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) 734 { 735 struct crypt_config *cc; 736 + struct crypto_ablkcipher *tfm; 737 char *tmp; 738 char *cipher; 739 char *chainmode; ··· 787 goto bad_cipher; 788 } 789 790 + tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0); 791 if (IS_ERR(tfm)) { 792 ti->error = "Error allocating crypto tfm"; 793 goto bad_cipher; ··· 821 cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) 822 goto bad_ivmode; 823 824 + cc->iv_size = crypto_ablkcipher_ivsize(tfm); 825 if (cc->iv_size) 826 /* at least a 64 bit sector number should fit in our buffer */ 827 cc->iv_size = max(cc->iv_size, ··· 841 goto bad_slab_pool; 842 } 843 844 + cc->dmreq_start = sizeof(struct ablkcipher_request); 845 + cc->dmreq_start += crypto_ablkcipher_reqsize(tfm); 846 + cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); 847 + cc->dmreq_start += crypto_ablkcipher_alignmask(tfm) & 848 + ~(crypto_tfm_ctx_alignment() - 1); 849 + 850 + cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + 851 + sizeof(struct dm_crypt_request) + cc->iv_size); 852 + if (!cc->req_pool) { 853 + ti->error = "Cannot allocate crypt request mempool"; 854 + goto bad_req_pool; 855 + } 856 + cc->req = NULL; 857 + 858 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); 859 if (!cc->page_pool) { 860 ti->error = "Cannot allocate page mempool"; ··· 853 goto bad_bs; 854 } 855 856 + if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) { 857 ti->error = "Error setting key"; 858 goto bad_device; 859 } ··· 914 bad_bs: 915 mempool_destroy(cc->page_pool); 916 bad_page_pool: 917 + mempool_destroy(cc->req_pool); 918 + bad_req_pool: 919 mempool_destroy(cc->io_pool); 920 bad_slab_pool: 921 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) 922 cc->iv_gen_ops->dtr(cc); 923 bad_ivmode: 924 + crypto_free_ablkcipher(tfm); 925 bad_cipher: 926 /* Must zero key material before freeing */ 927 memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); ··· 934 destroy_workqueue(cc->io_queue); 935 destroy_workqueue(cc->crypt_queue); 936 937 + if (cc->req) 938 + mempool_free(cc->req, cc->req_pool); 939 + 940 bioset_free(cc->bs); 941 mempool_destroy(cc->page_pool); 942 + mempool_destroy(cc->req_pool); 943 mempool_destroy(cc->io_pool); 944 945 kfree(cc->iv_mode); 946 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) 947 cc->iv_gen_ops->dtr(cc); 948 + crypto_free_ablkcipher(cc->tfm); 949 dm_put_device(ti, cc->dev); 950 951 /* Must zero key material before freeing */ ··· 958 io = mempool_alloc(cc->io_pool, GFP_NOIO); 959 io->target = ti; 960 io->base_bio = bio; 961 + io->sector = bio->bi_sector - ti->begin; 962 io->error = 0; 963 atomic_set(&io->pending, 0); 964

+1 -1

drivers/md/dm-exception-store.c

··· 449 450 static int persistent_read_metadata(struct exception_store *store) 451 { 452 - int r, new_snapshot; 453 struct pstore *ps = get_info(store); 454 455 /*

··· 449 450 static int persistent_read_metadata(struct exception_store *store) 451 { 452 + int r, uninitialized_var(new_snapshot); 453 struct pstore *ps = get_info(store); 454 455 /*

+23 -9

drivers/md/dm-ioctl.c

··· 15 #include <linux/slab.h> 16 #include <linux/dm-ioctl.h> 17 #include <linux/hdreg.h> 18 19 #include <asm/uaccess.h> 20 ··· 703 int r; 704 char *new_name = (char *) param + param->data_start; 705 706 - if (new_name < (char *) param->data || 707 invalid_str(new_name, (void *) param + param_size)) { 708 DMWARN("Invalid new logical volume name supplied."); 709 return -EINVAL; ··· 729 if (!md) 730 return -ENXIO; 731 732 - if (geostr < (char *) param->data || 733 invalid_str(geostr, (void *) param + param_size)) { 734 DMWARN("Invalid geometry supplied."); 735 goto out; ··· 1351 { 1352 struct dm_ioctl tmp, *dmi; 1353 1354 - if (copy_from_user(&tmp, user, sizeof(tmp))) 1355 return -EFAULT; 1356 1357 - if (tmp.data_size < sizeof(tmp)) 1358 return -EINVAL; 1359 1360 dmi = vmalloc(tmp.data_size); ··· 1398 return 0; 1399 } 1400 1401 - static int ctl_ioctl(struct inode *inode, struct file *file, 1402 - uint command, ulong u) 1403 { 1404 int r = 0; 1405 unsigned int cmd; 1406 - struct dm_ioctl *param; 1407 - struct dm_ioctl __user *user = (struct dm_ioctl __user *) u; 1408 ioctl_fn fn = NULL; 1409 size_t param_size; 1410 ··· 1470 return r; 1471 } 1472 1473 static const struct file_operations _ctl_fops = { 1474 - .ioctl = ctl_ioctl, 1475 .owner = THIS_MODULE, 1476 }; 1477

··· 15 #include <linux/slab.h> 16 #include <linux/dm-ioctl.h> 17 #include <linux/hdreg.h> 18 + #include <linux/compat.h> 19 20 #include <asm/uaccess.h> 21 ··· 702 int r; 703 char *new_name = (char *) param + param->data_start; 704 705 + if (new_name < param->data || 706 invalid_str(new_name, (void *) param + param_size)) { 707 DMWARN("Invalid new logical volume name supplied."); 708 return -EINVAL; ··· 728 if (!md) 729 return -ENXIO; 730 731 + if (geostr < param->data || 732 invalid_str(geostr, (void *) param + param_size)) { 733 DMWARN("Invalid geometry supplied."); 734 goto out; ··· 1350 { 1351 struct dm_ioctl tmp, *dmi; 1352 1353 + if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data))) 1354 return -EFAULT; 1355 1356 + if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data))) 1357 return -EINVAL; 1358 1359 dmi = vmalloc(tmp.data_size); ··· 1397 return 0; 1398 } 1399 1400 + static int ctl_ioctl(uint command, struct dm_ioctl __user *user) 1401 { 1402 int r = 0; 1403 unsigned int cmd; 1404 + struct dm_ioctl *uninitialized_var(param); 1405 ioctl_fn fn = NULL; 1406 size_t param_size; 1407 ··· 1471 return r; 1472 } 1473 1474 + static long dm_ctl_ioctl(struct file *file, uint command, ulong u) 1475 + { 1476 + return (long)ctl_ioctl(command, (struct dm_ioctl __user *)u); 1477 + } 1478 + 1479 + #ifdef CONFIG_COMPAT 1480 + static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u) 1481 + { 1482 + return (long)dm_ctl_ioctl(file, command, (ulong) compat_ptr(u)); 1483 + } 1484 + #else 1485 + #define dm_compat_ctl_ioctl NULL 1486 + #endif 1487 + 1488 static const struct file_operations _ctl_fops = { 1489 + .unlocked_ioctl = dm_ctl_ioctl, 1490 + .compat_ioctl = dm_compat_ctl_ioctl, 1491 .owner = THIS_MODULE, 1492 }; 1493

+50 -1

drivers/md/dm-log.c

··· 41 return 0; 42 } 43 44 - static struct dirty_log_type *get_type(const char *type_name) 45 { 46 struct dirty_log_type *type; 47 ··· 59 60 spin_unlock(&_lock); 61 return NULL; 62 } 63 64 static void put_type(struct dirty_log_type *type)

··· 41 return 0; 42 } 43 44 + static struct dirty_log_type *_get_type(const char *type_name) 45 { 46 struct dirty_log_type *type; 47 ··· 59 60 spin_unlock(&_lock); 61 return NULL; 62 + } 63 + 64 + /* 65 + * get_type 66 + * @type_name 67 + * 68 + * Attempt to retrieve the dirty_log_type by name. If not already 69 + * available, attempt to load the appropriate module. 70 + * 71 + * Log modules are named "dm-log-" followed by the 'type_name'. 72 + * Modules may contain multiple types. 73 + * This function will first try the module "dm-log-<type_name>", 74 + * then truncate 'type_name' on the last '-' and try again. 75 + * 76 + * For example, if type_name was "clustered-disk", it would search 77 + * 'dm-log-clustered-disk' then 'dm-log-clustered'. 78 + * 79 + * Returns: dirty_log_type* on success, NULL on failure 80 + */ 81 + static struct dirty_log_type *get_type(const char *type_name) 82 + { 83 + char *p, *type_name_dup; 84 + struct dirty_log_type *type; 85 + 86 + type = _get_type(type_name); 87 + if (type) 88 + return type; 89 + 90 + type_name_dup = kstrdup(type_name, GFP_KERNEL); 91 + if (!type_name_dup) { 92 + DMWARN("No memory left to attempt log module load for \"%s\"", 93 + type_name); 94 + return NULL; 95 + } 96 + 97 + while (request_module("dm-log-%s", type_name_dup) || 98 + !(type = _get_type(type_name))) { 99 + p = strrchr(type_name_dup, '-'); 100 + if (!p) 101 + break; 102 + p[0] = '\0'; 103 + } 104 + 105 + if (!type) 106 + DMWARN("Module for logging type \"%s\" not found.", type_name); 107 + 108 + kfree(type_name_dup); 109 + 110 + return type; 111 } 112 113 static void put_type(struct dirty_log_type *type)

+1 -1

drivers/md/dm-mpath.c

··· 106 107 static struct kmem_cache *_mpio_cache; 108 109 - struct workqueue_struct *kmultipathd; 110 static void process_queued_ios(struct work_struct *work); 111 static void trigger_event(struct work_struct *work); 112

··· 106 107 static struct kmem_cache *_mpio_cache; 108 109 + static struct workqueue_struct *kmultipathd; 110 static void process_queued_ios(struct work_struct *work); 111 static void trigger_event(struct work_struct *work); 112

+581 -93

drivers/md/dm-raid1.c

··· 6 7 #include "dm.h" 8 #include "dm-bio-list.h" 9 #include "dm-io.h" 10 #include "dm-log.h" 11 #include "kcopyd.h" ··· 21 #include <linux/vmalloc.h> 22 #include <linux/workqueue.h> 23 #include <linux/log2.h> 24 25 #define DM_MSG_PREFIX "raid1" 26 #define DM_IO_PAGES 64 ··· 115 /*----------------------------------------------------------------- 116 * Mirror set structures. 117 *---------------------------------------------------------------*/ 118 struct mirror { 119 struct mirror_set *ms; 120 atomic_t error_count; 121 struct dm_dev *dev; 122 sector_t offset; 123 }; ··· 136 struct kcopyd_client *kcopyd_client; 137 uint64_t features; 138 139 - spinlock_t lock; /* protects the next two lists */ 140 struct bio_list reads; 141 struct bio_list writes; 142 143 struct dm_io_client *io_client; 144 145 /* recovery */ 146 region_t nr_regions; 147 int in_sync; 148 int log_failure; 149 150 - struct mirror *default_mirror; /* Default mirror */ 151 152 struct workqueue_struct *kmirrord_wq; 153 struct work_struct kmirrord_work; 154 155 unsigned int nr_mirrors; 156 struct mirror mirror[0]; ··· 375 struct region_hash *rh = reg->rh; 376 377 rh->log->type->set_region_sync(rh->log, reg->key, success); 378 dispatch_bios(rh->ms, &reg->delayed_bios); 379 if (atomic_dec_and_test(&rh->recovery_in_flight)) 380 wake_up_all(&_kmirrord_recovery_stopped); ··· 649 wake(rh->ms); 650 } 651 652 /* 653 * Every mirror should look like this one. 654 */ 655 #define DEFAULT_MIRROR 0 656 657 /* 658 - * This is yucky. We squirrel the mirror_set struct away inside 659 - * bi_next for write buffers. This is safe since the bh 660 * doesn't get submitted to the lower levels of block layer. 661 */ 662 - static struct mirror_set *bio_get_ms(struct bio *bio) 663 { 664 - return (struct mirror_set *) bio->bi_next; 665 } 666 667 - static void bio_set_ms(struct bio *bio, struct mirror_set *ms) 668 { 669 - bio->bi_next = (struct bio *) ms; 670 } 671 672 /*----------------------------------------------------------------- ··· 756 static void recovery_complete(int read_err, unsigned int write_err, 757 void *context) 758 { 759 - struct region *reg = (struct region *) context; 760 761 - if (read_err) 762 /* Read error means the failure of default mirror. */ 763 DMERR_LIMIT("Unable to read primary mirror during recovery"); 764 765 - if (write_err) 766 DMERR_LIMIT("Write error during recovery (error = 0x%x)", 767 write_err); 768 769 rh_recovery_end(reg, !(read_err || write_err)); 770 } ··· 795 unsigned long flags = 0; 796 797 /* fill in the source */ 798 - m = ms->default_mirror; 799 from.bdev = m->dev->bdev; 800 from.sector = m->offset + region_to_sector(reg->rh, reg->key); 801 if (reg->key == (ms->nr_regions - 1)) { ··· 811 812 /* fill in the destinations */ 813 for (i = 0, dest = to; i < ms->nr_mirrors; i++) { 814 - if (&ms->mirror[i] == ms->default_mirror) 815 continue; 816 817 m = ms->mirror + i; ··· 865 *---------------------------------------------------------------*/ 866 static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) 867 { 868 - /* FIXME: add read balancing */ 869 - return ms->default_mirror; 870 } 871 872 /* 873 * remap a buffer to a particular mirror. 874 */ 875 - static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) 876 { 877 bio->bi_bdev = m->dev->bdev; 878 - bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); 879 } 880 881 static void do_reads(struct mirror_set *ms, struct bio_list *reads) ··· 974 975 while ((bio = bio_list_pop(reads))) { 976 region = bio_to_region(&ms->rh, bio); 977 978 /* 979 * We can only read balance if the region is in sync. 980 */ 981 - if (rh_in_sync(&ms->rh, region, 1)) 982 m = choose_mirror(ms, bio->bi_sector); 983 - else 984 - m = ms->default_mirror; 985 986 - map_bio(ms, m, bio); 987 - generic_make_request(bio); 988 } 989 } 990 ··· 1001 * RECOVERING: delay the io until recovery completes 1002 * NOSYNC: increment pending, just write to the default mirror 1003 *---------------------------------------------------------------*/ 1004 static void write_callback(unsigned long error, void *context) 1005 { 1006 - unsigned int i; 1007 - int uptodate = 1; 1008 struct bio *bio = (struct bio *) context; 1009 struct mirror_set *ms; 1010 1011 - ms = bio_get_ms(bio); 1012 - bio_set_ms(bio, NULL); 1013 1014 /* 1015 * NOTE: We don't decrement the pending count here, ··· 1072 * This way we handle both writes to SYNC and NOSYNC 1073 * regions with the same code. 1074 */ 1075 1076 - if (error) { 1077 /* 1078 - * only error the io if all mirrors failed. 1079 - * FIXME: bogus 1080 */ 1081 - uptodate = 0; 1082 - for (i = 0; i < ms->nr_mirrors; i++) 1083 - if (!test_bit(i, &error)) { 1084 - uptodate = 1; 1085 - break; 1086 - } 1087 } 1088 - bio_endio(bio, 0); 1089 } 1090 1091 static void do_write(struct mirror_set *ms, struct bio *bio) 1092 { 1093 unsigned int i; 1094 - struct io_region io[KCOPYD_MAX_REGIONS+1]; 1095 struct mirror *m; 1096 struct dm_io_request io_req = { 1097 .bi_rw = WRITE, ··· 1118 .client = ms->io_client, 1119 }; 1120 1121 - for (i = 0; i < ms->nr_mirrors; i++) { 1122 - m = ms->mirror + i; 1123 1124 - io[i].bdev = m->dev->bdev; 1125 - io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); 1126 - io[i].count = bio->bi_size >> 9; 1127 - } 1128 - 1129 - bio_set_ms(bio, ms); 1130 1131 (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); 1132 } ··· 1178 /* 1179 * Dispatch io. 1180 */ 1181 - if (unlikely(ms->log_failure)) 1182 while ((bio = bio_list_pop(&sync))) 1183 - bio_endio(bio, -EIO); 1184 - else while ((bio = bio_list_pop(&sync))) 1185 - do_write(ms, bio); 1186 1187 while ((bio = bio_list_pop(&recover))) 1188 rh_delay(&ms->rh, bio); 1189 1190 while ((bio = bio_list_pop(&nosync))) { 1191 - map_bio(ms, ms->default_mirror, bio); 1192 generic_make_request(bio); 1193 } 1194 } 1195 1196 /*----------------------------------------------------------------- 1197 * kmirrord 1198 *---------------------------------------------------------------*/ 1199 - static void do_mirror(struct work_struct *work) 1200 { 1201 struct mirror_set *ms =container_of(work, struct mirror_set, 1202 kmirrord_work); 1203 - struct bio_list reads, writes; 1204 1205 - spin_lock(&ms->lock); 1206 reads = ms->reads; 1207 writes = ms->writes; 1208 bio_list_init(&ms->reads); 1209 bio_list_init(&ms->writes); 1210 - spin_unlock(&ms->lock); 1211 1212 rh_update_states(&ms->rh); 1213 do_recovery(ms); 1214 do_reads(ms, &reads); 1215 do_writes(ms, &writes); 1216 } 1217 1218 /*----------------------------------------------------------------- 1219 * Target functions ··· 1325 ms->nr_mirrors = nr_mirrors; 1326 ms->nr_regions = dm_sector_div_up(ti->len, region_size); 1327 ms->in_sync = 0; 1328 - ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; 1329 1330 ms->io_client = dm_io_client_create(DM_IO_PAGES); 1331 if (IS_ERR(ms->io_client)) { 1332 ti->error = "Error creating dm_io client"; 1333 kfree(ms); 1334 return NULL; 1335 } ··· 1349 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { 1350 ti->error = "Error creating dirty region hash"; 1351 dm_io_client_destroy(ms->io_client); 1352 kfree(ms); 1353 return NULL; 1354 } ··· 1365 1366 dm_io_client_destroy(ms->io_client); 1367 rh_exit(&ms->rh); 1368 kfree(ms); 1369 } 1370 ··· 1393 } 1394 1395 ms->mirror[mirror].ms = ms; 1396 ms->mirror[mirror].offset = offset; 1397 1398 return 0; ··· 1547 goto err_free_context; 1548 } 1549 INIT_WORK(&ms->kmirrord_work, do_mirror); 1550 1551 r = parse_features(ms, argc, argv, &args_used); 1552 if (r) ··· 1597 1598 static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) 1599 { 1600 int should_wake = 0; 1601 struct bio_list *bl; 1602 1603 bl = (rw == WRITE) ? &ms->writes : &ms->reads; 1604 - spin_lock(&ms->lock); 1605 should_wake = !(bl->head); 1606 bio_list_add(bl, bio); 1607 - spin_unlock(&ms->lock); 1608 1609 if (should_wake) 1610 wake(ms); ··· 1620 int r, rw = bio_rw(bio); 1621 struct mirror *m; 1622 struct mirror_set *ms = ti->private; 1623 - 1624 - map_context->ll = bio_to_region(&ms->rh, bio); 1625 1626 if (rw == WRITE) { 1627 queue_bio(ms, bio, rw); 1628 return DM_MAPIO_SUBMITTED; 1629 } ··· 1634 if (r < 0 && r != -EWOULDBLOCK) 1635 return r; 1636 1637 - if (r == -EWOULDBLOCK) /* FIXME: ugly */ 1638 - r = DM_MAPIO_SUBMITTED; 1639 - 1640 /* 1641 - * We don't want to fast track a recovery just for a read 1642 - * ahead. So we just let it silently fail. 1643 - * FIXME: get rid of this. 1644 */ 1645 - if (!r && rw == READA) 1646 - return -EIO; 1647 1648 - if (!r) { 1649 - /* Pass this io over to the daemon */ 1650 queue_bio(ms, bio, rw); 1651 return DM_MAPIO_SUBMITTED; 1652 } 1653 1654 m = choose_mirror(ms, bio->bi_sector); 1655 - if (!m) 1656 return -EIO; 1657 1658 - map_bio(ms, m, bio); 1659 return DM_MAPIO_REMAPPED; 1660 } 1661 ··· 1670 { 1671 int rw = bio_rw(bio); 1672 struct mirror_set *ms = (struct mirror_set *) ti->private; 1673 - region_t region = map_context->ll; 1674 1675 /* 1676 * We need to dec pending if this was a write. 1677 */ 1678 - if (rw == WRITE) 1679 - rh_dec(&ms->rh, region); 1680 1681 - return 0; 1682 } 1683 1684 static void mirror_postsuspend(struct dm_target *ti) 1685 { 1686 - struct mirror_set *ms = (struct mirror_set *) ti->private; 1687 struct dirty_log *log = ms->rh.log; 1688 - 1689 - rh_stop_recovery(&ms->rh); 1690 - 1691 - /* Wait for all I/O we generated to complete */ 1692 - wait_event(_kmirrord_recovery_stopped, 1693 - !atomic_read(&ms->rh.recovery_in_flight)); 1694 1695 if (log->type->postsuspend && log->type->postsuspend(log)) 1696 /* FIXME: need better error handling */ 1697 - DMWARN("log suspend failed"); 1698 } 1699 1700 static void mirror_resume(struct dm_target *ti) 1701 { 1702 - struct mirror_set *ms = (struct mirror_set *) ti->private; 1703 struct dirty_log *log = ms->rh.log; 1704 if (log->type->resume && log->type->resume(log)) 1705 /* FIXME: need better error handling */ 1706 DMWARN("log resume failed"); 1707 rh_start_recovery(&ms->rh); 1708 } 1709 1710 static int mirror_status(struct dm_target *ti, status_type_t type, 1711 char *result, unsigned int maxlen) 1712 { 1713 unsigned int m, sz = 0; 1714 struct mirror_set *ms = (struct mirror_set *) ti->private; 1715 1716 switch (type) { 1717 case STATUSTYPE_INFO: 1718 DMEMIT("%d ", ms->nr_mirrors); 1719 - for (m = 0; m < ms->nr_mirrors; m++) 1720 DMEMIT("%s ", ms->mirror[m].dev->name); 1721 1722 - DMEMIT("%llu/%llu 0 ", 1723 - (unsigned long long)ms->rh.log->type-> 1724 - get_sync_count(ms->rh.log), 1725 - (unsigned long long)ms->nr_regions); 1726 1727 - sz += ms->rh.log->type->status(ms->rh.log, type, result+sz, maxlen-sz); 1728 1729 break; 1730 1731 case STATUSTYPE_TABLE: 1732 - sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen); 1733 1734 DMEMIT("%d", ms->nr_mirrors); 1735 for (m = 0; m < ms->nr_mirrors; m++) 1736 DMEMIT(" %s %llu", ms->mirror[m].dev->name, 1737 - (unsigned long long)ms->mirror[m].offset); 1738 1739 if (ms->features & DM_RAID1_HANDLE_ERRORS) 1740 DMEMIT(" 1 handle_errors"); ··· 1847 1848 static struct target_type mirror_target = { 1849 .name = "mirror", 1850 - .version = {1, 0, 3}, 1851 .module = THIS_MODULE, 1852 .ctr = mirror_ctr, 1853 .dtr = mirror_dtr, 1854 .map = mirror_map, 1855 .end_io = mirror_end_io, 1856 .postsuspend = mirror_postsuspend, 1857 .resume = mirror_resume, 1858 .status = mirror_status,

··· 6 7 #include "dm.h" 8 #include "dm-bio-list.h" 9 + #include "dm-bio-record.h" 10 #include "dm-io.h" 11 #include "dm-log.h" 12 #include "kcopyd.h" ··· 20 #include <linux/vmalloc.h> 21 #include <linux/workqueue.h> 22 #include <linux/log2.h> 23 + #include <linux/hardirq.h> 24 25 #define DM_MSG_PREFIX "raid1" 26 #define DM_IO_PAGES 64 ··· 113 /*----------------------------------------------------------------- 114 * Mirror set structures. 115 *---------------------------------------------------------------*/ 116 + enum dm_raid1_error { 117 + DM_RAID1_WRITE_ERROR, 118 + DM_RAID1_SYNC_ERROR, 119 + DM_RAID1_READ_ERROR 120 + }; 121 + 122 struct mirror { 123 struct mirror_set *ms; 124 atomic_t error_count; 125 + uint32_t error_type; 126 struct dm_dev *dev; 127 sector_t offset; 128 }; ··· 127 struct kcopyd_client *kcopyd_client; 128 uint64_t features; 129 130 + spinlock_t lock; /* protects the lists */ 131 struct bio_list reads; 132 struct bio_list writes; 133 + struct bio_list failures; 134 135 struct dm_io_client *io_client; 136 + mempool_t *read_record_pool; 137 138 /* recovery */ 139 region_t nr_regions; 140 int in_sync; 141 int log_failure; 142 + atomic_t suspend; 143 144 + atomic_t default_mirror; /* Default mirror */ 145 146 struct workqueue_struct *kmirrord_wq; 147 struct work_struct kmirrord_work; 148 + struct work_struct trigger_event; 149 150 unsigned int nr_mirrors; 151 struct mirror mirror[0]; ··· 362 struct region_hash *rh = reg->rh; 363 364 rh->log->type->set_region_sync(rh->log, reg->key, success); 365 + 366 + /* 367 + * Dispatch the bios before we call 'wake_up_all'. 368 + * This is important because if we are suspending, 369 + * we want to know that recovery is complete and 370 + * the work queue is flushed. If we wake_up_all 371 + * before we dispatch_bios (queue bios and call wake()), 372 + * then we risk suspending before the work queue 373 + * has been properly flushed. 374 + */ 375 dispatch_bios(rh->ms, &reg->delayed_bios); 376 if (atomic_dec_and_test(&rh->recovery_in_flight)) 377 wake_up_all(&_kmirrord_recovery_stopped); ··· 626 wake(rh->ms); 627 } 628 629 + #define MIN_READ_RECORDS 20 630 + struct dm_raid1_read_record { 631 + struct mirror *m; 632 + struct dm_bio_details details; 633 + }; 634 + 635 /* 636 * Every mirror should look like this one. 637 */ 638 #define DEFAULT_MIRROR 0 639 640 /* 641 + * This is yucky. We squirrel the mirror struct away inside 642 + * bi_next for read/write buffers. This is safe since the bh 643 * doesn't get submitted to the lower levels of block layer. 644 */ 645 + static struct mirror *bio_get_m(struct bio *bio) 646 { 647 + return (struct mirror *) bio->bi_next; 648 } 649 650 + static void bio_set_m(struct bio *bio, struct mirror *m) 651 { 652 + bio->bi_next = (struct bio *) m; 653 + } 654 + 655 + static struct mirror *get_default_mirror(struct mirror_set *ms) 656 + { 657 + return &ms->mirror[atomic_read(&ms->default_mirror)]; 658 + } 659 + 660 + static void set_default_mirror(struct mirror *m) 661 + { 662 + struct mirror_set *ms = m->ms; 663 + struct mirror *m0 = &(ms->mirror[0]); 664 + 665 + atomic_set(&ms->default_mirror, m - m0); 666 + } 667 + 668 + /* fail_mirror 669 + * @m: mirror device to fail 670 + * @error_type: one of the enum's, DM_RAID1_*_ERROR 671 + * 672 + * If errors are being handled, record the type of 673 + * error encountered for this device. If this type 674 + * of error has already been recorded, we can return; 675 + * otherwise, we must signal userspace by triggering 676 + * an event. Additionally, if the device is the 677 + * primary device, we must choose a new primary, but 678 + * only if the mirror is in-sync. 679 + * 680 + * This function must not block. 681 + */ 682 + static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) 683 + { 684 + struct mirror_set *ms = m->ms; 685 + struct mirror *new; 686 + 687 + if (!errors_handled(ms)) 688 + return; 689 + 690 + /* 691 + * error_count is used for nothing more than a 692 + * simple way to tell if a device has encountered 693 + * errors. 694 + */ 695 + atomic_inc(&m->error_count); 696 + 697 + if (test_and_set_bit(error_type, &m->error_type)) 698 + return; 699 + 700 + if (m != get_default_mirror(ms)) 701 + goto out; 702 + 703 + if (!ms->in_sync) { 704 + /* 705 + * Better to issue requests to same failing device 706 + * than to risk returning corrupt data. 707 + */ 708 + DMERR("Primary mirror (%s) failed while out-of-sync: " 709 + "Reads may fail.", m->dev->name); 710 + goto out; 711 + } 712 + 713 + for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) 714 + if (!atomic_read(&new->error_count)) { 715 + set_default_mirror(new); 716 + break; 717 + } 718 + 719 + if (unlikely(new == ms->mirror + ms->nr_mirrors)) 720 + DMWARN("All sides of mirror have failed."); 721 + 722 + out: 723 + schedule_work(&ms->trigger_event); 724 } 725 726 /*----------------------------------------------------------------- ··· 656 static void recovery_complete(int read_err, unsigned int write_err, 657 void *context) 658 { 659 + struct region *reg = (struct region *)context; 660 + struct mirror_set *ms = reg->rh->ms; 661 + int m, bit = 0; 662 663 + if (read_err) { 664 /* Read error means the failure of default mirror. */ 665 DMERR_LIMIT("Unable to read primary mirror during recovery"); 666 + fail_mirror(get_default_mirror(ms), DM_RAID1_SYNC_ERROR); 667 + } 668 669 + if (write_err) { 670 DMERR_LIMIT("Write error during recovery (error = 0x%x)", 671 write_err); 672 + /* 673 + * Bits correspond to devices (excluding default mirror). 674 + * The default mirror cannot change during recovery. 675 + */ 676 + for (m = 0; m < ms->nr_mirrors; m++) { 677 + if (&ms->mirror[m] == get_default_mirror(ms)) 678 + continue; 679 + if (test_bit(bit, &write_err)) 680 + fail_mirror(ms->mirror + m, 681 + DM_RAID1_SYNC_ERROR); 682 + bit++; 683 + } 684 + } 685 686 rh_recovery_end(reg, !(read_err || write_err)); 687 } ··· 678 unsigned long flags = 0; 679 680 /* fill in the source */ 681 + m = get_default_mirror(ms); 682 from.bdev = m->dev->bdev; 683 from.sector = m->offset + region_to_sector(reg->rh, reg->key); 684 if (reg->key == (ms->nr_regions - 1)) { ··· 694 695 /* fill in the destinations */ 696 for (i = 0, dest = to; i < ms->nr_mirrors; i++) { 697 + if (&ms->mirror[i] == get_default_mirror(ms)) 698 continue; 699 700 m = ms->mirror + i; ··· 748 *---------------------------------------------------------------*/ 749 static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) 750 { 751 + struct mirror *m = get_default_mirror(ms); 752 + 753 + do { 754 + if (likely(!atomic_read(&m->error_count))) 755 + return m; 756 + 757 + if (m-- == ms->mirror) 758 + m += ms->nr_mirrors; 759 + } while (m != get_default_mirror(ms)); 760 + 761 + return NULL; 762 + } 763 + 764 + static int default_ok(struct mirror *m) 765 + { 766 + struct mirror *default_mirror = get_default_mirror(m->ms); 767 + 768 + return !atomic_read(&default_mirror->error_count); 769 + } 770 + 771 + static int mirror_available(struct mirror_set *ms, struct bio *bio) 772 + { 773 + region_t region = bio_to_region(&ms->rh, bio); 774 + 775 + if (ms->rh.log->type->in_sync(ms->rh.log, region, 0)) 776 + return choose_mirror(ms, bio->bi_sector) ? 1 : 0; 777 + 778 + return 0; 779 } 780 781 /* 782 * remap a buffer to a particular mirror. 783 */ 784 + static sector_t map_sector(struct mirror *m, struct bio *bio) 785 + { 786 + return m->offset + (bio->bi_sector - m->ms->ti->begin); 787 + } 788 + 789 + static void map_bio(struct mirror *m, struct bio *bio) 790 { 791 bio->bi_bdev = m->dev->bdev; 792 + bio->bi_sector = map_sector(m, bio); 793 + } 794 + 795 + static void map_region(struct io_region *io, struct mirror *m, 796 + struct bio *bio) 797 + { 798 + io->bdev = m->dev->bdev; 799 + io->sector = map_sector(m, bio); 800 + io->count = bio->bi_size >> 9; 801 + } 802 + 803 + /*----------------------------------------------------------------- 804 + * Reads 805 + *---------------------------------------------------------------*/ 806 + static void read_callback(unsigned long error, void *context) 807 + { 808 + struct bio *bio = context; 809 + struct mirror *m; 810 + 811 + m = bio_get_m(bio); 812 + bio_set_m(bio, NULL); 813 + 814 + if (likely(!error)) { 815 + bio_endio(bio, 0); 816 + return; 817 + } 818 + 819 + fail_mirror(m, DM_RAID1_READ_ERROR); 820 + 821 + if (likely(default_ok(m)) || mirror_available(m->ms, bio)) { 822 + DMWARN_LIMIT("Read failure on mirror device %s. " 823 + "Trying alternative device.", 824 + m->dev->name); 825 + queue_bio(m->ms, bio, bio_rw(bio)); 826 + return; 827 + } 828 + 829 + DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.", 830 + m->dev->name); 831 + bio_endio(bio, -EIO); 832 + } 833 + 834 + /* Asynchronous read. */ 835 + static void read_async_bio(struct mirror *m, struct bio *bio) 836 + { 837 + struct io_region io; 838 + struct dm_io_request io_req = { 839 + .bi_rw = READ, 840 + .mem.type = DM_IO_BVEC, 841 + .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, 842 + .notify.fn = read_callback, 843 + .notify.context = bio, 844 + .client = m->ms->io_client, 845 + }; 846 + 847 + map_region(&io, m, bio); 848 + bio_set_m(bio, m); 849 + (void) dm_io(&io_req, 1, &io, NULL); 850 } 851 852 static void do_reads(struct mirror_set *ms, struct bio_list *reads) ··· 769 770 while ((bio = bio_list_pop(reads))) { 771 region = bio_to_region(&ms->rh, bio); 772 + m = get_default_mirror(ms); 773 774 /* 775 * We can only read balance if the region is in sync. 776 */ 777 + if (likely(rh_in_sync(&ms->rh, region, 1))) 778 m = choose_mirror(ms, bio->bi_sector); 779 + else if (m && atomic_read(&m->error_count)) 780 + m = NULL; 781 782 + if (likely(m)) 783 + read_async_bio(m, bio); 784 + else 785 + bio_endio(bio, -EIO); 786 } 787 } 788 ··· 793 * RECOVERING: delay the io until recovery completes 794 * NOSYNC: increment pending, just write to the default mirror 795 *---------------------------------------------------------------*/ 796 + 797 + /* __bio_mark_nosync 798 + * @ms 799 + * @bio 800 + * @done 801 + * @error 802 + * 803 + * The bio was written on some mirror(s) but failed on other mirror(s). 804 + * We can successfully endio the bio but should avoid the region being 805 + * marked clean by setting the state RH_NOSYNC. 806 + * 807 + * This function is _not_ safe in interrupt context! 808 + */ 809 + static void __bio_mark_nosync(struct mirror_set *ms, 810 + struct bio *bio, unsigned done, int error) 811 + { 812 + unsigned long flags; 813 + struct region_hash *rh = &ms->rh; 814 + struct dirty_log *log = ms->rh.log; 815 + struct region *reg; 816 + region_t region = bio_to_region(rh, bio); 817 + int recovering = 0; 818 + 819 + /* We must inform the log that the sync count has changed. */ 820 + log->type->set_region_sync(log, region, 0); 821 + ms->in_sync = 0; 822 + 823 + read_lock(&rh->hash_lock); 824 + reg = __rh_find(rh, region); 825 + read_unlock(&rh->hash_lock); 826 + 827 + /* region hash entry should exist because write was in-flight */ 828 + BUG_ON(!reg); 829 + BUG_ON(!list_empty(&reg->list)); 830 + 831 + spin_lock_irqsave(&rh->region_lock, flags); 832 + /* 833 + * Possible cases: 834 + * 1) RH_DIRTY 835 + * 2) RH_NOSYNC: was dirty, other preceeding writes failed 836 + * 3) RH_RECOVERING: flushing pending writes 837 + * Either case, the region should have not been connected to list. 838 + */ 839 + recovering = (reg->state == RH_RECOVERING); 840 + reg->state = RH_NOSYNC; 841 + BUG_ON(!list_empty(&reg->list)); 842 + spin_unlock_irqrestore(&rh->region_lock, flags); 843 + 844 + bio_endio(bio, error); 845 + if (recovering) 846 + complete_resync_work(reg, 0); 847 + } 848 + 849 static void write_callback(unsigned long error, void *context) 850 { 851 + unsigned i, ret = 0; 852 struct bio *bio = (struct bio *) context; 853 struct mirror_set *ms; 854 + int uptodate = 0; 855 + int should_wake = 0; 856 + unsigned long flags; 857 858 + ms = bio_get_m(bio)->ms; 859 + bio_set_m(bio, NULL); 860 861 /* 862 * NOTE: We don't decrement the pending count here, ··· 809 * This way we handle both writes to SYNC and NOSYNC 810 * regions with the same code. 811 */ 812 + if (likely(!error)) 813 + goto out; 814 815 + for (i = 0; i < ms->nr_mirrors; i++) 816 + if (test_bit(i, &error)) 817 + fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); 818 + else 819 + uptodate = 1; 820 + 821 + if (unlikely(!uptodate)) { 822 + DMERR("All replicated volumes dead, failing I/O"); 823 + /* None of the writes succeeded, fail the I/O. */ 824 + ret = -EIO; 825 + } else if (errors_handled(ms)) { 826 /* 827 + * Need to raise event. Since raising 828 + * events can block, we need to do it in 829 + * the main thread. 830 */ 831 + spin_lock_irqsave(&ms->lock, flags); 832 + if (!ms->failures.head) 833 + should_wake = 1; 834 + bio_list_add(&ms->failures, bio); 835 + spin_unlock_irqrestore(&ms->lock, flags); 836 + if (should_wake) 837 + wake(ms); 838 + return; 839 } 840 + out: 841 + bio_endio(bio, ret); 842 } 843 844 static void do_write(struct mirror_set *ms, struct bio *bio) 845 { 846 unsigned int i; 847 + struct io_region io[ms->nr_mirrors], *dest = io; 848 struct mirror *m; 849 struct dm_io_request io_req = { 850 .bi_rw = WRITE, ··· 839 .client = ms->io_client, 840 }; 841 842 + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) 843 + map_region(dest++, m, bio); 844 845 + /* 846 + * Use default mirror because we only need it to retrieve the reference 847 + * to the mirror set in write_callback(). 848 + */ 849 + bio_set_m(bio, get_default_mirror(ms)); 850 851 (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); 852 } ··· 900 /* 901 * Dispatch io. 902 */ 903 + if (unlikely(ms->log_failure)) { 904 + spin_lock_irq(&ms->lock); 905 + bio_list_merge(&ms->failures, &sync); 906 + spin_unlock_irq(&ms->lock); 907 + } else 908 while ((bio = bio_list_pop(&sync))) 909 + do_write(ms, bio); 910 911 while ((bio = bio_list_pop(&recover))) 912 rh_delay(&ms->rh, bio); 913 914 while ((bio = bio_list_pop(&nosync))) { 915 + map_bio(get_default_mirror(ms), bio); 916 generic_make_request(bio); 917 } 918 + } 919 + 920 + static void do_failures(struct mirror_set *ms, struct bio_list *failures) 921 + { 922 + struct bio *bio; 923 + 924 + if (!failures->head) 925 + return; 926 + 927 + if (!ms->log_failure) { 928 + while ((bio = bio_list_pop(failures))) 929 + __bio_mark_nosync(ms, bio, bio->bi_size, 0); 930 + return; 931 + } 932 + 933 + /* 934 + * If the log has failed, unattempted writes are being 935 + * put on the failures list. We can't issue those writes 936 + * until a log has been marked, so we must store them. 937 + * 938 + * If a 'noflush' suspend is in progress, we can requeue 939 + * the I/O's to the core. This give userspace a chance 940 + * to reconfigure the mirror, at which point the core 941 + * will reissue the writes. If the 'noflush' flag is 942 + * not set, we have no choice but to return errors. 943 + * 944 + * Some writes on the failures list may have been 945 + * submitted before the log failure and represent a 946 + * failure to write to one of the devices. It is ok 947 + * for us to treat them the same and requeue them 948 + * as well. 949 + */ 950 + if (dm_noflush_suspending(ms->ti)) { 951 + while ((bio = bio_list_pop(failures))) 952 + bio_endio(bio, DM_ENDIO_REQUEUE); 953 + return; 954 + } 955 + 956 + if (atomic_read(&ms->suspend)) { 957 + while ((bio = bio_list_pop(failures))) 958 + bio_endio(bio, -EIO); 959 + return; 960 + } 961 + 962 + spin_lock_irq(&ms->lock); 963 + bio_list_merge(&ms->failures, failures); 964 + spin_unlock_irq(&ms->lock); 965 + 966 + wake(ms); 967 + } 968 + 969 + static void trigger_event(struct work_struct *work) 970 + { 971 + struct mirror_set *ms = 972 + container_of(work, struct mirror_set, trigger_event); 973 + 974 + dm_table_event(ms->ti->table); 975 } 976 977 /*----------------------------------------------------------------- 978 * kmirrord 979 *---------------------------------------------------------------*/ 980 + static int _do_mirror(struct work_struct *work) 981 { 982 struct mirror_set *ms =container_of(work, struct mirror_set, 983 kmirrord_work); 984 + struct bio_list reads, writes, failures; 985 + unsigned long flags; 986 987 + spin_lock_irqsave(&ms->lock, flags); 988 reads = ms->reads; 989 writes = ms->writes; 990 + failures = ms->failures; 991 bio_list_init(&ms->reads); 992 bio_list_init(&ms->writes); 993 + bio_list_init(&ms->failures); 994 + spin_unlock_irqrestore(&ms->lock, flags); 995 996 rh_update_states(&ms->rh); 997 do_recovery(ms); 998 do_reads(ms, &reads); 999 do_writes(ms, &writes); 1000 + do_failures(ms, &failures); 1001 + 1002 + return (ms->failures.head) ? 1 : 0; 1003 } 1004 + 1005 + static void do_mirror(struct work_struct *work) 1006 + { 1007 + /* 1008 + * If _do_mirror returns 1, we give it 1009 + * another shot. This helps for cases like 1010 + * 'suspend' where we call flush_workqueue 1011 + * and expect all work to be finished. If 1012 + * a failure happens during a suspend, we 1013 + * couldn't issue a 'wake' because it would 1014 + * not be honored. Therefore, we return '1' 1015 + * from _do_mirror, and retry here. 1016 + */ 1017 + while (_do_mirror(work)) 1018 + schedule(); 1019 + } 1020 + 1021 1022 /*----------------------------------------------------------------- 1023 * Target functions ··· 965 ms->nr_mirrors = nr_mirrors; 966 ms->nr_regions = dm_sector_div_up(ti->len, region_size); 967 ms->in_sync = 0; 968 + ms->log_failure = 0; 969 + atomic_set(&ms->suspend, 0); 970 + atomic_set(&ms->default_mirror, DEFAULT_MIRROR); 971 + 972 + len = sizeof(struct dm_raid1_read_record); 973 + ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS, 974 + len); 975 + if (!ms->read_record_pool) { 976 + ti->error = "Error creating mirror read_record_pool"; 977 + kfree(ms); 978 + return NULL; 979 + } 980 981 ms->io_client = dm_io_client_create(DM_IO_PAGES); 982 if (IS_ERR(ms->io_client)) { 983 ti->error = "Error creating dm_io client"; 984 + mempool_destroy(ms->read_record_pool); 985 kfree(ms); 986 return NULL; 987 } ··· 977 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { 978 ti->error = "Error creating dirty region hash"; 979 dm_io_client_destroy(ms->io_client); 980 + mempool_destroy(ms->read_record_pool); 981 kfree(ms); 982 return NULL; 983 } ··· 992 993 dm_io_client_destroy(ms->io_client); 994 rh_exit(&ms->rh); 995 + mempool_destroy(ms->read_record_pool); 996 kfree(ms); 997 } 998 ··· 1019 } 1020 1021 ms->mirror[mirror].ms = ms; 1022 + atomic_set(&(ms->mirror[mirror].error_count), 0); 1023 + ms->mirror[mirror].error_type = 0; 1024 ms->mirror[mirror].offset = offset; 1025 1026 return 0; ··· 1171 goto err_free_context; 1172 } 1173 INIT_WORK(&ms->kmirrord_work, do_mirror); 1174 + INIT_WORK(&ms->trigger_event, trigger_event); 1175 1176 r = parse_features(ms, argc, argv, &args_used); 1177 if (r) ··· 1220 1221 static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) 1222 { 1223 + unsigned long flags; 1224 int should_wake = 0; 1225 struct bio_list *bl; 1226 1227 bl = (rw == WRITE) ? &ms->writes : &ms->reads; 1228 + spin_lock_irqsave(&ms->lock, flags); 1229 should_wake = !(bl->head); 1230 bio_list_add(bl, bio); 1231 + spin_unlock_irqrestore(&ms->lock, flags); 1232 1233 if (should_wake) 1234 wake(ms); ··· 1242 int r, rw = bio_rw(bio); 1243 struct mirror *m; 1244 struct mirror_set *ms = ti->private; 1245 + struct dm_raid1_read_record *read_record = NULL; 1246 1247 if (rw == WRITE) { 1248 + /* Save region for mirror_end_io() handler */ 1249 + map_context->ll = bio_to_region(&ms->rh, bio); 1250 queue_bio(ms, bio, rw); 1251 return DM_MAPIO_SUBMITTED; 1252 } ··· 1255 if (r < 0 && r != -EWOULDBLOCK) 1256 return r; 1257 1258 /* 1259 + * If region is not in-sync queue the bio. 1260 */ 1261 + if (!r || (r == -EWOULDBLOCK)) { 1262 + if (rw == READA) 1263 + return -EWOULDBLOCK; 1264 1265 queue_bio(ms, bio, rw); 1266 return DM_MAPIO_SUBMITTED; 1267 } 1268 1269 + /* 1270 + * The region is in-sync and we can perform reads directly. 1271 + * Store enough information so we can retry if it fails. 1272 + */ 1273 m = choose_mirror(ms, bio->bi_sector); 1274 + if (unlikely(!m)) 1275 return -EIO; 1276 1277 + read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO); 1278 + if (likely(read_record)) { 1279 + dm_bio_record(&read_record->details, bio); 1280 + map_context->ptr = read_record; 1281 + read_record->m = m; 1282 + } 1283 + 1284 + map_bio(m, bio); 1285 + 1286 return DM_MAPIO_REMAPPED; 1287 } 1288 ··· 1285 { 1286 int rw = bio_rw(bio); 1287 struct mirror_set *ms = (struct mirror_set *) ti->private; 1288 + struct mirror *m = NULL; 1289 + struct dm_bio_details *bd = NULL; 1290 + struct dm_raid1_read_record *read_record = map_context->ptr; 1291 1292 /* 1293 * We need to dec pending if this was a write. 1294 */ 1295 + if (rw == WRITE) { 1296 + rh_dec(&ms->rh, map_context->ll); 1297 + return error; 1298 + } 1299 1300 + if (error == -EOPNOTSUPP) 1301 + goto out; 1302 + 1303 + if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) 1304 + goto out; 1305 + 1306 + if (unlikely(error)) { 1307 + if (!read_record) { 1308 + /* 1309 + * There wasn't enough memory to record necessary 1310 + * information for a retry or there was no other 1311 + * mirror in-sync. 1312 + */ 1313 + DMERR_LIMIT("Mirror read failed from %s.", 1314 + m->dev->name); 1315 + return -EIO; 1316 + } 1317 + DMERR("Mirror read failed from %s. Trying alternative device.", 1318 + m->dev->name); 1319 + 1320 + m = read_record->m; 1321 + fail_mirror(m, DM_RAID1_READ_ERROR); 1322 + 1323 + /* 1324 + * A failed read is requeued for another attempt using an intact 1325 + * mirror. 1326 + */ 1327 + if (default_ok(m) || mirror_available(ms, bio)) { 1328 + bd = &read_record->details; 1329 + 1330 + dm_bio_restore(bd, bio); 1331 + mempool_free(read_record, ms->read_record_pool); 1332 + map_context->ptr = NULL; 1333 + queue_bio(ms, bio, rw); 1334 + return 1; 1335 + } 1336 + DMERR("All replicated volumes dead, failing I/O"); 1337 + } 1338 + 1339 + out: 1340 + if (read_record) { 1341 + mempool_free(read_record, ms->read_record_pool); 1342 + map_context->ptr = NULL; 1343 + } 1344 + 1345 + return error; 1346 + } 1347 + 1348 + static void mirror_presuspend(struct dm_target *ti) 1349 + { 1350 + struct mirror_set *ms = (struct mirror_set *) ti->private; 1351 + struct dirty_log *log = ms->rh.log; 1352 + 1353 + atomic_set(&ms->suspend, 1); 1354 + 1355 + /* 1356 + * We must finish up all the work that we've 1357 + * generated (i.e. recovery work). 1358 + */ 1359 + rh_stop_recovery(&ms->rh); 1360 + 1361 + wait_event(_kmirrord_recovery_stopped, 1362 + !atomic_read(&ms->rh.recovery_in_flight)); 1363 + 1364 + if (log->type->presuspend && log->type->presuspend(log)) 1365 + /* FIXME: need better error handling */ 1366 + DMWARN("log presuspend failed"); 1367 + 1368 + /* 1369 + * Now that recovery is complete/stopped and the 1370 + * delayed bios are queued, we need to wait for 1371 + * the worker thread to complete. This way, 1372 + * we know that all of our I/O has been pushed. 1373 + */ 1374 + flush_workqueue(ms->kmirrord_wq); 1375 } 1376 1377 static void mirror_postsuspend(struct dm_target *ti) 1378 { 1379 + struct mirror_set *ms = ti->private; 1380 struct dirty_log *log = ms->rh.log; 1381 1382 if (log->type->postsuspend && log->type->postsuspend(log)) 1383 /* FIXME: need better error handling */ 1384 + DMWARN("log postsuspend failed"); 1385 } 1386 1387 static void mirror_resume(struct dm_target *ti) 1388 { 1389 + struct mirror_set *ms = ti->private; 1390 struct dirty_log *log = ms->rh.log; 1391 + 1392 + atomic_set(&ms->suspend, 0); 1393 if (log->type->resume && log->type->resume(log)) 1394 /* FIXME: need better error handling */ 1395 DMWARN("log resume failed"); 1396 rh_start_recovery(&ms->rh); 1397 } 1398 1399 + /* 1400 + * device_status_char 1401 + * @m: mirror device/leg we want the status of 1402 + * 1403 + * We return one character representing the most severe error 1404 + * we have encountered. 1405 + * A => Alive - No failures 1406 + * D => Dead - A write failure occurred leaving mirror out-of-sync 1407 + * S => Sync - A sychronization failure occurred, mirror out-of-sync 1408 + * R => Read - A read failure occurred, mirror data unaffected 1409 + * 1410 + * Returns: <char> 1411 + */ 1412 + static char device_status_char(struct mirror *m) 1413 + { 1414 + if (!atomic_read(&(m->error_count))) 1415 + return 'A'; 1416 + 1417 + return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : 1418 + (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : 1419 + (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; 1420 + } 1421 + 1422 + 1423 static int mirror_status(struct dm_target *ti, status_type_t type, 1424 char *result, unsigned int maxlen) 1425 { 1426 unsigned int m, sz = 0; 1427 struct mirror_set *ms = (struct mirror_set *) ti->private; 1428 + struct dirty_log *log = ms->rh.log; 1429 + char buffer[ms->nr_mirrors + 1]; 1430 1431 switch (type) { 1432 case STATUSTYPE_INFO: 1433 DMEMIT("%d ", ms->nr_mirrors); 1434 + for (m = 0; m < ms->nr_mirrors; m++) { 1435 DMEMIT("%s ", ms->mirror[m].dev->name); 1436 + buffer[m] = device_status_char(&(ms->mirror[m])); 1437 + } 1438 + buffer[m] = '\0'; 1439 1440 + DMEMIT("%llu/%llu 1 %s ", 1441 + (unsigned long long)log->type->get_sync_count(ms->rh.log), 1442 + (unsigned long long)ms->nr_regions, buffer); 1443 1444 + sz += log->type->status(ms->rh.log, type, result+sz, maxlen-sz); 1445 1446 break; 1447 1448 case STATUSTYPE_TABLE: 1449 + sz = log->type->status(ms->rh.log, type, result, maxlen); 1450 1451 DMEMIT("%d", ms->nr_mirrors); 1452 for (m = 0; m < ms->nr_mirrors; m++) 1453 DMEMIT(" %s %llu", ms->mirror[m].dev->name, 1454 + (unsigned long long)ms->mirror[m].offset); 1455 1456 if (ms->features & DM_RAID1_HANDLE_ERRORS) 1457 DMEMIT(" 1 handle_errors"); ··· 1360 1361 static struct target_type mirror_target = { 1362 .name = "mirror", 1363 + .version = {1, 0, 20}, 1364 .module = THIS_MODULE, 1365 .ctr = mirror_ctr, 1366 .dtr = mirror_dtr, 1367 .map = mirror_map, 1368 .end_io = mirror_end_io, 1369 + .presuspend = mirror_presuspend, 1370 .postsuspend = mirror_postsuspend, 1371 .resume = mirror_resume, 1372 .status = mirror_status,

+69 -26

drivers/md/dm-snap.c

··· 213 214 /* 215 * Implementation of the exception hash tables. 216 */ 217 - static int init_exception_table(struct exception_table *et, uint32_t size) 218 { 219 unsigned int i; 220 221 et->hash_mask = size - 1; 222 et->table = dm_vcalloc(size, sizeof(struct list_head)); 223 if (!et->table) ··· 252 253 static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) 254 { 255 - return chunk & et->hash_mask; 256 } 257 258 static void insert_exception(struct exception_table *eh, ··· 279 280 slot = &et->table[exception_hash(et, chunk)]; 281 list_for_each_entry (e, slot, hash_list) 282 - if (e->old_chunk == chunk) 283 return e; 284 285 return NULL; ··· 312 mempool_free(pe, pending_pool); 313 } 314 315 int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) 316 { 317 struct dm_snap_exception *e; ··· 364 return -ENOMEM; 365 366 e->old_chunk = old; 367 e->new_chunk = new; 368 - insert_exception(&s->complete, e); 369 return 0; 370 } 371 ··· 383 mem /= sizeof(struct list_head); 384 385 return mem; 386 - } 387 - 388 - /* 389 - * Rounds a number down to a power of 2. 390 - */ 391 - static uint32_t round_down(uint32_t n) 392 - { 393 - while (n & (n - 1)) 394 - n &= (n - 1); 395 - return n; 396 } 397 398 /* ··· 403 hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift; 404 hash_size = min(hash_size, max_buckets); 405 406 - /* Round it down to a power of 2 */ 407 - hash_size = round_down(hash_size); 408 - if (init_exception_table(&s->complete, hash_size)) 409 return -ENOMEM; 410 411 /* ··· 416 if (hash_size < 64) 417 hash_size = 64; 418 419 - if (init_exception_table(&s->pending, hash_size)) { 420 exit_exception_table(&s->complete, exception_cache); 421 return -ENOMEM; 422 } ··· 775 * Add a proper exception, and remove the 776 * in-flight exception from the list. 777 */ 778 - insert_exception(&s->complete, e); 779 780 out: 781 remove_exception(&pe->e); ··· 909 } 910 911 static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, 912 - struct bio *bio) 913 { 914 bio->bi_bdev = s->cow->bdev; 915 - bio->bi_sector = chunk_to_sector(s, e->new_chunk) + 916 - (bio->bi_sector & s->chunk_mask); 917 } 918 919 static int snapshot_map(struct dm_target *ti, struct bio *bio, ··· 945 /* If the block is already remapped - use that, else remap it */ 946 e = lookup_exception(&s->complete, chunk); 947 if (e) { 948 - remap_exception(s, e, bio); 949 goto out_unlock; 950 } 951 ··· 962 goto out_unlock; 963 } 964 965 - remap_exception(s, &pe->e, bio); 966 bio_list_add(&pe->snapshot_bios, bio); 967 968 r = DM_MAPIO_SUBMITTED; ··· 1250 1251 static struct target_type origin_target = { 1252 .name = "snapshot-origin", 1253 - .version = {1, 5, 0}, 1254 .module = THIS_MODULE, 1255 .ctr = origin_ctr, 1256 .dtr = origin_dtr, ··· 1261 1262 static struct target_type snapshot_target = { 1263 .name = "snapshot", 1264 - .version = {1, 5, 0}, 1265 .module = THIS_MODULE, 1266 .ctr = snapshot_ctr, 1267 .dtr = snapshot_dtr,

··· 213 214 /* 215 * Implementation of the exception hash tables. 216 + * The lowest hash_shift bits of the chunk number are ignored, allowing 217 + * some consecutive chunks to be grouped together. 218 */ 219 + static int init_exception_table(struct exception_table *et, uint32_t size, 220 + unsigned hash_shift) 221 { 222 unsigned int i; 223 224 + et->hash_shift = hash_shift; 225 et->hash_mask = size - 1; 226 et->table = dm_vcalloc(size, sizeof(struct list_head)); 227 if (!et->table) ··· 248 249 static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) 250 { 251 + return (chunk >> et->hash_shift) & et->hash_mask; 252 } 253 254 static void insert_exception(struct exception_table *eh, ··· 275 276 slot = &et->table[exception_hash(et, chunk)]; 277 list_for_each_entry (e, slot, hash_list) 278 + if (chunk >= e->old_chunk && 279 + chunk <= e->old_chunk + dm_consecutive_chunk_count(e)) 280 return e; 281 282 return NULL; ··· 307 mempool_free(pe, pending_pool); 308 } 309 310 + static void insert_completed_exception(struct dm_snapshot *s, 311 + struct dm_snap_exception *new_e) 312 + { 313 + struct exception_table *eh = &s->complete; 314 + struct list_head *l; 315 + struct dm_snap_exception *e = NULL; 316 + 317 + l = &eh->table[exception_hash(eh, new_e->old_chunk)]; 318 + 319 + /* Add immediately if this table doesn't support consecutive chunks */ 320 + if (!eh->hash_shift) 321 + goto out; 322 + 323 + /* List is ordered by old_chunk */ 324 + list_for_each_entry_reverse(e, l, hash_list) { 325 + /* Insert after an existing chunk? */ 326 + if (new_e->old_chunk == (e->old_chunk + 327 + dm_consecutive_chunk_count(e) + 1) && 328 + new_e->new_chunk == (dm_chunk_number(e->new_chunk) + 329 + dm_consecutive_chunk_count(e) + 1)) { 330 + dm_consecutive_chunk_count_inc(e); 331 + free_exception(new_e); 332 + return; 333 + } 334 + 335 + /* Insert before an existing chunk? */ 336 + if (new_e->old_chunk == (e->old_chunk - 1) && 337 + new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) { 338 + dm_consecutive_chunk_count_inc(e); 339 + e->old_chunk--; 340 + e->new_chunk--; 341 + free_exception(new_e); 342 + return; 343 + } 344 + 345 + if (new_e->old_chunk > e->old_chunk) 346 + break; 347 + } 348 + 349 + out: 350 + list_add(&new_e->hash_list, e ? &e->hash_list : l); 351 + } 352 + 353 int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) 354 { 355 struct dm_snap_exception *e; ··· 316 return -ENOMEM; 317 318 e->old_chunk = old; 319 + 320 + /* Consecutive_count is implicitly initialised to zero */ 321 e->new_chunk = new; 322 + 323 + insert_completed_exception(s, e); 324 + 325 return 0; 326 } 327 ··· 331 mem /= sizeof(struct list_head); 332 333 return mem; 334 } 335 336 /* ··· 361 hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift; 362 hash_size = min(hash_size, max_buckets); 363 364 + hash_size = rounddown_pow_of_two(hash_size); 365 + if (init_exception_table(&s->complete, hash_size, 366 + DM_CHUNK_CONSECUTIVE_BITS)) 367 return -ENOMEM; 368 369 /* ··· 374 if (hash_size < 64) 375 hash_size = 64; 376 377 + if (init_exception_table(&s->pending, hash_size, 0)) { 378 exit_exception_table(&s->complete, exception_cache); 379 return -ENOMEM; 380 } ··· 733 * Add a proper exception, and remove the 734 * in-flight exception from the list. 735 */ 736 + insert_completed_exception(s, e); 737 738 out: 739 remove_exception(&pe->e); ··· 867 } 868 869 static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, 870 + struct bio *bio, chunk_t chunk) 871 { 872 bio->bi_bdev = s->cow->bdev; 873 + bio->bi_sector = chunk_to_sector(s, dm_chunk_number(e->new_chunk) + 874 + (chunk - e->old_chunk)) + 875 + (bio->bi_sector & s->chunk_mask); 876 } 877 878 static int snapshot_map(struct dm_target *ti, struct bio *bio, ··· 902 /* If the block is already remapped - use that, else remap it */ 903 e = lookup_exception(&s->complete, chunk); 904 if (e) { 905 + remap_exception(s, e, bio, chunk); 906 goto out_unlock; 907 } 908 ··· 919 goto out_unlock; 920 } 921 922 + remap_exception(s, &pe->e, bio, chunk); 923 bio_list_add(&pe->snapshot_bios, bio); 924 925 r = DM_MAPIO_SUBMITTED; ··· 1207 1208 static struct target_type origin_target = { 1209 .name = "snapshot-origin", 1210 + .version = {1, 6, 0}, 1211 .module = THIS_MODULE, 1212 .ctr = origin_ctr, 1213 .dtr = origin_dtr, ··· 1218 1219 static struct target_type snapshot_target = { 1220 .name = "snapshot", 1221 + .version = {1, 6, 0}, 1222 .module = THIS_MODULE, 1223 .ctr = snapshot_ctr, 1224 .dtr = snapshot_dtr,

+48 -2

drivers/md/dm-snap.h

··· 16 17 struct exception_table { 18 uint32_t hash_mask; 19 struct list_head *table; 20 }; 21 22 /* 23 * The snapshot code deals with largish chunks of the disk at a 24 - * time. Typically 64k - 256k. 25 */ 26 - /* FIXME: can we get away with limiting these to a uint32_t ? */ 27 typedef sector_t chunk_t; 28 29 /* 30 * An exception is used where an old chunk of data has been 31 * replaced by a new one. 32 */ 33 struct dm_snap_exception { 34 struct list_head hash_list; ··· 39 chunk_t old_chunk; 40 chunk_t new_chunk; 41 }; 42 43 /* 44 * Abstraction to handle the meta/layout of exception stores (the

··· 16 17 struct exception_table { 18 uint32_t hash_mask; 19 + unsigned hash_shift; 20 struct list_head *table; 21 }; 22 23 /* 24 * The snapshot code deals with largish chunks of the disk at a 25 + * time. Typically 32k - 512k. 26 */ 27 typedef sector_t chunk_t; 28 29 /* 30 * An exception is used where an old chunk of data has been 31 * replaced by a new one. 32 + * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number 33 + * of chunks that follow contiguously. Remaining bits hold the number of the 34 + * chunk within the device. 35 */ 36 struct dm_snap_exception { 37 struct list_head hash_list; ··· 36 chunk_t old_chunk; 37 chunk_t new_chunk; 38 }; 39 + 40 + /* 41 + * Funtions to manipulate consecutive chunks 42 + */ 43 + # if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) 44 + # define DM_CHUNK_CONSECUTIVE_BITS 8 45 + # define DM_CHUNK_NUMBER_BITS 56 46 + 47 + static inline chunk_t dm_chunk_number(chunk_t chunk) 48 + { 49 + return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); 50 + } 51 + 52 + static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) 53 + { 54 + return e->new_chunk >> DM_CHUNK_NUMBER_BITS; 55 + } 56 + 57 + static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) 58 + { 59 + e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); 60 + 61 + BUG_ON(!dm_consecutive_chunk_count(e)); 62 + } 63 + 64 + # else 65 + # define DM_CHUNK_CONSECUTIVE_BITS 0 66 + 67 + static inline chunk_t dm_chunk_number(chunk_t chunk) 68 + { 69 + return chunk; 70 + } 71 + 72 + static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) 73 + { 74 + return 0; 75 + } 76 + 77 + static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) 78 + { 79 + } 80 + 81 + # endif 82 83 /* 84 * Abstraction to handle the meta/layout of exception stores (the

+103 -2

drivers/md/dm-stripe.c

··· 14 #include <linux/log2.h> 15 16 #define DM_MSG_PREFIX "striped" 17 18 struct stripe { 19 struct dm_dev *dev; 20 sector_t physical_start; 21 }; 22 23 struct stripe_c { ··· 33 uint32_t chunk_shift; 34 sector_t chunk_mask; 35 36 struct stripe stripe[0]; 37 }; 38 39 static inline struct stripe_c *alloc_context(unsigned int stripes) 40 { ··· 86 return -ENXIO; 87 88 sc->stripe[stripe].physical_start = start; 89 return 0; 90 } 91 ··· 159 return -ENOMEM; 160 } 161 162 sc->stripes = stripes; 163 sc->stripe_width = width; 164 ti->split_io = chunk_size; ··· 187 kfree(sc); 188 return r; 189 } 190 } 191 192 ti->private = sc; 193 return 0; 194 } 195 ··· 203 for (i = 0; i < sc->stripes; i++) 204 dm_put_device(ti, sc->stripe[i].dev); 205 206 kfree(sc); 207 } 208 ··· 222 return DM_MAPIO_REMAPPED; 223 } 224 225 static int stripe_status(struct dm_target *ti, 226 status_type_t type, char *result, unsigned int maxlen) 227 { 228 struct stripe_c *sc = (struct stripe_c *) ti->private; 229 unsigned int sz = 0; 230 unsigned int i; 231 232 switch (type) { 233 case STATUSTYPE_INFO: 234 - result[0] = '\0'; 235 break; 236 237 case STATUSTYPE_TABLE: ··· 266 return 0; 267 } 268 269 static struct target_type stripe_target = { 270 .name = "striped", 271 - .version= {1, 0, 2}, 272 .module = THIS_MODULE, 273 .ctr = stripe_ctr, 274 .dtr = stripe_dtr, 275 .map = stripe_map, 276 .status = stripe_status, 277 }; 278 ··· 323 if (r < 0) 324 DMWARN("target registration failed"); 325 326 return r; 327 } 328 ··· 337 { 338 if (dm_unregister_target(&stripe_target)) 339 DMWARN("target unregistration failed"); 340 341 return; 342 }

··· 14 #include <linux/log2.h> 15 16 #define DM_MSG_PREFIX "striped" 17 + #define DM_IO_ERROR_THRESHOLD 15 18 19 struct stripe { 20 struct dm_dev *dev; 21 sector_t physical_start; 22 + 23 + atomic_t error_count; 24 }; 25 26 struct stripe_c { ··· 30 uint32_t chunk_shift; 31 sector_t chunk_mask; 32 33 + /* Needed for handling events */ 34 + struct dm_target *ti; 35 + 36 + /* Work struct used for triggering events*/ 37 + struct work_struct kstriped_ws; 38 + 39 struct stripe stripe[0]; 40 }; 41 + 42 + static struct workqueue_struct *kstriped; 43 + 44 + /* 45 + * An event is triggered whenever a drive 46 + * drops out of a stripe volume. 47 + */ 48 + static void trigger_event(struct work_struct *work) 49 + { 50 + struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws); 51 + 52 + dm_table_event(sc->ti->table); 53 + 54 + } 55 56 static inline struct stripe_c *alloc_context(unsigned int stripes) 57 { ··· 63 return -ENXIO; 64 65 sc->stripe[stripe].physical_start = start; 66 + 67 return 0; 68 } 69 ··· 135 return -ENOMEM; 136 } 137 138 + INIT_WORK(&sc->kstriped_ws, trigger_event); 139 + 140 + /* Set pointer to dm target; used in trigger_event */ 141 + sc->ti = ti; 142 + 143 sc->stripes = stripes; 144 sc->stripe_width = width; 145 ti->split_io = chunk_size; ··· 158 kfree(sc); 159 return r; 160 } 161 + atomic_set(&(sc->stripe[i].error_count), 0); 162 } 163 164 ti->private = sc; 165 + 166 return 0; 167 } 168 ··· 172 for (i = 0; i < sc->stripes; i++) 173 dm_put_device(ti, sc->stripe[i].dev); 174 175 + flush_workqueue(kstriped); 176 kfree(sc); 177 } 178 ··· 190 return DM_MAPIO_REMAPPED; 191 } 192 193 + /* 194 + * Stripe status: 195 + * 196 + * INFO 197 + * #stripes [stripe_name <stripe_name>] [group word count] 198 + * [error count 'A|D' <error count 'A|D'>] 199 + * 200 + * TABLE 201 + * #stripes [stripe chunk size] 202 + * [stripe_name physical_start <stripe_name physical_start>] 203 + * 204 + */ 205 + 206 static int stripe_status(struct dm_target *ti, 207 status_type_t type, char *result, unsigned int maxlen) 208 { 209 struct stripe_c *sc = (struct stripe_c *) ti->private; 210 + char buffer[sc->stripes + 1]; 211 unsigned int sz = 0; 212 unsigned int i; 213 214 switch (type) { 215 case STATUSTYPE_INFO: 216 + DMEMIT("%d ", sc->stripes); 217 + for (i = 0; i < sc->stripes; i++) { 218 + DMEMIT("%s ", sc->stripe[i].dev->name); 219 + buffer[i] = atomic_read(&(sc->stripe[i].error_count)) ? 220 + 'D' : 'A'; 221 + } 222 + buffer[i] = '\0'; 223 + DMEMIT("1 %s", buffer); 224 break; 225 226 case STATUSTYPE_TABLE: ··· 213 return 0; 214 } 215 216 + static int stripe_end_io(struct dm_target *ti, struct bio *bio, 217 + int error, union map_info *map_context) 218 + { 219 + unsigned i; 220 + char major_minor[16]; 221 + struct stripe_c *sc = ti->private; 222 + 223 + if (!error) 224 + return 0; /* I/O complete */ 225 + 226 + if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) 227 + return error; 228 + 229 + if (error == -EOPNOTSUPP) 230 + return error; 231 + 232 + memset(major_minor, 0, sizeof(major_minor)); 233 + sprintf(major_minor, "%d:%d", 234 + bio->bi_bdev->bd_disk->major, 235 + bio->bi_bdev->bd_disk->first_minor); 236 + 237 + /* 238 + * Test to see which stripe drive triggered the event 239 + * and increment error count for all stripes on that device. 240 + * If the error count for a given device exceeds the threshold 241 + * value we will no longer trigger any further events. 242 + */ 243 + for (i = 0; i < sc->stripes; i++) 244 + if (!strcmp(sc->stripe[i].dev->name, major_minor)) { 245 + atomic_inc(&(sc->stripe[i].error_count)); 246 + if (atomic_read(&(sc->stripe[i].error_count)) < 247 + DM_IO_ERROR_THRESHOLD) 248 + queue_work(kstriped, &sc->kstriped_ws); 249 + } 250 + 251 + return error; 252 + } 253 + 254 static struct target_type stripe_target = { 255 .name = "striped", 256 + .version = {1, 1, 0}, 257 .module = THIS_MODULE, 258 .ctr = stripe_ctr, 259 .dtr = stripe_dtr, 260 .map = stripe_map, 261 + .end_io = stripe_end_io, 262 .status = stripe_status, 263 }; 264 ··· 231 if (r < 0) 232 DMWARN("target registration failed"); 233 234 + kstriped = create_singlethread_workqueue("kstriped"); 235 + if (!kstriped) { 236 + DMERR("failed to create workqueue kstriped"); 237 + dm_unregister_target(&stripe_target); 238 + return -ENOMEM; 239 + } 240 + 241 return r; 242 } 243 ··· 238 { 239 if (dm_unregister_target(&stripe_target)) 240 DMWARN("target unregistration failed"); 241 + 242 + destroy_workqueue(kstriped); 243 244 return; 245 }

+9 -11

drivers/md/dm-table.c

··· 287 { 288 struct list_head *tmp, *next; 289 290 - for (tmp = devices->next; tmp != devices; tmp = next) { 291 struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); 292 - next = tmp->next; 293 kfree(dd); 294 } 295 } ··· 475 int mode, struct dm_dev **result) 476 { 477 int r; 478 - dev_t dev; 479 struct dm_dev *dd; 480 unsigned int major, minor; 481 ··· 804 return -ENOMEM; 805 806 /* set up internal nodes, bottom-up */ 807 - for (i = t->depth - 2, total = 0; i >= 0; i--) { 808 t->index[i] = indexes; 809 indexes += (KEYS_PER_NODE * t->counts[i]); 810 setup_btree_index(i, t); ··· 992 993 int dm_table_any_congested(struct dm_table *t, int bdi_bits) 994 { 995 - struct list_head *d, *devices; 996 int r = 0; 997 998 - devices = dm_table_get_devices(t); 999 - for (d = devices->next; d != devices; d = d->next) { 1000 - struct dm_dev *dd = list_entry(d, struct dm_dev, list); 1001 struct request_queue *q = bdev_get_queue(dd->bdev); 1002 r |= bdi_congested(&q->backing_dev_info, bdi_bits); 1003 } ··· 1006 1007 void dm_table_unplug_all(struct dm_table *t) 1008 { 1009 - struct list_head *d, *devices = dm_table_get_devices(t); 1010 1011 - for (d = devices->next; d != devices; d = d->next) { 1012 - struct dm_dev *dd = list_entry(d, struct dm_dev, list); 1013 struct request_queue *q = bdev_get_queue(dd->bdev); 1014 1015 blk_unplug(q);

··· 287 { 288 struct list_head *tmp, *next; 289 290 + list_for_each_safe(tmp, next, devices) { 291 struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); 292 kfree(dd); 293 } 294 } ··· 476 int mode, struct dm_dev **result) 477 { 478 int r; 479 + dev_t uninitialized_var(dev); 480 struct dm_dev *dd; 481 unsigned int major, minor; 482 ··· 805 return -ENOMEM; 806 807 /* set up internal nodes, bottom-up */ 808 + for (i = t->depth - 2; i >= 0; i--) { 809 t->index[i] = indexes; 810 indexes += (KEYS_PER_NODE * t->counts[i]); 811 setup_btree_index(i, t); ··· 993 994 int dm_table_any_congested(struct dm_table *t, int bdi_bits) 995 { 996 + struct dm_dev *dd; 997 + struct list_head *devices = dm_table_get_devices(t); 998 int r = 0; 999 1000 + list_for_each_entry(dd, devices, list) { 1001 struct request_queue *q = bdev_get_queue(dd->bdev); 1002 r |= bdi_congested(&q->backing_dev_info, bdi_bits); 1003 } ··· 1008 1009 void dm_table_unplug_all(struct dm_table *t) 1010 { 1011 + struct dm_dev *dd; 1012 + struct list_head *devices = dm_table_get_devices(t); 1013 1014 + list_for_each_entry(dd, devices, list) { 1015 struct request_queue *q = bdev_get_queue(dd->bdev); 1016 1017 blk_unplug(q);

+149 -89

drivers/md/dm.c

··· 71 #define DMF_DELETING 4 72 #define DMF_NOFLUSH_SUSPENDING 5 73 74 struct mapped_device { 75 struct rw_semaphore io_lock; 76 - struct semaphore suspend_lock; 77 spinlock_t pushback_lock; 78 rwlock_t map_lock; 79 atomic_t holders; ··· 107 wait_queue_head_t wait; 108 struct bio_list deferred; 109 struct bio_list pushback; 110 111 /* 112 * The current mapping. ··· 199 DMINFO("cleaned up"); 200 } 201 202 - int (*_inits[])(void) __initdata = { 203 local_init, 204 dm_target_init, 205 dm_linear_init, ··· 207 dm_interface_init, 208 }; 209 210 - void (*_exits[])(void) = { 211 local_exit, 212 dm_target_exit, 213 dm_linear_exit, ··· 1000 } 1001 1002 if (!try_module_get(THIS_MODULE)) 1003 - goto bad0; 1004 1005 /* get a minor number for the dev */ 1006 if (minor == DM_ANY_MINOR) ··· 1008 else 1009 r = specific_minor(md, minor); 1010 if (r < 0) 1011 - goto bad1; 1012 1013 memset(md, 0, sizeof(*md)); 1014 init_rwsem(&md->io_lock); 1015 - init_MUTEX(&md->suspend_lock); 1016 spin_lock_init(&md->pushback_lock); 1017 rwlock_init(&md->map_lock); 1018 atomic_set(&md->holders, 1); ··· 1024 1025 md->queue = blk_alloc_queue(GFP_KERNEL); 1026 if (!md->queue) 1027 - goto bad1_free_minor; 1028 1029 md->queue->queuedata = md; 1030 md->queue->backing_dev_info.congested_fn = dm_any_congested; ··· 1035 1036 md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); 1037 if (!md->io_pool) 1038 - goto bad2; 1039 1040 md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); 1041 if (!md->tio_pool) 1042 - goto bad3; 1043 1044 md->bs = bioset_create(16, 16); 1045 if (!md->bs) ··· 1047 1048 md->disk = alloc_disk(1); 1049 if (!md->disk) 1050 - goto bad4; 1051 1052 atomic_set(&md->pending, 0); 1053 init_waitqueue_head(&md->wait); ··· 1062 add_disk(md->disk); 1063 format_dev_t(md->name, MKDEV(_major, minor)); 1064 1065 /* Populate the mapping, nobody knows we exist yet */ 1066 spin_lock(&_minor_lock); 1067 old_md = idr_replace(&_minor_idr, md, minor); ··· 1075 1076 return md; 1077 1078 - bad4: 1079 bioset_free(md->bs); 1080 - bad_no_bioset: 1081 mempool_destroy(md->tio_pool); 1082 - bad3: 1083 mempool_destroy(md->io_pool); 1084 - bad2: 1085 blk_cleanup_queue(md->queue); 1086 - bad1_free_minor: 1087 free_minor(minor); 1088 - bad1: 1089 module_put(THIS_MODULE); 1090 - bad0: 1091 kfree(md); 1092 return NULL; 1093 } ··· 1104 unlock_fs(md); 1105 bdput(md->suspended_bdev); 1106 } 1107 mempool_destroy(md->tio_pool); 1108 mempool_destroy(md->io_pool); 1109 bioset_free(md->bs); ··· 1284 } 1285 EXPORT_SYMBOL_GPL(dm_put); 1286 1287 /* 1288 * Process the deferred bios 1289 */ 1290 - static void __flush_deferred_io(struct mapped_device *md, struct bio *c) 1291 { 1292 - struct bio *n; 1293 1294 - while (c) { 1295 - n = c->bi_next; 1296 - c->bi_next = NULL; 1297 if (__split_bio(md, c)) 1298 bio_io_error(c); 1299 - c = n; 1300 } 1301 } 1302 1303 /* ··· 1378 { 1379 int r = -EINVAL; 1380 1381 - down(&md->suspend_lock); 1382 1383 /* device must be suspended */ 1384 if (!dm_suspended(md)) ··· 1393 r = __bind(md, table); 1394 1395 out: 1396 - up(&md->suspend_lock); 1397 return r; 1398 } 1399 ··· 1442 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 1443 { 1444 struct dm_table *map = NULL; 1445 - unsigned long flags; 1446 DECLARE_WAITQUEUE(wait, current); 1447 - struct bio *def; 1448 - int r = -EINVAL; 1449 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 1450 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 1451 1452 - down(&md->suspend_lock); 1453 1454 - if (dm_suspended(md)) 1455 goto out_unlock; 1456 1457 map = dm_get_table(md); 1458 ··· 1474 r = -ENOMEM; 1475 goto flush_and_out; 1476 } 1477 - } 1478 1479 - /* 1480 - * Flush I/O to the device. 1481 - * noflush supersedes do_lockfs, because lock_fs() needs to flush I/Os. 1482 - */ 1483 - if (do_lockfs && !noflush) { 1484 - r = lock_fs(md); 1485 - if (r) 1486 - goto out; 1487 } 1488 1489 /* ··· 1500 dm_table_unplug_all(map); 1501 1502 /* 1503 - * Then we wait for the already mapped ios to 1504 - * complete. 1505 */ 1506 - while (1) { 1507 - set_current_state(TASK_INTERRUPTIBLE); 1508 - 1509 - if (!atomic_read(&md->pending) || signal_pending(current)) 1510 - break; 1511 - 1512 - io_schedule(); 1513 - } 1514 - set_current_state(TASK_RUNNING); 1515 1516 down_write(&md->io_lock); 1517 remove_wait_queue(&md->wait, &wait); 1518 1519 - if (noflush) { 1520 - spin_lock_irqsave(&md->pushback_lock, flags); 1521 - clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 1522 - bio_list_merge_head(&md->deferred, &md->pushback); 1523 - bio_list_init(&md->pushback); 1524 - spin_unlock_irqrestore(&md->pushback_lock, flags); 1525 - } 1526 1527 /* were we interrupted ? */ 1528 - r = -EINTR; 1529 - if (atomic_read(&md->pending)) { 1530 - clear_bit(DMF_BLOCK_IO, &md->flags); 1531 - def = bio_list_get(&md->deferred); 1532 - __flush_deferred_io(md, def); 1533 - up_write(&md->io_lock); 1534 unlock_fs(md); 1535 goto out; /* pushback list is already flushed, so skip flush */ 1536 } 1537 - up_write(&md->io_lock); 1538 1539 dm_table_postsuspend_targets(map); 1540 1541 set_bit(DMF_SUSPENDED, &md->flags); 1542 1543 - r = 0; 1544 - 1545 flush_and_out: 1546 - if (r && noflush) { 1547 /* 1548 * Because there may be already I/Os in the pushback list, 1549 * flush them before return. 1550 */ 1551 - down_write(&md->io_lock); 1552 - 1553 - spin_lock_irqsave(&md->pushback_lock, flags); 1554 - clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 1555 - bio_list_merge_head(&md->deferred, &md->pushback); 1556 - bio_list_init(&md->pushback); 1557 - spin_unlock_irqrestore(&md->pushback_lock, flags); 1558 - 1559 - def = bio_list_get(&md->deferred); 1560 - __flush_deferred_io(md, def); 1561 - up_write(&md->io_lock); 1562 - } 1563 1564 out: 1565 if (r && md->suspended_bdev) { ··· 1540 dm_table_put(map); 1541 1542 out_unlock: 1543 - up(&md->suspend_lock); 1544 return r; 1545 } 1546 1547 int dm_resume(struct mapped_device *md) 1548 { 1549 int r = -EINVAL; 1550 - struct bio *def; 1551 struct dm_table *map = NULL; 1552 1553 - down(&md->suspend_lock); 1554 if (!dm_suspended(md)) 1555 goto out; 1556 ··· 1561 if (r) 1562 goto out; 1563 1564 - down_write(&md->io_lock); 1565 - clear_bit(DMF_BLOCK_IO, &md->flags); 1566 - 1567 - def = bio_list_get(&md->deferred); 1568 - __flush_deferred_io(md, def); 1569 - up_write(&md->io_lock); 1570 1571 unlock_fs(md); 1572 ··· 1580 1581 out: 1582 dm_table_put(map); 1583 - up(&md->suspend_lock); 1584 1585 return r; 1586 }

··· 71 #define DMF_DELETING 4 72 #define DMF_NOFLUSH_SUSPENDING 5 73 74 + /* 75 + * Work processed by per-device workqueue. 76 + */ 77 + struct dm_wq_req { 78 + enum { 79 + DM_WQ_FLUSH_ALL, 80 + DM_WQ_FLUSH_DEFERRED, 81 + } type; 82 + struct work_struct work; 83 + struct mapped_device *md; 84 + void *context; 85 + }; 86 + 87 struct mapped_device { 88 struct rw_semaphore io_lock; 89 + struct mutex suspend_lock; 90 spinlock_t pushback_lock; 91 rwlock_t map_lock; 92 atomic_t holders; ··· 94 wait_queue_head_t wait; 95 struct bio_list deferred; 96 struct bio_list pushback; 97 + 98 + /* 99 + * Processing queue (flush/barriers) 100 + */ 101 + struct workqueue_struct *wq; 102 103 /* 104 * The current mapping. ··· 181 DMINFO("cleaned up"); 182 } 183 184 + static int (*_inits[])(void) __initdata = { 185 local_init, 186 dm_target_init, 187 dm_linear_init, ··· 189 dm_interface_init, 190 }; 191 192 + static void (*_exits[])(void) = { 193 local_exit, 194 dm_target_exit, 195 dm_linear_exit, ··· 982 } 983 984 if (!try_module_get(THIS_MODULE)) 985 + goto bad_module_get; 986 987 /* get a minor number for the dev */ 988 if (minor == DM_ANY_MINOR) ··· 990 else 991 r = specific_minor(md, minor); 992 if (r < 0) 993 + goto bad_minor; 994 995 memset(md, 0, sizeof(*md)); 996 init_rwsem(&md->io_lock); 997 + mutex_init(&md->suspend_lock); 998 spin_lock_init(&md->pushback_lock); 999 rwlock_init(&md->map_lock); 1000 atomic_set(&md->holders, 1); ··· 1006 1007 md->queue = blk_alloc_queue(GFP_KERNEL); 1008 if (!md->queue) 1009 + goto bad_queue; 1010 1011 md->queue->queuedata = md; 1012 md->queue->backing_dev_info.congested_fn = dm_any_congested; ··· 1017 1018 md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); 1019 if (!md->io_pool) 1020 + goto bad_io_pool; 1021 1022 md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); 1023 if (!md->tio_pool) 1024 + goto bad_tio_pool; 1025 1026 md->bs = bioset_create(16, 16); 1027 if (!md->bs) ··· 1029 1030 md->disk = alloc_disk(1); 1031 if (!md->disk) 1032 + goto bad_disk; 1033 1034 atomic_set(&md->pending, 0); 1035 init_waitqueue_head(&md->wait); ··· 1044 add_disk(md->disk); 1045 format_dev_t(md->name, MKDEV(_major, minor)); 1046 1047 + md->wq = create_singlethread_workqueue("kdmflush"); 1048 + if (!md->wq) 1049 + goto bad_thread; 1050 + 1051 /* Populate the mapping, nobody knows we exist yet */ 1052 spin_lock(&_minor_lock); 1053 old_md = idr_replace(&_minor_idr, md, minor); ··· 1053 1054 return md; 1055 1056 + bad_thread: 1057 + put_disk(md->disk); 1058 + bad_disk: 1059 bioset_free(md->bs); 1060 + bad_no_bioset: 1061 mempool_destroy(md->tio_pool); 1062 + bad_tio_pool: 1063 mempool_destroy(md->io_pool); 1064 + bad_io_pool: 1065 blk_cleanup_queue(md->queue); 1066 + bad_queue: 1067 free_minor(minor); 1068 + bad_minor: 1069 module_put(THIS_MODULE); 1070 + bad_module_get: 1071 kfree(md); 1072 return NULL; 1073 } ··· 1080 unlock_fs(md); 1081 bdput(md->suspended_bdev); 1082 } 1083 + destroy_workqueue(md->wq); 1084 mempool_destroy(md->tio_pool); 1085 mempool_destroy(md->io_pool); 1086 bioset_free(md->bs); ··· 1259 } 1260 EXPORT_SYMBOL_GPL(dm_put); 1261 1262 + static int dm_wait_for_completion(struct mapped_device *md) 1263 + { 1264 + int r = 0; 1265 + 1266 + while (1) { 1267 + set_current_state(TASK_INTERRUPTIBLE); 1268 + 1269 + smp_mb(); 1270 + if (!atomic_read(&md->pending)) 1271 + break; 1272 + 1273 + if (signal_pending(current)) { 1274 + r = -EINTR; 1275 + break; 1276 + } 1277 + 1278 + io_schedule(); 1279 + } 1280 + set_current_state(TASK_RUNNING); 1281 + 1282 + return r; 1283 + } 1284 + 1285 /* 1286 * Process the deferred bios 1287 */ 1288 + static void __flush_deferred_io(struct mapped_device *md) 1289 { 1290 + struct bio *c; 1291 1292 + while ((c = bio_list_pop(&md->deferred))) { 1293 if (__split_bio(md, c)) 1294 bio_io_error(c); 1295 } 1296 + 1297 + clear_bit(DMF_BLOCK_IO, &md->flags); 1298 + } 1299 + 1300 + static void __merge_pushback_list(struct mapped_device *md) 1301 + { 1302 + unsigned long flags; 1303 + 1304 + spin_lock_irqsave(&md->pushback_lock, flags); 1305 + clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 1306 + bio_list_merge_head(&md->deferred, &md->pushback); 1307 + bio_list_init(&md->pushback); 1308 + spin_unlock_irqrestore(&md->pushback_lock, flags); 1309 + } 1310 + 1311 + static void dm_wq_work(struct work_struct *work) 1312 + { 1313 + struct dm_wq_req *req = container_of(work, struct dm_wq_req, work); 1314 + struct mapped_device *md = req->md; 1315 + 1316 + down_write(&md->io_lock); 1317 + switch (req->type) { 1318 + case DM_WQ_FLUSH_ALL: 1319 + __merge_pushback_list(md); 1320 + /* pass through */ 1321 + case DM_WQ_FLUSH_DEFERRED: 1322 + __flush_deferred_io(md); 1323 + break; 1324 + default: 1325 + DMERR("dm_wq_work: unrecognised work type %d", req->type); 1326 + BUG(); 1327 + } 1328 + up_write(&md->io_lock); 1329 + } 1330 + 1331 + static void dm_wq_queue(struct mapped_device *md, int type, void *context, 1332 + struct dm_wq_req *req) 1333 + { 1334 + req->type = type; 1335 + req->md = md; 1336 + req->context = context; 1337 + INIT_WORK(&req->work, dm_wq_work); 1338 + queue_work(md->wq, &req->work); 1339 + } 1340 + 1341 + static void dm_queue_flush(struct mapped_device *md, int type, void *context) 1342 + { 1343 + struct dm_wq_req req; 1344 + 1345 + dm_wq_queue(md, type, context, &req); 1346 + flush_workqueue(md->wq); 1347 } 1348 1349 /* ··· 1282 { 1283 int r = -EINVAL; 1284 1285 + mutex_lock(&md->suspend_lock); 1286 1287 /* device must be suspended */ 1288 if (!dm_suspended(md)) ··· 1297 r = __bind(md, table); 1298 1299 out: 1300 + mutex_unlock(&md->suspend_lock); 1301 return r; 1302 } 1303 ··· 1346 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 1347 { 1348 struct dm_table *map = NULL; 1349 DECLARE_WAITQUEUE(wait, current); 1350 + int r = 0; 1351 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 1352 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 1353 1354 + mutex_lock(&md->suspend_lock); 1355 1356 + if (dm_suspended(md)) { 1357 + r = -EINVAL; 1358 goto out_unlock; 1359 + } 1360 1361 map = dm_get_table(md); 1362 ··· 1378 r = -ENOMEM; 1379 goto flush_and_out; 1380 } 1381 1382 + /* 1383 + * Flush I/O to the device. noflush supersedes do_lockfs, 1384 + * because lock_fs() needs to flush I/Os. 1385 + */ 1386 + if (do_lockfs) { 1387 + r = lock_fs(md); 1388 + if (r) 1389 + goto out; 1390 + } 1391 } 1392 1393 /* ··· 1404 dm_table_unplug_all(map); 1405 1406 /* 1407 + * Wait for the already-mapped ios to complete. 1408 */ 1409 + r = dm_wait_for_completion(md); 1410 1411 down_write(&md->io_lock); 1412 remove_wait_queue(&md->wait, &wait); 1413 1414 + if (noflush) 1415 + __merge_pushback_list(md); 1416 + up_write(&md->io_lock); 1417 1418 /* were we interrupted ? */ 1419 + if (r < 0) { 1420 + dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL); 1421 + 1422 unlock_fs(md); 1423 goto out; /* pushback list is already flushed, so skip flush */ 1424 } 1425 1426 dm_table_postsuspend_targets(map); 1427 1428 set_bit(DMF_SUSPENDED, &md->flags); 1429 1430 flush_and_out: 1431 + if (r && noflush) 1432 /* 1433 * Because there may be already I/Os in the pushback list, 1434 * flush them before return. 1435 */ 1436 + dm_queue_flush(md, DM_WQ_FLUSH_ALL, NULL); 1437 1438 out: 1439 if (r && md->suspended_bdev) { ··· 1474 dm_table_put(map); 1475 1476 out_unlock: 1477 + mutex_unlock(&md->suspend_lock); 1478 return r; 1479 } 1480 1481 int dm_resume(struct mapped_device *md) 1482 { 1483 int r = -EINVAL; 1484 struct dm_table *map = NULL; 1485 1486 + mutex_lock(&md->suspend_lock); 1487 if (!dm_suspended(md)) 1488 goto out; 1489 ··· 1496 if (r) 1497 goto out; 1498 1499 + dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL); 1500 1501 unlock_fs(md); 1502 ··· 1520 1521 out: 1522 dm_table_put(map); 1523 + mutex_unlock(&md->suspend_lock); 1524 1525 return r; 1526 }

-34

fs/compat_ioctl.c

··· 78 #include <linux/mii.h> 79 #include <linux/if_bonding.h> 80 #include <linux/watchdog.h> 81 - #include <linux/dm-ioctl.h> 82 83 #include <linux/soundcard.h> 84 #include <linux/lp.h> ··· 1992 COMPATIBLE_IOCTL(RESTART_ARRAY_RW) 1993 COMPATIBLE_IOCTL(GET_BITMAP_FILE) 1994 ULONG_IOCTL(SET_BITMAP_FILE) 1995 - /* DM */ 1996 - COMPATIBLE_IOCTL(DM_VERSION_32) 1997 - COMPATIBLE_IOCTL(DM_REMOVE_ALL_32) 1998 - COMPATIBLE_IOCTL(DM_LIST_DEVICES_32) 1999 - COMPATIBLE_IOCTL(DM_DEV_CREATE_32) 2000 - COMPATIBLE_IOCTL(DM_DEV_REMOVE_32) 2001 - COMPATIBLE_IOCTL(DM_DEV_RENAME_32) 2002 - COMPATIBLE_IOCTL(DM_DEV_SUSPEND_32) 2003 - COMPATIBLE_IOCTL(DM_DEV_STATUS_32) 2004 - COMPATIBLE_IOCTL(DM_DEV_WAIT_32) 2005 - COMPATIBLE_IOCTL(DM_TABLE_LOAD_32) 2006 - COMPATIBLE_IOCTL(DM_TABLE_CLEAR_32) 2007 - COMPATIBLE_IOCTL(DM_TABLE_DEPS_32) 2008 - COMPATIBLE_IOCTL(DM_TABLE_STATUS_32) 2009 - COMPATIBLE_IOCTL(DM_LIST_VERSIONS_32) 2010 - COMPATIBLE_IOCTL(DM_TARGET_MSG_32) 2011 - COMPATIBLE_IOCTL(DM_DEV_SET_GEOMETRY_32) 2012 - COMPATIBLE_IOCTL(DM_VERSION) 2013 - COMPATIBLE_IOCTL(DM_REMOVE_ALL) 2014 - COMPATIBLE_IOCTL(DM_LIST_DEVICES) 2015 - COMPATIBLE_IOCTL(DM_DEV_CREATE) 2016 - COMPATIBLE_IOCTL(DM_DEV_REMOVE) 2017 - COMPATIBLE_IOCTL(DM_DEV_RENAME) 2018 - COMPATIBLE_IOCTL(DM_DEV_SUSPEND) 2019 - COMPATIBLE_IOCTL(DM_DEV_STATUS) 2020 - COMPATIBLE_IOCTL(DM_DEV_WAIT) 2021 - COMPATIBLE_IOCTL(DM_TABLE_LOAD) 2022 - COMPATIBLE_IOCTL(DM_TABLE_CLEAR) 2023 - COMPATIBLE_IOCTL(DM_TABLE_DEPS) 2024 - COMPATIBLE_IOCTL(DM_TABLE_STATUS) 2025 - COMPATIBLE_IOCTL(DM_LIST_VERSIONS) 2026 - COMPATIBLE_IOCTL(DM_TARGET_MSG) 2027 - COMPATIBLE_IOCTL(DM_DEV_SET_GEOMETRY) 2028 /* Big K */ 2029 COMPATIBLE_IOCTL(PIO_FONT) 2030 COMPATIBLE_IOCTL(GIO_FONT)

··· 78 #include <linux/mii.h> 79 #include <linux/if_bonding.h> 80 #include <linux/watchdog.h> 81 82 #include <linux/soundcard.h> 83 #include <linux/lp.h> ··· 1993 COMPATIBLE_IOCTL(RESTART_ARRAY_RW) 1994 COMPATIBLE_IOCTL(GET_BITMAP_FILE) 1995 ULONG_IOCTL(SET_BITMAP_FILE) 1996 /* Big K */ 1997 COMPATIBLE_IOCTL(PIO_FONT) 1998 COMPATIBLE_IOCTL(GIO_FONT)

+9 -9

include/linux/device-mapper.h

··· 110 }; 111 112 struct io_restrictions { 113 - unsigned int max_sectors; 114 - unsigned short max_phys_segments; 115 - unsigned short max_hw_segments; 116 - unsigned short hardsect_size; 117 - unsigned int max_segment_size; 118 - unsigned int max_hw_sectors; 119 - unsigned long seg_boundary_mask; 120 - unsigned long bounce_pfn; 121 - unsigned char no_cluster; /* inverted so that 0 is default */ 122 }; 123 124 struct dm_target {

··· 110 }; 111 112 struct io_restrictions { 113 + unsigned long bounce_pfn; 114 + unsigned long seg_boundary_mask; 115 + unsigned max_hw_sectors; 116 + unsigned max_sectors; 117 + unsigned max_segment_size; 118 + unsigned short hardsect_size; 119 + unsigned short max_hw_segments; 120 + unsigned short max_phys_segments; 121 + unsigned char no_cluster; /* inverted so that 0 is default */ 122 }; 123 124 struct dm_target {

+2 -32

include/linux/dm-ioctl.h

··· 232 DM_DEV_SET_GEOMETRY_CMD 233 }; 234 235 - /* 236 - * The dm_ioctl struct passed into the ioctl is just the header 237 - * on a larger chunk of memory. On x86-64 and other 238 - * architectures the dm-ioctl struct will be padded to an 8 byte 239 - * boundary so the size will be different, which would change the 240 - * ioctl code - yes I really messed up. This hack forces these 241 - * architectures to have the correct ioctl code. 242 - */ 243 - #ifdef CONFIG_COMPAT 244 - typedef char ioctl_struct[308]; 245 - #define DM_VERSION_32 _IOWR(DM_IOCTL, DM_VERSION_CMD, ioctl_struct) 246 - #define DM_REMOVE_ALL_32 _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, ioctl_struct) 247 - #define DM_LIST_DEVICES_32 _IOWR(DM_IOCTL, DM_LIST_DEVICES_CMD, ioctl_struct) 248 - 249 - #define DM_DEV_CREATE_32 _IOWR(DM_IOCTL, DM_DEV_CREATE_CMD, ioctl_struct) 250 - #define DM_DEV_REMOVE_32 _IOWR(DM_IOCTL, DM_DEV_REMOVE_CMD, ioctl_struct) 251 - #define DM_DEV_RENAME_32 _IOWR(DM_IOCTL, DM_DEV_RENAME_CMD, ioctl_struct) 252 - #define DM_DEV_SUSPEND_32 _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, ioctl_struct) 253 - #define DM_DEV_STATUS_32 _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, ioctl_struct) 254 - #define DM_DEV_WAIT_32 _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, ioctl_struct) 255 - 256 - #define DM_TABLE_LOAD_32 _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, ioctl_struct) 257 - #define DM_TABLE_CLEAR_32 _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, ioctl_struct) 258 - #define DM_TABLE_DEPS_32 _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, ioctl_struct) 259 - #define DM_TABLE_STATUS_32 _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, ioctl_struct) 260 - #define DM_LIST_VERSIONS_32 _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, ioctl_struct) 261 - #define DM_TARGET_MSG_32 _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, ioctl_struct) 262 - #define DM_DEV_SET_GEOMETRY_32 _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, ioctl_struct) 263 - #endif 264 - 265 #define DM_IOCTL 0xfd 266 267 #define DM_VERSION _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl) ··· 256 #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) 257 258 #define DM_VERSION_MAJOR 4 259 - #define DM_VERSION_MINOR 12 260 #define DM_VERSION_PATCHLEVEL 0 261 - #define DM_VERSION_EXTRA "-ioctl (2007-10-02)" 262 263 /* Status bits */ 264 #define DM_READONLY_FLAG (1 << 0) /* In/Out */

··· 232 DM_DEV_SET_GEOMETRY_CMD 233 }; 234 235 #define DM_IOCTL 0xfd 236 237 #define DM_VERSION _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl) ··· 286 #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) 287 288 #define DM_VERSION_MAJOR 4 289 + #define DM_VERSION_MINOR 13 290 #define DM_VERSION_PATCHLEVEL 0 291 + #define DM_VERSION_EXTRA "-ioctl (2007-10-18)" 292 293 /* Status bits */ 294 #define DM_READONLY_FLAG (1 << 0) /* In/Out */