commit 5014bebee0cffda14fafae5a2534d08120b7b9e8 · tjh.dev/kernel

+5

Documentation/admin-guide/device-mapper/dm-crypt.rst

··· 146 integrity for the encrypted device. The additional space is then 147 used for storing authentication tag (and persistent IV if needed). 148 149 sector_size:<bytes> 150 Use <bytes> as the encryption unit instead of 512 bytes sectors. 151 This option can be in range 512 - 4096 bytes and must be power of two.

··· 146 integrity for the encrypted device. The additional space is then 147 used for storing authentication tag (and persistent IV if needed). 148 149 + integrity_key_size:<bytes> 150 + Optionally set the integrity key size if it differs from the digest size. 151 + It allows the use of wrapped key algorithms where the key size is 152 + independent of the cryptographic key size. 153 + 154 sector_size:<bytes> 155 Use <bytes> as the encryption unit instead of 512 bytes sectors. 156 This option can be in range 512 - 4096 bytes and must be power of two.

+5

Documentation/admin-guide/device-mapper/dm-integrity.rst

··· 92 allowed. This mode is useful for data recovery if the 93 device cannot be activated in any of the other standard 94 modes. 95 96 5. the number of additional arguments 97

··· 92 allowed. This mode is useful for data recovery if the 93 device cannot be activated in any of the other standard 94 modes. 95 + I - inline mode - in this mode, dm-integrity will store integrity 96 + data directly in the underlying device sectors. 97 + The underlying device must have an integrity profile that 98 + allows storing user integrity data and provides enough 99 + space for the selected integrity tag. 100 101 5. the number of additional arguments 102

+18 -2

Documentation/admin-guide/device-mapper/verity.rst

··· 87 Panic the device when a corrupted block is discovered. This option is 88 not compatible with ignore_corruption and restart_on_corruption. 89 90 ignore_zero_blocks 91 Do not verify blocks that are expected to contain zeroes and always return 92 zeroes instead. This may be useful if the partition contains unused blocks ··· 151 already in the secondary trusted keyring. 152 153 try_verify_in_tasklet 154 - If verity hashes are in cache, verify data blocks in kernel tasklet instead 155 - of workqueue. This option can reduce IO latency. 156 157 Theory of operation 158 ===================

··· 87 Panic the device when a corrupted block is discovered. This option is 88 not compatible with ignore_corruption and restart_on_corruption. 89 90 + restart_on_error 91 + Restart the system when an I/O error is detected. 92 + This option can be combined with the restart_on_corruption option. 93 + 94 + panic_on_error 95 + Panic the device when an I/O error is detected. This option is 96 + not compatible with the restart_on_error option but can be combined 97 + with the panic_on_corruption option. 98 + 99 ignore_zero_blocks 100 Do not verify blocks that are expected to contain zeroes and always return 101 zeroes instead. This may be useful if the partition contains unused blocks ··· 142 already in the secondary trusted keyring. 143 144 try_verify_in_tasklet 145 + If verity hashes are in cache and the IO size does not exceed the limit, 146 + verify data blocks in bottom half instead of workqueue. This option can 147 + reduce IO latency. The size limits can be configured via 148 + /sys/module/dm_verity/parameters/use_bh_bytes. The four parameters 149 + correspond to limits for IOPRIO_CLASS_NONE, IOPRIO_CLASS_RT, 150 + IOPRIO_CLASS_BE and IOPRIO_CLASS_IDLE in turn. 151 + For example: 152 + <none>,<rt>,<be>,<idle> 153 + 4096,4096,4096,4096 154 155 Theory of operation 156 ===================

+1

drivers/md/Kconfig

··· 267 depends on BLK_DEV_DM 268 depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) 269 depends on (TRUSTED_KEYS || TRUSTED_KEYS=n) 270 select CRYPTO 271 select CRYPTO_CBC 272 select CRYPTO_ESSIV

··· 267 depends on BLK_DEV_DM 268 depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) 269 depends on (TRUSTED_KEYS || TRUSTED_KEYS=n) 270 + select CRC32 271 select CRYPTO 272 select CRYPTO_CBC 273 select CRYPTO_ESSIV

+1 -3

drivers/md/dm-bufio.c

··· 2234 } 2235 EXPORT_SYMBOL_GPL(dm_bufio_issue_discard); 2236 2237 - static bool forget_buffer(struct dm_bufio_client *c, sector_t block) 2238 { 2239 struct dm_buffer *b; 2240 ··· 2249 cache_put_and_wake(c, b); 2250 } 2251 } 2252 - 2253 - return b ? true : false; 2254 } 2255 2256 /*

··· 2234 } 2235 EXPORT_SYMBOL_GPL(dm_bufio_issue_discard); 2236 2237 + static void forget_buffer(struct dm_bufio_client *c, sector_t block) 2238 { 2239 struct dm_buffer *b; 2240 ··· 2249 cache_put_and_wake(c, b); 2250 } 2251 } 2252 } 2253 2254 /*

+93 -3

drivers/md/dm-cache-target.c

··· 406 mempool_t migration_pool; 407 408 struct bio_set bs; 409 }; 410 411 struct per_bio_data { ··· 1928 if (cache->discard_bitset) 1929 free_bitset(cache->discard_bitset); 1930 1931 if (cache->copier) 1932 dm_kcopyd_client_destroy(cache->copier); 1933 ··· 2519 } 2520 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2521 2522 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2523 if (IS_ERR(cache->copier)) { 2524 *error = "could not create kcopyd client"; ··· 2824 return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); 2825 } 2826 2827 /* 2828 * The discard block size in the on disk metadata is not 2829 * necessarily the same as we're currently using. So we have to ··· 2933 return to_cblock(size); 2934 } 2935 2936 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 2937 { 2938 if (from_cblock(new_size) > from_cblock(cache->cache_size)) { ··· 2996 return 0; 2997 } 2998 2999 static int cache_preresume(struct dm_target *ti) 3000 { 3001 int r = 0; 3002 struct cache *cache = ti->private; 3003 dm_cblock_t csize = get_cache_dev_size(cache); 3004 3005 /* 3006 * Check to see if the cache has resized. ··· 3038 } 3039 3040 if (!cache->loaded_mappings) { 3041 r = dm_cache_load_mappings(cache->cmd, cache->policy, 3042 - load_mapping, cache); 3043 if (r) { 3044 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 3045 - metadata_operation_failed(cache, "dm_cache_load_mappings", r); 3046 return r; 3047 } 3048 ··· 3516 3517 static struct target_type cache_target = { 3518 .name = "cache", 3519 - .version = {2, 2, 0}, 3520 .module = THIS_MODULE, 3521 .ctr = cache_ctr, 3522 .dtr = cache_dtr,

··· 406 mempool_t migration_pool; 407 408 struct bio_set bs; 409 + 410 + /* 411 + * Cache_size entries. Set bits indicate blocks mapped beyond the 412 + * target length, which are marked for invalidation. 413 + */ 414 + unsigned long *invalid_bitset; 415 }; 416 417 struct per_bio_data { ··· 1922 if (cache->discard_bitset) 1923 free_bitset(cache->discard_bitset); 1924 1925 + if (cache->invalid_bitset) 1926 + free_bitset(cache->invalid_bitset); 1927 + 1928 if (cache->copier) 1929 dm_kcopyd_client_destroy(cache->copier); 1930 ··· 2510 } 2511 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2512 2513 + cache->invalid_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2514 + if (!cache->invalid_bitset) { 2515 + *error = "could not allocate bitset for invalid blocks"; 2516 + goto bad; 2517 + } 2518 + clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size)); 2519 + 2520 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2521 if (IS_ERR(cache->copier)) { 2522 *error = "could not create kcopyd client"; ··· 2808 return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); 2809 } 2810 2811 + static int load_filtered_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2812 + bool dirty, uint32_t hint, bool hint_valid) 2813 + { 2814 + struct cache *cache = context; 2815 + 2816 + if (from_oblock(oblock) >= from_oblock(cache->origin_blocks)) { 2817 + if (dirty) { 2818 + DMERR("%s: unable to shrink origin; cache block %u is dirty", 2819 + cache_device_name(cache), from_cblock(cblock)); 2820 + return -EFBIG; 2821 + } 2822 + set_bit(from_cblock(cblock), cache->invalid_bitset); 2823 + return 0; 2824 + } 2825 + 2826 + return load_mapping(context, oblock, cblock, dirty, hint, hint_valid); 2827 + } 2828 + 2829 /* 2830 * The discard block size in the on disk metadata is not 2831 * necessarily the same as we're currently using. So we have to ··· 2899 return to_cblock(size); 2900 } 2901 2902 + static bool can_resume(struct cache *cache) 2903 + { 2904 + /* 2905 + * Disallow retrying the resume operation for devices that failed the 2906 + * first resume attempt, as the failure leaves the policy object partially 2907 + * initialized. Retrying could trigger BUG_ON when loading cache mappings 2908 + * into the incomplete policy object. 2909 + */ 2910 + if (cache->sized && !cache->loaded_mappings) { 2911 + if (get_cache_mode(cache) != CM_WRITE) 2912 + DMERR("%s: unable to resume a failed-loaded cache, please check metadata.", 2913 + cache_device_name(cache)); 2914 + else 2915 + DMERR("%s: unable to resume cache due to missing proper cache table reload", 2916 + cache_device_name(cache)); 2917 + return false; 2918 + } 2919 + 2920 + return true; 2921 + } 2922 + 2923 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 2924 { 2925 if (from_cblock(new_size) > from_cblock(cache->cache_size)) { ··· 2941 return 0; 2942 } 2943 2944 + static int truncate_oblocks(struct cache *cache) 2945 + { 2946 + uint32_t nr_blocks = from_cblock(cache->cache_size); 2947 + uint32_t i; 2948 + int r; 2949 + 2950 + for_each_set_bit(i, cache->invalid_bitset, nr_blocks) { 2951 + r = dm_cache_remove_mapping(cache->cmd, to_cblock(i)); 2952 + if (r) { 2953 + DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 2954 + cache_device_name(cache)); 2955 + return r; 2956 + } 2957 + } 2958 + 2959 + return 0; 2960 + } 2961 + 2962 static int cache_preresume(struct dm_target *ti) 2963 { 2964 int r = 0; 2965 struct cache *cache = ti->private; 2966 dm_cblock_t csize = get_cache_dev_size(cache); 2967 + 2968 + if (!can_resume(cache)) 2969 + return -EINVAL; 2970 2971 /* 2972 * Check to see if the cache has resized. ··· 2962 } 2963 2964 if (!cache->loaded_mappings) { 2965 + /* 2966 + * The fast device could have been resized since the last 2967 + * failed preresume attempt. To be safe we start by a blank 2968 + * bitset for cache blocks. 2969 + */ 2970 + clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size)); 2971 + 2972 r = dm_cache_load_mappings(cache->cmd, cache->policy, 2973 + load_filtered_mapping, cache); 2974 if (r) { 2975 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 2976 + if (r != -EFBIG) 2977 + metadata_operation_failed(cache, "dm_cache_load_mappings", r); 2978 + return r; 2979 + } 2980 + 2981 + r = truncate_oblocks(cache); 2982 + if (r) { 2983 + metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 2984 return r; 2985 } 2986 ··· 3426 3427 static struct target_type cache_target = { 3428 .name = "cache", 3429 + .version = {2, 3, 0}, 3430 .module = THIS_MODULE, 3431 .ctr = cache_ctr, 3432 .dtr = cache_dtr,

+10 -31

drivers/md/dm-crypt.c

··· 17 #include <linux/bio.h> 18 #include <linux/blkdev.h> 19 #include <linux/blk-integrity.h> 20 #include <linux/mempool.h> 21 #include <linux/slab.h> 22 #include <linux/crypto.h> ··· 126 127 #define TCW_WHITENING_SIZE 16 128 struct iv_tcw_private { 129 - struct crypto_shash *crc32_tfm; 130 u8 *iv_seed; 131 u8 *whitening; 132 }; ··· 607 tcw->iv_seed = NULL; 608 kfree_sensitive(tcw->whitening); 609 tcw->whitening = NULL; 610 - 611 - if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm)) 612 - crypto_free_shash(tcw->crc32_tfm); 613 - tcw->crc32_tfm = NULL; 614 } 615 616 static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, ··· 622 if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) { 623 ti->error = "Wrong key size for TCW"; 624 return -EINVAL; 625 - } 626 - 627 - tcw->crc32_tfm = crypto_alloc_shash("crc32", 0, 628 - CRYPTO_ALG_ALLOCATES_MEMORY); 629 - if (IS_ERR(tcw->crc32_tfm)) { 630 - ti->error = "Error initializing CRC32 in TCW"; 631 - return PTR_ERR(tcw->crc32_tfm); 632 } 633 634 tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL); ··· 657 return 0; 658 } 659 660 - static int crypt_iv_tcw_whitening(struct crypt_config *cc, 661 - struct dm_crypt_request *dmreq, 662 - u8 *data) 663 { 664 struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; 665 __le64 sector = cpu_to_le64(dmreq->iv_sector); 666 u8 buf[TCW_WHITENING_SIZE]; 667 - SHASH_DESC_ON_STACK(desc, tcw->crc32_tfm); 668 - int i, r; 669 670 /* xor whitening with sector number */ 671 crypto_xor_cpy(buf, tcw->whitening, (u8 *)&sector, 8); 672 crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)&sector, 8); 673 674 /* calculate crc32 for every 32bit part and xor it */ 675 - desc->tfm = tcw->crc32_tfm; 676 - for (i = 0; i < 4; i++) { 677 - r = crypto_shash_digest(desc, &buf[i * 4], 4, &buf[i * 4]); 678 - if (r) 679 - goto out; 680 - } 681 crypto_xor(&buf[0], &buf[12], 4); 682 crypto_xor(&buf[4], &buf[8], 4); 683 684 /* apply whitening (8 bytes) to whole sector */ 685 for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++) 686 crypto_xor(data + i * 8, buf, 8); 687 - out: 688 memzero_explicit(buf, sizeof(buf)); 689 - return r; 690 } 691 692 static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, ··· 688 struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; 689 __le64 sector = cpu_to_le64(dmreq->iv_sector); 690 u8 *src; 691 - int r = 0; 692 693 /* Remove whitening from ciphertext */ 694 if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { 695 sg = crypt_get_sg_data(cc, dmreq->sg_in); 696 src = kmap_local_page(sg_page(sg)); 697 - r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset); 698 kunmap_local(src); 699 } 700 ··· 703 crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)&sector, 704 cc->iv_size - 8); 705 706 - return r; 707 } 708 709 static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, ··· 711 { 712 struct scatterlist *sg; 713 u8 *dst; 714 - int r; 715 716 if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) 717 return 0; ··· 718 /* Apply whitening on ciphertext */ 719 sg = crypt_get_sg_data(cc, dmreq->sg_out); 720 dst = kmap_local_page(sg_page(sg)); 721 - r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset); 722 kunmap_local(dst); 723 724 - return r; 725 } 726 727 static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv,

··· 17 #include <linux/bio.h> 18 #include <linux/blkdev.h> 19 #include <linux/blk-integrity.h> 20 + #include <linux/crc32.h> 21 #include <linux/mempool.h> 22 #include <linux/slab.h> 23 #include <linux/crypto.h> ··· 125 126 #define TCW_WHITENING_SIZE 16 127 struct iv_tcw_private { 128 u8 *iv_seed; 129 u8 *whitening; 130 }; ··· 607 tcw->iv_seed = NULL; 608 kfree_sensitive(tcw->whitening); 609 tcw->whitening = NULL; 610 } 611 612 static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, ··· 626 if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) { 627 ti->error = "Wrong key size for TCW"; 628 return -EINVAL; 629 } 630 631 tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL); ··· 668 return 0; 669 } 670 671 + static void crypt_iv_tcw_whitening(struct crypt_config *cc, 672 + struct dm_crypt_request *dmreq, u8 *data) 673 { 674 struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; 675 __le64 sector = cpu_to_le64(dmreq->iv_sector); 676 u8 buf[TCW_WHITENING_SIZE]; 677 + int i; 678 679 /* xor whitening with sector number */ 680 crypto_xor_cpy(buf, tcw->whitening, (u8 *)&sector, 8); 681 crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)&sector, 8); 682 683 /* calculate crc32 for every 32bit part and xor it */ 684 + for (i = 0; i < 4; i++) 685 + put_unaligned_le32(crc32(0, &buf[i * 4], 4), &buf[i * 4]); 686 crypto_xor(&buf[0], &buf[12], 4); 687 crypto_xor(&buf[4], &buf[8], 4); 688 689 /* apply whitening (8 bytes) to whole sector */ 690 for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++) 691 crypto_xor(data + i * 8, buf, 8); 692 memzero_explicit(buf, sizeof(buf)); 693 } 694 695 static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, ··· 707 struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; 708 __le64 sector = cpu_to_le64(dmreq->iv_sector); 709 u8 *src; 710 711 /* Remove whitening from ciphertext */ 712 if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { 713 sg = crypt_get_sg_data(cc, dmreq->sg_in); 714 src = kmap_local_page(sg_page(sg)); 715 + crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset); 716 kunmap_local(src); 717 } 718 ··· 723 crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)&sector, 724 cc->iv_size - 8); 725 726 + return 0; 727 } 728 729 static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, ··· 731 { 732 struct scatterlist *sg; 733 u8 *dst; 734 735 if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) 736 return 0; ··· 739 /* Apply whitening on ciphertext */ 740 sg = crypt_get_sg_data(cc, dmreq->sg_out); 741 dst = kmap_local_page(sg_page(sg)); 742 + crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset); 743 kunmap_local(dst); 744 745 + return 0; 746 } 747 748 static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv,

+17 -1

drivers/md/dm-delay.c

··· 369 return delay_bio(dc, c, bio); 370 } 371 372 #define DMEMIT_DELAY_CLASS(c) \ 373 DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay) 374 ··· 439 static struct target_type delay_target = { 440 .name = "delay", 441 .version = {1, 4, 0}, 442 - .features = DM_TARGET_PASSES_INTEGRITY, 443 .module = THIS_MODULE, 444 .ctr = delay_ctr, 445 .dtr = delay_dtr, 446 .map = delay_map, 447 .presuspend = delay_presuspend, 448 .resume = delay_resume, 449 .status = delay_status,

··· 369 return delay_bio(dc, c, bio); 370 } 371 372 + #ifdef CONFIG_BLK_DEV_ZONED 373 + static int delay_report_zones(struct dm_target *ti, 374 + struct dm_report_zones_args *args, unsigned int nr_zones) 375 + { 376 + struct delay_c *dc = ti->private; 377 + struct delay_class *c = &dc->read; 378 + 379 + return dm_report_zones(c->dev->bdev, c->start, 380 + c->start + dm_target_offset(ti, args->next_sector), 381 + args, nr_zones); 382 + } 383 + #else 384 + #define delay_report_zones NULL 385 + #endif 386 + 387 #define DMEMIT_DELAY_CLASS(c) \ 388 DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay) 389 ··· 424 static struct target_type delay_target = { 425 .name = "delay", 426 .version = {1, 4, 0}, 427 + .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM, 428 .module = THIS_MODULE, 429 .ctr = delay_ctr, 430 .dtr = delay_dtr, 431 .map = delay_map, 432 + .report_zones = delay_report_zones, 433 .presuspend = delay_presuspend, 434 .resume = delay_resume, 435 .status = delay_status,

+7

drivers/md/dm-ebs-target.c

··· 390 return DM_MAPIO_REMAPPED; 391 } 392 393 static void ebs_status(struct dm_target *ti, status_type_t type, 394 unsigned int status_flags, char *result, unsigned int maxlen) 395 { ··· 453 .ctr = ebs_ctr, 454 .dtr = ebs_dtr, 455 .map = ebs_map, 456 .status = ebs_status, 457 .io_hints = ebs_io_hints, 458 .prepare_ioctl = ebs_prepare_ioctl,

··· 390 return DM_MAPIO_REMAPPED; 391 } 392 393 + static void ebs_postsuspend(struct dm_target *ti) 394 + { 395 + struct ebs_c *ec = ti->private; 396 + dm_bufio_client_reset(ec->bufio); 397 + } 398 + 399 static void ebs_status(struct dm_target *ti, status_type_t type, 400 unsigned int status_flags, char *result, unsigned int maxlen) 401 { ··· 447 .ctr = ebs_ctr, 448 .dtr = ebs_dtr, 449 .map = ebs_map, 450 + .postsuspend = ebs_postsuspend, 451 .status = ebs_status, 452 .io_hints = ebs_io_hints, 453 .prepare_ioctl = ebs_prepare_ioctl,

+25 -23

drivers/md/dm-integrity.c

··· 21 #include <linux/reboot.h> 22 #include <crypto/hash.h> 23 #include <crypto/skcipher.h> 24 #include <linux/async_tx.h> 25 #include <linux/dm-bufio.h> 26 ··· 517 dm_integrity_io_error(ic, "crypto_shash_digest", r); 518 return r; 519 } 520 - if (memcmp(mac, actual_mac, mac_size)) { 521 dm_integrity_io_error(ic, "superblock mac", -EILSEQ); 522 dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0); 523 return -EILSEQ; ··· 860 if (likely(wr)) 861 memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR); 862 else { 863 - if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) { 864 dm_integrity_io_error(ic, "journal mac", -EILSEQ); 865 dm_audit_log_target(DM_MSG_PREFIX, "mac-journal", ic->ti, 0); 866 } ··· 1402 static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block, 1403 unsigned int *metadata_offset, unsigned int total_size, int op) 1404 { 1405 - #define MAY_BE_FILLER 1 1406 - #define MAY_BE_HASH 2 1407 unsigned int hash_offset = 0; 1408 - unsigned int may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); 1409 1410 do { 1411 unsigned char *data, *dp; ··· 1425 if (op == TAG_READ) { 1426 memcpy(tag, dp, to_copy); 1427 } else if (op == TAG_WRITE) { 1428 - if (memcmp(dp, tag, to_copy)) { 1429 memcpy(dp, tag, to_copy); 1430 dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy); 1431 } ··· 1433 /* e.g.: op == TAG_CMP */ 1434 1435 if (likely(is_power_of_2(ic->tag_size))) { 1436 - if (unlikely(memcmp(dp, tag, to_copy))) 1437 - if (unlikely(!ic->discard) || 1438 - unlikely(memchr_inv(dp, DISCARD_FILLER, to_copy) != NULL)) { 1439 - goto thorough_test; 1440 - } 1441 } else { 1442 unsigned int i, ts; 1443 thorough_test: 1444 ts = total_size; 1445 1446 for (i = 0; i < to_copy; i++, ts--) { 1447 - if (unlikely(dp[i] != tag[i])) 1448 - may_be &= ~MAY_BE_HASH; 1449 - if (likely(dp[i] != DISCARD_FILLER)) 1450 - may_be &= ~MAY_BE_FILLER; 1451 hash_offset++; 1452 if (unlikely(hash_offset == ic->tag_size)) { 1453 - if (unlikely(!may_be)) { 1454 dm_bufio_release(b); 1455 return ts; 1456 } 1457 hash_offset = 0; 1458 - may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); 1459 } 1460 } 1461 } ··· 1477 } while (unlikely(total_size)); 1478 1479 return 0; 1480 - #undef MAY_BE_FILLER 1481 - #undef MAY_BE_HASH 1482 } 1483 1484 struct flush_request { ··· 2075 char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; 2076 2077 integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); 2078 - if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { 2079 DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx", 2080 logical_sector); 2081 dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum", ··· 2594 bio_put(outgoing_bio); 2595 2596 integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest); 2597 - if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { 2598 DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", 2599 ic->dev->bdev, dio->bio_details.bi_iter.bi_sector); 2600 atomic64_inc(&ic->number_of_mismatches); ··· 2633 char *mem = bvec_kmap_local(&bv); 2634 //memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT); 2635 integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest); 2636 - if (unlikely(memcmp(digest, dio->integrity_payload + pos, 2637 min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { 2638 kunmap_local(mem); 2639 dm_integrity_free_payload(dio); ··· 2910 2911 integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block), 2912 (char *)access_journal_data(ic, i, l), test_tag); 2913 - if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) { 2914 dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ); 2915 dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0); 2916 } ··· 5071 5072 ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); 5073 if (!ic->recalc_bitmap) { 5074 r = -ENOMEM; 5075 goto bad; 5076 } 5077 ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); 5078 if (!ic->may_write_bitmap) { 5079 r = -ENOMEM; 5080 goto bad; 5081 } 5082 ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL); 5083 if (!ic->bbs) { 5084 r = -ENOMEM; 5085 goto bad; 5086 }

··· 21 #include <linux/reboot.h> 22 #include <crypto/hash.h> 23 #include <crypto/skcipher.h> 24 + #include <crypto/utils.h> 25 #include <linux/async_tx.h> 26 #include <linux/dm-bufio.h> 27 ··· 516 dm_integrity_io_error(ic, "crypto_shash_digest", r); 517 return r; 518 } 519 + if (crypto_memneq(mac, actual_mac, mac_size)) { 520 dm_integrity_io_error(ic, "superblock mac", -EILSEQ); 521 dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0); 522 return -EILSEQ; ··· 859 if (likely(wr)) 860 memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR); 861 else { 862 + if (crypto_memneq(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) { 863 dm_integrity_io_error(ic, "journal mac", -EILSEQ); 864 dm_audit_log_target(DM_MSG_PREFIX, "mac-journal", ic->ti, 0); 865 } ··· 1401 static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block, 1402 unsigned int *metadata_offset, unsigned int total_size, int op) 1403 { 1404 unsigned int hash_offset = 0; 1405 + unsigned char mismatch_hash = 0; 1406 + unsigned char mismatch_filler = !ic->discard; 1407 1408 do { 1409 unsigned char *data, *dp; ··· 1425 if (op == TAG_READ) { 1426 memcpy(tag, dp, to_copy); 1427 } else if (op == TAG_WRITE) { 1428 + if (crypto_memneq(dp, tag, to_copy)) { 1429 memcpy(dp, tag, to_copy); 1430 dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy); 1431 } ··· 1433 /* e.g.: op == TAG_CMP */ 1434 1435 if (likely(is_power_of_2(ic->tag_size))) { 1436 + if (unlikely(crypto_memneq(dp, tag, to_copy))) 1437 + goto thorough_test; 1438 } else { 1439 unsigned int i, ts; 1440 thorough_test: 1441 ts = total_size; 1442 1443 for (i = 0; i < to_copy; i++, ts--) { 1444 + /* 1445 + * Warning: the control flow must not be 1446 + * dependent on match/mismatch of 1447 + * individual bytes. 1448 + */ 1449 + mismatch_hash |= dp[i] ^ tag[i]; 1450 + mismatch_filler |= dp[i] ^ DISCARD_FILLER; 1451 hash_offset++; 1452 if (unlikely(hash_offset == ic->tag_size)) { 1453 + if (unlikely(mismatch_hash) && unlikely(mismatch_filler)) { 1454 dm_bufio_release(b); 1455 return ts; 1456 } 1457 hash_offset = 0; 1458 + mismatch_hash = 0; 1459 + mismatch_filler = !ic->discard; 1460 } 1461 } 1462 } ··· 1476 } while (unlikely(total_size)); 1477 1478 return 0; 1479 } 1480 1481 struct flush_request { ··· 2076 char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; 2077 2078 integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); 2079 + if (unlikely(crypto_memneq(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { 2080 DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx", 2081 logical_sector); 2082 dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum", ··· 2595 bio_put(outgoing_bio); 2596 2597 integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest); 2598 + if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { 2599 DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", 2600 ic->dev->bdev, dio->bio_details.bi_iter.bi_sector); 2601 atomic64_inc(&ic->number_of_mismatches); ··· 2634 char *mem = bvec_kmap_local(&bv); 2635 //memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT); 2636 integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest); 2637 + if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos, 2638 min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { 2639 kunmap_local(mem); 2640 dm_integrity_free_payload(dio); ··· 2911 2912 integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block), 2913 (char *)access_journal_data(ic, i, l), test_tag); 2914 + if (unlikely(crypto_memneq(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) { 2915 dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ); 2916 dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0); 2917 } ··· 5072 5073 ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); 5074 if (!ic->recalc_bitmap) { 5075 + ti->error = "Could not allocate memory for bitmap"; 5076 r = -ENOMEM; 5077 goto bad; 5078 } 5079 ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); 5080 if (!ic->may_write_bitmap) { 5081 + ti->error = "Could not allocate memory for bitmap"; 5082 r = -ENOMEM; 5083 goto bad; 5084 } 5085 ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL); 5086 if (!ic->bbs) { 5087 + ti->error = "Could not allocate memory for bitmap"; 5088 r = -ENOMEM; 5089 goto bad; 5090 }

+1 -1

drivers/md/dm-stripe.c

··· 467 .name = "striped", 468 .version = {1, 7, 0}, 469 .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT | 470 - DM_TARGET_ATOMIC_WRITES, 471 .module = THIS_MODULE, 472 .ctr = stripe_ctr, 473 .dtr = stripe_dtr,

··· 467 .name = "striped", 468 .version = {1, 7, 0}, 469 .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT | 470 + DM_TARGET_ATOMIC_WRITES | DM_TARGET_PASSES_CRYPTO, 471 .module = THIS_MODULE, 472 .ctr = stripe_ctr, 473 .dtr = stripe_dtr,

+4

drivers/md/dm-table.c

··· 697 DMERR("%s: zero-length target", dm_device_name(t->md)); 698 return -EINVAL; 699 } 700 701 ti->type = dm_get_target_type(type); 702 if (!ti->type) {

··· 697 DMERR("%s: zero-length target", dm_device_name(t->md)); 698 return -EINVAL; 699 } 700 + if (start + len < start || start + len > LLONG_MAX >> SECTOR_SHIFT) { 701 + DMERR("%s: too large device", dm_device_name(t->md)); 702 + return -EINVAL; 703 + } 704 705 ti->type = dm_get_target_type(type); 706 if (!ti->type) {

+6 -7

drivers/md/dm-vdo/block-map.c

··· 451 * select_lru_page() - Determine which page is least recently used. 452 * 453 * Picks the least recently used from among the non-busy entries at the front of each of the lru 454 - * ring. Since whenever we mark a page busy we also put it to the end of the ring it is unlikely 455 * that the entries at the front are busy unless the queue is very short, but not impossible. 456 * 457 * Return: A pointer to the info structure for a relevant page, or NULL if no such page can be ··· 1544 1545 static void return_to_pool(struct block_map_zone *zone, struct pooled_vio *vio) 1546 { 1547 - return_vio_to_pool(zone->vio_pool, vio); 1548 check_for_drain_complete(zone); 1549 } 1550 ··· 1837 1838 if (!vdo_copy_valid_page(vio->data, nonce, pbn, page)) 1839 vdo_format_block_map_page(page, nonce, pbn, false); 1840 - return_vio_to_pool(zone->vio_pool, pooled); 1841 1842 /* Release our claim to the load and wake any waiters */ 1843 release_page_lock(data_vio, "load"); ··· 1851 struct vio *vio = as_vio(completion); 1852 struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio); 1853 struct data_vio *data_vio = completion->parent; 1854 - struct block_map_zone *zone = pooled->context; 1855 1856 vio_record_metadata_io_error(vio); 1857 - return_vio_to_pool(zone->vio_pool, pooled); 1858 abort_load(data_vio, result); 1859 } 1860 ··· 2498 struct cursors *cursors = cursor->parent; 2499 struct vdo_completion *completion = cursors->completion; 2500 2501 - return_vio_to_pool(cursors->pool, vdo_forget(cursor->vio)); 2502 if (--cursors->active_roots > 0) 2503 return; 2504 ··· 2745 if (result != VDO_SUCCESS) 2746 return result; 2747 2748 - result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE, 2749 zone->thread_id, VIO_TYPE_BLOCK_MAP_INTERIOR, 2750 VIO_PRIORITY_METADATA, zone, &zone->vio_pool); 2751 if (result != VDO_SUCCESS)

··· 451 * select_lru_page() - Determine which page is least recently used. 452 * 453 * Picks the least recently used from among the non-busy entries at the front of each of the lru 454 + * list. Since whenever we mark a page busy we also put it to the end of the list it is unlikely 455 * that the entries at the front are busy unless the queue is very short, but not impossible. 456 * 457 * Return: A pointer to the info structure for a relevant page, or NULL if no such page can be ··· 1544 1545 static void return_to_pool(struct block_map_zone *zone, struct pooled_vio *vio) 1546 { 1547 + return_vio_to_pool(vio); 1548 check_for_drain_complete(zone); 1549 } 1550 ··· 1837 1838 if (!vdo_copy_valid_page(vio->data, nonce, pbn, page)) 1839 vdo_format_block_map_page(page, nonce, pbn, false); 1840 + return_vio_to_pool(pooled); 1841 1842 /* Release our claim to the load and wake any waiters */ 1843 release_page_lock(data_vio, "load"); ··· 1851 struct vio *vio = as_vio(completion); 1852 struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio); 1853 struct data_vio *data_vio = completion->parent; 1854 1855 vio_record_metadata_io_error(vio); 1856 + return_vio_to_pool(pooled); 1857 abort_load(data_vio, result); 1858 } 1859 ··· 2499 struct cursors *cursors = cursor->parent; 2500 struct vdo_completion *completion = cursors->completion; 2501 2502 + return_vio_to_pool(vdo_forget(cursor->vio)); 2503 if (--cursors->active_roots > 0) 2504 return; 2505 ··· 2746 if (result != VDO_SUCCESS) 2747 return result; 2748 2749 + result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE, 1, 2750 zone->thread_id, VIO_TYPE_BLOCK_MAP_INTERIOR, 2751 VIO_PRIORITY_METADATA, zone, &zone->vio_pool); 2752 if (result != VDO_SUCCESS)

-3

drivers/md/dm-vdo/constants.h

··· 44 /* The default size of each slab journal, in blocks */ 45 DEFAULT_VDO_SLAB_JOURNAL_SIZE = 224, 46 47 - /* Unit test minimum */ 48 - MINIMUM_VDO_SLAB_JOURNAL_BLOCKS = 2, 49 - 50 /* 51 * The initial size of lbn_operations and pbn_operations, which is based upon the expected 52 * maximum number of outstanding VIOs. This value was chosen to make it highly unlikely

··· 44 /* The default size of each slab journal, in blocks */ 45 DEFAULT_VDO_SLAB_JOURNAL_SIZE = 224, 46 47 /* 48 * The initial size of lbn_operations and pbn_operations, which is based upon the expected 49 * maximum number of outstanding VIOs. This value was chosen to make it highly unlikely

+10 -10

drivers/md/dm-vdo/dedupe.c

··· 226 * A list containing the data VIOs sharing this lock, all having the same record name and 227 * data block contents, linked by their hash_lock_node fields. 228 */ 229 - struct list_head duplicate_ring; 230 231 /* The number of data_vios sharing this lock instance */ 232 data_vio_count_t reference_count; ··· 343 { 344 memset(lock, 0, sizeof(*lock)); 345 INIT_LIST_HEAD(&lock->pool_node); 346 - INIT_LIST_HEAD(&lock->duplicate_ring); 347 vdo_waitq_init(&lock->waiters); 348 list_add_tail(&lock->pool_node, &zone->lock_pool); 349 } ··· 441 VDO_ASSERT_LOG_ONLY(data_vio->hash_zone != NULL, 442 "must have a hash zone when holding a hash lock"); 443 VDO_ASSERT_LOG_ONLY(!list_empty(&data_vio->hash_lock_entry), 444 - "must be on a hash lock ring when holding a hash lock"); 445 VDO_ASSERT_LOG_ONLY(old_lock->reference_count > 0, 446 "hash lock reference must be counted"); 447 ··· 464 465 if (new_lock != NULL) { 466 /* 467 - * Keep all data_vios sharing the lock on a ring since they can complete in any 468 * order and we'll always need a pointer to one to compare data. 469 */ 470 - list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_ring); 471 new_lock->reference_count += 1; 472 if (new_lock->max_references < new_lock->reference_count) 473 new_lock->max_references = new_lock->reference_count; ··· 1789 struct hash_zone *zone; 1790 bool collides; 1791 1792 - if (list_empty(&lock->duplicate_ring)) 1793 return false; 1794 1795 - lock_holder = list_first_entry(&lock->duplicate_ring, struct data_vio, 1796 hash_lock_entry); 1797 zone = candidate->hash_zone; 1798 collides = !blocks_equal(lock_holder->vio.data, candidate->vio.data); ··· 1815 return result; 1816 1817 result = VDO_ASSERT(list_empty(&data_vio->hash_lock_entry), 1818 - "must not already be a member of a hash lock ring"); 1819 if (result != VDO_SUCCESS) 1820 return result; 1821 ··· 1942 "returned hash lock must not be in use with state %s", 1943 get_hash_lock_state_name(lock->state)); 1944 VDO_ASSERT_LOG_ONLY(list_empty(&lock->pool_node), 1945 - "hash lock returned to zone must not be in a pool ring"); 1946 - VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_ring), 1947 "hash lock returned to zone must not reference DataVIOs"); 1948 1949 return_hash_lock_to_pool(zone, lock);

··· 226 * A list containing the data VIOs sharing this lock, all having the same record name and 227 * data block contents, linked by their hash_lock_node fields. 228 */ 229 + struct list_head duplicate_vios; 230 231 /* The number of data_vios sharing this lock instance */ 232 data_vio_count_t reference_count; ··· 343 { 344 memset(lock, 0, sizeof(*lock)); 345 INIT_LIST_HEAD(&lock->pool_node); 346 + INIT_LIST_HEAD(&lock->duplicate_vios); 347 vdo_waitq_init(&lock->waiters); 348 list_add_tail(&lock->pool_node, &zone->lock_pool); 349 } ··· 441 VDO_ASSERT_LOG_ONLY(data_vio->hash_zone != NULL, 442 "must have a hash zone when holding a hash lock"); 443 VDO_ASSERT_LOG_ONLY(!list_empty(&data_vio->hash_lock_entry), 444 + "must be on a hash lock list when holding a hash lock"); 445 VDO_ASSERT_LOG_ONLY(old_lock->reference_count > 0, 446 "hash lock reference must be counted"); 447 ··· 464 465 if (new_lock != NULL) { 466 /* 467 + * Keep all data_vios sharing the lock on a list since they can complete in any 468 * order and we'll always need a pointer to one to compare data. 469 */ 470 + list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_vios); 471 new_lock->reference_count += 1; 472 if (new_lock->max_references < new_lock->reference_count) 473 new_lock->max_references = new_lock->reference_count; ··· 1789 struct hash_zone *zone; 1790 bool collides; 1791 1792 + if (list_empty(&lock->duplicate_vios)) 1793 return false; 1794 1795 + lock_holder = list_first_entry(&lock->duplicate_vios, struct data_vio, 1796 hash_lock_entry); 1797 zone = candidate->hash_zone; 1798 collides = !blocks_equal(lock_holder->vio.data, candidate->vio.data); ··· 1815 return result; 1816 1817 result = VDO_ASSERT(list_empty(&data_vio->hash_lock_entry), 1818 + "must not already be a member of a hash lock list"); 1819 if (result != VDO_SUCCESS) 1820 return result; 1821 ··· 1942 "returned hash lock must not be in use with state %s", 1943 get_hash_lock_state_name(lock->state)); 1944 VDO_ASSERT_LOG_ONLY(list_empty(&lock->pool_node), 1945 + "hash lock returned to zone must not be in a pool list"); 1946 + VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_vios), 1947 "hash lock returned to zone must not reference DataVIOs"); 1948 1949 return_hash_lock_to_pool(zone, lock);

+1 -19

drivers/md/dm-vdo/encodings.c

··· 711 ref_blocks = vdo_get_saved_reference_count_size(slab_size - slab_journal_blocks); 712 meta_blocks = (ref_blocks + slab_journal_blocks); 713 714 - /* Make sure test code hasn't configured slabs to be too small. */ 715 if (meta_blocks >= slab_size) 716 return VDO_BAD_CONFIGURATION; 717 718 - /* 719 - * If the slab size is very small, assume this must be a unit test and override the number 720 - * of data blocks to be a power of two (wasting blocks in the slab). Many tests need their 721 - * data_blocks fields to be the exact capacity of the configured volume, and that used to 722 - * fall out since they use a power of two for the number of data blocks, the slab size was 723 - * a power of two, and every block in a slab was a data block. 724 - * 725 - * TODO: Try to figure out some way of structuring testParameters and unit tests so this 726 - * hack isn't needed without having to edit several unit tests every time the metadata size 727 - * changes by one block. 728 - */ 729 data_blocks = slab_size - meta_blocks; 730 - if ((slab_size < 1024) && !is_power_of_2(data_blocks)) 731 - data_blocks = ((block_count_t) 1 << ilog2(data_blocks)); 732 733 /* 734 * Configure the slab journal thresholds. The flush threshold is 168 of 224 blocks in ··· 1205 result = VDO_ASSERT(config->slab_size <= (1 << MAX_VDO_SLAB_BITS), 1206 "slab size must be less than or equal to 2^%d", 1207 MAX_VDO_SLAB_BITS); 1208 - if (result != VDO_SUCCESS) 1209 - return result; 1210 - 1211 - result = VDO_ASSERT(config->slab_journal_blocks >= MINIMUM_VDO_SLAB_JOURNAL_BLOCKS, 1212 - "slab journal size meets minimum size"); 1213 if (result != VDO_SUCCESS) 1214 return result; 1215

··· 711 ref_blocks = vdo_get_saved_reference_count_size(slab_size - slab_journal_blocks); 712 meta_blocks = (ref_blocks + slab_journal_blocks); 713 714 + /* Make sure configured slabs are not too small. */ 715 if (meta_blocks >= slab_size) 716 return VDO_BAD_CONFIGURATION; 717 718 data_blocks = slab_size - meta_blocks; 719 720 /* 721 * Configure the slab journal thresholds. The flush threshold is 168 of 224 blocks in ··· 1218 result = VDO_ASSERT(config->slab_size <= (1 << MAX_VDO_SLAB_BITS), 1219 "slab size must be less than or equal to 2^%d", 1220 MAX_VDO_SLAB_BITS); 1221 if (result != VDO_SUCCESS) 1222 return result; 1223

+3 -2

drivers/md/dm-vdo/indexer/index-layout.c

··· 54 * Each save also has a unique nonce. 55 */ 56 57 - #define MAGIC_SIZE 32 58 #define NONCE_INFO_SIZE 32 59 #define MAX_SAVES 2 60 ··· 97 #define SUPER_VERSION_CURRENT 3 98 #define SUPER_VERSION_MAXIMUM 7 99 100 - static const u8 LAYOUT_MAGIC[MAGIC_SIZE] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; 101 static const u64 REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */ 102 103 struct region_header { 104 u64 magic;

··· 54 * Each save also has a unique nonce. 55 */ 56 57 #define NONCE_INFO_SIZE 32 58 #define MAX_SAVES 2 59 ··· 98 #define SUPER_VERSION_CURRENT 3 99 #define SUPER_VERSION_MAXIMUM 7 100 101 + static const u8 LAYOUT_MAGIC[] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; 102 static const u64 REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */ 103 + 104 + #define MAGIC_SIZE (sizeof(LAYOUT_MAGIC) - 1) 105 106 struct region_header { 107 u64 magic;

+1 -5

drivers/md/dm-vdo/indexer/index-session.c

··· 100 101 int uds_launch_request(struct uds_request *request) 102 { 103 - size_t internal_size; 104 int result; 105 106 if (request->callback == NULL) { ··· 120 } 121 122 /* Reset all internal fields before processing. */ 123 - internal_size = 124 - sizeof(struct uds_request) - offsetof(struct uds_request, zone_number); 125 - // FIXME should be using struct_group for this instead 126 - memset((char *) request + sizeof(*request) - internal_size, 0, internal_size); 127 128 result = get_index_session(request->session); 129 if (result != UDS_SUCCESS)

··· 100 101 int uds_launch_request(struct uds_request *request) 102 { 103 int result; 104 105 if (request->callback == NULL) { ··· 121 } 122 123 /* Reset all internal fields before processing. */ 124 + memset(&request->internal, 0, sizeof(request->internal)); 125 126 result = get_index_session(request->session); 127 if (result != UDS_SUCCESS)

+26 -27

drivers/md/dm-vdo/indexer/indexer.h

··· 8 9 #include <linux/mutex.h> 10 #include <linux/sched.h> 11 #include <linux/types.h> 12 #include <linux/wait.h> 13 ··· 74 /* Remove any mapping for a name. */ 75 UDS_DELETE, 76 77 - }; 78 79 enum uds_open_index_type { 80 /* Create a new index. */ ··· 227 enum uds_zone_message_type type; 228 /* The virtual chapter number to which the message applies */ 229 u64 virtual_chapter; 230 - }; 231 232 struct uds_index_session; 233 struct uds_index; ··· 254 255 /* The existing data associated with the request name, if any */ 256 struct uds_record_data old_metadata; 257 - /* Either UDS_SUCCESS or an error code for the request */ 258 - int status; 259 /* True if the record name had an existing entry in the index */ 260 bool found; 261 262 - /* 263 - * The remaining fields are used internally and should not be altered by clients. The index 264 - * relies on zone_number being the first field in this section. 265 - */ 266 - 267 - /* The number of the zone which will process this request*/ 268 - unsigned int zone_number; 269 - /* A link for adding a request to a lock-free queue */ 270 - struct funnel_queue_entry queue_link; 271 - /* A link for adding a request to a standard linked list */ 272 - struct uds_request *next_request; 273 - /* A pointer to the index processing this request */ 274 - struct uds_index *index; 275 - /* Control message for coordinating between zones */ 276 - struct uds_zone_message zone_message; 277 - /* If true, process request immediately by waking the worker thread */ 278 - bool unbatched; 279 - /* If true, continue this request before processing newer requests */ 280 - bool requeued; 281 - /* The virtual chapter containing the record name, if known */ 282 - u64 virtual_chapter; 283 - /* The region of the index containing the record name */ 284 - enum uds_index_region location; 285 }; 286 287 /* A session is required for most index operations. */

··· 8 9 #include <linux/mutex.h> 10 #include <linux/sched.h> 11 + #include <linux/stddef.h> 12 #include <linux/types.h> 13 #include <linux/wait.h> 14 ··· 73 /* Remove any mapping for a name. */ 74 UDS_DELETE, 75 76 + } __packed; 77 78 enum uds_open_index_type { 79 /* Create a new index. */ ··· 226 enum uds_zone_message_type type; 227 /* The virtual chapter number to which the message applies */ 228 u64 virtual_chapter; 229 + } __packed; 230 231 struct uds_index_session; 232 struct uds_index; ··· 253 254 /* The existing data associated with the request name, if any */ 255 struct uds_record_data old_metadata; 256 /* True if the record name had an existing entry in the index */ 257 bool found; 258 + /* Either UDS_SUCCESS or an error code for the request */ 259 + int status; 260 261 + /* The remaining fields are used internally and should not be altered by clients. */ 262 + struct_group(internal, 263 + /* The virtual chapter containing the record name, if known */ 264 + u64 virtual_chapter; 265 + /* The region of the index containing the record name */ 266 + enum uds_index_region location; 267 + /* If true, process request immediately by waking the worker thread */ 268 + bool unbatched; 269 + /* If true, continue this request before processing newer requests */ 270 + bool requeued; 271 + /* Control message for coordinating between zones */ 272 + struct uds_zone_message zone_message; 273 + /* The number of the zone which will process this request*/ 274 + unsigned int zone_number; 275 + /* A link for adding a request to a lock-free queue */ 276 + struct funnel_queue_entry queue_link; 277 + /* A link for adding a request to a standard linked list */ 278 + struct uds_request *next_request; 279 + /* A pointer to the index processing this request */ 280 + struct uds_index *index; 281 + ); 282 }; 283 284 /* A session is required for most index operations. */

+4 -2

drivers/md/dm-vdo/io-submitter.c

··· 327 * @error_handler: the handler for submission or I/O errors (may be NULL) 328 * @operation: the type of I/O to perform 329 * @data: the buffer to read or write (may be NULL) 330 * 331 * The vio is enqueued on a vdo bio queue so that bio submission (which may block) does not block 332 * other vdo threads. ··· 339 */ 340 void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, 341 bio_end_io_t callback, vdo_action_fn error_handler, 342 - blk_opf_t operation, char *data) 343 { 344 int result; 345 struct vdo_completion *completion = &vio->completion; ··· 350 351 vdo_reset_completion(completion); 352 completion->error_handler = error_handler; 353 - result = vio_reset_bio(vio, data, callback, operation | REQ_META, physical); 354 if (result != VDO_SUCCESS) { 355 continue_vio(vio, result); 356 return;

··· 327 * @error_handler: the handler for submission or I/O errors (may be NULL) 328 * @operation: the type of I/O to perform 329 * @data: the buffer to read or write (may be NULL) 330 + * @size: the I/O amount in bytes 331 * 332 * The vio is enqueued on a vdo bio queue so that bio submission (which may block) does not block 333 * other vdo threads. ··· 338 */ 339 void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, 340 bio_end_io_t callback, vdo_action_fn error_handler, 341 + blk_opf_t operation, char *data, int size) 342 { 343 int result; 344 struct vdo_completion *completion = &vio->completion; ··· 349 350 vdo_reset_completion(completion); 351 completion->error_handler = error_handler; 352 + result = vio_reset_bio_with_size(vio, data, size, callback, operation | REQ_META, 353 + physical); 354 if (result != VDO_SUCCESS) { 355 continue_vio(vio, result); 356 return;

+15 -3

drivers/md/dm-vdo/io-submitter.h

··· 8 9 #include <linux/bio.h> 10 11 #include "types.h" 12 13 struct io_submitter; ··· 27 28 void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, 29 bio_end_io_t callback, vdo_action_fn error_handler, 30 - blk_opf_t operation, char *data); 31 32 static inline void vdo_submit_metadata_vio(struct vio *vio, physical_block_number_t physical, 33 bio_end_io_t callback, vdo_action_fn error_handler, 34 blk_opf_t operation) 35 { 36 __submit_metadata_vio(vio, physical, callback, error_handler, 37 - operation, vio->data); 38 } 39 40 static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback, ··· 53 { 54 /* FIXME: Can we just use REQ_OP_FLUSH? */ 55 __submit_metadata_vio(vio, 0, callback, error_handler, 56 - REQ_OP_WRITE | REQ_PREFLUSH, NULL); 57 } 58 59 #endif /* VDO_IO_SUBMITTER_H */

··· 8 9 #include <linux/bio.h> 10 11 + #include "constants.h" 12 #include "types.h" 13 14 struct io_submitter; ··· 26 27 void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, 28 bio_end_io_t callback, vdo_action_fn error_handler, 29 + blk_opf_t operation, char *data, int size); 30 31 static inline void vdo_submit_metadata_vio(struct vio *vio, physical_block_number_t physical, 32 bio_end_io_t callback, vdo_action_fn error_handler, 33 blk_opf_t operation) 34 { 35 __submit_metadata_vio(vio, physical, callback, error_handler, 36 + operation, vio->data, vio->block_count * VDO_BLOCK_SIZE); 37 + } 38 + 39 + static inline void vdo_submit_metadata_vio_with_size(struct vio *vio, 40 + physical_block_number_t physical, 41 + bio_end_io_t callback, 42 + vdo_action_fn error_handler, 43 + blk_opf_t operation, 44 + int size) 45 + { 46 + __submit_metadata_vio(vio, physical, callback, error_handler, 47 + operation, vio->data, size); 48 } 49 50 static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback, ··· 41 { 42 /* FIXME: Can we just use REQ_OP_FLUSH? */ 43 __submit_metadata_vio(vio, 0, callback, error_handler, 44 + REQ_OP_WRITE | REQ_PREFLUSH, NULL, 0); 45 } 46 47 #endif /* VDO_IO_SUBMITTER_H */

+1 -1

drivers/md/dm-vdo/packer.h

··· 46 47 /* 48 * Each packer_bin holds an incomplete batch of data_vios that only partially fill a compressed 49 - * block. The bins are kept in a ring sorted by the amount of unused space so the first bin with 50 * enough space to hold a newly-compressed data_vio can easily be found. When the bin fills up or 51 * is flushed, the first uncanceled data_vio in the bin is selected to be the agent for that bin. 52 * Upon entering the packer, each data_vio already has its compressed data in the first slot of the

··· 46 47 /* 48 * Each packer_bin holds an incomplete batch of data_vios that only partially fill a compressed 49 + * block. The bins are kept in a list sorted by the amount of unused space so the first bin with 50 * enough space to hold a newly-compressed data_vio can easily be found. When the bin fills up or 51 * is flushed, the first uncanceled data_vio in the bin is selected to be the agent for that bin. 52 * Upon entering the packer, each data_vio already has its compressed data in the first slot of the

+1 -1

drivers/md/dm-vdo/priority-table.c

··· 199 200 /* 201 * Remove the entry from the bucket list, remembering a pointer to another entry in the 202 - * ring. 203 */ 204 next_entry = entry->next; 205 list_del_init(entry);

··· 199 200 /* 201 * Remove the entry from the bucket list, remembering a pointer to another entry in the 202 + * list. 203 */ 204 next_entry = entry->next; 205 list_del_init(entry);

+3 -3

drivers/md/dm-vdo/recovery-journal.h

··· 43 * has a vio which is used to commit that block to disk. The vio's data is the on-disk 44 * representation of the journal block. In addition each in-memory block has a buffer which is used 45 * to accumulate entries while a partial commit of the block is in progress. In-memory blocks are 46 - * kept on two rings. Free blocks live on the 'free_tail_blocks' ring. When a block becomes active 47 - * (see below) it is moved to the 'active_tail_blocks' ring. When a block is fully committed, it is 48 - * moved back to the 'free_tail_blocks' ring. 49 * 50 * When entries are added to the journal, they are added to the active in-memory block, as 51 * indicated by the 'active_block' field. If the caller wishes to wait for the entry to be

··· 43 * has a vio which is used to commit that block to disk. The vio's data is the on-disk 44 * representation of the journal block. In addition each in-memory block has a buffer which is used 45 * to accumulate entries while a partial commit of the block is in progress. In-memory blocks are 46 + * kept on two lists. Free blocks live on the 'free_tail_blocks' list. When a block becomes active 47 + * (see below) it is moved to the 'active_tail_blocks' list. When a block is fully committed, it is 48 + * moved back to the 'free_tail_blocks' list. 49 * 50 * When entries are added to the journal, they are added to the active in-memory block, as 51 * indicated by the 'active_block' field. If the caller wishes to wait for the entry to be

+144 -53

drivers/md/dm-vdo/slab-depot.c

··· 139 } 140 141 /** 142 - * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its allocator in the correct 143 * order. 144 * @journal: The journal to be marked dirty. 145 * @lock: The recovery journal lock held by the slab journal. ··· 414 { 415 struct slab_journal *journal = completion->parent; 416 417 - return_vio_to_pool(journal->slab->allocator->vio_pool, 418 - vio_as_pooled_vio(as_vio(vdo_forget(completion)))); 419 finish_reaping(journal); 420 reap_slab_journal(journal); 421 } ··· 697 sequence_number_t committed = get_committing_sequence_number(pooled); 698 699 list_del_init(&pooled->list_entry); 700 - return_vio_to_pool(journal->slab->allocator->vio_pool, vdo_forget(pooled)); 701 702 if (result != VDO_SUCCESS) { 703 vio_record_metadata_io_error(as_vio(completion)); ··· 821 822 /* 823 * Since we are about to commit the tail block, this journal no longer needs to be on the 824 - * ring of journals which the recovery journal might ask to commit. 825 */ 826 mark_slab_journal_clean(journal); 827 ··· 1075 /* Release the slab journal lock. */ 1076 adjust_slab_journal_block_reference(&slab->journal, 1077 block->slab_journal_lock_to_release, -1); 1078 - return_vio_to_pool(slab->allocator->vio_pool, pooled); 1079 1080 /* 1081 * We can't clear the is_writing flag earlier as releasing the slab journal lock may cause ··· 1169 struct vdo_slab *slab = ((struct reference_block *) completion->parent)->slab; 1170 1171 vio_record_metadata_io_error(vio); 1172 - return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio)); 1173 - slab->active_count--; 1174 vdo_enter_read_only_mode(slab->allocator->depot->vdo, result); 1175 check_if_slab_drained(slab); 1176 } ··· 1371 static void prioritize_slab(struct vdo_slab *slab) 1372 { 1373 VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), 1374 - "a slab must not already be on a ring when prioritizing"); 1375 slab->priority = calculate_slab_priority(slab); 1376 vdo_priority_table_enqueue(slab->allocator->prioritized_slabs, 1377 slab->priority, &slab->allocq_entry); ··· 2164 dirty_block(&slab->reference_blocks[i]); 2165 } 2166 2167 - /** 2168 - * clear_provisional_references() - Clear the provisional reference counts from a reference block. 2169 - * @block: The block to clear. 2170 - */ 2171 - static void clear_provisional_references(struct reference_block *block) 2172 - { 2173 - vdo_refcount_t *counters = get_reference_counters_for_block(block); 2174 - block_count_t j; 2175 - 2176 - for (j = 0; j < COUNTS_PER_BLOCK; j++) { 2177 - if (counters[j] == PROVISIONAL_REFERENCE_COUNT) { 2178 - counters[j] = EMPTY_REFERENCE_COUNT; 2179 - block->allocated_count--; 2180 - } 2181 - } 2182 - } 2183 - 2184 static inline bool journal_points_equal(struct journal_point first, 2185 struct journal_point second) 2186 { 2187 return ((first.sequence_number == second.sequence_number) && 2188 (first.entry_count == second.entry_count)); 2189 } 2190 2191 /** ··· 2263 static void unpack_reference_block(struct packed_reference_block *packed, 2264 struct reference_block *block) 2265 { 2266 - block_count_t index; 2267 sector_count_t i; 2268 struct vdo_slab *slab = block->slab; 2269 vdo_refcount_t *counters = get_reference_counters_for_block(block); ··· 2288 } 2289 } 2290 2291 - block->allocated_count = 0; 2292 - for (index = 0; index < COUNTS_PER_BLOCK; index++) { 2293 - if (counters[index] != EMPTY_REFERENCE_COUNT) 2294 - block->allocated_count++; 2295 - } 2296 } 2297 2298 /** ··· 2301 struct pooled_vio *pooled = vio_as_pooled_vio(vio); 2302 struct reference_block *block = completion->parent; 2303 struct vdo_slab *slab = block->slab; 2304 2305 - unpack_reference_block((struct packed_reference_block *) vio->data, block); 2306 - return_vio_to_pool(slab->allocator->vio_pool, pooled); 2307 - slab->active_count--; 2308 - clear_provisional_references(block); 2309 2310 - slab->free_blocks -= block->allocated_count; 2311 check_if_slab_drained(slab); 2312 } 2313 ··· 2327 } 2328 2329 /** 2330 - * load_reference_block() - After a block waiter has gotten a VIO from the VIO pool, load the 2331 - * block. 2332 - * @waiter: The waiter of the block to load. 2333 * @context: The VIO returned by the pool. 2334 */ 2335 - static void load_reference_block(struct vdo_waiter *waiter, void *context) 2336 { 2337 struct pooled_vio *pooled = context; 2338 struct vio *vio = &pooled->vio; 2339 struct reference_block *block = 2340 container_of(waiter, struct reference_block, waiter); 2341 - size_t block_offset = (block - block->slab->reference_blocks); 2342 2343 vio->completion.parent = block; 2344 - vdo_submit_metadata_vio(vio, block->slab->ref_counts_origin + block_offset, 2345 - load_reference_block_endio, handle_io_error, 2346 - REQ_OP_READ); 2347 } 2348 2349 /** ··· 2355 static void load_reference_blocks(struct vdo_slab *slab) 2356 { 2357 block_count_t i; 2358 2359 slab->free_blocks = slab->block_count; 2360 slab->active_count = slab->reference_block_count; 2361 - for (i = 0; i < slab->reference_block_count; i++) { 2362 struct vdo_waiter *waiter = &slab->reference_blocks[i].waiter; 2363 2364 - waiter->callback = load_reference_block; 2365 - acquire_vio_from_pool(slab->allocator->vio_pool, waiter); 2366 } 2367 } 2368 ··· 2505 initialize_journal_state(journal); 2506 } 2507 2508 - return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio)); 2509 vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab)); 2510 } 2511 ··· 2525 struct vio *vio = as_vio(completion); 2526 2527 vio_record_metadata_io_error(vio); 2528 - return_vio_to_pool(journal->slab->allocator->vio_pool, vio_as_pooled_vio(vio)); 2529 vdo_finish_loading_with_result(&journal->slab->state, result); 2530 } 2531 ··· 2623 int result; 2624 2625 VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), 2626 - "a requeued slab must not already be on a ring"); 2627 2628 if (vdo_is_read_only(allocator->depot->vdo)) 2629 return; ··· 2776 vdo_log_info("VDO commencing normal operation"); 2777 else if (prior_state == VDO_RECOVERING) 2778 vdo_log_info("Exiting recovery mode"); 2779 } 2780 2781 /* ··· 3358 * This is a min_heap callback function orders slab_status structures using the 'is_clean' field as 3359 * the primary key and the 'emptiness' field as the secondary key. 3360 * 3361 - * Slabs need to be pushed onto the rings in the same order they are to be popped off. Popping 3362 * should always get the most empty first, so pushing should be from most empty to least empty. 3363 * Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements 3364 * before larger ones. ··· 4060 struct vdo *vdo = depot->vdo; 4061 block_count_t max_free_blocks = depot->slab_config.data_blocks; 4062 unsigned int max_priority = (2 + ilog2(max_free_blocks)); 4063 4064 *allocator = (struct block_allocator) { 4065 .depot = depot, ··· 4078 return result; 4079 4080 vdo_initialize_completion(&allocator->completion, vdo, VDO_BLOCK_ALLOCATOR_COMPLETION); 4081 - result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, allocator->thread_id, 4082 VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, 4083 allocator, &allocator->vio_pool); 4084 if (result != VDO_SUCCESS) 4085 return result; 4086 ··· 4313 uninitialize_allocator_summary(allocator); 4314 uninitialize_scrubber_vio(&allocator->scrubber); 4315 free_vio_pool(vdo_forget(allocator->vio_pool)); 4316 vdo_free_priority_table(vdo_forget(allocator->prioritized_slabs)); 4317 } 4318

··· 139 } 140 141 /** 142 + * mark_slab_journal_dirty() - Put a slab journal on the dirty list of its allocator in the correct 143 * order. 144 * @journal: The journal to be marked dirty. 145 * @lock: The recovery journal lock held by the slab journal. ··· 414 { 415 struct slab_journal *journal = completion->parent; 416 417 + return_vio_to_pool(vio_as_pooled_vio(as_vio(completion))); 418 finish_reaping(journal); 419 reap_slab_journal(journal); 420 } ··· 698 sequence_number_t committed = get_committing_sequence_number(pooled); 699 700 list_del_init(&pooled->list_entry); 701 + return_vio_to_pool(pooled); 702 703 if (result != VDO_SUCCESS) { 704 vio_record_metadata_io_error(as_vio(completion)); ··· 822 823 /* 824 * Since we are about to commit the tail block, this journal no longer needs to be on the 825 + * list of journals which the recovery journal might ask to commit. 826 */ 827 mark_slab_journal_clean(journal); 828 ··· 1076 /* Release the slab journal lock. */ 1077 adjust_slab_journal_block_reference(&slab->journal, 1078 block->slab_journal_lock_to_release, -1); 1079 + return_vio_to_pool(pooled); 1080 1081 /* 1082 * We can't clear the is_writing flag earlier as releasing the slab journal lock may cause ··· 1170 struct vdo_slab *slab = ((struct reference_block *) completion->parent)->slab; 1171 1172 vio_record_metadata_io_error(vio); 1173 + return_vio_to_pool(vio_as_pooled_vio(vio)); 1174 + slab->active_count -= vio->io_size / VDO_BLOCK_SIZE; 1175 vdo_enter_read_only_mode(slab->allocator->depot->vdo, result); 1176 check_if_slab_drained(slab); 1177 } ··· 1372 static void prioritize_slab(struct vdo_slab *slab) 1373 { 1374 VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), 1375 + "a slab must not already be on a list when prioritizing"); 1376 slab->priority = calculate_slab_priority(slab); 1377 vdo_priority_table_enqueue(slab->allocator->prioritized_slabs, 1378 slab->priority, &slab->allocq_entry); ··· 2165 dirty_block(&slab->reference_blocks[i]); 2166 } 2167 2168 static inline bool journal_points_equal(struct journal_point first, 2169 struct journal_point second) 2170 { 2171 return ((first.sequence_number == second.sequence_number) && 2172 (first.entry_count == second.entry_count)); 2173 + } 2174 + 2175 + /** 2176 + * match_bytes() - Check an 8-byte word for bytes matching the value specified 2177 + * @input: A word to examine the bytes of 2178 + * @match: The byte value sought 2179 + * 2180 + * Return: 1 in each byte when the corresponding input byte matched, 0 otherwise 2181 + */ 2182 + static inline u64 match_bytes(u64 input, u8 match) 2183 + { 2184 + u64 temp = input ^ (match * 0x0101010101010101ULL); 2185 + /* top bit of each byte is set iff top bit of temp byte is clear; rest are 0 */ 2186 + u64 test_top_bits = ~temp & 0x8080808080808080ULL; 2187 + /* top bit of each byte is set iff low 7 bits of temp byte are clear; rest are useless */ 2188 + u64 test_low_bits = 0x8080808080808080ULL - (temp & 0x7f7f7f7f7f7f7f7fULL); 2189 + /* return 1 when both tests indicate temp byte is 0 */ 2190 + return (test_top_bits & test_low_bits) >> 7; 2191 + } 2192 + 2193 + /** 2194 + * count_valid_references() - Process a newly loaded refcount array 2195 + * @counters: the array of counters from a metadata block 2196 + * 2197 + * Scan a 8-byte-aligned array of counters, fixing up any "provisional" values that weren't 2198 + * cleaned up at shutdown, changing them internally to "empty". 2199 + * 2200 + * Return: the number of blocks that are referenced (counters not "empty") 2201 + */ 2202 + static unsigned int count_valid_references(vdo_refcount_t *counters) 2203 + { 2204 + u64 *words = (u64 *)counters; 2205 + /* It's easier to count occurrences of a specific byte than its absences. */ 2206 + unsigned int empty_count = 0; 2207 + /* For speed, we process 8 bytes at once. */ 2208 + unsigned int words_left = COUNTS_PER_BLOCK / sizeof(u64); 2209 + 2210 + /* 2211 + * Sanity check assumptions used for optimizing this code: Counters are bytes. The counter 2212 + * array is a multiple of the word size. 2213 + */ 2214 + BUILD_BUG_ON(sizeof(vdo_refcount_t) != 1); 2215 + BUILD_BUG_ON((COUNTS_PER_BLOCK % sizeof(u64)) != 0); 2216 + 2217 + while (words_left > 0) { 2218 + /* 2219 + * This is used effectively as 8 byte-size counters. Byte 0 counts how many words 2220 + * had the target value found in byte 0, etc. We just have to avoid overflow. 2221 + */ 2222 + u64 split_count = 0; 2223 + /* 2224 + * The counter "% 255" trick used below to fold split_count into empty_count 2225 + * imposes a limit of 254 bytes examined each iteration of the outer loop. We 2226 + * process a word at a time, so that limit gets rounded down to 31 u64 words. 2227 + */ 2228 + const unsigned int max_words_per_iteration = 254 / sizeof(u64); 2229 + unsigned int iter_words_left = min_t(unsigned int, words_left, 2230 + max_words_per_iteration); 2231 + 2232 + words_left -= iter_words_left; 2233 + 2234 + while (iter_words_left--) { 2235 + u64 word = *words; 2236 + u64 temp; 2237 + 2238 + /* First, if we have any provisional refcount values, clear them. */ 2239 + temp = match_bytes(word, PROVISIONAL_REFERENCE_COUNT); 2240 + if (temp) { 2241 + /* 2242 + * 'temp' has 0x01 bytes where 'word' has PROVISIONAL; this xor 2243 + * will alter just those bytes, changing PROVISIONAL to EMPTY. 2244 + */ 2245 + word ^= temp * (PROVISIONAL_REFERENCE_COUNT ^ EMPTY_REFERENCE_COUNT); 2246 + *words = word; 2247 + } 2248 + 2249 + /* Now count the EMPTY_REFERENCE_COUNT bytes, updating the 8 counters. */ 2250 + split_count += match_bytes(word, EMPTY_REFERENCE_COUNT); 2251 + words++; 2252 + } 2253 + empty_count += split_count % 255; 2254 + } 2255 + 2256 + return COUNTS_PER_BLOCK - empty_count; 2257 } 2258 2259 /** ··· 2197 static void unpack_reference_block(struct packed_reference_block *packed, 2198 struct reference_block *block) 2199 { 2200 sector_count_t i; 2201 struct vdo_slab *slab = block->slab; 2202 vdo_refcount_t *counters = get_reference_counters_for_block(block); ··· 2223 } 2224 } 2225 2226 + block->allocated_count = count_valid_references(counters); 2227 } 2228 2229 /** ··· 2240 struct pooled_vio *pooled = vio_as_pooled_vio(vio); 2241 struct reference_block *block = completion->parent; 2242 struct vdo_slab *slab = block->slab; 2243 + unsigned int block_count = vio->io_size / VDO_BLOCK_SIZE; 2244 + unsigned int i; 2245 + char *data = vio->data; 2246 2247 + for (i = 0; i < block_count; i++, block++, data += VDO_BLOCK_SIZE) { 2248 + struct packed_reference_block *packed = (struct packed_reference_block *) data; 2249 2250 + unpack_reference_block(packed, block); 2251 + slab->free_blocks -= block->allocated_count; 2252 + } 2253 + return_vio_to_pool(pooled); 2254 + slab->active_count -= block_count; 2255 + 2256 check_if_slab_drained(slab); 2257 } 2258 ··· 2260 } 2261 2262 /** 2263 + * load_reference_block_group() - After a block waiter has gotten a VIO from the VIO pool, load 2264 + * a set of blocks. 2265 + * @waiter: The waiter of the first block to load. 2266 * @context: The VIO returned by the pool. 2267 */ 2268 + static void load_reference_block_group(struct vdo_waiter *waiter, void *context) 2269 { 2270 struct pooled_vio *pooled = context; 2271 struct vio *vio = &pooled->vio; 2272 struct reference_block *block = 2273 container_of(waiter, struct reference_block, waiter); 2274 + u32 block_offset = block - block->slab->reference_blocks; 2275 + u32 max_block_count = block->slab->reference_block_count - block_offset; 2276 + u32 block_count = min_t(int, vio->block_count, max_block_count); 2277 2278 vio->completion.parent = block; 2279 + vdo_submit_metadata_vio_with_size(vio, block->slab->ref_counts_origin + block_offset, 2280 + load_reference_block_endio, handle_io_error, 2281 + REQ_OP_READ, block_count * VDO_BLOCK_SIZE); 2282 } 2283 2284 /** ··· 2286 static void load_reference_blocks(struct vdo_slab *slab) 2287 { 2288 block_count_t i; 2289 + u64 blocks_per_vio = slab->allocator->refcount_blocks_per_big_vio; 2290 + struct vio_pool *pool = slab->allocator->refcount_big_vio_pool; 2291 + 2292 + if (!pool) { 2293 + pool = slab->allocator->vio_pool; 2294 + blocks_per_vio = 1; 2295 + } 2296 2297 slab->free_blocks = slab->block_count; 2298 slab->active_count = slab->reference_block_count; 2299 + for (i = 0; i < slab->reference_block_count; i += blocks_per_vio) { 2300 struct vdo_waiter *waiter = &slab->reference_blocks[i].waiter; 2301 2302 + waiter->callback = load_reference_block_group; 2303 + acquire_vio_from_pool(pool, waiter); 2304 } 2305 } 2306 ··· 2429 initialize_journal_state(journal); 2430 } 2431 2432 + return_vio_to_pool(vio_as_pooled_vio(vio)); 2433 vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab)); 2434 } 2435 ··· 2449 struct vio *vio = as_vio(completion); 2450 2451 vio_record_metadata_io_error(vio); 2452 + return_vio_to_pool(vio_as_pooled_vio(vio)); 2453 vdo_finish_loading_with_result(&journal->slab->state, result); 2454 } 2455 ··· 2547 int result; 2548 2549 VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), 2550 + "a requeued slab must not already be on a list"); 2551 2552 if (vdo_is_read_only(allocator->depot->vdo)) 2553 return; ··· 2700 vdo_log_info("VDO commencing normal operation"); 2701 else if (prior_state == VDO_RECOVERING) 2702 vdo_log_info("Exiting recovery mode"); 2703 + free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool)); 2704 } 2705 2706 /* ··· 3281 * This is a min_heap callback function orders slab_status structures using the 'is_clean' field as 3282 * the primary key and the 'emptiness' field as the secondary key. 3283 * 3284 + * Slabs need to be pushed onto the lists in the same order they are to be popped off. Popping 3285 * should always get the most empty first, so pushing should be from most empty to least empty. 3286 * Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements 3287 * before larger ones. ··· 3983 struct vdo *vdo = depot->vdo; 3984 block_count_t max_free_blocks = depot->slab_config.data_blocks; 3985 unsigned int max_priority = (2 + ilog2(max_free_blocks)); 3986 + u32 reference_block_count, refcount_reads_needed, refcount_blocks_per_vio; 3987 3988 *allocator = (struct block_allocator) { 3989 .depot = depot, ··· 4000 return result; 4001 4002 vdo_initialize_completion(&allocator->completion, vdo, VDO_BLOCK_ALLOCATOR_COMPLETION); 4003 + result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, 1, allocator->thread_id, 4004 VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, 4005 allocator, &allocator->vio_pool); 4006 + if (result != VDO_SUCCESS) 4007 + return result; 4008 + 4009 + /* Initialize the refcount-reading vio pool. */ 4010 + reference_block_count = vdo_get_saved_reference_count_size(depot->slab_config.slab_blocks); 4011 + refcount_reads_needed = DIV_ROUND_UP(reference_block_count, MAX_BLOCKS_PER_VIO); 4012 + refcount_blocks_per_vio = DIV_ROUND_UP(reference_block_count, refcount_reads_needed); 4013 + allocator->refcount_blocks_per_big_vio = refcount_blocks_per_vio; 4014 + result = make_vio_pool(vdo, BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE, 4015 + allocator->refcount_blocks_per_big_vio, allocator->thread_id, 4016 + VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, 4017 + NULL, &allocator->refcount_big_vio_pool); 4018 if (result != VDO_SUCCESS) 4019 return result; 4020 ··· 4223 uninitialize_allocator_summary(allocator); 4224 uninitialize_scrubber_vio(&allocator->scrubber); 4225 free_vio_pool(vdo_forget(allocator->vio_pool)); 4226 + free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool)); 4227 vdo_free_priority_table(vdo_forget(allocator->prioritized_slabs)); 4228 } 4229

+12 -1

drivers/md/dm-vdo/slab-depot.h

··· 45 enum { 46 /* The number of vios in the vio pool is proportional to the throughput of the VDO. */ 47 BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128, 48 }; 49 50 /* ··· 255 256 /* A list of the dirty blocks waiting to be written out */ 257 struct vdo_wait_queue dirty_blocks; 258 - /* The number of blocks which are currently writing */ 259 size_t active_count; 260 261 /* A waiter object for updating the slab summary */ ··· 432 433 /* The vio pool for reading and writing block allocator metadata */ 434 struct vio_pool *vio_pool; 435 /* The dm_kcopyd client for erasing slab journals */ 436 struct dm_kcopyd_client *eraser; 437 /* Iterator over the slabs to be erased */

··· 45 enum { 46 /* The number of vios in the vio pool is proportional to the throughput of the VDO. */ 47 BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128, 48 + 49 + /* 50 + * The number of vios in the vio pool used for loading reference count data. A slab's 51 + * refcounts is capped at ~8MB, and we process one at a time in a zone, so 9 should be 52 + * plenty. 53 + */ 54 + BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE = 9, 55 }; 56 57 /* ··· 248 249 /* A list of the dirty blocks waiting to be written out */ 250 struct vdo_wait_queue dirty_blocks; 251 + /* The number of blocks which are currently reading or writing */ 252 size_t active_count; 253 254 /* A waiter object for updating the slab summary */ ··· 425 426 /* The vio pool for reading and writing block allocator metadata */ 427 struct vio_pool *vio_pool; 428 + /* The vio pool for large initial reads of ref count areas */ 429 + struct vio_pool *refcount_big_vio_pool; 430 + /* How many ref count blocks are read per vio at initial load */ 431 + u32 refcount_blocks_per_big_vio; 432 /* The dm_kcopyd client for erasing slab journals */ 433 struct dm_kcopyd_client *eraser; 434 /* Iterator over the slabs to be erased */

+3

drivers/md/dm-vdo/types.h

··· 376 /* The size of this vio in blocks */ 377 unsigned int block_count; 378 379 /* The data being read or written. */ 380 char *data; 381

··· 376 /* The size of this vio in blocks */ 377 unsigned int block_count; 378 379 + /* The amount of data to be read or written, in bytes */ 380 + unsigned int io_size; 381 + 382 /* The data being read or written. */ 383 char *data; 384

+1 -10

drivers/md/dm-vdo/vdo.c

··· 31 32 #include <linux/completion.h> 33 #include <linux/device-mapper.h> 34 - #include <linux/kernel.h> 35 #include <linux/lz4.h> 36 - #include <linux/module.h> 37 #include <linux/mutex.h> 38 #include <linux/spinlock.h> 39 #include <linux/types.h> ··· 139 { 140 vdo_unregister_allocating_thread(); 141 } 142 - 143 - #ifdef MODULE 144 - #define MODULE_NAME THIS_MODULE->name 145 - #else 146 - #define MODULE_NAME "dm-vdo" 147 - #endif /* MODULE */ 148 149 static const struct vdo_work_queue_type default_queue_type = { 150 .start = start_vdo_request_queue, ··· 551 *vdo_ptr = vdo; 552 553 snprintf(vdo->thread_name_prefix, sizeof(vdo->thread_name_prefix), 554 - "%s%u", MODULE_NAME, instance); 555 - BUG_ON(vdo->thread_name_prefix[0] == '\0'); 556 result = vdo_allocate(vdo->thread_config.thread_count, 557 struct vdo_thread, __func__, &vdo->threads); 558 if (result != VDO_SUCCESS) {

··· 31 32 #include <linux/completion.h> 33 #include <linux/device-mapper.h> 34 #include <linux/lz4.h> 35 #include <linux/mutex.h> 36 #include <linux/spinlock.h> 37 #include <linux/types.h> ··· 141 { 142 vdo_unregister_allocating_thread(); 143 } 144 145 static const struct vdo_work_queue_type default_queue_type = { 146 .start = start_vdo_request_queue, ··· 559 *vdo_ptr = vdo; 560 561 snprintf(vdo->thread_name_prefix, sizeof(vdo->thread_name_prefix), 562 + "vdo%u", instance); 563 result = vdo_allocate(vdo->thread_config.thread_count, 564 struct vdo_thread, __func__, &vdo->threads); 565 if (result != VDO_SUCCESS) {

+33 -21

drivers/md/dm-vdo/vio.c

··· 188 189 /* 190 * Prepares the bio to perform IO with the specified buffer. May only be used on a VDO-allocated 191 - * bio, as it assumes the bio wraps a 4k buffer that is 4k aligned, but there does not have to be a 192 - * vio associated with the bio. 193 */ 194 int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, 195 blk_opf_t bi_opf, physical_block_number_t pbn) 196 { 197 - int bvec_count, offset, len, i; 198 struct bio *bio = vio->bio; 199 200 bio_reset(bio, bio->bi_bdev, bi_opf); 201 vdo_set_bio_properties(bio, vio, callback, bi_opf, pbn); ··· 214 bio->bi_ioprio = 0; 215 bio->bi_io_vec = bio->bi_inline_vecs; 216 bio->bi_max_vecs = vio->block_count + 1; 217 - len = VDO_BLOCK_SIZE * vio->block_count; 218 offset = offset_in_page(data); 219 - bvec_count = DIV_ROUND_UP(offset + len, PAGE_SIZE); 220 221 - /* 222 - * If we knew that data was always on one page, or contiguous pages, we wouldn't need the 223 - * loop. But if we're using vmalloc, it's not impossible that the data is in different 224 - * pages that can't be merged in bio_add_page... 225 - */ 226 - for (i = 0; (i < bvec_count) && (len > 0); i++) { 227 struct page *page; 228 int bytes_added; 229 int bytes = PAGE_SIZE - offset; 230 231 - if (bytes > len) 232 - bytes = len; 233 234 page = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data); 235 bytes_added = bio_add_page(bio, page, bytes, offset); ··· 240 } 241 242 data += bytes; 243 - len -= bytes; 244 offset = 0; 245 } 246 ··· 309 * make_vio_pool() - Create a new vio pool. 310 * @vdo: The vdo. 311 * @pool_size: The number of vios in the pool. 312 * @thread_id: The ID of the thread using this pool. 313 * @vio_type: The type of vios in the pool. 314 * @priority: The priority with which vios from the pool should be enqueued. ··· 318 * 319 * Return: A success or error code. 320 */ 321 - int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, 322 enum vio_type vio_type, enum vio_priority priority, void *context, 323 struct vio_pool **pool_ptr) 324 { 325 struct vio_pool *pool; 326 char *ptr; 327 int result; 328 329 result = vdo_allocate_extended(struct vio_pool, pool_size, struct pooled_vio, 330 __func__, &pool); ··· 336 INIT_LIST_HEAD(&pool->available); 337 INIT_LIST_HEAD(&pool->busy); 338 339 - result = vdo_allocate(pool_size * VDO_BLOCK_SIZE, char, 340 "VIO pool buffer", &pool->buffer); 341 if (result != VDO_SUCCESS) { 342 free_vio_pool(pool); ··· 344 } 345 346 ptr = pool->buffer; 347 - for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += VDO_BLOCK_SIZE) { 348 struct pooled_vio *pooled = &pool->vios[pool->size]; 349 350 - result = allocate_vio_components(vdo, vio_type, priority, NULL, 1, ptr, 351 &pooled->vio); 352 if (result != VDO_SUCCESS) { 353 free_vio_pool(pool); ··· 355 } 356 357 pooled->context = context; 358 list_add_tail(&pooled->pool_entry, &pool->available); 359 } 360 ··· 430 } 431 432 /** 433 - * return_vio_to_pool() - Return a vio to the pool 434 - * @pool: The vio pool. 435 * @vio: The pooled vio to return. 436 */ 437 - void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio) 438 { 439 VDO_ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()), 440 "vio pool entry returned on same thread as it was acquired"); 441

··· 188 189 /* 190 * Prepares the bio to perform IO with the specified buffer. May only be used on a VDO-allocated 191 + * bio, as it assumes the bio wraps a 4k-multiple buffer that is 4k aligned, but there does not 192 + * have to be a vio associated with the bio. 193 */ 194 int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, 195 blk_opf_t bi_opf, physical_block_number_t pbn) 196 { 197 + return vio_reset_bio_with_size(vio, data, vio->block_count * VDO_BLOCK_SIZE, 198 + callback, bi_opf, pbn); 199 + } 200 + 201 + int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback, 202 + blk_opf_t bi_opf, physical_block_number_t pbn) 203 + { 204 + int bvec_count, offset, i; 205 struct bio *bio = vio->bio; 206 + int vio_size = vio->block_count * VDO_BLOCK_SIZE; 207 + int remaining; 208 209 bio_reset(bio, bio->bi_bdev, bi_opf); 210 vdo_set_bio_properties(bio, vio, callback, bi_opf, pbn); ··· 205 bio->bi_ioprio = 0; 206 bio->bi_io_vec = bio->bi_inline_vecs; 207 bio->bi_max_vecs = vio->block_count + 1; 208 + if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d", 209 + size, vio_size) != VDO_SUCCESS) 210 + size = vio_size; 211 + vio->io_size = size; 212 offset = offset_in_page(data); 213 + bvec_count = DIV_ROUND_UP(offset + size, PAGE_SIZE); 214 + remaining = size; 215 216 + for (i = 0; (i < bvec_count) && (remaining > 0); i++) { 217 struct page *page; 218 int bytes_added; 219 int bytes = PAGE_SIZE - offset; 220 221 + if (bytes > remaining) 222 + bytes = remaining; 223 224 page = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data); 225 bytes_added = bio_add_page(bio, page, bytes, offset); ··· 232 } 233 234 data += bytes; 235 + remaining -= bytes; 236 offset = 0; 237 } 238 ··· 301 * make_vio_pool() - Create a new vio pool. 302 * @vdo: The vdo. 303 * @pool_size: The number of vios in the pool. 304 + * @block_count: The number of 4k blocks per vio. 305 * @thread_id: The ID of the thread using this pool. 306 * @vio_type: The type of vios in the pool. 307 * @priority: The priority with which vios from the pool should be enqueued. ··· 309 * 310 * Return: A success or error code. 311 */ 312 + int make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, thread_id_t thread_id, 313 enum vio_type vio_type, enum vio_priority priority, void *context, 314 struct vio_pool **pool_ptr) 315 { 316 struct vio_pool *pool; 317 char *ptr; 318 int result; 319 + size_t per_vio_size = VDO_BLOCK_SIZE * block_count; 320 321 result = vdo_allocate_extended(struct vio_pool, pool_size, struct pooled_vio, 322 __func__, &pool); ··· 326 INIT_LIST_HEAD(&pool->available); 327 INIT_LIST_HEAD(&pool->busy); 328 329 + result = vdo_allocate(pool_size * per_vio_size, char, 330 "VIO pool buffer", &pool->buffer); 331 if (result != VDO_SUCCESS) { 332 free_vio_pool(pool); ··· 334 } 335 336 ptr = pool->buffer; 337 + for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += per_vio_size) { 338 struct pooled_vio *pooled = &pool->vios[pool->size]; 339 340 + result = allocate_vio_components(vdo, vio_type, priority, NULL, block_count, ptr, 341 &pooled->vio); 342 if (result != VDO_SUCCESS) { 343 free_vio_pool(pool); ··· 345 } 346 347 pooled->context = context; 348 + pooled->pool = pool; 349 list_add_tail(&pooled->pool_entry, &pool->available); 350 } 351 ··· 419 } 420 421 /** 422 + * return_vio_to_pool() - Return a vio to its pool 423 * @vio: The pooled vio to return. 424 */ 425 + void return_vio_to_pool(struct pooled_vio *vio) 426 { 427 + struct vio_pool *pool = vio->pool; 428 + 429 VDO_ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()), 430 "vio pool entry returned on same thread as it was acquired"); 431

+9 -4

drivers/md/dm-vdo/vio.h

··· 30 void *context; 31 /* The list entry used by the pool */ 32 struct list_head pool_entry; 33 }; 34 35 /** ··· 125 126 int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, 127 blk_opf_t bi_opf, physical_block_number_t pbn); 128 129 void update_vio_error_stats(struct vio *vio, const char *format, ...) 130 __printf(2, 3); ··· 192 193 struct vio_pool; 194 195 - int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, 196 - enum vio_type vio_type, enum vio_priority priority, 197 - void *context, struct vio_pool **pool_ptr); 198 void free_vio_pool(struct vio_pool *pool); 199 bool __must_check is_vio_pool_busy(struct vio_pool *pool); 200 void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter); 201 - void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio); 202 203 #endif /* VIO_H */

··· 30 void *context; 31 /* The list entry used by the pool */ 32 struct list_head pool_entry; 33 + /* The pool this vio is allocated from */ 34 + struct vio_pool *pool; 35 }; 36 37 /** ··· 123 124 int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, 125 blk_opf_t bi_opf, physical_block_number_t pbn); 126 + int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback, 127 + blk_opf_t bi_opf, physical_block_number_t pbn); 128 129 void update_vio_error_stats(struct vio *vio, const char *format, ...) 130 __printf(2, 3); ··· 188 189 struct vio_pool; 190 191 + int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, 192 + thread_id_t thread_id, enum vio_type vio_type, 193 + enum vio_priority priority, void *context, 194 + struct vio_pool **pool_ptr); 195 void free_vio_pool(struct vio_pool *pool); 196 bool __must_check is_vio_pool_busy(struct vio_pool *pool); 197 void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter); 198 + void return_vio_to_pool(struct pooled_vio *vio); 199 200 #endif /* VIO_H */

+1 -1

drivers/md/dm-vdo/wait-queue.c

··· 34 waitq->last_waiter->next_waiter = waiter; 35 } 36 37 - /* In both cases, the waiter we added to the ring becomes the last waiter. */ 38 waitq->last_waiter = waiter; 39 waitq->length += 1; 40 }

··· 34 waitq->last_waiter->next_waiter = waiter; 35 } 36 37 + /* In both cases, the waiter we added to the list becomes the last waiter. */ 38 waitq->last_waiter = waiter; 39 waitq->length += 1; 40 }

+55 -7

drivers/md/dm-verity-target.c

··· 30 #define DM_VERITY_ENV_VAR_NAME "DM_VERITY_ERR_BLOCK_NR" 31 32 #define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 33 34 #define DM_VERITY_MAX_CORRUPTED_ERRS 100 35 ··· 49 static unsigned int dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; 50 51 module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, 0644); 52 53 static DEFINE_STATIC_KEY_FALSE(use_bh_wq_enabled); 54 ··· 321 322 if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { 323 data = dm_bufio_get(v->bufio, hash_block, &buf); 324 - if (data == NULL) { 325 /* 326 * In tasklet and the hash was not in the bufio cache. 327 * Return early and resume execution from a work-queue ··· 334 &buf, bio->bi_ioprio); 335 } 336 337 - if (IS_ERR(data)) 338 - return PTR_ERR(data); 339 340 aux = dm_bufio_get_aux_data(buf); 341 ··· 392 } 393 } 394 395 data += offset; 396 memcpy(want_digest, data, v->digest_size); 397 r = 0; ··· 679 verity_finish_io(io, errno_to_blk_status(err)); 680 } 681 682 static void verity_end_io(struct bio *bio) 683 { 684 struct dm_verity_io *io = bio->bi_private; 685 686 if (bio->bi_status && 687 (!verity_fec_is_enabled(io->v) || ··· 699 return; 700 } 701 702 - if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq) { 703 - INIT_WORK(&io->bh_work, verity_bh_work); 704 - queue_work(system_bh_wq, &io->bh_work); 705 } else { 706 INIT_WORK(&io->work, verity_work); 707 queue_work(io->v->verify_wq, &io->work); ··· 834 submit_bio_noacct(bio); 835 836 return DM_MAPIO_SUBMITTED; 837 } 838 839 /* ··· 1808 .name = "verity", 1809 /* Note: the LSMs depend on the singleton and immutable features */ 1810 .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE, 1811 - .version = {1, 10, 0}, 1812 .module = THIS_MODULE, 1813 .ctr = verity_ctr, 1814 .dtr = verity_dtr, 1815 .map = verity_map, 1816 .status = verity_status, 1817 .prepare_ioctl = verity_prepare_ioctl, 1818 .iterate_devices = verity_iterate_devices,

··· 30 #define DM_VERITY_ENV_VAR_NAME "DM_VERITY_ERR_BLOCK_NR" 31 32 #define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 33 + #define DM_VERITY_USE_BH_DEFAULT_BYTES 8192 34 35 #define DM_VERITY_MAX_CORRUPTED_ERRS 100 36 ··· 48 static unsigned int dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; 49 50 module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, 0644); 51 + 52 + static unsigned int dm_verity_use_bh_bytes[4] = { 53 + DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_NONE 54 + DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_RT 55 + DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_BE 56 + 0 // IOPRIO_CLASS_IDLE 57 + }; 58 + 59 + module_param_array_named(use_bh_bytes, dm_verity_use_bh_bytes, uint, NULL, 0644); 60 61 static DEFINE_STATIC_KEY_FALSE(use_bh_wq_enabled); 62 ··· 311 312 if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { 313 data = dm_bufio_get(v->bufio, hash_block, &buf); 314 + if (IS_ERR_OR_NULL(data)) { 315 /* 316 * In tasklet and the hash was not in the bufio cache. 317 * Return early and resume execution from a work-queue ··· 324 &buf, bio->bi_ioprio); 325 } 326 327 + if (IS_ERR(data)) { 328 + if (skip_unverified) 329 + return 1; 330 + r = PTR_ERR(data); 331 + data = dm_bufio_new(v->bufio, hash_block, &buf); 332 + if (IS_ERR(data)) 333 + return r; 334 + if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA, 335 + hash_block, data) == 0) { 336 + aux = dm_bufio_get_aux_data(buf); 337 + aux->hash_verified = 1; 338 + goto release_ok; 339 + } else { 340 + dm_bufio_release(buf); 341 + dm_bufio_forget(v->bufio, hash_block); 342 + return r; 343 + } 344 + } 345 346 aux = dm_bufio_get_aux_data(buf); 347 ··· 366 } 367 } 368 369 + release_ok: 370 data += offset; 371 memcpy(want_digest, data, v->digest_size); 372 r = 0; ··· 652 verity_finish_io(io, errno_to_blk_status(err)); 653 } 654 655 + static inline bool verity_use_bh(unsigned int bytes, unsigned short ioprio) 656 + { 657 + return ioprio <= IOPRIO_CLASS_IDLE && 658 + bytes <= READ_ONCE(dm_verity_use_bh_bytes[ioprio]); 659 + } 660 + 661 static void verity_end_io(struct bio *bio) 662 { 663 struct dm_verity_io *io = bio->bi_private; 664 + unsigned short ioprio = IOPRIO_PRIO_CLASS(bio->bi_ioprio); 665 + unsigned int bytes = io->n_blocks << io->v->data_dev_block_bits; 666 667 if (bio->bi_status && 668 (!verity_fec_is_enabled(io->v) || ··· 664 return; 665 } 666 667 + if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq && 668 + verity_use_bh(bytes, ioprio)) { 669 + if (in_hardirq() || irqs_disabled()) { 670 + INIT_WORK(&io->bh_work, verity_bh_work); 671 + queue_work(system_bh_wq, &io->bh_work); 672 + } else { 673 + verity_bh_work(&io->bh_work); 674 + } 675 } else { 676 INIT_WORK(&io->work, verity_work); 677 queue_work(io->v->verify_wq, &io->work); ··· 794 submit_bio_noacct(bio); 795 796 return DM_MAPIO_SUBMITTED; 797 + } 798 + 799 + static void verity_postsuspend(struct dm_target *ti) 800 + { 801 + struct dm_verity *v = ti->private; 802 + flush_workqueue(v->verify_wq); 803 + dm_bufio_client_reset(v->bufio); 804 } 805 806 /* ··· 1761 .name = "verity", 1762 /* Note: the LSMs depend on the singleton and immutable features */ 1763 .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE, 1764 + .version = {1, 11, 0}, 1765 .module = THIS_MODULE, 1766 .ctr = verity_ctr, 1767 .dtr = verity_dtr, 1768 .map = verity_map, 1769 + .postsuspend = verity_postsuspend, 1770 .status = verity_status, 1771 .prepare_ioctl = verity_prepare_ioctl, 1772 .iterate_devices = verity_iterate_devices,

+6 -2

drivers/md/dm.c

··· 1540 { 1541 struct dm_table *t = ci->map; 1542 struct bio flush_bio; 1543 1544 /* 1545 * Use an on-stack bio for this, it's safe since we don't 1546 * need to reference it after submit. It's just used as 1547 * the basis for the clone(s). 1548 */ 1549 - bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, 1550 - REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC); 1551 1552 ci->bio = &flush_bio; 1553 ci->sector_count = 0;

··· 1540 { 1541 struct dm_table *t = ci->map; 1542 struct bio flush_bio; 1543 + blk_opf_t opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; 1544 + 1545 + if ((ci->io->orig_bio->bi_opf & (REQ_IDLE | REQ_SYNC)) == 1546 + (REQ_IDLE | REQ_SYNC)) 1547 + opf |= REQ_IDLE; 1548 1549 /* 1550 * Use an on-stack bio for this, it's safe since we don't 1551 * need to reference it after submit. It's just used as 1552 * the basis for the clone(s). 1553 */ 1554 + bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, opf); 1555 1556 ci->bio = &flush_bio; 1557 ci->sector_count = 0;