Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

crypto: crc32 - Add ARM64 CRC32 hw accelerated module

This module registers a crc32 algorithm and a crc32c algorithm
that use the optional CRC32 and CRC32C instructions in ARMv8.

Tested on AMD Seattle.

Improvement compared to crc32c-generic algorithm:
TCRYPT CRC32C speed test shows ~450% speedup.
Simple dd write tests to btrfs filesystem show ~30% speedup.

Signed-off-by: Yazen Ghannam <yazen.ghannam@linaro.org>
Acked-by: Steve Capper <steve.capper@linaro.org>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Yazen Ghannam and committed by
Herbert Xu
f6f203fa aa408d60

+282
+4
arch/arm64/crypto/Kconfig
··· 50 50 select CRYPTO_AES 51 51 select CRYPTO_ABLK_HELPER 52 52 53 + config CRYPTO_CRC32_ARM64 54 + tristate "CRC32 and CRC32C using optional ARMv8 instructions" 55 + depends on ARM64 56 + select CRYPTO_HASH 53 57 endif
+4
arch/arm64/crypto/Makefile
··· 34 34 35 35 CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS 36 36 37 + obj-$(CONFIG_CRYPTO_CRC32_ARM64) += crc32-arm64.o 38 + 39 + CFLAGS_crc32-arm64.o := -mcpu=generic+crc 40 + 37 41 $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE 38 42 $(call if_changed_rule,cc_o_c)
+274
arch/arm64/crypto/crc32-arm64.c
··· 1 + /* 2 + * crc32-arm64.c - CRC32 and CRC32C using optional ARMv8 instructions 3 + * 4 + * Module based on crypto/crc32c_generic.c 5 + * 6 + * CRC32 loop taken from Ed Nevill's Hadoop CRC patch 7 + * http://mail-archives.apache.org/mod_mbox/hadoop-common-dev/201406.mbox/%3C1403687030.3355.19.camel%40localhost.localdomain%3E 8 + * 9 + * Using inline assembly instead of intrinsics in order to be backwards 10 + * compatible with older compilers. 11 + * 12 + * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org> 13 + * 14 + * This program is free software; you can redistribute it and/or modify 15 + * it under the terms of the GNU General Public License version 2 as 16 + * published by the Free Software Foundation. 17 + */ 18 + 19 + #include <linux/unaligned/access_ok.h> 20 + #include <linux/cpufeature.h> 21 + #include <linux/init.h> 22 + #include <linux/kernel.h> 23 + #include <linux/module.h> 24 + #include <linux/string.h> 25 + 26 + #include <crypto/internal/hash.h> 27 + 28 + MODULE_AUTHOR("Yazen Ghannam <yazen.ghannam@linaro.org>"); 29 + MODULE_DESCRIPTION("CRC32 and CRC32C using optional ARMv8 instructions"); 30 + MODULE_LICENSE("GPL v2"); 31 + 32 + #define CRC32X(crc, value) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)) 33 + #define CRC32W(crc, value) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) 34 + #define CRC32H(crc, value) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) 35 + #define CRC32B(crc, value) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) 36 + #define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)) 37 + #define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) 38 + #define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) 39 + #define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) 40 + 41 + static u32 crc32_arm64_le_hw(u32 crc, const u8 *p, unsigned int len) 42 + { 43 + s64 length = len; 44 + 45 + while ((length -= sizeof(u64)) >= 0) { 46 + CRC32X(crc, get_unaligned_le64(p)); 47 + p += sizeof(u64); 48 + } 49 + 50 + /* The following is more efficient than the straight loop */ 51 + if (length & sizeof(u32)) { 52 + CRC32W(crc, get_unaligned_le32(p)); 53 + p += sizeof(u32); 54 + } 55 + if (length & sizeof(u16)) { 56 + CRC32H(crc, get_unaligned_le16(p)); 57 + p += sizeof(u16); 58 + } 59 + if (length & sizeof(u8)) 60 + CRC32B(crc, *p); 61 + 62 + return crc; 63 + } 64 + 65 + static u32 crc32c_arm64_le_hw(u32 crc, const u8 *p, unsigned int len) 66 + { 67 + s64 length = len; 68 + 69 + while ((length -= sizeof(u64)) >= 0) { 70 + CRC32CX(crc, get_unaligned_le64(p)); 71 + p += sizeof(u64); 72 + } 73 + 74 + /* The following is more efficient than the straight loop */ 75 + if (length & sizeof(u32)) { 76 + CRC32CW(crc, get_unaligned_le32(p)); 77 + p += sizeof(u32); 78 + } 79 + if (length & sizeof(u16)) { 80 + CRC32CH(crc, get_unaligned_le16(p)); 81 + p += sizeof(u16); 82 + } 83 + if (length & sizeof(u8)) 84 + CRC32CB(crc, *p); 85 + 86 + return crc; 87 + } 88 + 89 + #define CHKSUM_BLOCK_SIZE 1 90 + #define CHKSUM_DIGEST_SIZE 4 91 + 92 + struct chksum_ctx { 93 + u32 key; 94 + }; 95 + 96 + struct chksum_desc_ctx { 97 + u32 crc; 98 + }; 99 + 100 + static int chksum_init(struct shash_desc *desc) 101 + { 102 + struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); 103 + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); 104 + 105 + ctx->crc = mctx->key; 106 + 107 + return 0; 108 + } 109 + 110 + /* 111 + * Setting the seed allows arbitrary accumulators and flexible XOR policy 112 + * If your algorithm starts with ~0, then XOR with ~0 before you set 113 + * the seed. 114 + */ 115 + static int chksum_setkey(struct crypto_shash *tfm, const u8 *key, 116 + unsigned int keylen) 117 + { 118 + struct chksum_ctx *mctx = crypto_shash_ctx(tfm); 119 + 120 + if (keylen != sizeof(mctx->key)) { 121 + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); 122 + return -EINVAL; 123 + } 124 + mctx->key = get_unaligned_le32(key); 125 + return 0; 126 + } 127 + 128 + static int chksum_update(struct shash_desc *desc, const u8 *data, 129 + unsigned int length) 130 + { 131 + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); 132 + 133 + ctx->crc = crc32_arm64_le_hw(ctx->crc, data, length); 134 + return 0; 135 + } 136 + 137 + static int chksumc_update(struct shash_desc *desc, const u8 *data, 138 + unsigned int length) 139 + { 140 + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); 141 + 142 + ctx->crc = crc32c_arm64_le_hw(ctx->crc, data, length); 143 + return 0; 144 + } 145 + 146 + static int chksum_final(struct shash_desc *desc, u8 *out) 147 + { 148 + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); 149 + 150 + put_unaligned_le32(~ctx->crc, out); 151 + return 0; 152 + } 153 + 154 + static int __chksum_finup(u32 crc, const u8 *data, unsigned int len, u8 *out) 155 + { 156 + put_unaligned_le32(~crc32_arm64_le_hw(crc, data, len), out); 157 + return 0; 158 + } 159 + 160 + static int __chksumc_finup(u32 crc, const u8 *data, unsigned int len, u8 *out) 161 + { 162 + put_unaligned_le32(~crc32c_arm64_le_hw(crc, data, len), out); 163 + return 0; 164 + } 165 + 166 + static int chksum_finup(struct shash_desc *desc, const u8 *data, 167 + unsigned int len, u8 *out) 168 + { 169 + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); 170 + 171 + return __chksum_finup(ctx->crc, data, len, out); 172 + } 173 + 174 + static int chksumc_finup(struct shash_desc *desc, const u8 *data, 175 + unsigned int len, u8 *out) 176 + { 177 + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); 178 + 179 + return __chksumc_finup(ctx->crc, data, len, out); 180 + } 181 + 182 + static int chksum_digest(struct shash_desc *desc, const u8 *data, 183 + unsigned int length, u8 *out) 184 + { 185 + struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); 186 + 187 + return __chksum_finup(mctx->key, data, length, out); 188 + } 189 + 190 + static int chksumc_digest(struct shash_desc *desc, const u8 *data, 191 + unsigned int length, u8 *out) 192 + { 193 + struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); 194 + 195 + return __chksumc_finup(mctx->key, data, length, out); 196 + } 197 + 198 + static int crc32_cra_init(struct crypto_tfm *tfm) 199 + { 200 + struct chksum_ctx *mctx = crypto_tfm_ctx(tfm); 201 + 202 + mctx->key = ~0; 203 + return 0; 204 + } 205 + 206 + static struct shash_alg crc32_alg = { 207 + .digestsize = CHKSUM_DIGEST_SIZE, 208 + .setkey = chksum_setkey, 209 + .init = chksum_init, 210 + .update = chksum_update, 211 + .final = chksum_final, 212 + .finup = chksum_finup, 213 + .digest = chksum_digest, 214 + .descsize = sizeof(struct chksum_desc_ctx), 215 + .base = { 216 + .cra_name = "crc32", 217 + .cra_driver_name = "crc32-arm64-hw", 218 + .cra_priority = 300, 219 + .cra_blocksize = CHKSUM_BLOCK_SIZE, 220 + .cra_alignmask = 0, 221 + .cra_ctxsize = sizeof(struct chksum_ctx), 222 + .cra_module = THIS_MODULE, 223 + .cra_init = crc32_cra_init, 224 + } 225 + }; 226 + 227 + static struct shash_alg crc32c_alg = { 228 + .digestsize = CHKSUM_DIGEST_SIZE, 229 + .setkey = chksum_setkey, 230 + .init = chksum_init, 231 + .update = chksumc_update, 232 + .final = chksum_final, 233 + .finup = chksumc_finup, 234 + .digest = chksumc_digest, 235 + .descsize = sizeof(struct chksum_desc_ctx), 236 + .base = { 237 + .cra_name = "crc32c", 238 + .cra_driver_name = "crc32c-arm64-hw", 239 + .cra_priority = 300, 240 + .cra_blocksize = CHKSUM_BLOCK_SIZE, 241 + .cra_alignmask = 0, 242 + .cra_ctxsize = sizeof(struct chksum_ctx), 243 + .cra_module = THIS_MODULE, 244 + .cra_init = crc32_cra_init, 245 + } 246 + }; 247 + 248 + static int __init crc32_mod_init(void) 249 + { 250 + int err; 251 + 252 + err = crypto_register_shash(&crc32_alg); 253 + 254 + if (err) 255 + return err; 256 + 257 + err = crypto_register_shash(&crc32c_alg); 258 + 259 + if (err) { 260 + crypto_unregister_shash(&crc32_alg); 261 + return err; 262 + } 263 + 264 + return 0; 265 + } 266 + 267 + static void __exit crc32_mod_exit(void) 268 + { 269 + crypto_unregister_shash(&crc32_alg); 270 + crypto_unregister_shash(&crc32c_alg); 271 + } 272 + 273 + module_cpu_feature_match(CRC32, crc32_mod_init); 274 + module_exit(crc32_mod_exit);