crypto: arm/crc32 - accelerated support based on x86 SSE implementation

+5

arch/arm/crypto/Kconfig

··· 125 125 depends on KERNEL_MODE_NEON && CRC_T10DIF 126 126 select CRYPTO_HASH 127 127 128 + config CRYPTO_CRC32_ARM_CE 129 + tristate "CRC32(C) digest algorithm using CRC and/or PMULL instructions" 130 + depends on KERNEL_MODE_NEON && CRC32 131 + select CRYPTO_HASH 132 + 128 133 endif

+2

arch/arm/crypto/Makefile

··· 14 14 ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o 15 15 ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o 16 16 ce-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM_CE) += crct10dif-arm-ce.o 17 + ce-obj-$(CONFIG_CRYPTO_CRC32_ARM_CE) += crc32-arm-ce.o 17 18 18 19 ifneq ($(ce-obj-y)$(ce-obj-m),) 19 20 ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y) ··· 39 38 aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o 40 39 ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o 41 40 crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o 41 + crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o 42 42 43 43 quiet_cmd_perl = PERL $@ 44 44 cmd_perl = $(PERL) $(<) > $(@)

+306

arch/arm/crypto/crc32-ce-core.S

··· 1 + /* 2 + * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions 3 + * 4 + * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + /* GPL HEADER START 12 + * 13 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 14 + * 15 + * This program is free software; you can redistribute it and/or modify 16 + * it under the terms of the GNU General Public License version 2 only, 17 + * as published by the Free Software Foundation. 18 + * 19 + * This program is distributed in the hope that it will be useful, but 20 + * WITHOUT ANY WARRANTY; without even the implied warranty of 21 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 + * General Public License version 2 for more details (a copy is included 23 + * in the LICENSE file that accompanied this code). 24 + * 25 + * You should have received a copy of the GNU General Public License 26 + * version 2 along with this program; If not, see http://www.gnu.org/licenses 27 + * 28 + * Please visit http://www.xyratex.com/contact if you need additional 29 + * information or have any questions. 30 + * 31 + * GPL HEADER END 32 + */ 33 + 34 + /* 35 + * Copyright 2012 Xyratex Technology Limited 36 + * 37 + * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 38 + * calculation. 39 + * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) 40 + * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found 41 + * at: 42 + * http://www.intel.com/products/processor/manuals/ 43 + * Intel(R) 64 and IA-32 Architectures Software Developer's Manual 44 + * Volume 2B: Instruction Set Reference, N-Z 45 + * 46 + * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> 47 + * Alexander Boyko <Alexander_Boyko@xyratex.com> 48 + */ 49 + 50 + #include <linux/linkage.h> 51 + #include <asm/assembler.h> 52 + 53 + .text 54 + .align 6 55 + .arch armv8-a 56 + .arch_extension crc 57 + .fpu crypto-neon-fp-armv8 58 + 59 + .Lcrc32_constants: 60 + /* 61 + * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 62 + * #define CONSTANT_R1 0x154442bd4LL 63 + * 64 + * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 65 + * #define CONSTANT_R2 0x1c6e41596LL 66 + */ 67 + .quad 0x0000000154442bd4 68 + .quad 0x00000001c6e41596 69 + 70 + /* 71 + * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 72 + * #define CONSTANT_R3 0x1751997d0LL 73 + * 74 + * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e 75 + * #define CONSTANT_R4 0x0ccaa009eLL 76 + */ 77 + .quad 0x00000001751997d0 78 + .quad 0x00000000ccaa009e 79 + 80 + /* 81 + * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 82 + * #define CONSTANT_R5 0x163cd6124LL 83 + */ 84 + .quad 0x0000000163cd6124 85 + .quad 0x00000000FFFFFFFF 86 + 87 + /* 88 + * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL 89 + * 90 + * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` 91 + * = 0x1F7011641LL 92 + * #define CONSTANT_RU 0x1F7011641LL 93 + */ 94 + .quad 0x00000001DB710641 95 + .quad 0x00000001F7011641 96 + 97 + .Lcrc32c_constants: 98 + .quad 0x00000000740eef02 99 + .quad 0x000000009e4addf8 100 + .quad 0x00000000f20c0dfe 101 + .quad 0x000000014cd00bd6 102 + .quad 0x00000000dd45aab8 103 + .quad 0x00000000FFFFFFFF 104 + .quad 0x0000000105ec76f0 105 + .quad 0x00000000dea713f1 106 + 107 + dCONSTANTl .req d0 108 + dCONSTANTh .req d1 109 + qCONSTANT .req q0 110 + 111 + BUF .req r0 112 + LEN .req r1 113 + CRC .req r2 114 + 115 + qzr .req q9 116 + 117 + /** 118 + * Calculate crc32 119 + * BUF - buffer 120 + * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63 121 + * CRC - initial crc32 122 + * return %eax crc32 123 + * uint crc32_pmull_le(unsigned char const *buffer, 124 + * size_t len, uint crc32) 125 + */ 126 + ENTRY(crc32_pmull_le) 127 + adr r3, .Lcrc32_constants 128 + b 0f 129 + 130 + ENTRY(crc32c_pmull_le) 131 + adr r3, .Lcrc32c_constants 132 + 133 + 0: bic LEN, LEN, #15 134 + vld1.8 {q1-q2}, [BUF, :128]! 135 + vld1.8 {q3-q4}, [BUF, :128]! 136 + vmov.i8 qzr, #0 137 + vmov.i8 qCONSTANT, #0 138 + vmov dCONSTANTl[0], CRC 139 + veor.8 d2, d2, dCONSTANTl 140 + sub LEN, LEN, #0x40 141 + cmp LEN, #0x40 142 + blt less_64 143 + 144 + vld1.64 {qCONSTANT}, [r3] 145 + 146 + loop_64: /* 64 bytes Full cache line folding */ 147 + sub LEN, LEN, #0x40 148 + 149 + vmull.p64 q5, d3, dCONSTANTh 150 + vmull.p64 q6, d5, dCONSTANTh 151 + vmull.p64 q7, d7, dCONSTANTh 152 + vmull.p64 q8, d9, dCONSTANTh 153 + 154 + vmull.p64 q1, d2, dCONSTANTl 155 + vmull.p64 q2, d4, dCONSTANTl 156 + vmull.p64 q3, d6, dCONSTANTl 157 + vmull.p64 q4, d8, dCONSTANTl 158 + 159 + veor.8 q1, q1, q5 160 + vld1.8 {q5}, [BUF, :128]! 161 + veor.8 q2, q2, q6 162 + vld1.8 {q6}, [BUF, :128]! 163 + veor.8 q3, q3, q7 164 + vld1.8 {q7}, [BUF, :128]! 165 + veor.8 q4, q4, q8 166 + vld1.8 {q8}, [BUF, :128]! 167 + 168 + veor.8 q1, q1, q5 169 + veor.8 q2, q2, q6 170 + veor.8 q3, q3, q7 171 + veor.8 q4, q4, q8 172 + 173 + cmp LEN, #0x40 174 + bge loop_64 175 + 176 + less_64: /* Folding cache line into 128bit */ 177 + vldr dCONSTANTl, [r3, #16] 178 + vldr dCONSTANTh, [r3, #24] 179 + 180 + vmull.p64 q5, d3, dCONSTANTh 181 + vmull.p64 q1, d2, dCONSTANTl 182 + veor.8 q1, q1, q5 183 + veor.8 q1, q1, q2 184 + 185 + vmull.p64 q5, d3, dCONSTANTh 186 + vmull.p64 q1, d2, dCONSTANTl 187 + veor.8 q1, q1, q5 188 + veor.8 q1, q1, q3 189 + 190 + vmull.p64 q5, d3, dCONSTANTh 191 + vmull.p64 q1, d2, dCONSTANTl 192 + veor.8 q1, q1, q5 193 + veor.8 q1, q1, q4 194 + 195 + teq LEN, #0 196 + beq fold_64 197 + 198 + loop_16: /* Folding rest buffer into 128bit */ 199 + subs LEN, LEN, #0x10 200 + 201 + vld1.8 {q2}, [BUF, :128]! 202 + vmull.p64 q5, d3, dCONSTANTh 203 + vmull.p64 q1, d2, dCONSTANTl 204 + veor.8 q1, q1, q5 205 + veor.8 q1, q1, q2 206 + 207 + bne loop_16 208 + 209 + fold_64: 210 + /* perform the last 64 bit fold, also adds 32 zeroes 211 + * to the input stream */ 212 + vmull.p64 q2, d2, dCONSTANTh 213 + vext.8 q1, q1, qzr, #8 214 + veor.8 q1, q1, q2 215 + 216 + /* final 32-bit fold */ 217 + vldr dCONSTANTl, [r3, #32] 218 + vldr d6, [r3, #40] 219 + vmov.i8 d7, #0 220 + 221 + vext.8 q2, q1, qzr, #4 222 + vand.8 d2, d2, d6 223 + vmull.p64 q1, d2, dCONSTANTl 224 + veor.8 q1, q1, q2 225 + 226 + /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ 227 + vldr dCONSTANTl, [r3, #48] 228 + vldr dCONSTANTh, [r3, #56] 229 + 230 + vand.8 q2, q1, q3 231 + vext.8 q2, qzr, q2, #8 232 + vmull.p64 q2, d5, dCONSTANTh 233 + vand.8 q2, q2, q3 234 + vmull.p64 q2, d4, dCONSTANTl 235 + veor.8 q1, q1, q2 236 + vmov r0, s5 237 + 238 + bx lr 239 + ENDPROC(crc32_pmull_le) 240 + ENDPROC(crc32c_pmull_le) 241 + 242 + .macro __crc32, c 243 + subs ip, r2, #8 244 + bmi .Ltail\c 245 + 246 + tst r1, #3 247 + bne .Lunaligned\c 248 + 249 + teq ip, #0 250 + .Laligned8\c: 251 + ldrd r2, r3, [r1], #8 252 + ARM_BE8(rev r2, r2 ) 253 + ARM_BE8(rev r3, r3 ) 254 + crc32\c\()w r0, r0, r2 255 + crc32\c\()w r0, r0, r3 256 + bxeq lr 257 + subs ip, ip, #8 258 + bpl .Laligned8\c 259 + 260 + .Ltail\c: 261 + tst ip, #4 262 + beq 2f 263 + ldr r3, [r1], #4 264 + ARM_BE8(rev r3, r3 ) 265 + crc32\c\()w r0, r0, r3 266 + 267 + 2: tst ip, #2 268 + beq 1f 269 + ldrh r3, [r1], #2 270 + ARM_BE8(rev16 r3, r3 ) 271 + crc32\c\()h r0, r0, r3 272 + 273 + 1: tst ip, #1 274 + bxeq lr 275 + ldrb r3, [r1] 276 + crc32\c\()b r0, r0, r3 277 + bx lr 278 + 279 + .Lunaligned\c: 280 + tst r1, #1 281 + beq 2f 282 + ldrb r3, [r1], #1 283 + subs r2, r2, #1 284 + crc32\c\()b r0, r0, r3 285 + 286 + tst r1, #2 287 + beq 0f 288 + 2: ldrh r3, [r1], #2 289 + subs r2, r2, #2 290 + ARM_BE8(rev16 r3, r3 ) 291 + crc32\c\()h r0, r0, r3 292 + 293 + 0: subs ip, r2, #8 294 + bpl .Laligned8\c 295 + b .Ltail\c 296 + .endm 297 + 298 + .align 5 299 + ENTRY(crc32_armv8_le) 300 + __crc32 301 + ENDPROC(crc32_armv8_le) 302 + 303 + .align 5 304 + ENTRY(crc32c_armv8_le) 305 + __crc32 c 306 + ENDPROC(crc32c_armv8_le)

+242

arch/arm/crypto/crc32-ce-glue.c

··· 1 + /* 2 + * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions 3 + * 4 + * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <linux/crc32.h> 12 + #include <linux/init.h> 13 + #include <linux/kernel.h> 14 + #include <linux/module.h> 15 + #include <linux/string.h> 16 + 17 + #include <crypto/internal/hash.h> 18 + 19 + #include <asm/hwcap.h> 20 + #include <asm/neon.h> 21 + #include <asm/simd.h> 22 + #include <asm/unaligned.h> 23 + 24 + #define PMULL_MIN_LEN 64L /* minimum size of buffer 25 + * for crc32_pmull_le_16 */ 26 + #define SCALE_F 16L /* size of NEON register */ 27 + 28 + asmlinkage u32 crc32_pmull_le(const u8 buf[], u32 len, u32 init_crc); 29 + asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], u32 len); 30 + 31 + asmlinkage u32 crc32c_pmull_le(const u8 buf[], u32 len, u32 init_crc); 32 + asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], u32 len); 33 + 34 + static u32 (*fallback_crc32)(u32 init_crc, const u8 buf[], u32 len); 35 + static u32 (*fallback_crc32c)(u32 init_crc, const u8 buf[], u32 len); 36 + 37 + static int crc32_cra_init(struct crypto_tfm *tfm) 38 + { 39 + u32 *key = crypto_tfm_ctx(tfm); 40 + 41 + *key = 0; 42 + return 0; 43 + } 44 + 45 + static int crc32c_cra_init(struct crypto_tfm *tfm) 46 + { 47 + u32 *key = crypto_tfm_ctx(tfm); 48 + 49 + *key = ~0; 50 + return 0; 51 + } 52 + 53 + static int crc32_setkey(struct crypto_shash *hash, const u8 *key, 54 + unsigned int keylen) 55 + { 56 + u32 *mctx = crypto_shash_ctx(hash); 57 + 58 + if (keylen != sizeof(u32)) { 59 + crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); 60 + return -EINVAL; 61 + } 62 + *mctx = le32_to_cpup((__le32 *)key); 63 + return 0; 64 + } 65 + 66 + static int crc32_init(struct shash_desc *desc) 67 + { 68 + u32 *mctx = crypto_shash_ctx(desc->tfm); 69 + u32 *crc = shash_desc_ctx(desc); 70 + 71 + *crc = *mctx; 72 + return 0; 73 + } 74 + 75 + static int crc32_update(struct shash_desc *desc, const u8 *data, 76 + unsigned int length) 77 + { 78 + u32 *crc = shash_desc_ctx(desc); 79 + 80 + *crc = crc32_armv8_le(*crc, data, length); 81 + return 0; 82 + } 83 + 84 + static int crc32c_update(struct shash_desc *desc, const u8 *data, 85 + unsigned int length) 86 + { 87 + u32 *crc = shash_desc_ctx(desc); 88 + 89 + *crc = crc32c_armv8_le(*crc, data, length); 90 + return 0; 91 + } 92 + 93 + static int crc32_final(struct shash_desc *desc, u8 *out) 94 + { 95 + u32 *crc = shash_desc_ctx(desc); 96 + 97 + put_unaligned_le32(*crc, out); 98 + return 0; 99 + } 100 + 101 + static int crc32c_final(struct shash_desc *desc, u8 *out) 102 + { 103 + u32 *crc = shash_desc_ctx(desc); 104 + 105 + put_unaligned_le32(~*crc, out); 106 + return 0; 107 + } 108 + 109 + static int crc32_pmull_update(struct shash_desc *desc, const u8 *data, 110 + unsigned int length) 111 + { 112 + u32 *crc = shash_desc_ctx(desc); 113 + unsigned int l; 114 + 115 + if (may_use_simd()) { 116 + if ((u32)data % SCALE_F) { 117 + l = min_t(u32, length, SCALE_F - ((u32)data % SCALE_F)); 118 + 119 + *crc = fallback_crc32(*crc, data, l); 120 + 121 + data += l; 122 + length -= l; 123 + } 124 + 125 + if (length >= PMULL_MIN_LEN) { 126 + l = round_down(length, SCALE_F); 127 + 128 + kernel_neon_begin(); 129 + *crc = crc32_pmull_le(data, l, *crc); 130 + kernel_neon_end(); 131 + 132 + data += l; 133 + length -= l; 134 + } 135 + } 136 + 137 + if (length > 0) 138 + *crc = fallback_crc32(*crc, data, length); 139 + 140 + return 0; 141 + } 142 + 143 + static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data, 144 + unsigned int length) 145 + { 146 + u32 *crc = shash_desc_ctx(desc); 147 + unsigned int l; 148 + 149 + if (may_use_simd()) { 150 + if ((u32)data % SCALE_F) { 151 + l = min_t(u32, length, SCALE_F - ((u32)data % SCALE_F)); 152 + 153 + *crc = fallback_crc32c(*crc, data, l); 154 + 155 + data += l; 156 + length -= l; 157 + } 158 + 159 + if (length >= PMULL_MIN_LEN) { 160 + l = round_down(length, SCALE_F); 161 + 162 + kernel_neon_begin(); 163 + *crc = crc32c_pmull_le(data, l, *crc); 164 + kernel_neon_end(); 165 + 166 + data += l; 167 + length -= l; 168 + } 169 + } 170 + 171 + if (length > 0) 172 + *crc = fallback_crc32c(*crc, data, length); 173 + 174 + return 0; 175 + } 176 + 177 + static struct shash_alg crc32_pmull_algs[] = { { 178 + .setkey = crc32_setkey, 179 + .init = crc32_init, 180 + .update = crc32_update, 181 + .final = crc32_final, 182 + .descsize = sizeof(u32), 183 + .digestsize = sizeof(u32), 184 + 185 + .base.cra_ctxsize = sizeof(u32), 186 + .base.cra_init = crc32_cra_init, 187 + .base.cra_name = "crc32", 188 + .base.cra_driver_name = "crc32-arm-ce", 189 + .base.cra_priority = 200, 190 + .base.cra_blocksize = 1, 191 + .base.cra_module = THIS_MODULE, 192 + }, { 193 + .setkey = crc32_setkey, 194 + .init = crc32_init, 195 + .update = crc32c_update, 196 + .final = crc32c_final, 197 + .descsize = sizeof(u32), 198 + .digestsize = sizeof(u32), 199 + 200 + .base.cra_ctxsize = sizeof(u32), 201 + .base.cra_init = crc32c_cra_init, 202 + .base.cra_name = "crc32c", 203 + .base.cra_driver_name = "crc32c-arm-ce", 204 + .base.cra_priority = 200, 205 + .base.cra_blocksize = 1, 206 + .base.cra_module = THIS_MODULE, 207 + } }; 208 + 209 + static int __init crc32_pmull_mod_init(void) 210 + { 211 + if (elf_hwcap2 & HWCAP2_PMULL) { 212 + crc32_pmull_algs[0].update = crc32_pmull_update; 213 + crc32_pmull_algs[1].update = crc32c_pmull_update; 214 + 215 + if (elf_hwcap2 & HWCAP2_CRC32) { 216 + fallback_crc32 = crc32_armv8_le; 217 + fallback_crc32c = crc32c_armv8_le; 218 + } else { 219 + fallback_crc32 = crc32_le; 220 + fallback_crc32c = __crc32c_le; 221 + } 222 + } else if (!(elf_hwcap2 & HWCAP2_CRC32)) { 223 + return -ENODEV; 224 + } 225 + 226 + return crypto_register_shashes(crc32_pmull_algs, 227 + ARRAY_SIZE(crc32_pmull_algs)); 228 + } 229 + 230 + static void __exit crc32_pmull_mod_exit(void) 231 + { 232 + crypto_unregister_shashes(crc32_pmull_algs, 233 + ARRAY_SIZE(crc32_pmull_algs)); 234 + } 235 + 236 + module_init(crc32_pmull_mod_init); 237 + module_exit(crc32_pmull_mod_exit); 238 + 239 + MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 240 + MODULE_LICENSE("GPL v2"); 241 + MODULE_ALIAS_CRYPTO("crc32"); 242 + MODULE_ALIAS_CRYPTO("crc32c");

Configure Feed

Configure Feed