crypto: arm/chacha20 - refactor to allow varying number of rounds

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

In preparation for adding XChaCha12 support, rename/refactor the NEON
implementation of ChaCha20 to support different numbers of rounds.

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Eric Biggers and committed by

Herbert Xu 7 years ago 3cc21519 d97a9430

+55 -47

3 changed files

expand all

arch

arm

crypto

Makefile

chacha-neon-core.S

chacha-neon-glue.c

+2 -2

arch/arm/crypto/Makefile

··· 9 9 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o 10 10 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o 11 11 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o 12 - obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o 12 + obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o 13 13 14 14 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o 15 15 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o ··· 52 52 ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o 53 53 crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o 54 54 crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o 55 - chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o 55 + chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o 56 56 57 57 ifdef REGENERATE_ARM_CRYPTO 58 58 quiet_cmd_perl = PERL $@

+24 -20

arch/arm/crypto/chacha20-neon-core.S arch/arm/crypto/chacha-neon-core.S

··· 1 1 /* 2 - * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions 2 + * ChaCha/XChaCha NEON helper functions 3 3 * 4 4 * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> 5 5 * ··· 27 27 * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only, 28 28 * needs index vector) 29 29 * 30 - * ChaCha20 has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit 31 - * rotations, the only choices are (a) and (b). We use (a) since it takes 32 - * two-thirds the cycles of (b) on both Cortex-A7 and Cortex-A53. 30 + * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations, 31 + * the only choices are (a) and (b). We use (a) since it takes two-thirds the 32 + * cycles of (b) on both Cortex-A7 and Cortex-A53. 33 33 * 34 34 * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest 35 35 * and doesn't need a temporary register. ··· 53 53 .align 5 54 54 55 55 /* 56 - * chacha20_permute - permute one block 56 + * chacha_permute - permute one block 57 57 * 58 58 * Permute one 64-byte block where the state matrix is stored in the four NEON 59 59 * registers q0-q3. It performs matrix operations on four words in parallel, 60 60 * but requires shuffling to rearrange the words after each round. 61 61 * 62 + * The round count is given in r3. 63 + * 62 64 * Clobbers: r3, ip, q4-q5 63 65 */ 64 - chacha20_permute: 66 + chacha_permute: 65 67 66 68 adr ip, .Lrol8_table 67 - mov r3, #10 68 69 vld1.8 {d10}, [ip, :64] 69 70 70 71 .Ldoubleround: ··· 129 128 // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 130 129 vext.8 q3, q3, q3, #4 131 130 132 - subs r3, r3, #1 131 + subs r3, r3, #2 133 132 bne .Ldoubleround 134 133 135 134 bx lr 136 - ENDPROC(chacha20_permute) 135 + ENDPROC(chacha_permute) 137 136 138 - ENTRY(chacha20_block_xor_neon) 137 + ENTRY(chacha_block_xor_neon) 139 138 // r0: Input state matrix, s 140 139 // r1: 1 data block output, o 141 140 // r2: 1 data block input, i 141 + // r3: nrounds 142 142 push {lr} 143 143 144 144 // x0..3 = s0..3 ··· 152 150 vmov q10, q2 153 151 vmov q11, q3 154 152 155 - bl chacha20_permute 153 + bl chacha_permute 156 154 157 155 add ip, r2, #0x20 158 156 vld1.8 {q4-q5}, [r2] ··· 179 177 vst1.8 {q2-q3}, [ip] 180 178 181 179 pop {pc} 182 - ENDPROC(chacha20_block_xor_neon) 180 + ENDPROC(chacha_block_xor_neon) 183 181 184 - ENTRY(hchacha20_block_neon) 182 + ENTRY(hchacha_block_neon) 185 183 // r0: Input state matrix, s 186 184 // r1: output (8 32-bit words) 185 + // r2: nrounds 187 186 push {lr} 188 187 189 188 vld1.32 {q0-q1}, [r0]! 190 189 vld1.32 {q2-q3}, [r0] 191 190 192 - bl chacha20_permute 191 + mov r3, r2 192 + bl chacha_permute 193 193 194 194 vst1.32 {q0}, [r1]! 195 195 vst1.32 {q3}, [r1] 196 196 197 197 pop {pc} 198 - ENDPROC(hchacha20_block_neon) 198 + ENDPROC(hchacha_block_neon) 199 199 200 200 .align 4 201 201 .Lctrinc: .word 0, 1, 2, 3 202 202 .Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6 203 203 204 204 .align 5 205 - ENTRY(chacha20_4block_xor_neon) 205 + ENTRY(chacha_4block_xor_neon) 206 206 push {r4-r5} 207 207 mov r4, sp // preserve the stack pointer 208 208 sub ip, sp, #0x20 // allocate a 32 byte buffer ··· 214 210 // r0: Input state matrix, s 215 211 // r1: 4 data blocks output, o 216 212 // r2: 4 data blocks input, i 213 + // r3: nrounds 217 214 218 215 // 219 - // This function encrypts four consecutive ChaCha20 blocks by loading 216 + // This function encrypts four consecutive ChaCha blocks by loading 220 217 // the state matrix in NEON registers four times. The algorithm performs 221 218 // each operation on the corresponding word of each state matrix, hence 222 219 // requires no word shuffling. The words are re-interleaved before the ··· 250 245 vdup.32 q0, d0[0] 251 246 252 247 adr ip, .Lrol8_table 253 - mov r3, #10 254 248 b 1f 255 249 256 250 .Ldoubleround4: ··· 447 443 vsri.u32 q5, q8, #25 448 444 vsri.u32 q6, q9, #25 449 445 450 - subs r3, r3, #1 446 + subs r3, r3, #2 451 447 bne .Ldoubleround4 452 448 453 449 // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15. ··· 557 553 558 554 pop {r4-r5} 559 555 bx lr 560 - ENDPROC(chacha20_4block_xor_neon) 556 + ENDPROC(chacha_4block_xor_neon)

+29 -25

arch/arm/crypto/chacha20-neon-glue.c arch/arm/crypto/chacha-neon-glue.c

··· 28 28 #include <asm/neon.h> 29 29 #include <asm/simd.h> 30 30 31 - asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); 32 - asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); 33 - asmlinkage void hchacha20_block_neon(const u32 *state, u32 *out); 31 + asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src, 32 + int nrounds); 33 + asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src, 34 + int nrounds); 35 + asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); 34 36 35 - static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src, 36 - unsigned int bytes) 37 + static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, 38 + unsigned int bytes, int nrounds) 37 39 { 38 40 u8 buf[CHACHA_BLOCK_SIZE]; 39 41 40 42 while (bytes >= CHACHA_BLOCK_SIZE * 4) { 41 - chacha20_4block_xor_neon(state, dst, src); 43 + chacha_4block_xor_neon(state, dst, src, nrounds); 42 44 bytes -= CHACHA_BLOCK_SIZE * 4; 43 45 src += CHACHA_BLOCK_SIZE * 4; 44 46 dst += CHACHA_BLOCK_SIZE * 4; 45 47 state[12] += 4; 46 48 } 47 49 while (bytes >= CHACHA_BLOCK_SIZE) { 48 - chacha20_block_xor_neon(state, dst, src); 50 + chacha_block_xor_neon(state, dst, src, nrounds); 49 51 bytes -= CHACHA_BLOCK_SIZE; 50 52 src += CHACHA_BLOCK_SIZE; 51 53 dst += CHACHA_BLOCK_SIZE; ··· 55 53 } 56 54 if (bytes) { 57 55 memcpy(buf, src, bytes); 58 - chacha20_block_xor_neon(state, buf, buf); 56 + chacha_block_xor_neon(state, buf, buf, nrounds); 59 57 memcpy(dst, buf, bytes); 60 58 } 61 59 } 62 60 63 - static int chacha20_neon_stream_xor(struct skcipher_request *req, 64 - struct chacha_ctx *ctx, u8 *iv) 61 + static int chacha_neon_stream_xor(struct skcipher_request *req, 62 + struct chacha_ctx *ctx, u8 *iv) 65 63 { 66 64 struct skcipher_walk walk; 67 65 u32 state[16]; ··· 78 76 nbytes = round_down(nbytes, walk.stride); 79 77 80 78 kernel_neon_begin(); 81 - chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, 82 - nbytes); 79 + chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, 80 + nbytes, ctx->nrounds); 83 81 kernel_neon_end(); 84 82 err = skcipher_walk_done(&walk, walk.nbytes - nbytes); 85 83 } ··· 87 85 return err; 88 86 } 89 87 90 - static int chacha20_neon(struct skcipher_request *req) 88 + static int chacha_neon(struct skcipher_request *req) 91 89 { 92 90 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 93 91 struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); ··· 95 93 if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) 96 94 return crypto_chacha_crypt(req); 97 95 98 - return chacha20_neon_stream_xor(req, ctx, req->iv); 96 + return chacha_neon_stream_xor(req, ctx, req->iv); 99 97 } 100 98 101 - static int xchacha20_neon(struct skcipher_request *req) 99 + static int xchacha_neon(struct skcipher_request *req) 102 100 { 103 101 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 104 102 struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); ··· 112 110 crypto_chacha_init(state, ctx, req->iv); 113 111 114 112 kernel_neon_begin(); 115 - hchacha20_block_neon(state, subctx.key); 113 + hchacha_block_neon(state, subctx.key, ctx->nrounds); 116 114 kernel_neon_end(); 115 + subctx.nrounds = ctx->nrounds; 117 116 118 117 memcpy(&real_iv[0], req->iv + 24, 8); 119 118 memcpy(&real_iv[8], req->iv + 16, 8); 120 - return chacha20_neon_stream_xor(req, &subctx, real_iv); 119 + return chacha_neon_stream_xor(req, &subctx, real_iv); 121 120 } 122 121 123 122 static struct skcipher_alg algs[] = { ··· 136 133 .chunksize = CHACHA_BLOCK_SIZE, 137 134 .walksize = 4 * CHACHA_BLOCK_SIZE, 138 135 .setkey = crypto_chacha20_setkey, 139 - .encrypt = chacha20_neon, 140 - .decrypt = chacha20_neon, 136 + .encrypt = chacha_neon, 137 + .decrypt = chacha_neon, 141 138 }, { 142 139 .base.cra_name = "xchacha20", 143 140 .base.cra_driver_name = "xchacha20-neon", ··· 152 149 .chunksize = CHACHA_BLOCK_SIZE, 153 150 .walksize = 4 * CHACHA_BLOCK_SIZE, 154 151 .setkey = crypto_chacha20_setkey, 155 - .encrypt = xchacha20_neon, 156 - .decrypt = xchacha20_neon, 152 + .encrypt = xchacha_neon, 153 + .decrypt = xchacha_neon, 157 154 } 158 155 }; 159 156 160 - static int __init chacha20_simd_mod_init(void) 157 + static int __init chacha_simd_mod_init(void) 161 158 { 162 159 if (!(elf_hwcap & HWCAP_NEON)) 163 160 return -ENODEV; ··· 165 162 return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); 166 163 } 167 164 168 - static void __exit chacha20_simd_mod_fini(void) 165 + static void __exit chacha_simd_mod_fini(void) 169 166 { 170 167 crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); 171 168 } 172 169 173 - module_init(chacha20_simd_mod_init); 174 - module_exit(chacha20_simd_mod_fini); 170 + module_init(chacha_simd_mod_init); 171 + module_exit(chacha_simd_mod_fini); 175 172 173 + MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)"); 176 174 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 177 175 MODULE_LICENSE("GPL v2"); 178 176 MODULE_ALIAS_CRYPTO("chacha20");