Merge tag 'for-3.16' of git://git.linaro.org/people/ard.biesheuvel/linux-arm into upstream

+3

arch/arm64/Kconfig

··· 343 343 source "security/Kconfig" 344 344 345 345 source "crypto/Kconfig" 346 + if CRYPTO 347 + source "arch/arm64/crypto/Kconfig" 348 + endif 346 349 347 350 source "lib/Kconfig"

+1

arch/arm64/Makefile

··· 45 45 core-y += arch/arm64/kernel/ arch/arm64/mm/ 46 46 core-$(CONFIG_KVM) += arch/arm64/kvm/ 47 47 core-$(CONFIG_XEN) += arch/arm64/xen/ 48 + core-$(CONFIG_CRYPTO) += arch/arm64/crypto/ 48 49 libs-y := arch/arm64/lib/ $(libs-y) 49 50 libs-y += $(LIBGCC) 50 51

+53

arch/arm64/crypto/Kconfig

··· 1 + 2 + menuconfig ARM64_CRYPTO 3 + bool "ARM64 Accelerated Cryptographic Algorithms" 4 + depends on ARM64 5 + help 6 + Say Y here to choose from a selection of cryptographic algorithms 7 + implemented using ARM64 specific CPU features or instructions. 8 + 9 + if ARM64_CRYPTO 10 + 11 + config CRYPTO_SHA1_ARM64_CE 12 + tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)" 13 + depends on ARM64 && KERNEL_MODE_NEON 14 + select CRYPTO_HASH 15 + 16 + config CRYPTO_SHA2_ARM64_CE 17 + tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)" 18 + depends on ARM64 && KERNEL_MODE_NEON 19 + select CRYPTO_HASH 20 + 21 + config CRYPTO_GHASH_ARM64_CE 22 + tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions" 23 + depends on ARM64 && KERNEL_MODE_NEON 24 + select CRYPTO_HASH 25 + 26 + config CRYPTO_AES_ARM64_CE 27 + tristate "AES core cipher using ARMv8 Crypto Extensions" 28 + depends on ARM64 && KERNEL_MODE_NEON 29 + select CRYPTO_ALGAPI 30 + select CRYPTO_AES 31 + 32 + config CRYPTO_AES_ARM64_CE_CCM 33 + tristate "AES in CCM mode using ARMv8 Crypto Extensions" 34 + depends on ARM64 && KERNEL_MODE_NEON 35 + select CRYPTO_ALGAPI 36 + select CRYPTO_AES 37 + select CRYPTO_AEAD 38 + 39 + config CRYPTO_AES_ARM64_CE_BLK 40 + tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions" 41 + depends on ARM64 && KERNEL_MODE_NEON 42 + select CRYPTO_BLKCIPHER 43 + select CRYPTO_AES 44 + select CRYPTO_ABLK_HELPER 45 + 46 + config CRYPTO_AES_ARM64_NEON_BLK 47 + tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions" 48 + depends on ARM64 && KERNEL_MODE_NEON 49 + select CRYPTO_BLKCIPHER 50 + select CRYPTO_AES 51 + select CRYPTO_ABLK_HELPER 52 + 53 + endif

+38

arch/arm64/crypto/Makefile

··· 1 + # 2 + # linux/arch/arm64/crypto/Makefile 3 + # 4 + # Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + # 6 + # This program is free software; you can redistribute it and/or modify 7 + # it under the terms of the GNU General Public License version 2 as 8 + # published by the Free Software Foundation. 9 + # 10 + 11 + obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o 12 + sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o 13 + 14 + obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o 15 + sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o 16 + 17 + obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o 18 + ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o 19 + 20 + obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o 21 + CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto 22 + 23 + obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o 24 + aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o 25 + 26 + obj-$(CONFIG_CRYPTO_AES_ARM64_CE_BLK) += aes-ce-blk.o 27 + aes-ce-blk-y := aes-glue-ce.o aes-ce.o 28 + 29 + obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o 30 + aes-neon-blk-y := aes-glue-neon.o aes-neon.o 31 + 32 + AFLAGS_aes-ce.o := -DINTERLEAVE=2 -DINTERLEAVE_INLINE 33 + AFLAGS_aes-neon.o := -DINTERLEAVE=4 34 + 35 + CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS 36 + 37 + $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE 38 + $(call if_changed_dep,cc_o_c)

+222

arch/arm64/crypto/aes-ce-ccm-core.S

··· 1 + /* 2 + * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions 3 + * 4 + * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <linux/linkage.h> 12 + 13 + .text 14 + .arch armv8-a+crypto 15 + 16 + /* 17 + * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, 18 + * u32 *macp, u8 const rk[], u32 rounds); 19 + */ 20 + ENTRY(ce_aes_ccm_auth_data) 21 + ldr w8, [x3] /* leftover from prev round? */ 22 + ld1 {v0.2d}, [x0] /* load mac */ 23 + cbz w8, 1f 24 + sub w8, w8, #16 25 + eor v1.16b, v1.16b, v1.16b 26 + 0: ldrb w7, [x1], #1 /* get 1 byte of input */ 27 + subs w2, w2, #1 28 + add w8, w8, #1 29 + ins v1.b[0], w7 30 + ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */ 31 + beq 8f /* out of input? */ 32 + cbnz w8, 0b 33 + eor v0.16b, v0.16b, v1.16b 34 + 1: ld1 {v3.2d}, [x4] /* load first round key */ 35 + prfm pldl1strm, [x1] 36 + cmp w5, #12 /* which key size? */ 37 + add x6, x4, #16 38 + sub w7, w5, #2 /* modified # of rounds */ 39 + bmi 2f 40 + bne 5f 41 + mov v5.16b, v3.16b 42 + b 4f 43 + 2: mov v4.16b, v3.16b 44 + ld1 {v5.2d}, [x6], #16 /* load 2nd round key */ 45 + 3: aese v0.16b, v4.16b 46 + aesmc v0.16b, v0.16b 47 + 4: ld1 {v3.2d}, [x6], #16 /* load next round key */ 48 + aese v0.16b, v5.16b 49 + aesmc v0.16b, v0.16b 50 + 5: ld1 {v4.2d}, [x6], #16 /* load next round key */ 51 + subs w7, w7, #3 52 + aese v0.16b, v3.16b 53 + aesmc v0.16b, v0.16b 54 + ld1 {v5.2d}, [x6], #16 /* load next round key */ 55 + bpl 3b 56 + aese v0.16b, v4.16b 57 + subs w2, w2, #16 /* last data? */ 58 + eor v0.16b, v0.16b, v5.16b /* final round */ 59 + bmi 6f 60 + ld1 {v1.16b}, [x1], #16 /* load next input block */ 61 + eor v0.16b, v0.16b, v1.16b /* xor with mac */ 62 + bne 1b 63 + 6: st1 {v0.2d}, [x0] /* store mac */ 64 + beq 10f 65 + adds w2, w2, #16 66 + beq 10f 67 + mov w8, w2 68 + 7: ldrb w7, [x1], #1 69 + umov w6, v0.b[0] 70 + eor w6, w6, w7 71 + strb w6, [x0], #1 72 + subs w2, w2, #1 73 + beq 10f 74 + ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */ 75 + b 7b 76 + 8: mov w7, w8 77 + add w8, w8, #16 78 + 9: ext v1.16b, v1.16b, v1.16b, #1 79 + adds w7, w7, #1 80 + bne 9b 81 + eor v0.16b, v0.16b, v1.16b 82 + st1 {v0.2d}, [x0] 83 + 10: str w8, [x3] 84 + ret 85 + ENDPROC(ce_aes_ccm_auth_data) 86 + 87 + /* 88 + * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[], 89 + * u32 rounds); 90 + */ 91 + ENTRY(ce_aes_ccm_final) 92 + ld1 {v3.2d}, [x2], #16 /* load first round key */ 93 + ld1 {v0.2d}, [x0] /* load mac */ 94 + cmp w3, #12 /* which key size? */ 95 + sub w3, w3, #2 /* modified # of rounds */ 96 + ld1 {v1.2d}, [x1] /* load 1st ctriv */ 97 + bmi 0f 98 + bne 3f 99 + mov v5.16b, v3.16b 100 + b 2f 101 + 0: mov v4.16b, v3.16b 102 + 1: ld1 {v5.2d}, [x2], #16 /* load next round key */ 103 + aese v0.16b, v4.16b 104 + aese v1.16b, v4.16b 105 + aesmc v0.16b, v0.16b 106 + aesmc v1.16b, v1.16b 107 + 2: ld1 {v3.2d}, [x2], #16 /* load next round key */ 108 + aese v0.16b, v5.16b 109 + aese v1.16b, v5.16b 110 + aesmc v0.16b, v0.16b 111 + aesmc v1.16b, v1.16b 112 + 3: ld1 {v4.2d}, [x2], #16 /* load next round key */ 113 + subs w3, w3, #3 114 + aese v0.16b, v3.16b 115 + aese v1.16b, v3.16b 116 + aesmc v0.16b, v0.16b 117 + aesmc v1.16b, v1.16b 118 + bpl 1b 119 + aese v0.16b, v4.16b 120 + aese v1.16b, v4.16b 121 + /* final round key cancels out */ 122 + eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */ 123 + st1 {v0.2d}, [x0] /* store result */ 124 + ret 125 + ENDPROC(ce_aes_ccm_final) 126 + 127 + .macro aes_ccm_do_crypt,enc 128 + ldr x8, [x6, #8] /* load lower ctr */ 129 + ld1 {v0.2d}, [x5] /* load mac */ 130 + rev x8, x8 /* keep swabbed ctr in reg */ 131 + 0: /* outer loop */ 132 + ld1 {v1.1d}, [x6] /* load upper ctr */ 133 + prfm pldl1strm, [x1] 134 + add x8, x8, #1 135 + rev x9, x8 136 + cmp w4, #12 /* which key size? */ 137 + sub w7, w4, #2 /* get modified # of rounds */ 138 + ins v1.d[1], x9 /* no carry in lower ctr */ 139 + ld1 {v3.2d}, [x3] /* load first round key */ 140 + add x10, x3, #16 141 + bmi 1f 142 + bne 4f 143 + mov v5.16b, v3.16b 144 + b 3f 145 + 1: mov v4.16b, v3.16b 146 + ld1 {v5.2d}, [x10], #16 /* load 2nd round key */ 147 + 2: /* inner loop: 3 rounds, 2x interleaved */ 148 + aese v0.16b, v4.16b 149 + aese v1.16b, v4.16b 150 + aesmc v0.16b, v0.16b 151 + aesmc v1.16b, v1.16b 152 + 3: ld1 {v3.2d}, [x10], #16 /* load next round key */ 153 + aese v0.16b, v5.16b 154 + aese v1.16b, v5.16b 155 + aesmc v0.16b, v0.16b 156 + aesmc v1.16b, v1.16b 157 + 4: ld1 {v4.2d}, [x10], #16 /* load next round key */ 158 + subs w7, w7, #3 159 + aese v0.16b, v3.16b 160 + aese v1.16b, v3.16b 161 + aesmc v0.16b, v0.16b 162 + aesmc v1.16b, v1.16b 163 + ld1 {v5.2d}, [x10], #16 /* load next round key */ 164 + bpl 2b 165 + aese v0.16b, v4.16b 166 + aese v1.16b, v4.16b 167 + subs w2, w2, #16 168 + bmi 6f /* partial block? */ 169 + ld1 {v2.16b}, [x1], #16 /* load next input block */ 170 + .if \enc == 1 171 + eor v2.16b, v2.16b, v5.16b /* final round enc+mac */ 172 + eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */ 173 + .else 174 + eor v2.16b, v2.16b, v1.16b /* xor with crypted ctr */ 175 + eor v1.16b, v2.16b, v5.16b /* final round enc */ 176 + .endif 177 + eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */ 178 + st1 {v1.16b}, [x0], #16 /* write output block */ 179 + bne 0b 180 + rev x8, x8 181 + st1 {v0.2d}, [x5] /* store mac */ 182 + str x8, [x6, #8] /* store lsb end of ctr (BE) */ 183 + 5: ret 184 + 185 + 6: eor v0.16b, v0.16b, v5.16b /* final round mac */ 186 + eor v1.16b, v1.16b, v5.16b /* final round enc */ 187 + st1 {v0.2d}, [x5] /* store mac */ 188 + add w2, w2, #16 /* process partial tail block */ 189 + 7: ldrb w9, [x1], #1 /* get 1 byte of input */ 190 + umov w6, v1.b[0] /* get top crypted ctr byte */ 191 + umov w7, v0.b[0] /* get top mac byte */ 192 + .if \enc == 1 193 + eor w7, w7, w9 194 + eor w9, w9, w6 195 + .else 196 + eor w9, w9, w6 197 + eor w7, w7, w9 198 + .endif 199 + strb w9, [x0], #1 /* store out byte */ 200 + strb w7, [x5], #1 /* store mac byte */ 201 + subs w2, w2, #1 202 + beq 5b 203 + ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */ 204 + ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */ 205 + b 7b 206 + .endm 207 + 208 + /* 209 + * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, 210 + * u8 const rk[], u32 rounds, u8 mac[], 211 + * u8 ctr[]); 212 + * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, 213 + * u8 const rk[], u32 rounds, u8 mac[], 214 + * u8 ctr[]); 215 + */ 216 + ENTRY(ce_aes_ccm_encrypt) 217 + aes_ccm_do_crypt 1 218 + ENDPROC(ce_aes_ccm_encrypt) 219 + 220 + ENTRY(ce_aes_ccm_decrypt) 221 + aes_ccm_do_crypt 0 222 + ENDPROC(ce_aes_ccm_decrypt)

+297

arch/arm64/crypto/aes-ce-ccm-glue.c

··· 1 + /* 2 + * aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions 3 + * 4 + * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <asm/neon.h> 12 + #include <asm/unaligned.h> 13 + #include <crypto/aes.h> 14 + #include <crypto/algapi.h> 15 + #include <crypto/scatterwalk.h> 16 + #include <linux/crypto.h> 17 + #include <linux/module.h> 18 + 19 + static int num_rounds(struct crypto_aes_ctx *ctx) 20 + { 21 + /* 22 + * # of rounds specified by AES: 23 + * 128 bit key 10 rounds 24 + * 192 bit key 12 rounds 25 + * 256 bit key 14 rounds 26 + * => n byte key => 6 + (n/4) rounds 27 + */ 28 + return 6 + ctx->key_length / 4; 29 + } 30 + 31 + asmlinkage void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, 32 + u32 *macp, u32 const rk[], u32 rounds); 33 + 34 + asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, 35 + u32 const rk[], u32 rounds, u8 mac[], 36 + u8 ctr[]); 37 + 38 + asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, 39 + u32 const rk[], u32 rounds, u8 mac[], 40 + u8 ctr[]); 41 + 42 + asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[], 43 + u32 rounds); 44 + 45 + static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key, 46 + unsigned int key_len) 47 + { 48 + struct crypto_aes_ctx *ctx = crypto_aead_ctx(tfm); 49 + int ret; 50 + 51 + ret = crypto_aes_expand_key(ctx, in_key, key_len); 52 + if (!ret) 53 + return 0; 54 + 55 + tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; 56 + return -EINVAL; 57 + } 58 + 59 + static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) 60 + { 61 + if ((authsize & 1) || authsize < 4) 62 + return -EINVAL; 63 + return 0; 64 + } 65 + 66 + static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen) 67 + { 68 + struct crypto_aead *aead = crypto_aead_reqtfm(req); 69 + __be32 *n = (__be32 *)&maciv[AES_BLOCK_SIZE - 8]; 70 + u32 l = req->iv[0] + 1; 71 + 72 + /* verify that CCM dimension 'L' is set correctly in the IV */ 73 + if (l < 2 || l > 8) 74 + return -EINVAL; 75 + 76 + /* verify that msglen can in fact be represented in L bytes */ 77 + if (l < 4 && msglen >> (8 * l)) 78 + return -EOVERFLOW; 79 + 80 + /* 81 + * Even if the CCM spec allows L values of up to 8, the Linux cryptoapi 82 + * uses a u32 type to represent msglen so the top 4 bytes are always 0. 83 + */ 84 + n[0] = 0; 85 + n[1] = cpu_to_be32(msglen); 86 + 87 + memcpy(maciv, req->iv, AES_BLOCK_SIZE - l); 88 + 89 + /* 90 + * Meaning of byte 0 according to CCM spec (RFC 3610/NIST 800-38C) 91 + * - bits 0..2 : max # of bytes required to represent msglen, minus 1 92 + * (already set by caller) 93 + * - bits 3..5 : size of auth tag (1 => 4 bytes, 2 => 6 bytes, etc) 94 + * - bit 6 : indicates presence of authenticate-only data 95 + */ 96 + maciv[0] |= (crypto_aead_authsize(aead) - 2) << 2; 97 + if (req->assoclen) 98 + maciv[0] |= 0x40; 99 + 100 + memset(&req->iv[AES_BLOCK_SIZE - l], 0, l); 101 + return 0; 102 + } 103 + 104 + static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) 105 + { 106 + struct crypto_aead *aead = crypto_aead_reqtfm(req); 107 + struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); 108 + struct __packed { __be16 l; __be32 h; u16 len; } ltag; 109 + struct scatter_walk walk; 110 + u32 len = req->assoclen; 111 + u32 macp = 0; 112 + 113 + /* prepend the AAD with a length tag */ 114 + if (len < 0xff00) { 115 + ltag.l = cpu_to_be16(len); 116 + ltag.len = 2; 117 + } else { 118 + ltag.l = cpu_to_be16(0xfffe); 119 + put_unaligned_be32(len, &ltag.h); 120 + ltag.len = 6; 121 + } 122 + 123 + ce_aes_ccm_auth_data(mac, (u8 *)&ltag, ltag.len, &macp, ctx->key_enc, 124 + num_rounds(ctx)); 125 + scatterwalk_start(&walk, req->assoc); 126 + 127 + do { 128 + u32 n = scatterwalk_clamp(&walk, len); 129 + u8 *p; 130 + 131 + if (!n) { 132 + scatterwalk_start(&walk, sg_next(walk.sg)); 133 + n = scatterwalk_clamp(&walk, len); 134 + } 135 + p = scatterwalk_map(&walk); 136 + ce_aes_ccm_auth_data(mac, p, n, &macp, ctx->key_enc, 137 + num_rounds(ctx)); 138 + len -= n; 139 + 140 + scatterwalk_unmap(p); 141 + scatterwalk_advance(&walk, n); 142 + scatterwalk_done(&walk, 0, len); 143 + } while (len); 144 + } 145 + 146 + static int ccm_encrypt(struct aead_request *req) 147 + { 148 + struct crypto_aead *aead = crypto_aead_reqtfm(req); 149 + struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); 150 + struct blkcipher_desc desc = { .info = req->iv }; 151 + struct blkcipher_walk walk; 152 + u8 __aligned(8) mac[AES_BLOCK_SIZE]; 153 + u8 buf[AES_BLOCK_SIZE]; 154 + u32 len = req->cryptlen; 155 + int err; 156 + 157 + err = ccm_init_mac(req, mac, len); 158 + if (err) 159 + return err; 160 + 161 + kernel_neon_begin_partial(6); 162 + 163 + if (req->assoclen) 164 + ccm_calculate_auth_mac(req, mac); 165 + 166 + /* preserve the original iv for the final round */ 167 + memcpy(buf, req->iv, AES_BLOCK_SIZE); 168 + 169 + blkcipher_walk_init(&walk, req->dst, req->src, len); 170 + err = blkcipher_aead_walk_virt_block(&desc, &walk, aead, 171 + AES_BLOCK_SIZE); 172 + 173 + while (walk.nbytes) { 174 + u32 tail = walk.nbytes % AES_BLOCK_SIZE; 175 + 176 + if (walk.nbytes == len) 177 + tail = 0; 178 + 179 + ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr, 180 + walk.nbytes - tail, ctx->key_enc, 181 + num_rounds(ctx), mac, walk.iv); 182 + 183 + len -= walk.nbytes - tail; 184 + err = blkcipher_walk_done(&desc, &walk, tail); 185 + } 186 + if (!err) 187 + ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); 188 + 189 + kernel_neon_end(); 190 + 191 + if (err) 192 + return err; 193 + 194 + /* copy authtag to end of dst */ 195 + scatterwalk_map_and_copy(mac, req->dst, req->cryptlen, 196 + crypto_aead_authsize(aead), 1); 197 + 198 + return 0; 199 + } 200 + 201 + static int ccm_decrypt(struct aead_request *req) 202 + { 203 + struct crypto_aead *aead = crypto_aead_reqtfm(req); 204 + struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); 205 + unsigned int authsize = crypto_aead_authsize(aead); 206 + struct blkcipher_desc desc = { .info = req->iv }; 207 + struct blkcipher_walk walk; 208 + u8 __aligned(8) mac[AES_BLOCK_SIZE]; 209 + u8 buf[AES_BLOCK_SIZE]; 210 + u32 len = req->cryptlen - authsize; 211 + int err; 212 + 213 + err = ccm_init_mac(req, mac, len); 214 + if (err) 215 + return err; 216 + 217 + kernel_neon_begin_partial(6); 218 + 219 + if (req->assoclen) 220 + ccm_calculate_auth_mac(req, mac); 221 + 222 + /* preserve the original iv for the final round */ 223 + memcpy(buf, req->iv, AES_BLOCK_SIZE); 224 + 225 + blkcipher_walk_init(&walk, req->dst, req->src, len); 226 + err = blkcipher_aead_walk_virt_block(&desc, &walk, aead, 227 + AES_BLOCK_SIZE); 228 + 229 + while (walk.nbytes) { 230 + u32 tail = walk.nbytes % AES_BLOCK_SIZE; 231 + 232 + if (walk.nbytes == len) 233 + tail = 0; 234 + 235 + ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr, 236 + walk.nbytes - tail, ctx->key_enc, 237 + num_rounds(ctx), mac, walk.iv); 238 + 239 + len -= walk.nbytes - tail; 240 + err = blkcipher_walk_done(&desc, &walk, tail); 241 + } 242 + if (!err) 243 + ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); 244 + 245 + kernel_neon_end(); 246 + 247 + if (err) 248 + return err; 249 + 250 + /* compare calculated auth tag with the stored one */ 251 + scatterwalk_map_and_copy(buf, req->src, req->cryptlen - authsize, 252 + authsize, 0); 253 + 254 + if (memcmp(mac, buf, authsize)) 255 + return -EBADMSG; 256 + return 0; 257 + } 258 + 259 + static struct crypto_alg ccm_aes_alg = { 260 + .cra_name = "ccm(aes)", 261 + .cra_driver_name = "ccm-aes-ce", 262 + .cra_priority = 300, 263 + .cra_flags = CRYPTO_ALG_TYPE_AEAD, 264 + .cra_blocksize = 1, 265 + .cra_ctxsize = sizeof(struct crypto_aes_ctx), 266 + .cra_alignmask = 7, 267 + .cra_type = &crypto_aead_type, 268 + .cra_module = THIS_MODULE, 269 + .cra_aead = { 270 + .ivsize = AES_BLOCK_SIZE, 271 + .maxauthsize = AES_BLOCK_SIZE, 272 + .setkey = ccm_setkey, 273 + .setauthsize = ccm_setauthsize, 274 + .encrypt = ccm_encrypt, 275 + .decrypt = ccm_decrypt, 276 + } 277 + }; 278 + 279 + static int __init aes_mod_init(void) 280 + { 281 + if (!(elf_hwcap & HWCAP_AES)) 282 + return -ENODEV; 283 + return crypto_register_alg(&ccm_aes_alg); 284 + } 285 + 286 + static void __exit aes_mod_exit(void) 287 + { 288 + crypto_unregister_alg(&ccm_aes_alg); 289 + } 290 + 291 + module_init(aes_mod_init); 292 + module_exit(aes_mod_exit); 293 + 294 + MODULE_DESCRIPTION("Synchronous AES in CCM mode using ARMv8 Crypto Extensions"); 295 + MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 296 + MODULE_LICENSE("GPL v2"); 297 + MODULE_ALIAS("ccm(aes)");

+155

arch/arm64/crypto/aes-ce-cipher.c

··· 1 + /* 2 + * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions 3 + * 4 + * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <asm/neon.h> 12 + #include <crypto/aes.h> 13 + #include <linux/cpufeature.h> 14 + #include <linux/crypto.h> 15 + #include <linux/module.h> 16 + 17 + MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions"); 18 + MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 19 + MODULE_LICENSE("GPL v2"); 20 + 21 + struct aes_block { 22 + u8 b[AES_BLOCK_SIZE]; 23 + }; 24 + 25 + static int num_rounds(struct crypto_aes_ctx *ctx) 26 + { 27 + /* 28 + * # of rounds specified by AES: 29 + * 128 bit key 10 rounds 30 + * 192 bit key 12 rounds 31 + * 256 bit key 14 rounds 32 + * => n byte key => 6 + (n/4) rounds 33 + */ 34 + return 6 + ctx->key_length / 4; 35 + } 36 + 37 + static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) 38 + { 39 + struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); 40 + struct aes_block *out = (struct aes_block *)dst; 41 + struct aes_block const *in = (struct aes_block *)src; 42 + void *dummy0; 43 + int dummy1; 44 + 45 + kernel_neon_begin_partial(4); 46 + 47 + __asm__(" ld1 {v0.16b}, %[in] ;" 48 + " ld1 {v1.2d}, [%[key]], #16 ;" 49 + " cmp %w[rounds], #10 ;" 50 + " bmi 0f ;" 51 + " bne 3f ;" 52 + " mov v3.16b, v1.16b ;" 53 + " b 2f ;" 54 + "0: mov v2.16b, v1.16b ;" 55 + " ld1 {v3.2d}, [%[key]], #16 ;" 56 + "1: aese v0.16b, v2.16b ;" 57 + " aesmc v0.16b, v0.16b ;" 58 + "2: ld1 {v1.2d}, [%[key]], #16 ;" 59 + " aese v0.16b, v3.16b ;" 60 + " aesmc v0.16b, v0.16b ;" 61 + "3: ld1 {v2.2d}, [%[key]], #16 ;" 62 + " subs %w[rounds], %w[rounds], #3 ;" 63 + " aese v0.16b, v1.16b ;" 64 + " aesmc v0.16b, v0.16b ;" 65 + " ld1 {v3.2d}, [%[key]], #16 ;" 66 + " bpl 1b ;" 67 + " aese v0.16b, v2.16b ;" 68 + " eor v0.16b, v0.16b, v3.16b ;" 69 + " st1 {v0.16b}, %[out] ;" 70 + 71 + : [out] "=Q"(*out), 72 + [key] "=r"(dummy0), 73 + [rounds] "=r"(dummy1) 74 + : [in] "Q"(*in), 75 + "1"(ctx->key_enc), 76 + "2"(num_rounds(ctx) - 2) 77 + : "cc"); 78 + 79 + kernel_neon_end(); 80 + } 81 + 82 + static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) 83 + { 84 + struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); 85 + struct aes_block *out = (struct aes_block *)dst; 86 + struct aes_block const *in = (struct aes_block *)src; 87 + void *dummy0; 88 + int dummy1; 89 + 90 + kernel_neon_begin_partial(4); 91 + 92 + __asm__(" ld1 {v0.16b}, %[in] ;" 93 + " ld1 {v1.2d}, [%[key]], #16 ;" 94 + " cmp %w[rounds], #10 ;" 95 + " bmi 0f ;" 96 + " bne 3f ;" 97 + " mov v3.16b, v1.16b ;" 98 + " b 2f ;" 99 + "0: mov v2.16b, v1.16b ;" 100 + " ld1 {v3.2d}, [%[key]], #16 ;" 101 + "1: aesd v0.16b, v2.16b ;" 102 + " aesimc v0.16b, v0.16b ;" 103 + "2: ld1 {v1.2d}, [%[key]], #16 ;" 104 + " aesd v0.16b, v3.16b ;" 105 + " aesimc v0.16b, v0.16b ;" 106 + "3: ld1 {v2.2d}, [%[key]], #16 ;" 107 + " subs %w[rounds], %w[rounds], #3 ;" 108 + " aesd v0.16b, v1.16b ;" 109 + " aesimc v0.16b, v0.16b ;" 110 + " ld1 {v3.2d}, [%[key]], #16 ;" 111 + " bpl 1b ;" 112 + " aesd v0.16b, v2.16b ;" 113 + " eor v0.16b, v0.16b, v3.16b ;" 114 + " st1 {v0.16b}, %[out] ;" 115 + 116 + : [out] "=Q"(*out), 117 + [key] "=r"(dummy0), 118 + [rounds] "=r"(dummy1) 119 + : [in] "Q"(*in), 120 + "1"(ctx->key_dec), 121 + "2"(num_rounds(ctx) - 2) 122 + : "cc"); 123 + 124 + kernel_neon_end(); 125 + } 126 + 127 + static struct crypto_alg aes_alg = { 128 + .cra_name = "aes", 129 + .cra_driver_name = "aes-ce", 130 + .cra_priority = 300, 131 + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, 132 + .cra_blocksize = AES_BLOCK_SIZE, 133 + .cra_ctxsize = sizeof(struct crypto_aes_ctx), 134 + .cra_module = THIS_MODULE, 135 + .cra_cipher = { 136 + .cia_min_keysize = AES_MIN_KEY_SIZE, 137 + .cia_max_keysize = AES_MAX_KEY_SIZE, 138 + .cia_setkey = crypto_aes_set_key, 139 + .cia_encrypt = aes_cipher_encrypt, 140 + .cia_decrypt = aes_cipher_decrypt 141 + } 142 + }; 143 + 144 + static int __init aes_mod_init(void) 145 + { 146 + return crypto_register_alg(&aes_alg); 147 + } 148 + 149 + static void __exit aes_mod_exit(void) 150 + { 151 + crypto_unregister_alg(&aes_alg); 152 + } 153 + 154 + module_cpu_feature_match(AES, aes_mod_init); 155 + module_exit(aes_mod_exit);

+133

arch/arm64/crypto/aes-ce.S

··· 1 + /* 2 + * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with 3 + * Crypto Extensions 4 + * 5 + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> 6 + * 7 + * This program is free software; you can redistribute it and/or modify 8 + * it under the terms of the GNU General Public License version 2 as 9 + * published by the Free Software Foundation. 10 + */ 11 + 12 + #include <linux/linkage.h> 13 + 14 + #define AES_ENTRY(func) ENTRY(ce_ ## func) 15 + #define AES_ENDPROC(func) ENDPROC(ce_ ## func) 16 + 17 + .arch armv8-a+crypto 18 + 19 + /* preload all round keys */ 20 + .macro load_round_keys, rounds, rk 21 + cmp \rounds, #12 22 + blo 2222f /* 128 bits */ 23 + beq 1111f /* 192 bits */ 24 + ld1 {v17.16b-v18.16b}, [\rk], #32 25 + 1111: ld1 {v19.16b-v20.16b}, [\rk], #32 26 + 2222: ld1 {v21.16b-v24.16b}, [\rk], #64 27 + ld1 {v25.16b-v28.16b}, [\rk], #64 28 + ld1 {v29.16b-v31.16b}, [\rk] 29 + .endm 30 + 31 + /* prepare for encryption with key in rk[] */ 32 + .macro enc_prepare, rounds, rk, ignore 33 + load_round_keys \rounds, \rk 34 + .endm 35 + 36 + /* prepare for encryption (again) but with new key in rk[] */ 37 + .macro enc_switch_key, rounds, rk, ignore 38 + load_round_keys \rounds, \rk 39 + .endm 40 + 41 + /* prepare for decryption with key in rk[] */ 42 + .macro dec_prepare, rounds, rk, ignore 43 + load_round_keys \rounds, \rk 44 + .endm 45 + 46 + .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3 47 + aes\de \i0\().16b, \k\().16b 48 + .ifnb \i1 49 + aes\de \i1\().16b, \k\().16b 50 + .ifnb \i3 51 + aes\de \i2\().16b, \k\().16b 52 + aes\de \i3\().16b, \k\().16b 53 + .endif 54 + .endif 55 + aes\mc \i0\().16b, \i0\().16b 56 + .ifnb \i1 57 + aes\mc \i1\().16b, \i1\().16b 58 + .ifnb \i3 59 + aes\mc \i2\().16b, \i2\().16b 60 + aes\mc \i3\().16b, \i3\().16b 61 + .endif 62 + .endif 63 + .endm 64 + 65 + /* up to 4 interleaved encryption rounds with the same round key */ 66 + .macro round_Nx, enc, k, i0, i1, i2, i3 67 + .ifc \enc, e 68 + do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3 69 + .else 70 + do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3 71 + .endif 72 + .endm 73 + 74 + /* up to 4 interleaved final rounds */ 75 + .macro fin_round_Nx, de, k, k2, i0, i1, i2, i3 76 + aes\de \i0\().16b, \k\().16b 77 + .ifnb \i1 78 + aes\de \i1\().16b, \k\().16b 79 + .ifnb \i3 80 + aes\de \i2\().16b, \k\().16b 81 + aes\de \i3\().16b, \k\().16b 82 + .endif 83 + .endif 84 + eor \i0\().16b, \i0\().16b, \k2\().16b 85 + .ifnb \i1 86 + eor \i1\().16b, \i1\().16b, \k2\().16b 87 + .ifnb \i3 88 + eor \i2\().16b, \i2\().16b, \k2\().16b 89 + eor \i3\().16b, \i3\().16b, \k2\().16b 90 + .endif 91 + .endif 92 + .endm 93 + 94 + /* up to 4 interleaved blocks */ 95 + .macro do_block_Nx, enc, rounds, i0, i1, i2, i3 96 + cmp \rounds, #12 97 + blo 2222f /* 128 bits */ 98 + beq 1111f /* 192 bits */ 99 + round_Nx \enc, v17, \i0, \i1, \i2, \i3 100 + round_Nx \enc, v18, \i0, \i1, \i2, \i3 101 + 1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3 102 + round_Nx \enc, v20, \i0, \i1, \i2, \i3 103 + 2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 104 + round_Nx \enc, \key, \i0, \i1, \i2, \i3 105 + .endr 106 + fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3 107 + .endm 108 + 109 + .macro encrypt_block, in, rounds, t0, t1, t2 110 + do_block_Nx e, \rounds, \in 111 + .endm 112 + 113 + .macro encrypt_block2x, i0, i1, rounds, t0, t1, t2 114 + do_block_Nx e, \rounds, \i0, \i1 115 + .endm 116 + 117 + .macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 118 + do_block_Nx e, \rounds, \i0, \i1, \i2, \i3 119 + .endm 120 + 121 + .macro decrypt_block, in, rounds, t0, t1, t2 122 + do_block_Nx d, \rounds, \in 123 + .endm 124 + 125 + .macro decrypt_block2x, i0, i1, rounds, t0, t1, t2 126 + do_block_Nx d, \rounds, \i0, \i1 127 + .endm 128 + 129 + .macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 130 + do_block_Nx d, \rounds, \i0, \i1, \i2, \i3 131 + .endm 132 + 133 + #include "aes-modes.S"

+446

arch/arm64/crypto/aes-glue.c

··· 1 + /* 2 + * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES 3 + * 4 + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <asm/neon.h> 12 + #include <asm/hwcap.h> 13 + #include <crypto/aes.h> 14 + #include <crypto/ablk_helper.h> 15 + #include <crypto/algapi.h> 16 + #include <linux/module.h> 17 + #include <linux/cpufeature.h> 18 + 19 + #ifdef USE_V8_CRYPTO_EXTENSIONS 20 + #define MODE "ce" 21 + #define PRIO 300 22 + #define aes_ecb_encrypt ce_aes_ecb_encrypt 23 + #define aes_ecb_decrypt ce_aes_ecb_decrypt 24 + #define aes_cbc_encrypt ce_aes_cbc_encrypt 25 + #define aes_cbc_decrypt ce_aes_cbc_decrypt 26 + #define aes_ctr_encrypt ce_aes_ctr_encrypt 27 + #define aes_xts_encrypt ce_aes_xts_encrypt 28 + #define aes_xts_decrypt ce_aes_xts_decrypt 29 + MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions"); 30 + #else 31 + #define MODE "neon" 32 + #define PRIO 200 33 + #define aes_ecb_encrypt neon_aes_ecb_encrypt 34 + #define aes_ecb_decrypt neon_aes_ecb_decrypt 35 + #define aes_cbc_encrypt neon_aes_cbc_encrypt 36 + #define aes_cbc_decrypt neon_aes_cbc_decrypt 37 + #define aes_ctr_encrypt neon_aes_ctr_encrypt 38 + #define aes_xts_encrypt neon_aes_xts_encrypt 39 + #define aes_xts_decrypt neon_aes_xts_decrypt 40 + MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON"); 41 + MODULE_ALIAS("ecb(aes)"); 42 + MODULE_ALIAS("cbc(aes)"); 43 + MODULE_ALIAS("ctr(aes)"); 44 + MODULE_ALIAS("xts(aes)"); 45 + #endif 46 + 47 + MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 48 + MODULE_LICENSE("GPL v2"); 49 + 50 + /* defined in aes-modes.S */ 51 + asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], 52 + int rounds, int blocks, int first); 53 + asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], 54 + int rounds, int blocks, int first); 55 + 56 + asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], 57 + int rounds, int blocks, u8 iv[], int first); 58 + asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], 59 + int rounds, int blocks, u8 iv[], int first); 60 + 61 + asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], 62 + int rounds, int blocks, u8 ctr[], int first); 63 + 64 + asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], 65 + int rounds, int blocks, u8 const rk2[], u8 iv[], 66 + int first); 67 + asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], 68 + int rounds, int blocks, u8 const rk2[], u8 iv[], 69 + int first); 70 + 71 + struct crypto_aes_xts_ctx { 72 + struct crypto_aes_ctx key1; 73 + struct crypto_aes_ctx __aligned(8) key2; 74 + }; 75 + 76 + static int xts_set_key(struct crypto_tfm *tfm, const u8 *in_key, 77 + unsigned int key_len) 78 + { 79 + struct crypto_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm); 80 + int ret; 81 + 82 + ret = crypto_aes_expand_key(&ctx->key1, in_key, key_len / 2); 83 + if (!ret) 84 + ret = crypto_aes_expand_key(&ctx->key2, &in_key[key_len / 2], 85 + key_len / 2); 86 + if (!ret) 87 + return 0; 88 + 89 + tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; 90 + return -EINVAL; 91 + } 92 + 93 + static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 94 + struct scatterlist *src, unsigned int nbytes) 95 + { 96 + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 97 + int err, first, rounds = 6 + ctx->key_length / 4; 98 + struct blkcipher_walk walk; 99 + unsigned int blocks; 100 + 101 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 102 + blkcipher_walk_init(&walk, dst, src, nbytes); 103 + err = blkcipher_walk_virt(desc, &walk); 104 + 105 + kernel_neon_begin(); 106 + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { 107 + aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr, 108 + (u8 *)ctx->key_enc, rounds, blocks, first); 109 + err = blkcipher_walk_done(desc, &walk, 0); 110 + } 111 + kernel_neon_end(); 112 + return err; 113 + } 114 + 115 + static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 116 + struct scatterlist *src, unsigned int nbytes) 117 + { 118 + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 119 + int err, first, rounds = 6 + ctx->key_length / 4; 120 + struct blkcipher_walk walk; 121 + unsigned int blocks; 122 + 123 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 124 + blkcipher_walk_init(&walk, dst, src, nbytes); 125 + err = blkcipher_walk_virt(desc, &walk); 126 + 127 + kernel_neon_begin(); 128 + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { 129 + aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr, 130 + (u8 *)ctx->key_dec, rounds, blocks, first); 131 + err = blkcipher_walk_done(desc, &walk, 0); 132 + } 133 + kernel_neon_end(); 134 + return err; 135 + } 136 + 137 + static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 138 + struct scatterlist *src, unsigned int nbytes) 139 + { 140 + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 141 + int err, first, rounds = 6 + ctx->key_length / 4; 142 + struct blkcipher_walk walk; 143 + unsigned int blocks; 144 + 145 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 146 + blkcipher_walk_init(&walk, dst, src, nbytes); 147 + err = blkcipher_walk_virt(desc, &walk); 148 + 149 + kernel_neon_begin(); 150 + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { 151 + aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr, 152 + (u8 *)ctx->key_enc, rounds, blocks, walk.iv, 153 + first); 154 + err = blkcipher_walk_done(desc, &walk, 0); 155 + } 156 + kernel_neon_end(); 157 + return err; 158 + } 159 + 160 + static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 161 + struct scatterlist *src, unsigned int nbytes) 162 + { 163 + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 164 + int err, first, rounds = 6 + ctx->key_length / 4; 165 + struct blkcipher_walk walk; 166 + unsigned int blocks; 167 + 168 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 169 + blkcipher_walk_init(&walk, dst, src, nbytes); 170 + err = blkcipher_walk_virt(desc, &walk); 171 + 172 + kernel_neon_begin(); 173 + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { 174 + aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr, 175 + (u8 *)ctx->key_dec, rounds, blocks, walk.iv, 176 + first); 177 + err = blkcipher_walk_done(desc, &walk, 0); 178 + } 179 + kernel_neon_end(); 180 + return err; 181 + } 182 + 183 + static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 184 + struct scatterlist *src, unsigned int nbytes) 185 + { 186 + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 187 + int err, first, rounds = 6 + ctx->key_length / 4; 188 + struct blkcipher_walk walk; 189 + int blocks; 190 + 191 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 192 + blkcipher_walk_init(&walk, dst, src, nbytes); 193 + err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); 194 + 195 + first = 1; 196 + kernel_neon_begin(); 197 + while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) { 198 + aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr, 199 + (u8 *)ctx->key_enc, rounds, blocks, walk.iv, 200 + first); 201 + first = 0; 202 + nbytes -= blocks * AES_BLOCK_SIZE; 203 + if (nbytes && nbytes == walk.nbytes % AES_BLOCK_SIZE) 204 + break; 205 + err = blkcipher_walk_done(desc, &walk, 206 + walk.nbytes % AES_BLOCK_SIZE); 207 + } 208 + if (nbytes) { 209 + u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; 210 + u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; 211 + u8 __aligned(8) tail[AES_BLOCK_SIZE]; 212 + 213 + /* 214 + * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need 215 + * to tell aes_ctr_encrypt() to only read half a block. 216 + */ 217 + blocks = (nbytes <= 8) ? -1 : 1; 218 + 219 + aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds, 220 + blocks, walk.iv, first); 221 + memcpy(tdst, tail, nbytes); 222 + err = blkcipher_walk_done(desc, &walk, 0); 223 + } 224 + kernel_neon_end(); 225 + 226 + return err; 227 + } 228 + 229 + static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 230 + struct scatterlist *src, unsigned int nbytes) 231 + { 232 + struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 233 + int err, first, rounds = 6 + ctx->key1.key_length / 4; 234 + struct blkcipher_walk walk; 235 + unsigned int blocks; 236 + 237 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 238 + blkcipher_walk_init(&walk, dst, src, nbytes); 239 + err = blkcipher_walk_virt(desc, &walk); 240 + 241 + kernel_neon_begin(); 242 + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { 243 + aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr, 244 + (u8 *)ctx->key1.key_enc, rounds, blocks, 245 + (u8 *)ctx->key2.key_enc, walk.iv, first); 246 + err = blkcipher_walk_done(desc, &walk, 0); 247 + } 248 + kernel_neon_end(); 249 + 250 + return err; 251 + } 252 + 253 + static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 254 + struct scatterlist *src, unsigned int nbytes) 255 + { 256 + struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 257 + int err, first, rounds = 6 + ctx->key1.key_length / 4; 258 + struct blkcipher_walk walk; 259 + unsigned int blocks; 260 + 261 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 262 + blkcipher_walk_init(&walk, dst, src, nbytes); 263 + err = blkcipher_walk_virt(desc, &walk); 264 + 265 + kernel_neon_begin(); 266 + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { 267 + aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr, 268 + (u8 *)ctx->key1.key_dec, rounds, blocks, 269 + (u8 *)ctx->key2.key_enc, walk.iv, first); 270 + err = blkcipher_walk_done(desc, &walk, 0); 271 + } 272 + kernel_neon_end(); 273 + 274 + return err; 275 + } 276 + 277 + static struct crypto_alg aes_algs[] = { { 278 + .cra_name = "__ecb-aes-" MODE, 279 + .cra_driver_name = "__driver-ecb-aes-" MODE, 280 + .cra_priority = 0, 281 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 282 + .cra_blocksize = AES_BLOCK_SIZE, 283 + .cra_ctxsize = sizeof(struct crypto_aes_ctx), 284 + .cra_alignmask = 7, 285 + .cra_type = &crypto_blkcipher_type, 286 + .cra_module = THIS_MODULE, 287 + .cra_blkcipher = { 288 + .min_keysize = AES_MIN_KEY_SIZE, 289 + .max_keysize = AES_MAX_KEY_SIZE, 290 + .ivsize = AES_BLOCK_SIZE, 291 + .setkey = crypto_aes_set_key, 292 + .encrypt = ecb_encrypt, 293 + .decrypt = ecb_decrypt, 294 + }, 295 + }, { 296 + .cra_name = "__cbc-aes-" MODE, 297 + .cra_driver_name = "__driver-cbc-aes-" MODE, 298 + .cra_priority = 0, 299 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 300 + .cra_blocksize = AES_BLOCK_SIZE, 301 + .cra_ctxsize = sizeof(struct crypto_aes_ctx), 302 + .cra_alignmask = 7, 303 + .cra_type = &crypto_blkcipher_type, 304 + .cra_module = THIS_MODULE, 305 + .cra_blkcipher = { 306 + .min_keysize = AES_MIN_KEY_SIZE, 307 + .max_keysize = AES_MAX_KEY_SIZE, 308 + .ivsize = AES_BLOCK_SIZE, 309 + .setkey = crypto_aes_set_key, 310 + .encrypt = cbc_encrypt, 311 + .decrypt = cbc_decrypt, 312 + }, 313 + }, { 314 + .cra_name = "__ctr-aes-" MODE, 315 + .cra_driver_name = "__driver-ctr-aes-" MODE, 316 + .cra_priority = 0, 317 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 318 + .cra_blocksize = 1, 319 + .cra_ctxsize = sizeof(struct crypto_aes_ctx), 320 + .cra_alignmask = 7, 321 + .cra_type = &crypto_blkcipher_type, 322 + .cra_module = THIS_MODULE, 323 + .cra_blkcipher = { 324 + .min_keysize = AES_MIN_KEY_SIZE, 325 + .max_keysize = AES_MAX_KEY_SIZE, 326 + .ivsize = AES_BLOCK_SIZE, 327 + .setkey = crypto_aes_set_key, 328 + .encrypt = ctr_encrypt, 329 + .decrypt = ctr_encrypt, 330 + }, 331 + }, { 332 + .cra_name = "__xts-aes-" MODE, 333 + .cra_driver_name = "__driver-xts-aes-" MODE, 334 + .cra_priority = 0, 335 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 336 + .cra_blocksize = AES_BLOCK_SIZE, 337 + .cra_ctxsize = sizeof(struct crypto_aes_xts_ctx), 338 + .cra_alignmask = 7, 339 + .cra_type = &crypto_blkcipher_type, 340 + .cra_module = THIS_MODULE, 341 + .cra_blkcipher = { 342 + .min_keysize = 2 * AES_MIN_KEY_SIZE, 343 + .max_keysize = 2 * AES_MAX_KEY_SIZE, 344 + .ivsize = AES_BLOCK_SIZE, 345 + .setkey = xts_set_key, 346 + .encrypt = xts_encrypt, 347 + .decrypt = xts_decrypt, 348 + }, 349 + }, { 350 + .cra_name = "ecb(aes)", 351 + .cra_driver_name = "ecb-aes-" MODE, 352 + .cra_priority = PRIO, 353 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, 354 + .cra_blocksize = AES_BLOCK_SIZE, 355 + .cra_ctxsize = sizeof(struct async_helper_ctx), 356 + .cra_alignmask = 7, 357 + .cra_type = &crypto_ablkcipher_type, 358 + .cra_module = THIS_MODULE, 359 + .cra_init = ablk_init, 360 + .cra_exit = ablk_exit, 361 + .cra_ablkcipher = { 362 + .min_keysize = AES_MIN_KEY_SIZE, 363 + .max_keysize = AES_MAX_KEY_SIZE, 364 + .ivsize = AES_BLOCK_SIZE, 365 + .setkey = ablk_set_key, 366 + .encrypt = ablk_encrypt, 367 + .decrypt = ablk_decrypt, 368 + } 369 + }, { 370 + .cra_name = "cbc(aes)", 371 + .cra_driver_name = "cbc-aes-" MODE, 372 + .cra_priority = PRIO, 373 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, 374 + .cra_blocksize = AES_BLOCK_SIZE, 375 + .cra_ctxsize = sizeof(struct async_helper_ctx), 376 + .cra_alignmask = 7, 377 + .cra_type = &crypto_ablkcipher_type, 378 + .cra_module = THIS_MODULE, 379 + .cra_init = ablk_init, 380 + .cra_exit = ablk_exit, 381 + .cra_ablkcipher = { 382 + .min_keysize = AES_MIN_KEY_SIZE, 383 + .max_keysize = AES_MAX_KEY_SIZE, 384 + .ivsize = AES_BLOCK_SIZE, 385 + .setkey = ablk_set_key, 386 + .encrypt = ablk_encrypt, 387 + .decrypt = ablk_decrypt, 388 + } 389 + }, { 390 + .cra_name = "ctr(aes)", 391 + .cra_driver_name = "ctr-aes-" MODE, 392 + .cra_priority = PRIO, 393 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, 394 + .cra_blocksize = 1, 395 + .cra_ctxsize = sizeof(struct async_helper_ctx), 396 + .cra_alignmask = 7, 397 + .cra_type = &crypto_ablkcipher_type, 398 + .cra_module = THIS_MODULE, 399 + .cra_init = ablk_init, 400 + .cra_exit = ablk_exit, 401 + .cra_ablkcipher = { 402 + .min_keysize = AES_MIN_KEY_SIZE, 403 + .max_keysize = AES_MAX_KEY_SIZE, 404 + .ivsize = AES_BLOCK_SIZE, 405 + .setkey = ablk_set_key, 406 + .encrypt = ablk_encrypt, 407 + .decrypt = ablk_decrypt, 408 + } 409 + }, { 410 + .cra_name = "xts(aes)", 411 + .cra_driver_name = "xts-aes-" MODE, 412 + .cra_priority = PRIO, 413 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, 414 + .cra_blocksize = AES_BLOCK_SIZE, 415 + .cra_ctxsize = sizeof(struct async_helper_ctx), 416 + .cra_alignmask = 7, 417 + .cra_type = &crypto_ablkcipher_type, 418 + .cra_module = THIS_MODULE, 419 + .cra_init = ablk_init, 420 + .cra_exit = ablk_exit, 421 + .cra_ablkcipher = { 422 + .min_keysize = 2 * AES_MIN_KEY_SIZE, 423 + .max_keysize = 2 * AES_MAX_KEY_SIZE, 424 + .ivsize = AES_BLOCK_SIZE, 425 + .setkey = ablk_set_key, 426 + .encrypt = ablk_encrypt, 427 + .decrypt = ablk_decrypt, 428 + } 429 + } }; 430 + 431 + static int __init aes_init(void) 432 + { 433 + return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs)); 434 + } 435 + 436 + static void __exit aes_exit(void) 437 + { 438 + crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs)); 439 + } 440 + 441 + #ifdef USE_V8_CRYPTO_EXTENSIONS 442 + module_cpu_feature_match(AES, aes_init); 443 + #else 444 + module_init(aes_init); 445 + #endif 446 + module_exit(aes_exit);

+532

arch/arm64/crypto/aes-modes.S

··· 1 + /* 2 + * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES 3 + * 4 + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + /* included by aes-ce.S and aes-neon.S */ 12 + 13 + .text 14 + .align 4 15 + 16 + /* 17 + * There are several ways to instantiate this code: 18 + * - no interleave, all inline 19 + * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2) 20 + * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE) 21 + * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4) 22 + * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE) 23 + * 24 + * Macros imported by this code: 25 + * - enc_prepare - setup NEON registers for encryption 26 + * - dec_prepare - setup NEON registers for decryption 27 + * - enc_switch_key - change to new key after having prepared for encryption 28 + * - encrypt_block - encrypt a single block 29 + * - decrypt block - decrypt a single block 30 + * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2) 31 + * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2) 32 + * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4) 33 + * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4) 34 + */ 35 + 36 + #if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE) 37 + #define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp 38 + #define FRAME_POP ldp x29, x30, [sp],#16 39 + 40 + #if INTERLEAVE == 2 41 + 42 + aes_encrypt_block2x: 43 + encrypt_block2x v0, v1, w3, x2, x6, w7 44 + ret 45 + ENDPROC(aes_encrypt_block2x) 46 + 47 + aes_decrypt_block2x: 48 + decrypt_block2x v0, v1, w3, x2, x6, w7 49 + ret 50 + ENDPROC(aes_decrypt_block2x) 51 + 52 + #elif INTERLEAVE == 4 53 + 54 + aes_encrypt_block4x: 55 + encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 56 + ret 57 + ENDPROC(aes_encrypt_block4x) 58 + 59 + aes_decrypt_block4x: 60 + decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 61 + ret 62 + ENDPROC(aes_decrypt_block4x) 63 + 64 + #else 65 + #error INTERLEAVE should equal 2 or 4 66 + #endif 67 + 68 + .macro do_encrypt_block2x 69 + bl aes_encrypt_block2x 70 + .endm 71 + 72 + .macro do_decrypt_block2x 73 + bl aes_decrypt_block2x 74 + .endm 75 + 76 + .macro do_encrypt_block4x 77 + bl aes_encrypt_block4x 78 + .endm 79 + 80 + .macro do_decrypt_block4x 81 + bl aes_decrypt_block4x 82 + .endm 83 + 84 + #else 85 + #define FRAME_PUSH 86 + #define FRAME_POP 87 + 88 + .macro do_encrypt_block2x 89 + encrypt_block2x v0, v1, w3, x2, x6, w7 90 + .endm 91 + 92 + .macro do_decrypt_block2x 93 + decrypt_block2x v0, v1, w3, x2, x6, w7 94 + .endm 95 + 96 + .macro do_encrypt_block4x 97 + encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 98 + .endm 99 + 100 + .macro do_decrypt_block4x 101 + decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 102 + .endm 103 + 104 + #endif 105 + 106 + /* 107 + * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 108 + * int blocks, int first) 109 + * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 110 + * int blocks, int first) 111 + */ 112 + 113 + AES_ENTRY(aes_ecb_encrypt) 114 + FRAME_PUSH 115 + cbz w5, .LecbencloopNx 116 + 117 + enc_prepare w3, x2, x5 118 + 119 + .LecbencloopNx: 120 + #if INTERLEAVE >= 2 121 + subs w4, w4, #INTERLEAVE 122 + bmi .Lecbenc1x 123 + #if INTERLEAVE == 2 124 + ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ 125 + do_encrypt_block2x 126 + st1 {v0.16b-v1.16b}, [x0], #32 127 + #else 128 + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 129 + do_encrypt_block4x 130 + st1 {v0.16b-v3.16b}, [x0], #64 131 + #endif 132 + b .LecbencloopNx 133 + .Lecbenc1x: 134 + adds w4, w4, #INTERLEAVE 135 + beq .Lecbencout 136 + #endif 137 + .Lecbencloop: 138 + ld1 {v0.16b}, [x1], #16 /* get next pt block */ 139 + encrypt_block v0, w3, x2, x5, w6 140 + st1 {v0.16b}, [x0], #16 141 + subs w4, w4, #1 142 + bne .Lecbencloop 143 + .Lecbencout: 144 + FRAME_POP 145 + ret 146 + AES_ENDPROC(aes_ecb_encrypt) 147 + 148 + 149 + AES_ENTRY(aes_ecb_decrypt) 150 + FRAME_PUSH 151 + cbz w5, .LecbdecloopNx 152 + 153 + dec_prepare w3, x2, x5 154 + 155 + .LecbdecloopNx: 156 + #if INTERLEAVE >= 2 157 + subs w4, w4, #INTERLEAVE 158 + bmi .Lecbdec1x 159 + #if INTERLEAVE == 2 160 + ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ 161 + do_decrypt_block2x 162 + st1 {v0.16b-v1.16b}, [x0], #32 163 + #else 164 + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 165 + do_decrypt_block4x 166 + st1 {v0.16b-v3.16b}, [x0], #64 167 + #endif 168 + b .LecbdecloopNx 169 + .Lecbdec1x: 170 + adds w4, w4, #INTERLEAVE 171 + beq .Lecbdecout 172 + #endif 173 + .Lecbdecloop: 174 + ld1 {v0.16b}, [x1], #16 /* get next ct block */ 175 + decrypt_block v0, w3, x2, x5, w6 176 + st1 {v0.16b}, [x0], #16 177 + subs w4, w4, #1 178 + bne .Lecbdecloop 179 + .Lecbdecout: 180 + FRAME_POP 181 + ret 182 + AES_ENDPROC(aes_ecb_decrypt) 183 + 184 + 185 + /* 186 + * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 187 + * int blocks, u8 iv[], int first) 188 + * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 189 + * int blocks, u8 iv[], int first) 190 + */ 191 + 192 + AES_ENTRY(aes_cbc_encrypt) 193 + cbz w6, .Lcbcencloop 194 + 195 + ld1 {v0.16b}, [x5] /* get iv */ 196 + enc_prepare w3, x2, x5 197 + 198 + .Lcbcencloop: 199 + ld1 {v1.16b}, [x1], #16 /* get next pt block */ 200 + eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */ 201 + encrypt_block v0, w3, x2, x5, w6 202 + st1 {v0.16b}, [x0], #16 203 + subs w4, w4, #1 204 + bne .Lcbcencloop 205 + ret 206 + AES_ENDPROC(aes_cbc_encrypt) 207 + 208 + 209 + AES_ENTRY(aes_cbc_decrypt) 210 + FRAME_PUSH 211 + cbz w6, .LcbcdecloopNx 212 + 213 + ld1 {v7.16b}, [x5] /* get iv */ 214 + dec_prepare w3, x2, x5 215 + 216 + .LcbcdecloopNx: 217 + #if INTERLEAVE >= 2 218 + subs w4, w4, #INTERLEAVE 219 + bmi .Lcbcdec1x 220 + #if INTERLEAVE == 2 221 + ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ 222 + mov v2.16b, v0.16b 223 + mov v3.16b, v1.16b 224 + do_decrypt_block2x 225 + eor v0.16b, v0.16b, v7.16b 226 + eor v1.16b, v1.16b, v2.16b 227 + mov v7.16b, v3.16b 228 + st1 {v0.16b-v1.16b}, [x0], #32 229 + #else 230 + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 231 + mov v4.16b, v0.16b 232 + mov v5.16b, v1.16b 233 + mov v6.16b, v2.16b 234 + do_decrypt_block4x 235 + sub x1, x1, #16 236 + eor v0.16b, v0.16b, v7.16b 237 + eor v1.16b, v1.16b, v4.16b 238 + ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */ 239 + eor v2.16b, v2.16b, v5.16b 240 + eor v3.16b, v3.16b, v6.16b 241 + st1 {v0.16b-v3.16b}, [x0], #64 242 + #endif 243 + b .LcbcdecloopNx 244 + .Lcbcdec1x: 245 + adds w4, w4, #INTERLEAVE 246 + beq .Lcbcdecout 247 + #endif 248 + .Lcbcdecloop: 249 + ld1 {v1.16b}, [x1], #16 /* get next ct block */ 250 + mov v0.16b, v1.16b /* ...and copy to v0 */ 251 + decrypt_block v0, w3, x2, x5, w6 252 + eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */ 253 + mov v7.16b, v1.16b /* ct is next iv */ 254 + st1 {v0.16b}, [x0], #16 255 + subs w4, w4, #1 256 + bne .Lcbcdecloop 257 + .Lcbcdecout: 258 + FRAME_POP 259 + ret 260 + AES_ENDPROC(aes_cbc_decrypt) 261 + 262 + 263 + /* 264 + * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 265 + * int blocks, u8 ctr[], int first) 266 + */ 267 + 268 + AES_ENTRY(aes_ctr_encrypt) 269 + FRAME_PUSH 270 + cbnz w6, .Lctrfirst /* 1st time around? */ 271 + umov x5, v4.d[1] /* keep swabbed ctr in reg */ 272 + rev x5, x5 273 + #if INTERLEAVE >= 2 274 + cmn w5, w4 /* 32 bit overflow? */ 275 + bcs .Lctrinc 276 + add x5, x5, #1 /* increment BE ctr */ 277 + b .LctrincNx 278 + #else 279 + b .Lctrinc 280 + #endif 281 + .Lctrfirst: 282 + enc_prepare w3, x2, x6 283 + ld1 {v4.16b}, [x5] 284 + umov x5, v4.d[1] /* keep swabbed ctr in reg */ 285 + rev x5, x5 286 + #if INTERLEAVE >= 2 287 + cmn w5, w4 /* 32 bit overflow? */ 288 + bcs .Lctrloop 289 + .LctrloopNx: 290 + subs w4, w4, #INTERLEAVE 291 + bmi .Lctr1x 292 + #if INTERLEAVE == 2 293 + mov v0.8b, v4.8b 294 + mov v1.8b, v4.8b 295 + rev x7, x5 296 + add x5, x5, #1 297 + ins v0.d[1], x7 298 + rev x7, x5 299 + add x5, x5, #1 300 + ins v1.d[1], x7 301 + ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */ 302 + do_encrypt_block2x 303 + eor v0.16b, v0.16b, v2.16b 304 + eor v1.16b, v1.16b, v3.16b 305 + st1 {v0.16b-v1.16b}, [x0], #32 306 + #else 307 + ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */ 308 + dup v7.4s, w5 309 + mov v0.16b, v4.16b 310 + add v7.4s, v7.4s, v8.4s 311 + mov v1.16b, v4.16b 312 + rev32 v8.16b, v7.16b 313 + mov v2.16b, v4.16b 314 + mov v3.16b, v4.16b 315 + mov v1.s[3], v8.s[0] 316 + mov v2.s[3], v8.s[1] 317 + mov v3.s[3], v8.s[2] 318 + ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ 319 + do_encrypt_block4x 320 + eor v0.16b, v5.16b, v0.16b 321 + ld1 {v5.16b}, [x1], #16 /* get 1 input block */ 322 + eor v1.16b, v6.16b, v1.16b 323 + eor v2.16b, v7.16b, v2.16b 324 + eor v3.16b, v5.16b, v3.16b 325 + st1 {v0.16b-v3.16b}, [x0], #64 326 + add x5, x5, #INTERLEAVE 327 + #endif 328 + cbz w4, .LctroutNx 329 + .LctrincNx: 330 + rev x7, x5 331 + ins v4.d[1], x7 332 + b .LctrloopNx 333 + .LctroutNx: 334 + sub x5, x5, #1 335 + rev x7, x5 336 + ins v4.d[1], x7 337 + b .Lctrout 338 + .Lctr1x: 339 + adds w4, w4, #INTERLEAVE 340 + beq .Lctrout 341 + #endif 342 + .Lctrloop: 343 + mov v0.16b, v4.16b 344 + encrypt_block v0, w3, x2, x6, w7 345 + subs w4, w4, #1 346 + bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */ 347 + ld1 {v3.16b}, [x1], #16 348 + eor v3.16b, v0.16b, v3.16b 349 + st1 {v3.16b}, [x0], #16 350 + beq .Lctrout 351 + .Lctrinc: 352 + adds x5, x5, #1 /* increment BE ctr */ 353 + rev x7, x5 354 + ins v4.d[1], x7 355 + bcc .Lctrloop /* no overflow? */ 356 + umov x7, v4.d[0] /* load upper word of ctr */ 357 + rev x7, x7 /* ... to handle the carry */ 358 + add x7, x7, #1 359 + rev x7, x7 360 + ins v4.d[0], x7 361 + b .Lctrloop 362 + .Lctrhalfblock: 363 + ld1 {v3.8b}, [x1] 364 + eor v3.8b, v0.8b, v3.8b 365 + st1 {v3.8b}, [x0] 366 + .Lctrout: 367 + FRAME_POP 368 + ret 369 + AES_ENDPROC(aes_ctr_encrypt) 370 + .ltorg 371 + 372 + 373 + /* 374 + * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 375 + * int blocks, u8 const rk2[], u8 iv[], int first) 376 + * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 377 + * int blocks, u8 const rk2[], u8 iv[], int first) 378 + */ 379 + 380 + .macro next_tweak, out, in, const, tmp 381 + sshr \tmp\().2d, \in\().2d, #63 382 + and \tmp\().16b, \tmp\().16b, \const\().16b 383 + add \out\().2d, \in\().2d, \in\().2d 384 + ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 385 + eor \out\().16b, \out\().16b, \tmp\().16b 386 + .endm 387 + 388 + .Lxts_mul_x: 389 + .word 1, 0, 0x87, 0 390 + 391 + AES_ENTRY(aes_xts_encrypt) 392 + FRAME_PUSH 393 + cbz w7, .LxtsencloopNx 394 + 395 + ld1 {v4.16b}, [x6] 396 + enc_prepare w3, x5, x6 397 + encrypt_block v4, w3, x5, x6, w7 /* first tweak */ 398 + enc_switch_key w3, x2, x6 399 + ldr q7, .Lxts_mul_x 400 + b .LxtsencNx 401 + 402 + .LxtsencloopNx: 403 + ldr q7, .Lxts_mul_x 404 + next_tweak v4, v4, v7, v8 405 + .LxtsencNx: 406 + #if INTERLEAVE >= 2 407 + subs w4, w4, #INTERLEAVE 408 + bmi .Lxtsenc1x 409 + #if INTERLEAVE == 2 410 + ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ 411 + next_tweak v5, v4, v7, v8 412 + eor v0.16b, v0.16b, v4.16b 413 + eor v1.16b, v1.16b, v5.16b 414 + do_encrypt_block2x 415 + eor v0.16b, v0.16b, v4.16b 416 + eor v1.16b, v1.16b, v5.16b 417 + st1 {v0.16b-v1.16b}, [x0], #32 418 + cbz w4, .LxtsencoutNx 419 + next_tweak v4, v5, v7, v8 420 + b .LxtsencNx 421 + .LxtsencoutNx: 422 + mov v4.16b, v5.16b 423 + b .Lxtsencout 424 + #else 425 + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 426 + next_tweak v5, v4, v7, v8 427 + eor v0.16b, v0.16b, v4.16b 428 + next_tweak v6, v5, v7, v8 429 + eor v1.16b, v1.16b, v5.16b 430 + eor v2.16b, v2.16b, v6.16b 431 + next_tweak v7, v6, v7, v8 432 + eor v3.16b, v3.16b, v7.16b 433 + do_encrypt_block4x 434 + eor v3.16b, v3.16b, v7.16b 435 + eor v0.16b, v0.16b, v4.16b 436 + eor v1.16b, v1.16b, v5.16b 437 + eor v2.16b, v2.16b, v6.16b 438 + st1 {v0.16b-v3.16b}, [x0], #64 439 + mov v4.16b, v7.16b 440 + cbz w4, .Lxtsencout 441 + b .LxtsencloopNx 442 + #endif 443 + .Lxtsenc1x: 444 + adds w4, w4, #INTERLEAVE 445 + beq .Lxtsencout 446 + #endif 447 + .Lxtsencloop: 448 + ld1 {v1.16b}, [x1], #16 449 + eor v0.16b, v1.16b, v4.16b 450 + encrypt_block v0, w3, x2, x6, w7 451 + eor v0.16b, v0.16b, v4.16b 452 + st1 {v0.16b}, [x0], #16 453 + subs w4, w4, #1 454 + beq .Lxtsencout 455 + next_tweak v4, v4, v7, v8 456 + b .Lxtsencloop 457 + .Lxtsencout: 458 + FRAME_POP 459 + ret 460 + AES_ENDPROC(aes_xts_encrypt) 461 + 462 + 463 + AES_ENTRY(aes_xts_decrypt) 464 + FRAME_PUSH 465 + cbz w7, .LxtsdecloopNx 466 + 467 + ld1 {v4.16b}, [x6] 468 + enc_prepare w3, x5, x6 469 + encrypt_block v4, w3, x5, x6, w7 /* first tweak */ 470 + dec_prepare w3, x2, x6 471 + ldr q7, .Lxts_mul_x 472 + b .LxtsdecNx 473 + 474 + .LxtsdecloopNx: 475 + ldr q7, .Lxts_mul_x 476 + next_tweak v4, v4, v7, v8 477 + .LxtsdecNx: 478 + #if INTERLEAVE >= 2 479 + subs w4, w4, #INTERLEAVE 480 + bmi .Lxtsdec1x 481 + #if INTERLEAVE == 2 482 + ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ 483 + next_tweak v5, v4, v7, v8 484 + eor v0.16b, v0.16b, v4.16b 485 + eor v1.16b, v1.16b, v5.16b 486 + do_decrypt_block2x 487 + eor v0.16b, v0.16b, v4.16b 488 + eor v1.16b, v1.16b, v5.16b 489 + st1 {v0.16b-v1.16b}, [x0], #32 490 + cbz w4, .LxtsdecoutNx 491 + next_tweak v4, v5, v7, v8 492 + b .LxtsdecNx 493 + .LxtsdecoutNx: 494 + mov v4.16b, v5.16b 495 + b .Lxtsdecout 496 + #else 497 + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 498 + next_tweak v5, v4, v7, v8 499 + eor v0.16b, v0.16b, v4.16b 500 + next_tweak v6, v5, v7, v8 501 + eor v1.16b, v1.16b, v5.16b 502 + eor v2.16b, v2.16b, v6.16b 503 + next_tweak v7, v6, v7, v8 504 + eor v3.16b, v3.16b, v7.16b 505 + do_decrypt_block4x 506 + eor v3.16b, v3.16b, v7.16b 507 + eor v0.16b, v0.16b, v4.16b 508 + eor v1.16b, v1.16b, v5.16b 509 + eor v2.16b, v2.16b, v6.16b 510 + st1 {v0.16b-v3.16b}, [x0], #64 511 + mov v4.16b, v7.16b 512 + cbz w4, .Lxtsdecout 513 + b .LxtsdecloopNx 514 + #endif 515 + .Lxtsdec1x: 516 + adds w4, w4, #INTERLEAVE 517 + beq .Lxtsdecout 518 + #endif 519 + .Lxtsdecloop: 520 + ld1 {v1.16b}, [x1], #16 521 + eor v0.16b, v1.16b, v4.16b 522 + decrypt_block v0, w3, x2, x6, w7 523 + eor v0.16b, v0.16b, v4.16b 524 + st1 {v0.16b}, [x0], #16 525 + subs w4, w4, #1 526 + beq .Lxtsdecout 527 + next_tweak v4, v4, v7, v8 528 + b .Lxtsdecloop 529 + .Lxtsdecout: 530 + FRAME_POP 531 + ret 532 + AES_ENDPROC(aes_xts_decrypt)

+382

arch/arm64/crypto/aes-neon.S

··· 1 + /* 2 + * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON 3 + * 4 + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <linux/linkage.h> 12 + 13 + #define AES_ENTRY(func) ENTRY(neon_ ## func) 14 + #define AES_ENDPROC(func) ENDPROC(neon_ ## func) 15 + 16 + /* multiply by polynomial 'x' in GF(2^8) */ 17 + .macro mul_by_x, out, in, temp, const 18 + sshr \temp, \in, #7 19 + add \out, \in, \in 20 + and \temp, \temp, \const 21 + eor \out, \out, \temp 22 + .endm 23 + 24 + /* preload the entire Sbox */ 25 + .macro prepare, sbox, shiftrows, temp 26 + adr \temp, \sbox 27 + movi v12.16b, #0x40 28 + ldr q13, \shiftrows 29 + movi v14.16b, #0x1b 30 + ld1 {v16.16b-v19.16b}, [\temp], #64 31 + ld1 {v20.16b-v23.16b}, [\temp], #64 32 + ld1 {v24.16b-v27.16b}, [\temp], #64 33 + ld1 {v28.16b-v31.16b}, [\temp] 34 + .endm 35 + 36 + /* do preload for encryption */ 37 + .macro enc_prepare, ignore0, ignore1, temp 38 + prepare .LForward_Sbox, .LForward_ShiftRows, \temp 39 + .endm 40 + 41 + .macro enc_switch_key, ignore0, ignore1, temp 42 + /* do nothing */ 43 + .endm 44 + 45 + /* do preload for decryption */ 46 + .macro dec_prepare, ignore0, ignore1, temp 47 + prepare .LReverse_Sbox, .LReverse_ShiftRows, \temp 48 + .endm 49 + 50 + /* apply SubBytes transformation using the the preloaded Sbox */ 51 + .macro sub_bytes, in 52 + sub v9.16b, \in\().16b, v12.16b 53 + tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b 54 + sub v10.16b, v9.16b, v12.16b 55 + tbx \in\().16b, {v20.16b-v23.16b}, v9.16b 56 + sub v11.16b, v10.16b, v12.16b 57 + tbx \in\().16b, {v24.16b-v27.16b}, v10.16b 58 + tbx \in\().16b, {v28.16b-v31.16b}, v11.16b 59 + .endm 60 + 61 + /* apply MixColumns transformation */ 62 + .macro mix_columns, in 63 + mul_by_x v10.16b, \in\().16b, v9.16b, v14.16b 64 + rev32 v8.8h, \in\().8h 65 + eor \in\().16b, v10.16b, \in\().16b 66 + shl v9.4s, v8.4s, #24 67 + shl v11.4s, \in\().4s, #24 68 + sri v9.4s, v8.4s, #8 69 + sri v11.4s, \in\().4s, #8 70 + eor v9.16b, v9.16b, v8.16b 71 + eor v10.16b, v10.16b, v9.16b 72 + eor \in\().16b, v10.16b, v11.16b 73 + .endm 74 + 75 + /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ 76 + .macro inv_mix_columns, in 77 + mul_by_x v11.16b, \in\().16b, v10.16b, v14.16b 78 + mul_by_x v11.16b, v11.16b, v10.16b, v14.16b 79 + eor \in\().16b, \in\().16b, v11.16b 80 + rev32 v11.8h, v11.8h 81 + eor \in\().16b, \in\().16b, v11.16b 82 + mix_columns \in 83 + .endm 84 + 85 + .macro do_block, enc, in, rounds, rk, rkp, i 86 + ld1 {v15.16b}, [\rk] 87 + add \rkp, \rk, #16 88 + mov \i, \rounds 89 + 1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ 90 + tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */ 91 + sub_bytes \in 92 + ld1 {v15.16b}, [\rkp], #16 93 + subs \i, \i, #1 94 + beq 2222f 95 + .if \enc == 1 96 + mix_columns \in 97 + .else 98 + inv_mix_columns \in 99 + .endif 100 + b 1111b 101 + 2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ 102 + .endm 103 + 104 + .macro encrypt_block, in, rounds, rk, rkp, i 105 + do_block 1, \in, \rounds, \rk, \rkp, \i 106 + .endm 107 + 108 + .macro decrypt_block, in, rounds, rk, rkp, i 109 + do_block 0, \in, \rounds, \rk, \rkp, \i 110 + .endm 111 + 112 + /* 113 + * Interleaved versions: functionally equivalent to the 114 + * ones above, but applied to 2 or 4 AES states in parallel. 115 + */ 116 + 117 + .macro sub_bytes_2x, in0, in1 118 + sub v8.16b, \in0\().16b, v12.16b 119 + sub v9.16b, \in1\().16b, v12.16b 120 + tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b 121 + tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b 122 + sub v10.16b, v8.16b, v12.16b 123 + sub v11.16b, v9.16b, v12.16b 124 + tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b 125 + tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b 126 + sub v8.16b, v10.16b, v12.16b 127 + sub v9.16b, v11.16b, v12.16b 128 + tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b 129 + tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b 130 + tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b 131 + tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b 132 + .endm 133 + 134 + .macro sub_bytes_4x, in0, in1, in2, in3 135 + sub v8.16b, \in0\().16b, v12.16b 136 + tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b 137 + sub v9.16b, \in1\().16b, v12.16b 138 + tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b 139 + sub v10.16b, \in2\().16b, v12.16b 140 + tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b 141 + sub v11.16b, \in3\().16b, v12.16b 142 + tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b 143 + tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b 144 + tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b 145 + sub v8.16b, v8.16b, v12.16b 146 + tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b 147 + sub v9.16b, v9.16b, v12.16b 148 + tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b 149 + sub v10.16b, v10.16b, v12.16b 150 + tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b 151 + sub v11.16b, v11.16b, v12.16b 152 + tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b 153 + sub v8.16b, v8.16b, v12.16b 154 + tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b 155 + sub v9.16b, v9.16b, v12.16b 156 + tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b 157 + sub v10.16b, v10.16b, v12.16b 158 + tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b 159 + sub v11.16b, v11.16b, v12.16b 160 + tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b 161 + tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b 162 + tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b 163 + .endm 164 + 165 + .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const 166 + sshr \tmp0\().16b, \in0\().16b, #7 167 + add \out0\().16b, \in0\().16b, \in0\().16b 168 + sshr \tmp1\().16b, \in1\().16b, #7 169 + and \tmp0\().16b, \tmp0\().16b, \const\().16b 170 + add \out1\().16b, \in1\().16b, \in1\().16b 171 + and \tmp1\().16b, \tmp1\().16b, \const\().16b 172 + eor \out0\().16b, \out0\().16b, \tmp0\().16b 173 + eor \out1\().16b, \out1\().16b, \tmp1\().16b 174 + .endm 175 + 176 + .macro mix_columns_2x, in0, in1 177 + mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 178 + rev32 v10.8h, \in0\().8h 179 + rev32 v11.8h, \in1\().8h 180 + eor \in0\().16b, v8.16b, \in0\().16b 181 + eor \in1\().16b, v9.16b, \in1\().16b 182 + shl v12.4s, v10.4s, #24 183 + shl v13.4s, v11.4s, #24 184 + eor v8.16b, v8.16b, v10.16b 185 + sri v12.4s, v10.4s, #8 186 + shl v10.4s, \in0\().4s, #24 187 + eor v9.16b, v9.16b, v11.16b 188 + sri v13.4s, v11.4s, #8 189 + shl v11.4s, \in1\().4s, #24 190 + sri v10.4s, \in0\().4s, #8 191 + eor \in0\().16b, v8.16b, v12.16b 192 + sri v11.4s, \in1\().4s, #8 193 + eor \in1\().16b, v9.16b, v13.16b 194 + eor \in0\().16b, v10.16b, \in0\().16b 195 + eor \in1\().16b, v11.16b, \in1\().16b 196 + .endm 197 + 198 + .macro inv_mix_cols_2x, in0, in1 199 + mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 200 + mul_by_x_2x v8, v9, v8, v9, v10, v11, v14 201 + eor \in0\().16b, \in0\().16b, v8.16b 202 + eor \in1\().16b, \in1\().16b, v9.16b 203 + rev32 v8.8h, v8.8h 204 + rev32 v9.8h, v9.8h 205 + eor \in0\().16b, \in0\().16b, v8.16b 206 + eor \in1\().16b, \in1\().16b, v9.16b 207 + mix_columns_2x \in0, \in1 208 + .endm 209 + 210 + .macro inv_mix_cols_4x, in0, in1, in2, in3 211 + mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 212 + mul_by_x_2x v10, v11, \in2, \in3, v12, v13, v14 213 + mul_by_x_2x v8, v9, v8, v9, v12, v13, v14 214 + mul_by_x_2x v10, v11, v10, v11, v12, v13, v14 215 + eor \in0\().16b, \in0\().16b, v8.16b 216 + eor \in1\().16b, \in1\().16b, v9.16b 217 + eor \in2\().16b, \in2\().16b, v10.16b 218 + eor \in3\().16b, \in3\().16b, v11.16b 219 + rev32 v8.8h, v8.8h 220 + rev32 v9.8h, v9.8h 221 + rev32 v10.8h, v10.8h 222 + rev32 v11.8h, v11.8h 223 + eor \in0\().16b, \in0\().16b, v8.16b 224 + eor \in1\().16b, \in1\().16b, v9.16b 225 + eor \in2\().16b, \in2\().16b, v10.16b 226 + eor \in3\().16b, \in3\().16b, v11.16b 227 + mix_columns_2x \in0, \in1 228 + mix_columns_2x \in2, \in3 229 + .endm 230 + 231 + .macro do_block_2x, enc, in0, in1 rounds, rk, rkp, i 232 + ld1 {v15.16b}, [\rk] 233 + add \rkp, \rk, #16 234 + mov \i, \rounds 235 + 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ 236 + eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ 237 + sub_bytes_2x \in0, \in1 238 + tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ 239 + tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ 240 + ld1 {v15.16b}, [\rkp], #16 241 + subs \i, \i, #1 242 + beq 2222f 243 + .if \enc == 1 244 + mix_columns_2x \in0, \in1 245 + ldr q13, .LForward_ShiftRows 246 + .else 247 + inv_mix_cols_2x \in0, \in1 248 + ldr q13, .LReverse_ShiftRows 249 + .endif 250 + movi v12.16b, #0x40 251 + b 1111b 252 + 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ 253 + eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ 254 + .endm 255 + 256 + .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i 257 + ld1 {v15.16b}, [\rk] 258 + add \rkp, \rk, #16 259 + mov \i, \rounds 260 + 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ 261 + eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ 262 + eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ 263 + eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ 264 + sub_bytes_4x \in0, \in1, \in2, \in3 265 + tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ 266 + tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ 267 + tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */ 268 + tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */ 269 + ld1 {v15.16b}, [\rkp], #16 270 + subs \i, \i, #1 271 + beq 2222f 272 + .if \enc == 1 273 + mix_columns_2x \in0, \in1 274 + mix_columns_2x \in2, \in3 275 + ldr q13, .LForward_ShiftRows 276 + .else 277 + inv_mix_cols_4x \in0, \in1, \in2, \in3 278 + ldr q13, .LReverse_ShiftRows 279 + .endif 280 + movi v12.16b, #0x40 281 + b 1111b 282 + 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ 283 + eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ 284 + eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ 285 + eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ 286 + .endm 287 + 288 + .macro encrypt_block2x, in0, in1, rounds, rk, rkp, i 289 + do_block_2x 1, \in0, \in1, \rounds, \rk, \rkp, \i 290 + .endm 291 + 292 + .macro decrypt_block2x, in0, in1, rounds, rk, rkp, i 293 + do_block_2x 0, \in0, \in1, \rounds, \rk, \rkp, \i 294 + .endm 295 + 296 + .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i 297 + do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i 298 + .endm 299 + 300 + .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i 301 + do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i 302 + .endm 303 + 304 + #include "aes-modes.S" 305 + 306 + .text 307 + .align 4 308 + .LForward_ShiftRows: 309 + .byte 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3 310 + .byte 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb 311 + 312 + .LReverse_ShiftRows: 313 + .byte 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb 314 + .byte 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3 315 + 316 + .LForward_Sbox: 317 + .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 318 + .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 319 + .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 320 + .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 321 + .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc 322 + .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 323 + .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a 324 + .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 325 + .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 326 + .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 327 + .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b 328 + .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf 329 + .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 330 + .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 331 + .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 332 + .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 333 + .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 334 + .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 335 + .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 336 + .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb 337 + .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c 338 + .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 339 + .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 340 + .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 341 + .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 342 + .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a 343 + .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e 344 + .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e 345 + .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 346 + .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf 347 + .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 348 + .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 349 + 350 + .LReverse_Sbox: 351 + .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 352 + .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb 353 + .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 354 + .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb 355 + .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d 356 + .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e 357 + .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 358 + .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 359 + .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 360 + .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 361 + .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda 362 + .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 363 + .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a 364 + .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 365 + .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 366 + .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b 367 + .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea 368 + .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 369 + .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 370 + .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e 371 + .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 372 + .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b 373 + .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 374 + .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 375 + .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 376 + .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f 377 + .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d 378 + .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef 379 + .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 380 + .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 381 + .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 382 + .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d

+95

arch/arm64/crypto/ghash-ce-core.S

··· 1 + /* 2 + * Accelerated GHASH implementation with ARMv8 PMULL instructions. 3 + * 4 + * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> 5 + * 6 + * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S 7 + * 8 + * Copyright (c) 2009 Intel Corp. 9 + * Author: Huang Ying <ying.huang@intel.com> 10 + * Vinodh Gopal 11 + * Erdinc Ozturk 12 + * Deniz Karakoyunlu 13 + * 14 + * This program is free software; you can redistribute it and/or modify it 15 + * under the terms of the GNU General Public License version 2 as published 16 + * by the Free Software Foundation. 17 + */ 18 + 19 + #include <linux/linkage.h> 20 + #include <asm/assembler.h> 21 + 22 + DATA .req v0 23 + SHASH .req v1 24 + IN1 .req v2 25 + T1 .req v2 26 + T2 .req v3 27 + T3 .req v4 28 + VZR .req v5 29 + 30 + .text 31 + .arch armv8-a+crypto 32 + 33 + /* 34 + * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 35 + * struct ghash_key const *k, const char *head) 36 + */ 37 + ENTRY(pmull_ghash_update) 38 + ld1 {DATA.16b}, [x1] 39 + ld1 {SHASH.16b}, [x3] 40 + eor VZR.16b, VZR.16b, VZR.16b 41 + 42 + /* do the head block first, if supplied */ 43 + cbz x4, 0f 44 + ld1 {IN1.2d}, [x4] 45 + b 1f 46 + 47 + 0: ld1 {IN1.2d}, [x2], #16 48 + sub w0, w0, #1 49 + 1: ext IN1.16b, IN1.16b, IN1.16b, #8 50 + CPU_LE( rev64 IN1.16b, IN1.16b ) 51 + eor DATA.16b, DATA.16b, IN1.16b 52 + 53 + /* multiply DATA by SHASH in GF(2^128) */ 54 + ext T2.16b, DATA.16b, DATA.16b, #8 55 + ext T3.16b, SHASH.16b, SHASH.16b, #8 56 + eor T2.16b, T2.16b, DATA.16b 57 + eor T3.16b, T3.16b, SHASH.16b 58 + 59 + pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1 60 + pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0 61 + pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0) 62 + eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0) 63 + eor T2.16b, T2.16b, DATA.16b 64 + 65 + ext T3.16b, VZR.16b, T2.16b, #8 66 + ext T2.16b, T2.16b, VZR.16b, #8 67 + eor DATA.16b, DATA.16b, T3.16b 68 + eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of 69 + // carry-less multiplication 70 + 71 + /* first phase of the reduction */ 72 + shl T3.2d, DATA.2d, #1 73 + eor T3.16b, T3.16b, DATA.16b 74 + shl T3.2d, T3.2d, #5 75 + eor T3.16b, T3.16b, DATA.16b 76 + shl T3.2d, T3.2d, #57 77 + ext T2.16b, VZR.16b, T3.16b, #8 78 + ext T3.16b, T3.16b, VZR.16b, #8 79 + eor DATA.16b, DATA.16b, T2.16b 80 + eor T1.16b, T1.16b, T3.16b 81 + 82 + /* second phase of the reduction */ 83 + ushr T2.2d, DATA.2d, #5 84 + eor T2.16b, T2.16b, DATA.16b 85 + ushr T2.2d, T2.2d, #1 86 + eor T2.16b, T2.16b, DATA.16b 87 + ushr T2.2d, T2.2d, #1 88 + eor T1.16b, T1.16b, T2.16b 89 + eor DATA.16b, DATA.16b, T1.16b 90 + 91 + cbnz w0, 0b 92 + 93 + st1 {DATA.16b}, [x1] 94 + ret 95 + ENDPROC(pmull_ghash_update)

+155

arch/arm64/crypto/ghash-ce-glue.c

··· 1 + /* 2 + * Accelerated GHASH implementation with ARMv8 PMULL instructions. 3 + * 4 + * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify it 7 + * under the terms of the GNU General Public License version 2 as published 8 + * by the Free Software Foundation. 9 + */ 10 + 11 + #include <asm/neon.h> 12 + #include <asm/unaligned.h> 13 + #include <crypto/internal/hash.h> 14 + #include <linux/cpufeature.h> 15 + #include <linux/crypto.h> 16 + #include <linux/module.h> 17 + 18 + MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions"); 19 + MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 20 + MODULE_LICENSE("GPL v2"); 21 + 22 + #define GHASH_BLOCK_SIZE 16 23 + #define GHASH_DIGEST_SIZE 16 24 + 25 + struct ghash_key { 26 + u64 a; 27 + u64 b; 28 + }; 29 + 30 + struct ghash_desc_ctx { 31 + u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)]; 32 + u8 buf[GHASH_BLOCK_SIZE]; 33 + u32 count; 34 + }; 35 + 36 + asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src, 37 + struct ghash_key const *k, const char *head); 38 + 39 + static int ghash_init(struct shash_desc *desc) 40 + { 41 + struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); 42 + 43 + *ctx = (struct ghash_desc_ctx){}; 44 + return 0; 45 + } 46 + 47 + static int ghash_update(struct shash_desc *desc, const u8 *src, 48 + unsigned int len) 49 + { 50 + struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); 51 + unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; 52 + 53 + ctx->count += len; 54 + 55 + if ((partial + len) >= GHASH_BLOCK_SIZE) { 56 + struct ghash_key *key = crypto_shash_ctx(desc->tfm); 57 + int blocks; 58 + 59 + if (partial) { 60 + int p = GHASH_BLOCK_SIZE - partial; 61 + 62 + memcpy(ctx->buf + partial, src, p); 63 + src += p; 64 + len -= p; 65 + } 66 + 67 + blocks = len / GHASH_BLOCK_SIZE; 68 + len %= GHASH_BLOCK_SIZE; 69 + 70 + kernel_neon_begin_partial(6); 71 + pmull_ghash_update(blocks, ctx->digest, src, key, 72 + partial ? ctx->buf : NULL); 73 + kernel_neon_end(); 74 + src += blocks * GHASH_BLOCK_SIZE; 75 + } 76 + if (len) 77 + memcpy(ctx->buf + partial, src, len); 78 + return 0; 79 + } 80 + 81 + static int ghash_final(struct shash_desc *desc, u8 *dst) 82 + { 83 + struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); 84 + unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; 85 + 86 + if (partial) { 87 + struct ghash_key *key = crypto_shash_ctx(desc->tfm); 88 + 89 + memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial); 90 + 91 + kernel_neon_begin_partial(6); 92 + pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL); 93 + kernel_neon_end(); 94 + } 95 + put_unaligned_be64(ctx->digest[1], dst); 96 + put_unaligned_be64(ctx->digest[0], dst + 8); 97 + 98 + *ctx = (struct ghash_desc_ctx){}; 99 + return 0; 100 + } 101 + 102 + static int ghash_setkey(struct crypto_shash *tfm, 103 + const u8 *inkey, unsigned int keylen) 104 + { 105 + struct ghash_key *key = crypto_shash_ctx(tfm); 106 + u64 a, b; 107 + 108 + if (keylen != GHASH_BLOCK_SIZE) { 109 + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); 110 + return -EINVAL; 111 + } 112 + 113 + /* perform multiplication by 'x' in GF(2^128) */ 114 + b = get_unaligned_be64(inkey); 115 + a = get_unaligned_be64(inkey + 8); 116 + 117 + key->a = (a << 1) | (b >> 63); 118 + key->b = (b << 1) | (a >> 63); 119 + 120 + if (b >> 63) 121 + key->b ^= 0xc200000000000000UL; 122 + 123 + return 0; 124 + } 125 + 126 + static struct shash_alg ghash_alg = { 127 + .digestsize = GHASH_DIGEST_SIZE, 128 + .init = ghash_init, 129 + .update = ghash_update, 130 + .final = ghash_final, 131 + .setkey = ghash_setkey, 132 + .descsize = sizeof(struct ghash_desc_ctx), 133 + .base = { 134 + .cra_name = "ghash", 135 + .cra_driver_name = "ghash-ce", 136 + .cra_priority = 200, 137 + .cra_flags = CRYPTO_ALG_TYPE_SHASH, 138 + .cra_blocksize = GHASH_BLOCK_SIZE, 139 + .cra_ctxsize = sizeof(struct ghash_key), 140 + .cra_module = THIS_MODULE, 141 + }, 142 + }; 143 + 144 + static int __init ghash_ce_mod_init(void) 145 + { 146 + return crypto_register_shash(&ghash_alg); 147 + } 148 + 149 + static void __exit ghash_ce_mod_exit(void) 150 + { 151 + crypto_unregister_shash(&ghash_alg); 152 + } 153 + 154 + module_cpu_feature_match(PMULL, ghash_ce_mod_init); 155 + module_exit(ghash_ce_mod_exit);

+153

arch/arm64/crypto/sha1-ce-core.S

··· 1 + /* 2 + * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions 3 + * 4 + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <linux/linkage.h> 12 + #include <asm/assembler.h> 13 + 14 + .text 15 + .arch armv8-a+crypto 16 + 17 + k0 .req v0 18 + k1 .req v1 19 + k2 .req v2 20 + k3 .req v3 21 + 22 + t0 .req v4 23 + t1 .req v5 24 + 25 + dga .req q6 26 + dgav .req v6 27 + dgb .req s7 28 + dgbv .req v7 29 + 30 + dg0q .req q12 31 + dg0s .req s12 32 + dg0v .req v12 33 + dg1s .req s13 34 + dg1v .req v13 35 + dg2s .req s14 36 + 37 + .macro add_only, op, ev, rc, s0, dg1 38 + .ifc \ev, ev 39 + add t1.4s, v\s0\().4s, \rc\().4s 40 + sha1h dg2s, dg0s 41 + .ifnb \dg1 42 + sha1\op dg0q, \dg1, t0.4s 43 + .else 44 + sha1\op dg0q, dg1s, t0.4s 45 + .endif 46 + .else 47 + .ifnb \s0 48 + add t0.4s, v\s0\().4s, \rc\().4s 49 + .endif 50 + sha1h dg1s, dg0s 51 + sha1\op dg0q, dg2s, t1.4s 52 + .endif 53 + .endm 54 + 55 + .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1 56 + sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s 57 + add_only \op, \ev, \rc, \s1, \dg1 58 + sha1su1 v\s0\().4s, v\s3\().4s 59 + .endm 60 + 61 + /* 62 + * The SHA1 round constants 63 + */ 64 + .align 4 65 + .Lsha1_rcon: 66 + .word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6 67 + 68 + /* 69 + * void sha1_ce_transform(int blocks, u8 const *src, u32 *state, 70 + * u8 *head, long bytes) 71 + */ 72 + ENTRY(sha1_ce_transform) 73 + /* load round constants */ 74 + adr x6, .Lsha1_rcon 75 + ld1r {k0.4s}, [x6], #4 76 + ld1r {k1.4s}, [x6], #4 77 + ld1r {k2.4s}, [x6], #4 78 + ld1r {k3.4s}, [x6] 79 + 80 + /* load state */ 81 + ldr dga, [x2] 82 + ldr dgb, [x2, #16] 83 + 84 + /* load partial state (if supplied) */ 85 + cbz x3, 0f 86 + ld1 {v8.4s-v11.4s}, [x3] 87 + b 1f 88 + 89 + /* load input */ 90 + 0: ld1 {v8.4s-v11.4s}, [x1], #64 91 + sub w0, w0, #1 92 + 93 + 1: 94 + CPU_LE( rev32 v8.16b, v8.16b ) 95 + CPU_LE( rev32 v9.16b, v9.16b ) 96 + CPU_LE( rev32 v10.16b, v10.16b ) 97 + CPU_LE( rev32 v11.16b, v11.16b ) 98 + 99 + 2: add t0.4s, v8.4s, k0.4s 100 + mov dg0v.16b, dgav.16b 101 + 102 + add_update c, ev, k0, 8, 9, 10, 11, dgb 103 + add_update c, od, k0, 9, 10, 11, 8 104 + add_update c, ev, k0, 10, 11, 8, 9 105 + add_update c, od, k0, 11, 8, 9, 10 106 + add_update c, ev, k1, 8, 9, 10, 11 107 + 108 + add_update p, od, k1, 9, 10, 11, 8 109 + add_update p, ev, k1, 10, 11, 8, 9 110 + add_update p, od, k1, 11, 8, 9, 10 111 + add_update p, ev, k1, 8, 9, 10, 11 112 + add_update p, od, k2, 9, 10, 11, 8 113 + 114 + add_update m, ev, k2, 10, 11, 8, 9 115 + add_update m, od, k2, 11, 8, 9, 10 116 + add_update m, ev, k2, 8, 9, 10, 11 117 + add_update m, od, k2, 9, 10, 11, 8 118 + add_update m, ev, k3, 10, 11, 8, 9 119 + 120 + add_update p, od, k3, 11, 8, 9, 10 121 + add_only p, ev, k3, 9 122 + add_only p, od, k3, 10 123 + add_only p, ev, k3, 11 124 + add_only p, od 125 + 126 + /* update state */ 127 + add dgbv.2s, dgbv.2s, dg1v.2s 128 + add dgav.4s, dgav.4s, dg0v.4s 129 + 130 + cbnz w0, 0b 131 + 132 + /* 133 + * Final block: add padding and total bit count. 134 + * Skip if we have no total byte count in x4. In that case, the input 135 + * size was not a round multiple of the block size, and the padding is 136 + * handled by the C code. 137 + */ 138 + cbz x4, 3f 139 + movi v9.2d, #0 140 + mov x8, #0x80000000 141 + movi v10.2d, #0 142 + ror x7, x4, #29 // ror(lsl(x4, 3), 32) 143 + fmov d8, x8 144 + mov x4, #0 145 + mov v11.d[0], xzr 146 + mov v11.d[1], x7 147 + b 2b 148 + 149 + /* store new state */ 150 + 3: str dga, [x2] 151 + str dgb, [x2, #16] 152 + ret 153 + ENDPROC(sha1_ce_transform)

+174

arch/arm64/crypto/sha1-ce-glue.c

··· 1 + /* 2 + * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions 3 + * 4 + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <asm/neon.h> 12 + #include <asm/unaligned.h> 13 + #include <crypto/internal/hash.h> 14 + #include <crypto/sha.h> 15 + #include <linux/cpufeature.h> 16 + #include <linux/crypto.h> 17 + #include <linux/module.h> 18 + 19 + MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); 20 + MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 21 + MODULE_LICENSE("GPL v2"); 22 + 23 + asmlinkage void sha1_ce_transform(int blocks, u8 const *src, u32 *state, 24 + u8 *head, long bytes); 25 + 26 + static int sha1_init(struct shash_desc *desc) 27 + { 28 + struct sha1_state *sctx = shash_desc_ctx(desc); 29 + 30 + *sctx = (struct sha1_state){ 31 + .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, 32 + }; 33 + return 0; 34 + } 35 + 36 + static int sha1_update(struct shash_desc *desc, const u8 *data, 37 + unsigned int len) 38 + { 39 + struct sha1_state *sctx = shash_desc_ctx(desc); 40 + unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; 41 + 42 + sctx->count += len; 43 + 44 + if ((partial + len) >= SHA1_BLOCK_SIZE) { 45 + int blocks; 46 + 47 + if (partial) { 48 + int p = SHA1_BLOCK_SIZE - partial; 49 + 50 + memcpy(sctx->buffer + partial, data, p); 51 + data += p; 52 + len -= p; 53 + } 54 + 55 + blocks = len / SHA1_BLOCK_SIZE; 56 + len %= SHA1_BLOCK_SIZE; 57 + 58 + kernel_neon_begin_partial(16); 59 + sha1_ce_transform(blocks, data, sctx->state, 60 + partial ? sctx->buffer : NULL, 0); 61 + kernel_neon_end(); 62 + 63 + data += blocks * SHA1_BLOCK_SIZE; 64 + partial = 0; 65 + } 66 + if (len) 67 + memcpy(sctx->buffer + partial, data, len); 68 + return 0; 69 + } 70 + 71 + static int sha1_final(struct shash_desc *desc, u8 *out) 72 + { 73 + static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; 74 + 75 + struct sha1_state *sctx = shash_desc_ctx(desc); 76 + __be64 bits = cpu_to_be64(sctx->count << 3); 77 + __be32 *dst = (__be32 *)out; 78 + int i; 79 + 80 + u32 padlen = SHA1_BLOCK_SIZE 81 + - ((sctx->count + sizeof(bits)) % SHA1_BLOCK_SIZE); 82 + 83 + sha1_update(desc, padding, padlen); 84 + sha1_update(desc, (const u8 *)&bits, sizeof(bits)); 85 + 86 + for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) 87 + put_unaligned_be32(sctx->state[i], dst++); 88 + 89 + *sctx = (struct sha1_state){}; 90 + return 0; 91 + } 92 + 93 + static int sha1_finup(struct shash_desc *desc, const u8 *data, 94 + unsigned int len, u8 *out) 95 + { 96 + struct sha1_state *sctx = shash_desc_ctx(desc); 97 + __be32 *dst = (__be32 *)out; 98 + int blocks; 99 + int i; 100 + 101 + if (sctx->count || !len || (len % SHA1_BLOCK_SIZE)) { 102 + sha1_update(desc, data, len); 103 + return sha1_final(desc, out); 104 + } 105 + 106 + /* 107 + * Use a fast path if the input is a multiple of 64 bytes. In 108 + * this case, there is no need to copy data around, and we can 109 + * perform the entire digest calculation in a single invocation 110 + * of sha1_ce_transform() 111 + */ 112 + blocks = len / SHA1_BLOCK_SIZE; 113 + 114 + kernel_neon_begin_partial(16); 115 + sha1_ce_transform(blocks, data, sctx->state, NULL, len); 116 + kernel_neon_end(); 117 + 118 + for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) 119 + put_unaligned_be32(sctx->state[i], dst++); 120 + 121 + *sctx = (struct sha1_state){}; 122 + return 0; 123 + } 124 + 125 + static int sha1_export(struct shash_desc *desc, void *out) 126 + { 127 + struct sha1_state *sctx = shash_desc_ctx(desc); 128 + struct sha1_state *dst = out; 129 + 130 + *dst = *sctx; 131 + return 0; 132 + } 133 + 134 + static int sha1_import(struct shash_desc *desc, const void *in) 135 + { 136 + struct sha1_state *sctx = shash_desc_ctx(desc); 137 + struct sha1_state const *src = in; 138 + 139 + *sctx = *src; 140 + return 0; 141 + } 142 + 143 + static struct shash_alg alg = { 144 + .init = sha1_init, 145 + .update = sha1_update, 146 + .final = sha1_final, 147 + .finup = sha1_finup, 148 + .export = sha1_export, 149 + .import = sha1_import, 150 + .descsize = sizeof(struct sha1_state), 151 + .digestsize = SHA1_DIGEST_SIZE, 152 + .statesize = sizeof(struct sha1_state), 153 + .base = { 154 + .cra_name = "sha1", 155 + .cra_driver_name = "sha1-ce", 156 + .cra_priority = 200, 157 + .cra_flags = CRYPTO_ALG_TYPE_SHASH, 158 + .cra_blocksize = SHA1_BLOCK_SIZE, 159 + .cra_module = THIS_MODULE, 160 + } 161 + }; 162 + 163 + static int __init sha1_ce_mod_init(void) 164 + { 165 + return crypto_register_shash(&alg); 166 + } 167 + 168 + static void __exit sha1_ce_mod_fini(void) 169 + { 170 + crypto_unregister_shash(&alg); 171 + } 172 + 173 + module_cpu_feature_match(SHA1, sha1_ce_mod_init); 174 + module_exit(sha1_ce_mod_fini);

+156

arch/arm64/crypto/sha2-ce-core.S

··· 1 + /* 2 + * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions 3 + * 4 + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <linux/linkage.h> 12 + #include <asm/assembler.h> 13 + 14 + .text 15 + .arch armv8-a+crypto 16 + 17 + dga .req q20 18 + dgav .req v20 19 + dgb .req q21 20 + dgbv .req v21 21 + 22 + t0 .req v22 23 + t1 .req v23 24 + 25 + dg0q .req q24 26 + dg0v .req v24 27 + dg1q .req q25 28 + dg1v .req v25 29 + dg2q .req q26 30 + dg2v .req v26 31 + 32 + .macro add_only, ev, rc, s0 33 + mov dg2v.16b, dg0v.16b 34 + .ifeq \ev 35 + add t1.4s, v\s0\().4s, \rc\().4s 36 + sha256h dg0q, dg1q, t0.4s 37 + sha256h2 dg1q, dg2q, t0.4s 38 + .else 39 + .ifnb \s0 40 + add t0.4s, v\s0\().4s, \rc\().4s 41 + .endif 42 + sha256h dg0q, dg1q, t1.4s 43 + sha256h2 dg1q, dg2q, t1.4s 44 + .endif 45 + .endm 46 + 47 + .macro add_update, ev, rc, s0, s1, s2, s3 48 + sha256su0 v\s0\().4s, v\s1\().4s 49 + add_only \ev, \rc, \s1 50 + sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s 51 + .endm 52 + 53 + /* 54 + * The SHA-256 round constants 55 + */ 56 + .align 4 57 + .Lsha2_rcon: 58 + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 59 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 60 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 61 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 62 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 63 + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da 64 + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 65 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 66 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 67 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 68 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 69 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 70 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 71 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 72 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 73 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 74 + 75 + /* 76 + * void sha2_ce_transform(int blocks, u8 const *src, u32 *state, 77 + * u8 *head, long bytes) 78 + */ 79 + ENTRY(sha2_ce_transform) 80 + /* load round constants */ 81 + adr x8, .Lsha2_rcon 82 + ld1 { v0.4s- v3.4s}, [x8], #64 83 + ld1 { v4.4s- v7.4s}, [x8], #64 84 + ld1 { v8.4s-v11.4s}, [x8], #64 85 + ld1 {v12.4s-v15.4s}, [x8] 86 + 87 + /* load state */ 88 + ldp dga, dgb, [x2] 89 + 90 + /* load partial input (if supplied) */ 91 + cbz x3, 0f 92 + ld1 {v16.4s-v19.4s}, [x3] 93 + b 1f 94 + 95 + /* load input */ 96 + 0: ld1 {v16.4s-v19.4s}, [x1], #64 97 + sub w0, w0, #1 98 + 99 + 1: 100 + CPU_LE( rev32 v16.16b, v16.16b ) 101 + CPU_LE( rev32 v17.16b, v17.16b ) 102 + CPU_LE( rev32 v18.16b, v18.16b ) 103 + CPU_LE( rev32 v19.16b, v19.16b ) 104 + 105 + 2: add t0.4s, v16.4s, v0.4s 106 + mov dg0v.16b, dgav.16b 107 + mov dg1v.16b, dgbv.16b 108 + 109 + add_update 0, v1, 16, 17, 18, 19 110 + add_update 1, v2, 17, 18, 19, 16 111 + add_update 0, v3, 18, 19, 16, 17 112 + add_update 1, v4, 19, 16, 17, 18 113 + 114 + add_update 0, v5, 16, 17, 18, 19 115 + add_update 1, v6, 17, 18, 19, 16 116 + add_update 0, v7, 18, 19, 16, 17 117 + add_update 1, v8, 19, 16, 17, 18 118 + 119 + add_update 0, v9, 16, 17, 18, 19 120 + add_update 1, v10, 17, 18, 19, 16 121 + add_update 0, v11, 18, 19, 16, 17 122 + add_update 1, v12, 19, 16, 17, 18 123 + 124 + add_only 0, v13, 17 125 + add_only 1, v14, 18 126 + add_only 0, v15, 19 127 + add_only 1 128 + 129 + /* update state */ 130 + add dgav.4s, dgav.4s, dg0v.4s 131 + add dgbv.4s, dgbv.4s, dg1v.4s 132 + 133 + /* handled all input blocks? */ 134 + cbnz w0, 0b 135 + 136 + /* 137 + * Final block: add padding and total bit count. 138 + * Skip if we have no total byte count in x4. In that case, the input 139 + * size was not a round multiple of the block size, and the padding is 140 + * handled by the C code. 141 + */ 142 + cbz x4, 3f 143 + movi v17.2d, #0 144 + mov x8, #0x80000000 145 + movi v18.2d, #0 146 + ror x7, x4, #29 // ror(lsl(x4, 3), 32) 147 + fmov d16, x8 148 + mov x4, #0 149 + mov v19.d[0], xzr 150 + mov v19.d[1], x7 151 + b 2b 152 + 153 + /* store new state */ 154 + 3: stp dga, dgb, [x2] 155 + ret 156 + ENDPROC(sha2_ce_transform)

+255

arch/arm64/crypto/sha2-ce-glue.c

··· 1 + /* 2 + * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions 3 + * 4 + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <asm/neon.h> 12 + #include <asm/unaligned.h> 13 + #include <crypto/internal/hash.h> 14 + #include <crypto/sha.h> 15 + #include <linux/cpufeature.h> 16 + #include <linux/crypto.h> 17 + #include <linux/module.h> 18 + 19 + MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions"); 20 + MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 21 + MODULE_LICENSE("GPL v2"); 22 + 23 + asmlinkage int sha2_ce_transform(int blocks, u8 const *src, u32 *state, 24 + u8 *head, long bytes); 25 + 26 + static int sha224_init(struct shash_desc *desc) 27 + { 28 + struct sha256_state *sctx = shash_desc_ctx(desc); 29 + 30 + *sctx = (struct sha256_state){ 31 + .state = { 32 + SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3, 33 + SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7, 34 + } 35 + }; 36 + return 0; 37 + } 38 + 39 + static int sha256_init(struct shash_desc *desc) 40 + { 41 + struct sha256_state *sctx = shash_desc_ctx(desc); 42 + 43 + *sctx = (struct sha256_state){ 44 + .state = { 45 + SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, 46 + SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, 47 + } 48 + }; 49 + return 0; 50 + } 51 + 52 + static int sha2_update(struct shash_desc *desc, const u8 *data, 53 + unsigned int len) 54 + { 55 + struct sha256_state *sctx = shash_desc_ctx(desc); 56 + unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; 57 + 58 + sctx->count += len; 59 + 60 + if ((partial + len) >= SHA256_BLOCK_SIZE) { 61 + int blocks; 62 + 63 + if (partial) { 64 + int p = SHA256_BLOCK_SIZE - partial; 65 + 66 + memcpy(sctx->buf + partial, data, p); 67 + data += p; 68 + len -= p; 69 + } 70 + 71 + blocks = len / SHA256_BLOCK_SIZE; 72 + len %= SHA256_BLOCK_SIZE; 73 + 74 + kernel_neon_begin_partial(28); 75 + sha2_ce_transform(blocks, data, sctx->state, 76 + partial ? sctx->buf : NULL, 0); 77 + kernel_neon_end(); 78 + 79 + data += blocks * SHA256_BLOCK_SIZE; 80 + partial = 0; 81 + } 82 + if (len) 83 + memcpy(sctx->buf + partial, data, len); 84 + return 0; 85 + } 86 + 87 + static void sha2_final(struct shash_desc *desc) 88 + { 89 + static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, }; 90 + 91 + struct sha256_state *sctx = shash_desc_ctx(desc); 92 + __be64 bits = cpu_to_be64(sctx->count << 3); 93 + u32 padlen = SHA256_BLOCK_SIZE 94 + - ((sctx->count + sizeof(bits)) % SHA256_BLOCK_SIZE); 95 + 96 + sha2_update(desc, padding, padlen); 97 + sha2_update(desc, (const u8 *)&bits, sizeof(bits)); 98 + } 99 + 100 + static int sha224_final(struct shash_desc *desc, u8 *out) 101 + { 102 + struct sha256_state *sctx = shash_desc_ctx(desc); 103 + __be32 *dst = (__be32 *)out; 104 + int i; 105 + 106 + sha2_final(desc); 107 + 108 + for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++) 109 + put_unaligned_be32(sctx->state[i], dst++); 110 + 111 + *sctx = (struct sha256_state){}; 112 + return 0; 113 + } 114 + 115 + static int sha256_final(struct shash_desc *desc, u8 *out) 116 + { 117 + struct sha256_state *sctx = shash_desc_ctx(desc); 118 + __be32 *dst = (__be32 *)out; 119 + int i; 120 + 121 + sha2_final(desc); 122 + 123 + for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++) 124 + put_unaligned_be32(sctx->state[i], dst++); 125 + 126 + *sctx = (struct sha256_state){}; 127 + return 0; 128 + } 129 + 130 + static void sha2_finup(struct shash_desc *desc, const u8 *data, 131 + unsigned int len) 132 + { 133 + struct sha256_state *sctx = shash_desc_ctx(desc); 134 + int blocks; 135 + 136 + if (sctx->count || !len || (len % SHA256_BLOCK_SIZE)) { 137 + sha2_update(desc, data, len); 138 + sha2_final(desc); 139 + return; 140 + } 141 + 142 + /* 143 + * Use a fast path if the input is a multiple of 64 bytes. In 144 + * this case, there is no need to copy data around, and we can 145 + * perform the entire digest calculation in a single invocation 146 + * of sha2_ce_transform() 147 + */ 148 + blocks = len / SHA256_BLOCK_SIZE; 149 + 150 + kernel_neon_begin_partial(28); 151 + sha2_ce_transform(blocks, data, sctx->state, NULL, len); 152 + kernel_neon_end(); 153 + data += blocks * SHA256_BLOCK_SIZE; 154 + } 155 + 156 + static int sha224_finup(struct shash_desc *desc, const u8 *data, 157 + unsigned int len, u8 *out) 158 + { 159 + struct sha256_state *sctx = shash_desc_ctx(desc); 160 + __be32 *dst = (__be32 *)out; 161 + int i; 162 + 163 + sha2_finup(desc, data, len); 164 + 165 + for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++) 166 + put_unaligned_be32(sctx->state[i], dst++); 167 + 168 + *sctx = (struct sha256_state){}; 169 + return 0; 170 + } 171 + 172 + static int sha256_finup(struct shash_desc *desc, const u8 *data, 173 + unsigned int len, u8 *out) 174 + { 175 + struct sha256_state *sctx = shash_desc_ctx(desc); 176 + __be32 *dst = (__be32 *)out; 177 + int i; 178 + 179 + sha2_finup(desc, data, len); 180 + 181 + for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++) 182 + put_unaligned_be32(sctx->state[i], dst++); 183 + 184 + *sctx = (struct sha256_state){}; 185 + return 0; 186 + } 187 + 188 + static int sha2_export(struct shash_desc *desc, void *out) 189 + { 190 + struct sha256_state *sctx = shash_desc_ctx(desc); 191 + struct sha256_state *dst = out; 192 + 193 + *dst = *sctx; 194 + return 0; 195 + } 196 + 197 + static int sha2_import(struct shash_desc *desc, const void *in) 198 + { 199 + struct sha256_state *sctx = shash_desc_ctx(desc); 200 + struct sha256_state const *src = in; 201 + 202 + *sctx = *src; 203 + return 0; 204 + } 205 + 206 + static struct shash_alg algs[] = { { 207 + .init = sha224_init, 208 + .update = sha2_update, 209 + .final = sha224_final, 210 + .finup = sha224_finup, 211 + .export = sha2_export, 212 + .import = sha2_import, 213 + .descsize = sizeof(struct sha256_state), 214 + .digestsize = SHA224_DIGEST_SIZE, 215 + .statesize = sizeof(struct sha256_state), 216 + .base = { 217 + .cra_name = "sha224", 218 + .cra_driver_name = "sha224-ce", 219 + .cra_priority = 200, 220 + .cra_flags = CRYPTO_ALG_TYPE_SHASH, 221 + .cra_blocksize = SHA256_BLOCK_SIZE, 222 + .cra_module = THIS_MODULE, 223 + } 224 + }, { 225 + .init = sha256_init, 226 + .update = sha2_update, 227 + .final = sha256_final, 228 + .finup = sha256_finup, 229 + .export = sha2_export, 230 + .import = sha2_import, 231 + .descsize = sizeof(struct sha256_state), 232 + .digestsize = SHA256_DIGEST_SIZE, 233 + .statesize = sizeof(struct sha256_state), 234 + .base = { 235 + .cra_name = "sha256", 236 + .cra_driver_name = "sha256-ce", 237 + .cra_priority = 200, 238 + .cra_flags = CRYPTO_ALG_TYPE_SHASH, 239 + .cra_blocksize = SHA256_BLOCK_SIZE, 240 + .cra_module = THIS_MODULE, 241 + } 242 + } }; 243 + 244 + static int __init sha2_ce_mod_init(void) 245 + { 246 + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); 247 + } 248 + 249 + static void __exit sha2_ce_mod_fini(void) 250 + { 251 + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); 252 + } 253 + 254 + module_cpu_feature_match(SHA2, sha2_ce_mod_init); 255 + module_exit(sha2_ce_mod_fini);

+1

arch/arm64/include/asm/Kbuild

··· 40 40 generic-y += sembuf.h 41 41 generic-y += serial.h 42 42 generic-y += shmbuf.h 43 + generic-y += simd.h 43 44 generic-y += sizes.h 44 45 generic-y += socket.h 45 46 generic-y += sockios.h

+23

arch/arm64/include/asm/fpsimd.h

··· 37 37 u32 fpcr; 38 38 }; 39 39 }; 40 + /* the id of the last cpu to have restored this state */ 41 + unsigned int cpu; 40 42 }; 43 + 44 + /* 45 + * Struct for stacking the bottom 'n' FP/SIMD registers. 46 + */ 47 + struct fpsimd_partial_state { 48 + u32 fpsr; 49 + u32 fpcr; 50 + u32 num_regs; 51 + __uint128_t vregs[32]; 52 + }; 53 + 41 54 42 55 #if defined(__KERNEL__) && defined(CONFIG_COMPAT) 43 56 /* Masks for extracting the FPSR and FPCR from the FPSCR */ ··· 70 57 71 58 extern void fpsimd_thread_switch(struct task_struct *next); 72 59 extern void fpsimd_flush_thread(void); 60 + 61 + extern void fpsimd_preserve_current_state(void); 62 + extern void fpsimd_restore_current_state(void); 63 + extern void fpsimd_update_current_state(struct fpsimd_state *state); 64 + 65 + extern void fpsimd_flush_task_state(struct task_struct *target); 66 + 67 + extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state, 68 + u32 num_regs); 69 + extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state); 73 70 74 71 #endif 75 72

+35

arch/arm64/include/asm/fpsimdmacros.h

··· 62 62 ldr w\tmpnr, [\state, #16 * 2 + 4] 63 63 msr fpcr, x\tmpnr 64 64 .endm 65 + 66 + .altmacro 67 + .macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2 68 + mrs x\tmpnr1, fpsr 69 + str w\numnr, [\state, #8] 70 + mrs x\tmpnr2, fpcr 71 + stp w\tmpnr1, w\tmpnr2, [\state] 72 + adr x\tmpnr1, 0f 73 + add \state, \state, x\numnr, lsl #4 74 + sub x\tmpnr1, x\tmpnr1, x\numnr, lsl #1 75 + br x\tmpnr1 76 + .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0 77 + .irp qb, %(qa + 1) 78 + stp q\qa, q\qb, [\state, # -16 * \qa - 16] 79 + .endr 80 + .endr 81 + 0: 82 + .endm 83 + 84 + .macro fpsimd_restore_partial state, tmpnr1, tmpnr2 85 + ldp w\tmpnr1, w\tmpnr2, [\state] 86 + msr fpsr, x\tmpnr1 87 + msr fpcr, x\tmpnr2 88 + adr x\tmpnr1, 0f 89 + ldr w\tmpnr2, [\state, #8] 90 + add \state, \state, x\tmpnr2, lsl #4 91 + sub x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1 92 + br x\tmpnr1 93 + .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0 94 + .irp qb, %(qa + 1) 95 + ldp q\qa, q\qb, [\state, # -16 * \qa - 16] 96 + .endr 97 + .endr 98 + 0: 99 + .endm

+5 -1

arch/arm64/include/asm/neon.h

··· 8 8 * published by the Free Software Foundation. 9 9 */ 10 10 11 + #include <linux/types.h> 12 + 11 13 #define cpu_has_neon() (1) 12 14 13 - void kernel_neon_begin(void); 15 + #define kernel_neon_begin() kernel_neon_begin_partial(32) 16 + 17 + void kernel_neon_begin_partial(u32 num_regs); 14 18 void kernel_neon_end(void);

+3 -1

arch/arm64/include/asm/thread_info.h

··· 103 103 #define TIF_SIGPENDING 0 104 104 #define TIF_NEED_RESCHED 1 105 105 #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ 106 + #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ 106 107 #define TIF_SYSCALL_TRACE 8 107 108 #define TIF_SYSCALL_AUDIT 9 108 109 #define TIF_SYSCALL_TRACEPOINT 10 ··· 119 118 #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) 120 119 #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) 121 120 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 121 + #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE) 122 122 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 123 123 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) 124 124 #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) ··· 127 125 #define _TIF_32BIT (1 << TIF_32BIT) 128 126 129 127 #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ 130 - _TIF_NOTIFY_RESUME) 128 + _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE) 131 129 132 130 #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ 133 131 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP)

+24

arch/arm64/kernel/entry-fpsimd.S

··· 41 41 fpsimd_restore x0, 8 42 42 ret 43 43 ENDPROC(fpsimd_load_state) 44 + 45 + #ifdef CONFIG_KERNEL_MODE_NEON 46 + 47 + /* 48 + * Save the bottom n FP registers. 49 + * 50 + * x0 - pointer to struct fpsimd_partial_state 51 + */ 52 + ENTRY(fpsimd_save_partial_state) 53 + fpsimd_save_partial x0, 1, 8, 9 54 + ret 55 + ENDPROC(fpsimd_load_partial_state) 56 + 57 + /* 58 + * Load the bottom n FP registers. 59 + * 60 + * x0 - pointer to struct fpsimd_partial_state 61 + */ 62 + ENTRY(fpsimd_load_partial_state) 63 + fpsimd_restore_partial x0, 8, 9 64 + ret 65 + ENDPROC(fpsimd_load_partial_state) 66 + 67 + #endif

+1 -1

arch/arm64/kernel/entry.S

··· 562 562 str x0, [sp, #S_X0] // returned x0 563 563 work_pending: 564 564 tbnz x1, #TIF_NEED_RESCHED, work_resched 565 - /* TIF_SIGPENDING or TIF_NOTIFY_RESUME case */ 565 + /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */ 566 566 ldr x2, [sp, #S_PSTATE] 567 567 mov x0, sp // 'regs' 568 568 tst x2, #PSR_MODE_MASK // user mode regs?

+167 -19

arch/arm64/kernel/fpsimd.c

··· 35 35 #define FPEXC_IDF (1 << 7) 36 36 37 37 /* 38 + * In order to reduce the number of times the FPSIMD state is needlessly saved 39 + * and restored, we need to keep track of two things: 40 + * (a) for each task, we need to remember which CPU was the last one to have 41 + * the task's FPSIMD state loaded into its FPSIMD registers; 42 + * (b) for each CPU, we need to remember which task's userland FPSIMD state has 43 + * been loaded into its FPSIMD registers most recently, or whether it has 44 + * been used to perform kernel mode NEON in the meantime. 45 + * 46 + * For (a), we add a 'cpu' field to struct fpsimd_state, which gets updated to 47 + * the id of the current CPU everytime the state is loaded onto a CPU. For (b), 48 + * we add the per-cpu variable 'fpsimd_last_state' (below), which contains the 49 + * address of the userland FPSIMD state of the task that was loaded onto the CPU 50 + * the most recently, or NULL if kernel mode NEON has been performed after that. 51 + * 52 + * With this in place, we no longer have to restore the next FPSIMD state right 53 + * when switching between tasks. Instead, we can defer this check to userland 54 + * resume, at which time we verify whether the CPU's fpsimd_last_state and the 55 + * task's fpsimd_state.cpu are still mutually in sync. If this is the case, we 56 + * can omit the FPSIMD restore. 57 + * 58 + * As an optimization, we use the thread_info flag TIF_FOREIGN_FPSTATE to 59 + * indicate whether or not the userland FPSIMD state of the current task is 60 + * present in the registers. The flag is set unless the FPSIMD registers of this 61 + * CPU currently contain the most recent userland FPSIMD state of the current 62 + * task. 63 + * 64 + * For a certain task, the sequence may look something like this: 65 + * - the task gets scheduled in; if both the task's fpsimd_state.cpu field 66 + * contains the id of the current CPU, and the CPU's fpsimd_last_state per-cpu 67 + * variable points to the task's fpsimd_state, the TIF_FOREIGN_FPSTATE flag is 68 + * cleared, otherwise it is set; 69 + * 70 + * - the task returns to userland; if TIF_FOREIGN_FPSTATE is set, the task's 71 + * userland FPSIMD state is copied from memory to the registers, the task's 72 + * fpsimd_state.cpu field is set to the id of the current CPU, the current 73 + * CPU's fpsimd_last_state pointer is set to this task's fpsimd_state and the 74 + * TIF_FOREIGN_FPSTATE flag is cleared; 75 + * 76 + * - the task executes an ordinary syscall; upon return to userland, the 77 + * TIF_FOREIGN_FPSTATE flag will still be cleared, so no FPSIMD state is 78 + * restored; 79 + * 80 + * - the task executes a syscall which executes some NEON instructions; this is 81 + * preceded by a call to kernel_neon_begin(), which copies the task's FPSIMD 82 + * register contents to memory, clears the fpsimd_last_state per-cpu variable 83 + * and sets the TIF_FOREIGN_FPSTATE flag; 84 + * 85 + * - the task gets preempted after kernel_neon_end() is called; as we have not 86 + * returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so 87 + * whatever is in the FPSIMD registers is not saved to memory, but discarded. 88 + */ 89 + static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state); 90 + 91 + /* 38 92 * Trapped FP/ASIMD access. 39 93 */ 40 94 void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) ··· 126 72 127 73 void fpsimd_thread_switch(struct task_struct *next) 128 74 { 129 - /* check if not kernel threads */ 130 - if (current->mm) 75 + /* 76 + * Save the current FPSIMD state to memory, but only if whatever is in 77 + * the registers is in fact the most recent userland FPSIMD state of 78 + * 'current'. 79 + */ 80 + if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE)) 131 81 fpsimd_save_state(&current->thread.fpsimd_state); 132 - if (next->mm) 133 - fpsimd_load_state(&next->thread.fpsimd_state); 82 + 83 + if (next->mm) { 84 + /* 85 + * If we are switching to a task whose most recent userland 86 + * FPSIMD state is already in the registers of *this* cpu, 87 + * we can skip loading the state from memory. Otherwise, set 88 + * the TIF_FOREIGN_FPSTATE flag so the state will be loaded 89 + * upon the next return to userland. 90 + */ 91 + struct fpsimd_state *st = &next->thread.fpsimd_state; 92 + 93 + if (__this_cpu_read(fpsimd_last_state) == st 94 + && st->cpu == smp_processor_id()) 95 + clear_ti_thread_flag(task_thread_info(next), 96 + TIF_FOREIGN_FPSTATE); 97 + else 98 + set_ti_thread_flag(task_thread_info(next), 99 + TIF_FOREIGN_FPSTATE); 100 + } 134 101 } 135 102 136 103 void fpsimd_flush_thread(void) 137 104 { 138 - preempt_disable(); 139 105 memset(&current->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); 140 - fpsimd_load_state(&current->thread.fpsimd_state); 106 + set_thread_flag(TIF_FOREIGN_FPSTATE); 107 + } 108 + 109 + /* 110 + * Save the userland FPSIMD state of 'current' to memory, but only if the state 111 + * currently held in the registers does in fact belong to 'current' 112 + */ 113 + void fpsimd_preserve_current_state(void) 114 + { 115 + preempt_disable(); 116 + if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) 117 + fpsimd_save_state(&current->thread.fpsimd_state); 141 118 preempt_enable(); 119 + } 120 + 121 + /* 122 + * Load the userland FPSIMD state of 'current' from memory, but only if the 123 + * FPSIMD state already held in the registers is /not/ the most recent FPSIMD 124 + * state of 'current' 125 + */ 126 + void fpsimd_restore_current_state(void) 127 + { 128 + preempt_disable(); 129 + if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { 130 + struct fpsimd_state *st = &current->thread.fpsimd_state; 131 + 132 + fpsimd_load_state(st); 133 + this_cpu_write(fpsimd_last_state, st); 134 + st->cpu = smp_processor_id(); 135 + } 136 + preempt_enable(); 137 + } 138 + 139 + /* 140 + * Load an updated userland FPSIMD state for 'current' from memory and set the 141 + * flag that indicates that the FPSIMD register contents are the most recent 142 + * FPSIMD state of 'current' 143 + */ 144 + void fpsimd_update_current_state(struct fpsimd_state *state) 145 + { 146 + preempt_disable(); 147 + fpsimd_load_state(state); 148 + if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { 149 + struct fpsimd_state *st = &current->thread.fpsimd_state; 150 + 151 + this_cpu_write(fpsimd_last_state, st); 152 + st->cpu = smp_processor_id(); 153 + } 154 + preempt_enable(); 155 + } 156 + 157 + /* 158 + * Invalidate live CPU copies of task t's FPSIMD state 159 + */ 160 + void fpsimd_flush_task_state(struct task_struct *t) 161 + { 162 + t->thread.fpsimd_state.cpu = NR_CPUS; 142 163 } 143 164 144 165 #ifdef CONFIG_KERNEL_MODE_NEON 145 166 167 + static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate); 168 + static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate); 169 + 146 170 /* 147 171 * Kernel-side NEON support functions 148 172 */ 149 - void kernel_neon_begin(void) 173 + void kernel_neon_begin_partial(u32 num_regs) 150 174 { 151 - /* Avoid using the NEON in interrupt context */ 152 - BUG_ON(in_interrupt()); 153 - preempt_disable(); 175 + if (in_interrupt()) { 176 + struct fpsimd_partial_state *s = this_cpu_ptr( 177 + in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); 154 178 155 - if (current->mm) 156 - fpsimd_save_state(&current->thread.fpsimd_state); 179 + BUG_ON(num_regs > 32); 180 + fpsimd_save_partial_state(s, roundup(num_regs, 2)); 181 + } else { 182 + /* 183 + * Save the userland FPSIMD state if we have one and if we 184 + * haven't done so already. Clear fpsimd_last_state to indicate 185 + * that there is no longer userland FPSIMD state in the 186 + * registers. 187 + */ 188 + preempt_disable(); 189 + if (current->mm && 190 + !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) 191 + fpsimd_save_state(&current->thread.fpsimd_state); 192 + this_cpu_write(fpsimd_last_state, NULL); 193 + } 157 194 } 158 - EXPORT_SYMBOL(kernel_neon_begin); 195 + EXPORT_SYMBOL(kernel_neon_begin_partial); 159 196 160 197 void kernel_neon_end(void) 161 198 { 162 - if (current->mm) 163 - fpsimd_load_state(&current->thread.fpsimd_state); 164 - 165 - preempt_enable(); 199 + if (in_interrupt()) { 200 + struct fpsimd_partial_state *s = this_cpu_ptr( 201 + in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); 202 + fpsimd_load_partial_state(s); 203 + } else { 204 + preempt_enable(); 205 + } 166 206 } 167 207 EXPORT_SYMBOL(kernel_neon_end); 168 208 ··· 268 120 { 269 121 switch (cmd) { 270 122 case CPU_PM_ENTER: 271 - if (current->mm) 123 + if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE)) 272 124 fpsimd_save_state(&current->thread.fpsimd_state); 273 125 break; 274 126 case CPU_PM_EXIT: 275 127 if (current->mm) 276 - fpsimd_load_state(&current->thread.fpsimd_state); 128 + set_thread_flag(TIF_FOREIGN_FPSTATE); 277 129 break; 278 130 case CPU_PM_ENTER_FAILED: 279 131 default:

+1 -1

arch/arm64/kernel/process.c

··· 206 206 207 207 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 208 208 { 209 - fpsimd_save_state(&current->thread.fpsimd_state); 209 + fpsimd_preserve_current_state(); 210 210 *dst = *src; 211 211 return 0; 212 212 }

+2

arch/arm64/kernel/ptrace.c

··· 518 518 return ret; 519 519 520 520 target->thread.fpsimd_state.user_fpsimd = newstate; 521 + fpsimd_flush_task_state(target); 521 522 return ret; 522 523 } 523 524 ··· 766 765 uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK; 767 766 } 768 767 768 + fpsimd_flush_task_state(target); 769 769 return ret; 770 770 } 771 771

+7 -6

arch/arm64/kernel/signal.c

··· 51 51 int err; 52 52 53 53 /* dump the hardware registers to the fpsimd_state structure */ 54 - fpsimd_save_state(fpsimd); 54 + fpsimd_preserve_current_state(); 55 55 56 56 /* copy the FP and status/control registers */ 57 57 err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); ··· 86 86 __get_user_error(fpsimd.fpcr, &ctx->fpcr, err); 87 87 88 88 /* load the hardware registers from the fpsimd_state structure */ 89 - if (!err) { 90 - preempt_disable(); 91 - fpsimd_load_state(&fpsimd); 92 - preempt_enable(); 93 - } 89 + if (!err) 90 + fpsimd_update_current_state(&fpsimd); 94 91 95 92 return err ? -EFAULT : 0; 96 93 } ··· 430 433 clear_thread_flag(TIF_NOTIFY_RESUME); 431 434 tracehook_notify_resume(regs); 432 435 } 436 + 437 + if (thread_flags & _TIF_FOREIGN_FPSTATE) 438 + fpsimd_restore_current_state(); 439 + 433 440 }

+3 -6

arch/arm64/kernel/signal32.c

··· 222 222 * Note that this also saves V16-31, which aren't visible 223 223 * in AArch32. 224 224 */ 225 - fpsimd_save_state(fpsimd); 225 + fpsimd_preserve_current_state(); 226 226 227 227 /* Place structure header on the stack */ 228 228 __put_user_error(magic, &frame->magic, err); ··· 285 285 * We don't need to touch the exception register, so 286 286 * reload the hardware state. 287 287 */ 288 - if (!err) { 289 - preempt_disable(); 290 - fpsimd_load_state(&fpsimd); 291 - preempt_enable(); 292 - } 288 + if (!err) 289 + fpsimd_update_current_state(&fpsimd); 293 290 294 291 return err ? -EFAULT : 0; 295 292 }

+13 -8

include/asm-generic/unaligned.h

··· 4 4 /* 5 5 * This is the most generic implementation of unaligned accesses 6 6 * and should work almost anywhere. 7 - * 8 - * If an architecture can handle unaligned accesses in hardware, 9 - * it may want to use the linux/unaligned/access_ok.h implementation 10 - * instead. 11 7 */ 12 8 #include <asm/byteorder.h> 13 9 10 + /* Set by the arch if it can handle unaligned accesses in hardware. */ 11 + #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 12 + # include <linux/unaligned/access_ok.h> 13 + #endif 14 + 14 15 #if defined(__LITTLE_ENDIAN) 15 - # include <linux/unaligned/le_struct.h> 16 - # include <linux/unaligned/be_byteshift.h> 16 + # ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 17 + # include <linux/unaligned/le_struct.h> 18 + # include <linux/unaligned/be_byteshift.h> 19 + # endif 17 20 # include <linux/unaligned/generic.h> 18 21 # define get_unaligned __get_unaligned_le 19 22 # define put_unaligned __put_unaligned_le 20 23 #elif defined(__BIG_ENDIAN) 21 - # include <linux/unaligned/be_struct.h> 22 - # include <linux/unaligned/le_byteshift.h> 24 + # ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 25 + # include <linux/unaligned/be_struct.h> 26 + # include <linux/unaligned/le_byteshift.h> 27 + # endif 23 28 # include <linux/unaligned/generic.h> 24 29 # define get_unaligned __get_unaligned_be 25 30 # define put_unaligned __put_unaligned_be