crypto: arm64/aes-ccm - Cache round keys and unroll AES loops

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

The CCM code as originally written attempted to use as few NEON
registers as possible, to avoid having to eagerly preserve/restore the
entire NEON register file at every call to kernel_neon_begin/end. At
that time, this API took a number of NEON registers as a parameter, and
only preserved that many registers.

Today, the NEON register file is restored lazily, and the old API is
long gone. This means we can use as many NEON registers as we can make
meaningful use of, which means in the AES case that we can keep all
round keys in registers rather than reloading each of them for each AES
block processed.

On Cortex-A53, this results in a speedup of more than 50%. (From 4
cycles per byte to 2.6 cycles per byte)

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Ard Biesheuvel and committed by

Herbert Xu 2 years ago 565def15 948ffc66

+38 -57

1 changed file

expand all

arch

arm64

crypto

aes-ce-ccm-core.S

+38 -57

arch/arm64/crypto/aes-ce-ccm-core.S

··· 14 14 .text 15 15 .arch armv8-a+crypto 16 16 17 + .macro load_round_keys, rk, nr, tmp 18 + sub w\tmp, \nr, #10 19 + add \tmp, \rk, w\tmp, sxtw #4 20 + ld1 {v10.4s-v13.4s}, [\rk] 21 + ld1 {v14.4s-v17.4s}, [\tmp], #64 22 + ld1 {v18.4s-v21.4s}, [\tmp], #64 23 + ld1 {v3.4s-v5.4s}, [\tmp] 24 + .endm 25 + 26 + .macro dround, va, vb, vk 27 + aese \va\().16b, \vk\().16b 28 + aesmc \va\().16b, \va\().16b 29 + aese \vb\().16b, \vk\().16b 30 + aesmc \vb\().16b, \vb\().16b 31 + .endm 32 + 33 + .macro aes_encrypt, va, vb, nr 34 + tbz \nr, #2, .L\@ 35 + dround \va, \vb, v10 36 + dround \va, \vb, v11 37 + tbz \nr, #1, .L\@ 38 + dround \va, \vb, v12 39 + dround \va, \vb, v13 40 + .L\@: .irp v, v14, v15, v16, v17, v18, v19, v20, v21, v3 41 + dround \va, \vb, \v 42 + .endr 43 + aese \va\().16b, v4.16b 44 + aese \vb\().16b, v4.16b 45 + .endm 46 + 17 47 /* 18 48 * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[], 19 49 * u32 rounds); 20 50 */ 21 51 SYM_FUNC_START(ce_aes_ccm_final) 22 - ld1 {v3.4s}, [x2], #16 /* load first round key */ 23 52 ld1 {v0.16b}, [x0] /* load mac */ 24 - cmp w3, #12 /* which key size? */ 25 - sub w3, w3, #2 /* modified # of rounds */ 26 53 ld1 {v1.16b}, [x1] /* load 1st ctriv */ 27 - bmi 0f 28 - bne 3f 29 - mov v5.16b, v3.16b 30 - b 2f 31 - 0: mov v4.16b, v3.16b 32 - 1: ld1 {v5.4s}, [x2], #16 /* load next round key */ 33 - aese v0.16b, v4.16b 34 - aesmc v0.16b, v0.16b 35 - aese v1.16b, v4.16b 36 - aesmc v1.16b, v1.16b 37 - 2: ld1 {v3.4s}, [x2], #16 /* load next round key */ 38 - aese v0.16b, v5.16b 39 - aesmc v0.16b, v0.16b 40 - aese v1.16b, v5.16b 41 - aesmc v1.16b, v1.16b 42 - 3: ld1 {v4.4s}, [x2], #16 /* load next round key */ 43 - subs w3, w3, #3 44 - aese v0.16b, v3.16b 45 - aesmc v0.16b, v0.16b 46 - aese v1.16b, v3.16b 47 - aesmc v1.16b, v1.16b 48 - bpl 1b 49 - aese v0.16b, v4.16b 50 - aese v1.16b, v4.16b 54 + 55 + aes_encrypt v0, v1, w3 56 + 51 57 /* final round key cancels out */ 52 58 eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */ 53 59 st1 {v0.16b}, [x0] /* store result */ ··· 61 55 SYM_FUNC_END(ce_aes_ccm_final) 62 56 63 57 .macro aes_ccm_do_crypt,enc 58 + load_round_keys x3, w4, x10 59 + 64 60 cbz x2, 5f 65 61 ldr x8, [x6, #8] /* load lower ctr */ 66 62 ld1 {v0.16b}, [x5] /* load mac */ ··· 72 64 prfm pldl1strm, [x1] 73 65 add x8, x8, #1 74 66 rev x9, x8 75 - cmp w4, #12 /* which key size? */ 76 - sub w7, w4, #2 /* get modified # of rounds */ 77 67 ins v1.d[1], x9 /* no carry in lower ctr */ 78 - ld1 {v3.4s}, [x3] /* load first round key */ 79 - add x10, x3, #16 80 - bmi 1f 81 - bne 4f 82 - mov v5.16b, v3.16b 83 - b 3f 84 - 1: mov v4.16b, v3.16b 85 - ld1 {v5.4s}, [x10], #16 /* load 2nd round key */ 86 - 2: /* inner loop: 3 rounds, 2x interleaved */ 87 - aese v0.16b, v4.16b 88 - aesmc v0.16b, v0.16b 89 - aese v1.16b, v4.16b 90 - aesmc v1.16b, v1.16b 91 - 3: ld1 {v3.4s}, [x10], #16 /* load next round key */ 92 - aese v0.16b, v5.16b 93 - aesmc v0.16b, v0.16b 94 - aese v1.16b, v5.16b 95 - aesmc v1.16b, v1.16b 96 - 4: ld1 {v4.4s}, [x10], #16 /* load next round key */ 97 - subs w7, w7, #3 98 - aese v0.16b, v3.16b 99 - aesmc v0.16b, v0.16b 100 - aese v1.16b, v3.16b 101 - aesmc v1.16b, v1.16b 102 - ld1 {v5.4s}, [x10], #16 /* load next round key */ 103 - bpl 2b 104 - aese v0.16b, v4.16b 105 - aese v1.16b, v4.16b 68 + 69 + aes_encrypt v0, v1, w4 70 + 106 71 subs w2, w2, #16 107 72 bmi 6f /* partial block? */ 108 73 ld1 {v2.16b}, [x1], #16 /* load next input block */