crypto: arm64/aes-ccm - Merge encrypt and decrypt tail handling

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

The encryption and decryption code paths are mostly identical, except
for a small difference where the plaintext input into the MAC is taken
from either the input or the output block.

We can factor this in quite easily using a vector bit select, and a few
additional XORs, without the need for branches. This way, we can use the
same tail handling logic on the encrypt and decrypt code paths, allowing
further consolidation of the asm helpers in a subsequent patch.

(In the main loop, adding just a handful of ALU instructions results in
a noticeable performance hit [around 5% on Apple M2], so those routines
are kept separate)

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Ard Biesheuvel and committed by

Herbert Xu 2 years ago 71505288 565def15

+13 -13

1 changed file

expand all

arch

arm64

crypto

aes-ce-ccm-core.S

+13 -13

arch/arm64/crypto/aes-ce-ccm-core.S

··· 77 77 aes_encrypt v0, v1, w4 78 78 79 79 subs w2, w2, #16 80 - bmi 6f /* partial block? */ 80 + bmi ce_aes_ccm_crypt_tail 81 81 ld1 {v2.16b}, [x1], #16 /* load next input block */ 82 82 .if \enc == 1 83 83 eor v2.16b, v2.16b, v5.16b /* final round enc+mac */ ··· 93 93 st1 {v0.16b}, [x5] /* store mac */ 94 94 str x8, [x6, #8] /* store lsb end of ctr (BE) */ 95 95 5: ret 96 + .endm 96 97 97 - 6: eor v0.16b, v0.16b, v5.16b /* final round mac */ 98 + SYM_FUNC_START_LOCAL(ce_aes_ccm_crypt_tail) 99 + eor v0.16b, v0.16b, v5.16b /* final round mac */ 98 100 eor v1.16b, v1.16b, v5.16b /* final round enc */ 99 101 100 102 add x1, x1, w2, sxtw /* rewind the input pointer (w2 < 0) */ ··· 110 108 111 109 ld1 {v2.16b}, [x1] /* load a full block of input */ 112 110 tbl v1.16b, {v1.16b}, v7.16b /* move keystream to end of register */ 113 - .if \enc == 1 114 - tbl v7.16b, {v2.16b}, v9.16b /* copy plaintext to start of v7 */ 115 - eor v2.16b, v2.16b, v1.16b /* encrypt partial input block */ 116 - .else 117 - eor v2.16b, v2.16b, v1.16b /* decrypt partial input block */ 118 - tbl v7.16b, {v2.16b}, v9.16b /* copy plaintext to start of v7 */ 119 - .endif 120 - eor v0.16b, v0.16b, v7.16b /* fold plaintext into mac */ 121 - tbx v2.16b, {v6.16b}, v8.16b /* insert output from previous iteration */ 111 + eor v7.16b, v2.16b, v1.16b /* encrypt partial input block */ 112 + bif v2.16b, v7.16b, v22.16b /* select plaintext */ 113 + tbx v7.16b, {v6.16b}, v8.16b /* insert output from previous iteration */ 114 + tbl v2.16b, {v2.16b}, v9.16b /* copy plaintext to start of v2 */ 115 + eor v0.16b, v0.16b, v2.16b /* fold plaintext into mac */ 122 116 123 117 st1 {v0.16b}, [x5] /* store mac */ 124 - st1 {v2.16b}, [x0] /* store output block */ 118 + st1 {v7.16b}, [x0] /* store output block */ 125 119 ret 126 - .endm 120 + SYM_FUNC_END(ce_aes_ccm_crypt_tail) 127 121 128 122 /* 129 123 * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, ··· 130 132 * u8 ctr[]); 131 133 */ 132 134 SYM_FUNC_START(ce_aes_ccm_encrypt) 135 + movi v22.16b, #255 133 136 aes_ccm_do_crypt 1 134 137 SYM_FUNC_END(ce_aes_ccm_encrypt) 135 138 136 139 SYM_FUNC_START(ce_aes_ccm_decrypt) 140 + movi v22.16b, #0 137 141 aes_ccm_do_crypt 0 138 142 SYM_FUNC_END(ce_aes_ccm_decrypt) 139 143