crypto: arm64/aes-ce - Simplify round key load sequence

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Tweak the round key logic so that they can be loaded using a single
branchless sequence using overlapping loads. This is shorter and
simpler, and puts the conditional branches based on the key size further
apart, which might benefit microarchitectures that cannot record taken
branches at every instruction. For these branches, use test-bit-branch
instructions that don't clobber the condition flags.

Note that none of this has any impact on performance, positive or
otherwise (and the branch prediction benefit would only benefit AES-192
which nobody uses). It does make for nicer code, though.

While at it, use \@ to generate the labels inside the macros, which is
more robust than using fixed numbers, which could clash inadvertently.
Also, bring aes-neon.S in line with these changes, including the switch
to test-and-branch instructions, to avoid surprises in the future when
we might start relying on the condition flags being preserved in the
chaining mode wrappers in aes-modes.S

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Ard Biesheuvel and committed by

Herbert Xu 2 years ago 571e557c 3f4d1482

+24 -30

2 changed files

expand all

arch

arm64

crypto

aes-ce.S

aes-neon.S

+14 -20

arch/arm64/crypto/aes-ce.S

··· 25 25 .endm 26 26 27 27 /* preload all round keys */ 28 - .macro load_round_keys, rounds, rk 29 - cmp \rounds, #12 30 - blo 2222f /* 128 bits */ 31 - beq 1111f /* 192 bits */ 32 - ld1 {v17.4s-v18.4s}, [\rk], #32 33 - 1111: ld1 {v19.4s-v20.4s}, [\rk], #32 34 - 2222: ld1 {v21.4s-v24.4s}, [\rk], #64 35 - ld1 {v25.4s-v28.4s}, [\rk], #64 36 - ld1 {v29.4s-v31.4s}, [\rk] 28 + .macro load_round_keys, rk, nr, tmp 29 + add \tmp, \rk, \nr, sxtw #4 30 + sub \tmp, \tmp, #160 31 + ld1 {v17.4s-v20.4s}, [\rk] 32 + ld1 {v21.4s-v24.4s}, [\tmp], #64 33 + ld1 {v25.4s-v28.4s}, [\tmp], #64 34 + ld1 {v29.4s-v31.4s}, [\tmp] 37 35 .endm 38 36 39 37 /* prepare for encryption with key in rk[] */ 40 38 .macro enc_prepare, rounds, rk, temp 41 - mov \temp, \rk 42 - load_round_keys \rounds, \temp 39 + load_round_keys \rk, \rounds, \temp 43 40 .endm 44 41 45 42 /* prepare for encryption (again) but with new key in rk[] */ 46 43 .macro enc_switch_key, rounds, rk, temp 47 - mov \temp, \rk 48 - load_round_keys \rounds, \temp 44 + load_round_keys \rk, \rounds, \temp 49 45 .endm 50 46 51 47 /* prepare for decryption with key in rk[] */ 52 48 .macro dec_prepare, rounds, rk, temp 53 - mov \temp, \rk 54 - load_round_keys \rounds, \temp 49 + load_round_keys \rk, \rounds, \temp 55 50 .endm 56 51 57 52 .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3, i4 ··· 105 110 106 111 /* up to 5 interleaved blocks */ 107 112 .macro do_block_Nx, enc, rounds, i0, i1, i2, i3, i4 108 - cmp \rounds, #12 109 - blo 2222f /* 128 bits */ 110 - beq 1111f /* 192 bits */ 113 + tbz \rounds, #2, .L\@ /* 128 bits */ 111 114 round_Nx \enc, v17, \i0, \i1, \i2, \i3, \i4 112 115 round_Nx \enc, v18, \i0, \i1, \i2, \i3, \i4 113 - 1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3, \i4 116 + tbz \rounds, #1, .L\@ /* 192 bits */ 117 + round_Nx \enc, v19, \i0, \i1, \i2, \i3, \i4 114 118 round_Nx \enc, v20, \i0, \i1, \i2, \i3, \i4 115 - 2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 119 + .L\@: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 116 120 round_Nx \enc, \key, \i0, \i1, \i2, \i3, \i4 117 121 .endr 118 122 fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3, \i4

+10 -10

arch/arm64/crypto/aes-neon.S

··· 99 99 ld1 {v15.4s}, [\rk] 100 100 add \rkp, \rk, #16 101 101 mov \i, \rounds 102 - 1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ 102 + .La\@: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ 103 103 movi v15.16b, #0x40 104 104 tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */ 105 105 sub_bytes \in 106 - subs \i, \i, #1 106 + sub \i, \i, #1 107 107 ld1 {v15.4s}, [\rkp], #16 108 - beq 2222f 108 + cbz \i, .Lb\@ 109 109 mix_columns \in, \enc 110 - b 1111b 111 - 2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ 110 + b .La\@ 111 + .Lb\@: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ 112 112 .endm 113 113 114 114 .macro encrypt_block, in, rounds, rk, rkp, i ··· 206 206 ld1 {v15.4s}, [\rk] 207 207 add \rkp, \rk, #16 208 208 mov \i, \rounds 209 - 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ 209 + .La\@: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ 210 210 eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ 211 211 eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ 212 212 eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ ··· 216 216 tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */ 217 217 tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */ 218 218 sub_bytes_4x \in0, \in1, \in2, \in3 219 - subs \i, \i, #1 219 + sub \i, \i, #1 220 220 ld1 {v15.4s}, [\rkp], #16 221 - beq 2222f 221 + cbz \i, .Lb\@ 222 222 mix_columns_2x \in0, \in1, \enc 223 223 mix_columns_2x \in2, \in3, \enc 224 - b 1111b 225 - 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ 224 + b .La\@ 225 + .Lb\@: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ 226 226 eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ 227 227 eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ 228 228 eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */