crypto: arm64/aes-ce - implement 5 way interleave for ECB, CBC and CTR

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

This implements 5-way interleaving for ECB, CBC decryption and CTR,
resulting in a speedup of ~11% on Marvell ThunderX2, which has a
very deep pipeline and therefore a high issue latency for NEON
instructions operating on the same registers.

Note that XTS is left alone: implementing 5-way interleave there
would either involve spilling of the calculated tweaks to the
stack, or recalculating them after the encryption operation, and
doing either of those would most likely penalize low end cores.

For ECB, this is not a concern at all, given that we have plenty
of spare registers. For CTR and CBC decryption, we take advantage
of the fact that v16 is not used by the CE version of the code
(which is the only one targeted by the optimization), and so we
can reshuffle the code a bit and avoid having to spill to memory
(with the exception of one extra reload in the CBC routine)

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Ard Biesheuvel and committed by

Herbert Xu 6 years ago 7367bfeb e2174139

+75 -31

3 changed files

expand all

arch

arm64

crypto

aes-ce.S

aes-modes.S

aes-neon.S

arch/arm64/crypto/aes-ce.S

··· 18 18 .arch armv8-a+crypto 19 19 20 20 xtsmask .req v16 21 + cbciv .req v16 22 + vctr .req v16 21 23 22 24 .macro xts_reload_mask, tmp 23 25 .endm

+71 -31

arch/arm64/crypto/aes-modes.S

··· 17 17 #define MAX_STRIDE 4 18 18 #endif 19 19 20 + #if MAX_STRIDE == 4 21 + #define ST4(x...) x 22 + #define ST5(x...) 23 + #else 24 + #define ST4(x...) 25 + #define ST5(x...) x 26 + #endif 27 + 20 28 aes_encrypt_block4x: 21 29 encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 22 30 ret ··· 61 53 enc_prepare w3, x2, x5 62 54 63 55 .LecbencloopNx: 64 - subs w4, w4, #4 56 + subs w4, w4, #MAX_STRIDE 65 57 bmi .Lecbenc1x 66 58 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 67 - bl aes_encrypt_block4x 59 + ST4( bl aes_encrypt_block4x ) 60 + ST5( ld1 {v4.16b}, [x1], #16 ) 61 + ST5( bl aes_encrypt_block5x ) 68 62 st1 {v0.16b-v3.16b}, [x0], #64 63 + ST5( st1 {v4.16b}, [x0], #16 ) 69 64 b .LecbencloopNx 70 65 .Lecbenc1x: 71 - adds w4, w4, #4 66 + adds w4, w4, #MAX_STRIDE 72 67 beq .Lecbencout 73 68 .Lecbencloop: 74 69 ld1 {v0.16b}, [x1], #16 /* get next pt block */ ··· 92 81 dec_prepare w3, x2, x5 93 82 94 83 .LecbdecloopNx: 95 - subs w4, w4, #4 84 + subs w4, w4, #MAX_STRIDE 96 85 bmi .Lecbdec1x 97 86 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 98 - bl aes_decrypt_block4x 87 + ST4( bl aes_decrypt_block4x ) 88 + ST5( ld1 {v4.16b}, [x1], #16 ) 89 + ST5( bl aes_decrypt_block5x ) 99 90 st1 {v0.16b-v3.16b}, [x0], #64 91 + ST5( st1 {v4.16b}, [x0], #16 ) 100 92 b .LecbdecloopNx 101 93 .Lecbdec1x: 102 - adds w4, w4, #4 94 + adds w4, w4, #MAX_STRIDE 103 95 beq .Lecbdecout 104 96 .Lecbdecloop: 105 97 ld1 {v0.16b}, [x1], #16 /* get next ct block */ ··· 162 148 stp x29, x30, [sp, #-16]! 163 149 mov x29, sp 164 150 165 - ld1 {v7.16b}, [x5] /* get iv */ 151 + ld1 {cbciv.16b}, [x5] /* get iv */ 166 152 dec_prepare w3, x2, x6 167 153 168 154 .LcbcdecloopNx: 169 - subs w4, w4, #4 155 + subs w4, w4, #MAX_STRIDE 170 156 bmi .Lcbcdec1x 171 157 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 158 + #if MAX_STRIDE == 5 159 + ld1 {v4.16b}, [x1], #16 /* get 1 ct block */ 160 + mov v5.16b, v0.16b 161 + mov v6.16b, v1.16b 162 + mov v7.16b, v2.16b 163 + bl aes_decrypt_block5x 164 + sub x1, x1, #32 165 + eor v0.16b, v0.16b, cbciv.16b 166 + eor v1.16b, v1.16b, v5.16b 167 + ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */ 168 + ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ 169 + eor v2.16b, v2.16b, v6.16b 170 + eor v3.16b, v3.16b, v7.16b 171 + eor v4.16b, v4.16b, v5.16b 172 + #else 172 173 mov v4.16b, v0.16b 173 174 mov v5.16b, v1.16b 174 175 mov v6.16b, v2.16b 175 176 bl aes_decrypt_block4x 176 177 sub x1, x1, #16 177 - eor v0.16b, v0.16b, v7.16b 178 + eor v0.16b, v0.16b, cbciv.16b 178 179 eor v1.16b, v1.16b, v4.16b 179 - ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */ 180 + ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ 180 181 eor v2.16b, v2.16b, v5.16b 181 182 eor v3.16b, v3.16b, v6.16b 183 + #endif 182 184 st1 {v0.16b-v3.16b}, [x0], #64 185 + ST5( st1 {v4.16b}, [x0], #16 ) 183 186 b .LcbcdecloopNx 184 187 .Lcbcdec1x: 185 - adds w4, w4, #4 188 + adds w4, w4, #MAX_STRIDE 186 189 beq .Lcbcdecout 187 190 .Lcbcdecloop: 188 191 ld1 {v1.16b}, [x1], #16 /* get next ct block */ 189 192 mov v0.16b, v1.16b /* ...and copy to v0 */ 190 193 decrypt_block v0, w3, x2, x6, w7 191 - eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */ 192 - mov v7.16b, v1.16b /* ct is next iv */ 194 + eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */ 195 + mov cbciv.16b, v1.16b /* ct is next iv */ 193 196 st1 {v0.16b}, [x0], #16 194 197 subs w4, w4, #1 195 198 bne .Lcbcdecloop 196 199 .Lcbcdecout: 197 - st1 {v7.16b}, [x5] /* return iv */ 200 + st1 {cbciv.16b}, [x5] /* return iv */ 198 201 ldp x29, x30, [sp], #16 199 202 ret 200 203 AES_ENDPROC(aes_cbc_decrypt) ··· 305 274 mov x29, sp 306 275 307 276 enc_prepare w3, x2, x6 308 - ld1 {v4.16b}, [x5] 277 + ld1 {vctr.16b}, [x5] 309 278 310 - umov x6, v4.d[1] /* keep swabbed ctr in reg */ 279 + umov x6, vctr.d[1] /* keep swabbed ctr in reg */ 311 280 rev x6, x6 312 281 cmn w6, w4 /* 32 bit overflow? */ 313 282 bcs .Lctrloop 314 283 .LctrloopNx: 315 - subs w4, w4, #4 284 + subs w4, w4, #MAX_STRIDE 316 285 bmi .Lctr1x 317 286 add w7, w6, #1 318 - mov v0.16b, v4.16b 287 + mov v0.16b, vctr.16b 319 288 add w8, w6, #2 320 - mov v1.16b, v4.16b 289 + mov v1.16b, vctr.16b 321 290 add w9, w6, #3 322 - mov v2.16b, v4.16b 291 + mov v2.16b, vctr.16b 292 + add w9, w6, #3 323 293 rev w7, w7 324 - mov v3.16b, v4.16b 294 + mov v3.16b, vctr.16b 325 295 rev w8, w8 296 + ST5( mov v4.16b, vctr.16b ) 326 297 mov v1.s[3], w7 327 298 rev w9, w9 299 + ST5( add w10, w6, #4 ) 328 300 mov v2.s[3], w8 301 + ST5( rev w10, w10 ) 329 302 mov v3.s[3], w9 303 + ST5( mov v4.s[3], w10 ) 330 304 ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ 331 - bl aes_encrypt_block4x 305 + ST4( bl aes_encrypt_block4x ) 306 + ST5( bl aes_encrypt_block5x ) 332 307 eor v0.16b, v5.16b, v0.16b 333 - ld1 {v5.16b}, [x1], #16 /* get 1 input block */ 308 + ST4( ld1 {v5.16b}, [x1], #16 ) 334 309 eor v1.16b, v6.16b, v1.16b 310 + ST5( ld1 {v5.16b-v6.16b}, [x1], #32 ) 335 311 eor v2.16b, v7.16b, v2.16b 336 312 eor v3.16b, v5.16b, v3.16b 313 + ST5( eor v4.16b, v6.16b, v4.16b ) 337 314 st1 {v0.16b-v3.16b}, [x0], #64 338 - add x6, x6, #4 315 + ST5( st1 {v4.16b}, [x0], #16 ) 316 + add x6, x6, #MAX_STRIDE 339 317 rev x7, x6 340 - ins v4.d[1], x7 318 + ins vctr.d[1], x7 341 319 cbz w4, .Lctrout 342 320 b .LctrloopNx 343 321 .Lctr1x: 344 - adds w4, w4, #4 322 + adds w4, w4, #MAX_STRIDE 345 323 beq .Lctrout 346 324 .Lctrloop: 347 - mov v0.16b, v4.16b 325 + mov v0.16b, vctr.16b 348 326 encrypt_block v0, w3, x2, x8, w7 349 327 350 328 adds x6, x6, #1 /* increment BE ctr */ 351 329 rev x7, x6 352 - ins v4.d[1], x7 330 + ins vctr.d[1], x7 353 331 bcs .Lctrcarry /* overflow? */ 354 332 355 333 .Lctrcarrydone: ··· 370 330 bne .Lctrloop 371 331 372 332 .Lctrout: 373 - st1 {v4.16b}, [x5] /* return next CTR value */ 333 + st1 {vctr.16b}, [x5] /* return next CTR value */ 374 334 ldp x29, x30, [sp], #16 375 335 ret 376 336 ··· 379 339 b .Lctrout 380 340 381 341 .Lctrcarry: 382 - umov x7, v4.d[0] /* load upper word of ctr */ 342 + umov x7, vctr.d[0] /* load upper word of ctr */ 383 343 rev x7, x7 /* ... to handle the carry */ 384 344 add x7, x7, #1 385 345 rev x7, x7 386 - ins v4.d[0], x7 346 + ins vctr.d[0], x7 387 347 b .Lctrcarrydone 388 348 AES_ENDPROC(aes_ctr_encrypt) 389 349

arch/arm64/crypto/aes-neon.S

··· 15 15 #define AES_ENDPROC(func) ENDPROC(neon_ ## func) 16 16 17 17 xtsmask .req v7 18 + cbciv .req v7 19 + vctr .req v4 18 20 19 21 .macro xts_reload_mask, tmp 20 22 xts_load_mask \tmp