Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

crypto: arm64/aes-neon-ctr - improve handling of single tail block

Instead of falling back to C code to do a memcpy of the output of the
last block, handle this in the asm code directly if possible, which is
the case if the entire input is longer than 16 bytes.

Cc: Nathan Huckleberry <nhuck@google.com>
Cc: Eric Biggers <ebiggers@google.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Ard Biesheuvel and committed by
Herbert Xu
8daa399e e236ab0d

+20 -19
+7 -14
arch/arm64/crypto/aes-glue.c
··· 24 24 #ifdef USE_V8_CRYPTO_EXTENSIONS 25 25 #define MODE "ce" 26 26 #define PRIO 300 27 - #define STRIDE 5 28 27 #define aes_expandkey ce_aes_expandkey 29 28 #define aes_ecb_encrypt ce_aes_ecb_encrypt 30 29 #define aes_ecb_decrypt ce_aes_ecb_decrypt ··· 41 42 #else 42 43 #define MODE "neon" 43 44 #define PRIO 200 44 - #define STRIDE 4 45 45 #define aes_ecb_encrypt neon_aes_ecb_encrypt 46 46 #define aes_ecb_decrypt neon_aes_ecb_decrypt 47 47 #define aes_cbc_encrypt neon_aes_cbc_encrypt ··· 87 89 int rounds, int bytes, u8 const iv[]); 88 90 89 91 asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], 90 - int rounds, int bytes, u8 ctr[], u8 finalbuf[]); 92 + int rounds, int bytes, u8 ctr[]); 91 93 92 94 asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], 93 95 int rounds, int bytes, u32 const rk2[], u8 iv[], ··· 456 458 unsigned int nbytes = walk.nbytes; 457 459 u8 *dst = walk.dst.virt.addr; 458 460 u8 buf[AES_BLOCK_SIZE]; 459 - unsigned int tail; 460 461 461 462 if (unlikely(nbytes < AES_BLOCK_SIZE)) 462 - src = memcpy(buf, src, nbytes); 463 + src = dst = memcpy(buf + sizeof(buf) - nbytes, 464 + src, nbytes); 463 465 else if (nbytes < walk.total) 464 466 nbytes &= ~(AES_BLOCK_SIZE - 1); 465 467 466 468 kernel_neon_begin(); 467 469 aes_ctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes, 468 - walk.iv, buf); 470 + walk.iv); 469 471 kernel_neon_end(); 470 472 471 - tail = nbytes % (STRIDE * AES_BLOCK_SIZE); 472 - if (tail > 0 && tail < AES_BLOCK_SIZE) 473 - /* 474 - * The final partial block could not be returned using 475 - * an overlapping store, so it was passed via buf[] 476 - * instead. 477 - */ 478 - memcpy(dst + nbytes - tail, buf, tail); 473 + if (unlikely(nbytes < AES_BLOCK_SIZE)) 474 + memcpy(walk.dst.virt.addr, 475 + buf + sizeof(buf) - nbytes, nbytes); 479 476 480 477 err = skcipher_walk_done(&walk, walk.nbytes - nbytes); 481 478 }
+13 -5
arch/arm64/crypto/aes-modes.S
··· 321 321 322 322 /* 323 323 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 324 - * int bytes, u8 ctr[], u8 finalbuf[]) 324 + * int bytes, u8 ctr[]) 325 325 */ 326 326 327 327 AES_FUNC_START(aes_ctr_encrypt) ··· 414 414 .Lctrtail: 415 415 /* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */ 416 416 mov x16, #16 417 - ands x13, x4, #0xf 418 - csel x13, x13, x16, ne 417 + ands x6, x4, #0xf 418 + csel x13, x6, x16, ne 419 419 420 420 ST5( cmp w4, #64 - (MAX_STRIDE << 4) ) 421 421 ST5( csel x14, x16, xzr, gt ) ··· 424 424 cmp w4, #32 - (MAX_STRIDE << 4) 425 425 csel x16, x16, xzr, gt 426 426 cmp w4, #16 - (MAX_STRIDE << 4) 427 - ble .Lctrtail1x 428 427 429 428 adr_l x12, .Lcts_permute_table 430 429 add x12, x12, x13 430 + ble .Lctrtail1x 431 431 432 432 ST5( ld1 {v5.16b}, [x1], x14 ) 433 433 ld1 {v6.16b}, [x1], x15 ··· 462 462 b .Lctrout 463 463 464 464 .Lctrtail1x: 465 - csel x0, x0, x6, eq // use finalbuf if less than a full block 465 + sub x7, x6, #16 466 + csel x6, x6, x7, eq 467 + add x1, x1, x6 468 + add x0, x0, x6 466 469 ld1 {v5.16b}, [x1] 470 + ld1 {v6.16b}, [x0] 467 471 ST5( mov v3.16b, v4.16b ) 468 472 encrypt_block v3, w3, x2, x8, w7 473 + ld1 {v10.16b-v11.16b}, [x12] 474 + tbl v3.16b, {v3.16b}, v10.16b 475 + sshr v11.16b, v11.16b, #7 469 476 eor v5.16b, v5.16b, v3.16b 477 + bif v5.16b, v6.16b, v11.16b 470 478 st1 {v5.16b}, [x0] 471 479 b .Lctrout 472 480 AES_FUNC_END(aes_ctr_encrypt)