crypto: arm64/aes-neonbs-xts - use plain NEON for non-power-of-2 input sizes

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Even though the kernel's implementations of AES-XTS were updated to
implement ciphertext stealing and can operate on inputs of any size
larger than or equal to the AES block size, this feature is rarely used
in practice.

In fact, in the kernel, AES-XTS is only used to operate on 4096 or 512
byte blocks, which means that not only the ciphertext stealing is
effectively dead code, the logic in the bit sliced NEON implementation
to deal with fewer than 8 blocks at a time is also never used.

Since the bit-sliced NEON driver already depends on the plain NEON
version, which is slower but can operate on smaller data quantities more
straightforwardly, let's fallback to the plain NEON implementation of
XTS for any residual inputs that are not multiples of 128 bytes. This
allows us to remove a lot of complicated logic that rarely gets
exercised in practice.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Ard Biesheuvel and committed by

Herbert Xu 4 years ago dfc6031e fc074e13

+57 -108

2 changed files

expand all

arch

arm64

crypto

aes-neonbs-core.S

aes-neonbs-glue.c

+40 -92

arch/arm64/crypto/aes-neonbs-core.S

··· 735 735 * int blocks, u8 iv[]) 736 736 */ 737 737 SYM_FUNC_START_LOCAL(__xts_crypt8) 738 - mov x6, #1 739 - lsl x6, x6, x23 740 - subs w23, w23, #8 741 - csel x23, x23, xzr, pl 742 - csel x6, x6, xzr, mi 738 + movi v18.2s, #0x1 739 + movi v19.2s, #0x87 740 + uzp1 v18.4s, v18.4s, v19.4s 743 741 744 - ld1 {v0.16b}, [x20], #16 745 - next_tweak v26, v25, v30, v31 742 + ld1 {v0.16b-v3.16b}, [x1], #64 743 + ld1 {v4.16b-v7.16b}, [x1], #64 744 + 745 + next_tweak v26, v25, v18, v19 746 + next_tweak v27, v26, v18, v19 747 + next_tweak v28, v27, v18, v19 748 + next_tweak v29, v28, v18, v19 749 + next_tweak v30, v29, v18, v19 750 + next_tweak v31, v30, v18, v19 751 + next_tweak v16, v31, v18, v19 752 + next_tweak v17, v16, v18, v19 753 + 746 754 eor v0.16b, v0.16b, v25.16b 747 - tbnz x6, #1, 0f 748 - 749 - ld1 {v1.16b}, [x20], #16 750 - next_tweak v27, v26, v30, v31 751 755 eor v1.16b, v1.16b, v26.16b 752 - tbnz x6, #2, 0f 753 - 754 - ld1 {v2.16b}, [x20], #16 755 - next_tweak v28, v27, v30, v31 756 756 eor v2.16b, v2.16b, v27.16b 757 - tbnz x6, #3, 0f 758 - 759 - ld1 {v3.16b}, [x20], #16 760 - next_tweak v29, v28, v30, v31 761 757 eor v3.16b, v3.16b, v28.16b 762 - tbnz x6, #4, 0f 763 - 764 - ld1 {v4.16b}, [x20], #16 765 - str q29, [sp, #.Lframe_local_offset] 766 758 eor v4.16b, v4.16b, v29.16b 767 - next_tweak v29, v29, v30, v31 768 - tbnz x6, #5, 0f 759 + eor v5.16b, v5.16b, v30.16b 760 + eor v6.16b, v6.16b, v31.16b 761 + eor v7.16b, v7.16b, v16.16b 769 762 770 - ld1 {v5.16b}, [x20], #16 771 - str q29, [sp, #.Lframe_local_offset + 16] 772 - eor v5.16b, v5.16b, v29.16b 773 - next_tweak v29, v29, v30, v31 774 - tbnz x6, #6, 0f 763 + stp q16, q17, [sp, #16] 775 764 776 - ld1 {v6.16b}, [x20], #16 777 - str q29, [sp, #.Lframe_local_offset + 32] 778 - eor v6.16b, v6.16b, v29.16b 779 - next_tweak v29, v29, v30, v31 780 - tbnz x6, #7, 0f 781 - 782 - ld1 {v7.16b}, [x20], #16 783 - str q29, [sp, #.Lframe_local_offset + 48] 784 - eor v7.16b, v7.16b, v29.16b 785 - next_tweak v29, v29, v30, v31 786 - 787 - 0: mov bskey, x21 788 - mov rounds, x22 765 + mov bskey, x2 766 + mov rounds, x3 789 767 br x16 790 768 SYM_FUNC_END(__xts_crypt8) 791 769 792 770 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 793 - frame_push 6, 64 771 + stp x29, x30, [sp, #-48]! 772 + mov x29, sp 794 773 795 - mov x19, x0 796 - mov x20, x1 797 - mov x21, x2 798 - mov x22, x3 799 - mov x23, x4 800 - mov x24, x5 774 + ld1 {v25.16b}, [x5] 801 775 802 - movi v30.2s, #0x1 803 - movi v25.2s, #0x87 804 - uzp1 v30.4s, v30.4s, v25.4s 805 - ld1 {v25.16b}, [x24] 806 - 807 - 99: adr x16, \do8 776 + 0: adr x16, \do8 808 777 bl __xts_crypt8 809 778 810 - ldp q16, q17, [sp, #.Lframe_local_offset] 811 - ldp q18, q19, [sp, #.Lframe_local_offset + 32] 779 + eor v16.16b, \o0\().16b, v25.16b 780 + eor v17.16b, \o1\().16b, v26.16b 781 + eor v18.16b, \o2\().16b, v27.16b 782 + eor v19.16b, \o3\().16b, v28.16b 812 783 813 - eor \o0\().16b, \o0\().16b, v25.16b 814 - eor \o1\().16b, \o1\().16b, v26.16b 815 - eor \o2\().16b, \o2\().16b, v27.16b 816 - eor \o3\().16b, \o3\().16b, v28.16b 784 + ldp q24, q25, [sp, #16] 817 785 818 - st1 {\o0\().16b}, [x19], #16 819 - mov v25.16b, v26.16b 820 - tbnz x6, #1, 1f 821 - st1 {\o1\().16b}, [x19], #16 822 - mov v25.16b, v27.16b 823 - tbnz x6, #2, 1f 824 - st1 {\o2\().16b}, [x19], #16 825 - mov v25.16b, v28.16b 826 - tbnz x6, #3, 1f 827 - st1 {\o3\().16b}, [x19], #16 828 - mov v25.16b, v29.16b 829 - tbnz x6, #4, 1f 786 + eor v20.16b, \o4\().16b, v29.16b 787 + eor v21.16b, \o5\().16b, v30.16b 788 + eor v22.16b, \o6\().16b, v31.16b 789 + eor v23.16b, \o7\().16b, v24.16b 830 790 831 - eor \o4\().16b, \o4\().16b, v16.16b 832 - eor \o5\().16b, \o5\().16b, v17.16b 833 - eor \o6\().16b, \o6\().16b, v18.16b 834 - eor \o7\().16b, \o7\().16b, v19.16b 791 + st1 {v16.16b-v19.16b}, [x0], #64 792 + st1 {v20.16b-v23.16b}, [x0], #64 835 793 836 - st1 {\o4\().16b}, [x19], #16 837 - tbnz x6, #5, 1f 838 - st1 {\o5\().16b}, [x19], #16 839 - tbnz x6, #6, 1f 840 - st1 {\o6\().16b}, [x19], #16 841 - tbnz x6, #7, 1f 842 - st1 {\o7\().16b}, [x19], #16 794 + subs x4, x4, #8 795 + b.gt 0b 843 796 844 - cbz x23, 1f 845 - st1 {v25.16b}, [x24] 846 - 847 - b 99b 848 - 849 - 1: st1 {v25.16b}, [x24] 850 - frame_pop 797 + st1 {v25.16b}, [x5] 798 + ldp x29, x30, [sp], #48 851 799 ret 852 800 .endm 853 801

+17 -16

arch/arm64/crypto/aes-neonbs-glue.c

··· 302 302 return err; 303 303 304 304 while (walk.nbytes >= AES_BLOCK_SIZE) { 305 - unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; 306 - 307 - if (walk.nbytes < walk.total || walk.nbytes % AES_BLOCK_SIZE) 308 - blocks = round_down(blocks, 309 - walk.stride / AES_BLOCK_SIZE); 310 - 305 + int blocks = (walk.nbytes / AES_BLOCK_SIZE) & ~7; 311 306 out = walk.dst.virt.addr; 312 307 in = walk.src.virt.addr; 313 308 nbytes = walk.nbytes; 314 309 315 310 kernel_neon_begin(); 316 - if (likely(blocks > 6)) { /* plain NEON is faster otherwise */ 317 - if (first) 311 + if (blocks >= 8) { 312 + if (first == 1) 318 313 neon_aes_ecb_encrypt(walk.iv, walk.iv, 319 314 ctx->twkey, 320 315 ctx->key.rounds, 1); 321 - first = 0; 316 + first = 2; 322 317 323 318 fn(out, in, ctx->key.rk, ctx->key.rounds, blocks, 324 319 walk.iv); ··· 322 327 in += blocks * AES_BLOCK_SIZE; 323 328 nbytes -= blocks * AES_BLOCK_SIZE; 324 329 } 325 - 326 - if (walk.nbytes == walk.total && nbytes > 0) 327 - goto xts_tail; 328 - 330 + if (walk.nbytes == walk.total && nbytes > 0) { 331 + if (encrypt) 332 + neon_aes_xts_encrypt(out, in, ctx->cts.key_enc, 333 + ctx->key.rounds, nbytes, 334 + ctx->twkey, walk.iv, first); 335 + else 336 + neon_aes_xts_decrypt(out, in, ctx->cts.key_dec, 337 + ctx->key.rounds, nbytes, 338 + ctx->twkey, walk.iv, first); 339 + nbytes = first = 0; 340 + } 329 341 kernel_neon_end(); 330 342 err = skcipher_walk_done(&walk, nbytes); 331 343 } ··· 357 355 nbytes = walk.nbytes; 358 356 359 357 kernel_neon_begin(); 360 - xts_tail: 361 358 if (encrypt) 362 359 neon_aes_xts_encrypt(out, in, ctx->cts.key_enc, ctx->key.rounds, 363 - nbytes, ctx->twkey, walk.iv, first ?: 2); 360 + nbytes, ctx->twkey, walk.iv, first); 364 361 else 365 362 neon_aes_xts_decrypt(out, in, ctx->cts.key_dec, ctx->key.rounds, 366 - nbytes, ctx->twkey, walk.iv, first ?: 2); 363 + nbytes, ctx->twkey, walk.iv, first); 367 364 kernel_neon_end(); 368 365 369 366 return skcipher_walk_done(&walk, 0);