crypto: arm/aes-neonbs-ctr - deal with non-multiples of AES block size

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Instead of falling back to C code to deal with the final bit of input
that is not a round multiple of the block size, handle this in the asm
code, permitting us to use overlapping loads and stores for performance,
and implement the 16-byte wide XOR using a single NEON instruction.

Since NEON loads and stores have a natural width of 16 bytes, we need to
handle inputs of less than 16 bytes in a special way, but this rarely
occurs in practice so it does not impact performance. All other input
sizes can be consumed directly by the NEON asm code, although it should
be noted that the core AES transform can still only process 128 bytes (8
AES blocks) at a time.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Ard Biesheuvel and committed by

Herbert Xu 4 years ago c8bf850e 8daa399e

+82 -68

2 changed files

expand all

arch

arm

crypto

aes-neonbs-core.S

aes-neonbs-glue.c

+68 -47

arch/arm/crypto/aes-neonbs-core.S

··· 758 758 ENDPROC(aesbs_cbc_decrypt) 759 759 760 760 .macro next_ctr, q 761 - vmov.32 \q\()h[1], r10 761 + vmov \q\()h, r9, r10 762 762 adds r10, r10, #1 763 - vmov.32 \q\()h[0], r9 764 763 adcs r9, r9, #0 765 - vmov.32 \q\()l[1], r8 764 + vmov \q\()l, r7, r8 766 765 adcs r8, r8, #0 767 - vmov.32 \q\()l[0], r7 768 766 adc r7, r7, #0 769 767 vrev32.8 \q, \q 770 768 .endm 771 769 772 770 /* 773 771 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], 774 - * int rounds, int blocks, u8 ctr[], u8 final[]) 772 + * int rounds, int bytes, u8 ctr[]) 775 773 */ 776 774 ENTRY(aesbs_ctr_encrypt) 777 775 mov ip, sp 778 776 push {r4-r10, lr} 779 777 780 - ldm ip, {r5-r7} // load args 4-6 781 - teq r7, #0 782 - addne r5, r5, #1 // one extra block if final != 0 783 - 778 + ldm ip, {r5, r6} // load args 4-5 784 779 vld1.8 {q0}, [r6] // load counter 785 780 vrev32.8 q1, q0 786 781 vmov r9, r10, d3 ··· 787 792 adc r7, r7, #0 788 793 789 794 99: vmov q1, q0 790 - vmov q2, q0 791 - vmov q3, q0 792 - vmov q4, q0 793 - vmov q5, q0 794 - vmov q6, q0 795 - vmov q7, q0 796 - 797 - adr ip, 0f 798 795 sub lr, r5, #1 799 - and lr, lr, #7 800 - cmp r5, #8 801 - sub ip, ip, lr, lsl #5 802 - sub ip, ip, lr, lsl #2 803 - movlt pc, ip // computed goto if blocks < 8 796 + vmov q2, q0 797 + adr ip, 0f 798 + vmov q3, q0 799 + and lr, lr, #112 800 + vmov q4, q0 801 + cmp r5, #112 802 + vmov q5, q0 803 + sub ip, ip, lr, lsl #1 804 + vmov q6, q0 805 + add ip, ip, lr, lsr #2 806 + vmov q7, q0 807 + movle pc, ip // computed goto if bytes < 112 804 808 805 809 next_ctr q1 806 810 next_ctr q2 ··· 814 820 bl aesbs_encrypt8 815 821 816 822 adr ip, 1f 817 - and lr, r5, #7 818 - cmp r5, #8 819 - movgt r4, #0 820 - ldrle r4, [sp, #40] // load final in the last round 821 - sub ip, ip, lr, lsl #2 822 - movlt pc, ip // computed goto if blocks < 8 823 + sub lr, r5, #1 824 + cmp r5, #128 825 + bic lr, lr, #15 826 + ands r4, r5, #15 // preserves C flag 827 + teqcs r5, r5 // set Z flag if not last iteration 828 + sub ip, ip, lr, lsr #2 829 + rsb r4, r4, #16 830 + movcc pc, ip // computed goto if bytes < 128 823 831 824 832 vld1.8 {q8}, [r1]! 825 833 vld1.8 {q9}, [r1]! ··· 830 834 vld1.8 {q12}, [r1]! 831 835 vld1.8 {q13}, [r1]! 832 836 vld1.8 {q14}, [r1]! 833 - teq r4, #0 // skip last block if 'final' 834 - 1: bne 2f 837 + 1: subne r1, r1, r4 835 838 vld1.8 {q15}, [r1]! 836 839 837 - 2: adr ip, 3f 838 - cmp r5, #8 839 - sub ip, ip, lr, lsl #3 840 - movlt pc, ip // computed goto if blocks < 8 840 + add ip, ip, #2f - 1b 841 841 842 842 veor q0, q0, q8 843 - vst1.8 {q0}, [r0]! 844 843 veor q1, q1, q9 845 - vst1.8 {q1}, [r0]! 846 844 veor q4, q4, q10 847 - vst1.8 {q4}, [r0]! 848 845 veor q6, q6, q11 849 - vst1.8 {q6}, [r0]! 850 846 veor q3, q3, q12 851 - vst1.8 {q3}, [r0]! 852 847 veor q7, q7, q13 853 - vst1.8 {q7}, [r0]! 854 848 veor q2, q2, q14 849 + bne 3f 850 + veor q5, q5, q15 851 + 852 + movcc pc, ip // computed goto if bytes < 128 853 + 854 + vst1.8 {q0}, [r0]! 855 + vst1.8 {q1}, [r0]! 856 + vst1.8 {q4}, [r0]! 857 + vst1.8 {q6}, [r0]! 858 + vst1.8 {q3}, [r0]! 859 + vst1.8 {q7}, [r0]! 855 860 vst1.8 {q2}, [r0]! 856 - teq r4, #0 // skip last block if 'final' 857 - W(bne) 5f 858 - 3: veor q5, q5, q15 861 + 2: subne r0, r0, r4 859 862 vst1.8 {q5}, [r0]! 860 863 861 - 4: next_ctr q0 864 + next_ctr q0 862 865 863 - subs r5, r5, #8 866 + subs r5, r5, #128 864 867 bgt 99b 865 868 866 869 vst1.8 {q0}, [r6] 867 870 pop {r4-r10, pc} 868 871 869 - 5: vst1.8 {q5}, [r4] 870 - b 4b 872 + 3: adr lr, .Lpermute_table + 16 873 + cmp r5, #16 // Z flag remains cleared 874 + sub lr, lr, r4 875 + vld1.8 {q8-q9}, [lr] 876 + vtbl.8 d16, {q5}, d16 877 + vtbl.8 d17, {q5}, d17 878 + veor q5, q8, q15 879 + bcc 4f // have to reload prev if R5 < 16 880 + vtbx.8 d10, {q2}, d18 881 + vtbx.8 d11, {q2}, d19 882 + mov pc, ip // branch back to VST sequence 883 + 884 + 4: sub r0, r0, r4 885 + vshr.s8 q9, q9, #7 // create mask for VBIF 886 + vld1.8 {q8}, [r0] // reload 887 + vbif q5, q8, q9 888 + vst1.8 {q5}, [r0] 889 + pop {r4-r10, pc} 871 890 ENDPROC(aesbs_ctr_encrypt) 891 + 892 + .align 6 893 + .Lpermute_table: 894 + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 895 + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 896 + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 897 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 898 + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 899 + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 872 900 873 901 .macro next_tweak, out, in, const, tmp 874 902 vshr.s64 \tmp, \in, #63 ··· 908 888 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 909 889 * int blocks, u8 iv[], int reorder_last_tweak) 910 890 */ 891 + .align 6 911 892 __xts_prepare8: 912 893 vld1.8 {q14}, [r7] // load iv 913 894 vmov.i32 d30, #0x87 // compose tweak mask vector

+14 -21

arch/arm/crypto/aes-neonbs-glue.c

··· 37 37 int rounds, int blocks, u8 iv[]); 38 38 39 39 asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], 40 - int rounds, int blocks, u8 ctr[], u8 final[]); 40 + int rounds, int blocks, u8 ctr[]); 41 41 42 42 asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], 43 43 int rounds, int blocks, u8 iv[], int); ··· 243 243 err = skcipher_walk_virt(&walk, req, false); 244 244 245 245 while (walk.nbytes > 0) { 246 - unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; 247 - u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL; 246 + const u8 *src = walk.src.virt.addr; 247 + u8 *dst = walk.dst.virt.addr; 248 + int bytes = walk.nbytes; 248 249 249 - if (walk.nbytes < walk.total) { 250 - blocks = round_down(blocks, 251 - walk.stride / AES_BLOCK_SIZE); 252 - final = NULL; 253 - } 250 + if (unlikely(bytes < AES_BLOCK_SIZE)) 251 + src = dst = memcpy(buf + sizeof(buf) - bytes, 252 + src, bytes); 253 + else if (walk.nbytes < walk.total) 254 + bytes &= ~(8 * AES_BLOCK_SIZE - 1); 254 255 255 256 kernel_neon_begin(); 256 - aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr, 257 - ctx->rk, ctx->rounds, blocks, walk.iv, final); 257 + aesbs_ctr_encrypt(dst, src, ctx->rk, ctx->rounds, bytes, walk.iv); 258 258 kernel_neon_end(); 259 259 260 - if (final) { 261 - u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; 262 - u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; 260 + if (unlikely(bytes < AES_BLOCK_SIZE)) 261 + memcpy(walk.dst.virt.addr, 262 + buf + sizeof(buf) - bytes, bytes); 263 263 264 - crypto_xor_cpy(dst, src, final, 265 - walk.total % AES_BLOCK_SIZE); 266 - 267 - err = skcipher_walk_done(&walk, 0); 268 - break; 269 - } 270 - err = skcipher_walk_done(&walk, 271 - walk.nbytes - blocks * AES_BLOCK_SIZE); 264 + err = skcipher_walk_done(&walk, walk.nbytes - bytes); 272 265 } 273 266 274 267 return err;