Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

crypto: arm64/aes-neonbs-ctr - fallback to plain NEON for final chunk

Instead of processing the entire input with the 8-way bit sliced
algorithm, which is sub-optimal for inputs that are not a multiple of
128 bytes in size, invoke the plain NEON version of CTR for the
remainder of the input after processing the bulk using 128 byte strides.

This allows us to greatly simplify the asm code that implements CTR, and
get rid of all the branches and special code paths. It also gains us a
couple of percent of performance.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Ard Biesheuvel and committed by
Herbert Xu
fc074e13 c8bf850e

+55 -142
+1
arch/arm64/crypto/aes-glue.c
··· 976 976 module_init(aes_init); 977 977 EXPORT_SYMBOL(neon_aes_ecb_encrypt); 978 978 EXPORT_SYMBOL(neon_aes_cbc_encrypt); 979 + EXPORT_SYMBOL(neon_aes_ctr_encrypt); 979 980 EXPORT_SYMBOL(neon_aes_xts_encrypt); 980 981 EXPORT_SYMBOL(neon_aes_xts_decrypt); 981 982 #endif
+25 -107
arch/arm64/crypto/aes-neonbs-core.S
··· 869 869 870 870 /* 871 871 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], 872 - * int rounds, int blocks, u8 iv[], u8 final[]) 872 + * int rounds, int blocks, u8 iv[]) 873 873 */ 874 874 SYM_FUNC_START(aesbs_ctr_encrypt) 875 - frame_push 8 875 + stp x29, x30, [sp, #-16]! 876 + mov x29, sp 876 877 877 - mov x19, x0 878 - mov x20, x1 879 - mov x21, x2 880 - mov x22, x3 881 - mov x23, x4 882 - mov x24, x5 883 - mov x25, x6 884 - 885 - cmp x25, #0 886 - cset x26, ne 887 - add x23, x23, x26 // do one extra block if final 888 - 889 - ldp x7, x8, [x24] 890 - ld1 {v0.16b}, [x24] 878 + ldp x7, x8, [x5] 879 + ld1 {v0.16b}, [x5] 891 880 CPU_LE( rev x7, x7 ) 892 881 CPU_LE( rev x8, x8 ) 893 882 adds x8, x8, #1 894 883 adc x7, x7, xzr 895 884 896 - 99: mov x9, #1 897 - lsl x9, x9, x23 898 - subs w23, w23, #8 899 - csel x23, x23, xzr, pl 900 - csel x9, x9, xzr, le 901 - 902 - tbnz x9, #1, 0f 903 - next_ctr v1 904 - tbnz x9, #2, 0f 885 + 0: next_ctr v1 905 886 next_ctr v2 906 - tbnz x9, #3, 0f 907 887 next_ctr v3 908 - tbnz x9, #4, 0f 909 888 next_ctr v4 910 - tbnz x9, #5, 0f 911 889 next_ctr v5 912 - tbnz x9, #6, 0f 913 890 next_ctr v6 914 - tbnz x9, #7, 0f 915 891 next_ctr v7 916 892 917 - 0: mov bskey, x21 918 - mov rounds, x22 893 + mov bskey, x2 894 + mov rounds, x3 919 895 bl aesbs_encrypt8 920 896 921 - lsr x9, x9, x26 // disregard the extra block 922 - tbnz x9, #0, 0f 897 + ld1 { v8.16b-v11.16b}, [x1], #64 898 + ld1 {v12.16b-v15.16b}, [x1], #64 923 899 924 - ld1 {v8.16b}, [x20], #16 925 - eor v0.16b, v0.16b, v8.16b 926 - st1 {v0.16b}, [x19], #16 927 - tbnz x9, #1, 1f 900 + eor v8.16b, v0.16b, v8.16b 901 + eor v9.16b, v1.16b, v9.16b 902 + eor v10.16b, v4.16b, v10.16b 903 + eor v11.16b, v6.16b, v11.16b 904 + eor v12.16b, v3.16b, v12.16b 905 + eor v13.16b, v7.16b, v13.16b 906 + eor v14.16b, v2.16b, v14.16b 907 + eor v15.16b, v5.16b, v15.16b 928 908 929 - ld1 {v9.16b}, [x20], #16 930 - eor v1.16b, v1.16b, v9.16b 931 - st1 {v1.16b}, [x19], #16 932 - tbnz x9, #2, 2f 909 + st1 { v8.16b-v11.16b}, [x0], #64 910 + st1 {v12.16b-v15.16b}, [x0], #64 933 911 934 - ld1 {v10.16b}, [x20], #16 935 - eor v4.16b, v4.16b, v10.16b 936 - st1 {v4.16b}, [x19], #16 937 - tbnz x9, #3, 3f 912 + next_ctr v0 913 + subs x4, x4, #8 914 + b.gt 0b 938 915 939 - ld1 {v11.16b}, [x20], #16 940 - eor v6.16b, v6.16b, v11.16b 941 - st1 {v6.16b}, [x19], #16 942 - tbnz x9, #4, 4f 943 - 944 - ld1 {v12.16b}, [x20], #16 945 - eor v3.16b, v3.16b, v12.16b 946 - st1 {v3.16b}, [x19], #16 947 - tbnz x9, #5, 5f 948 - 949 - ld1 {v13.16b}, [x20], #16 950 - eor v7.16b, v7.16b, v13.16b 951 - st1 {v7.16b}, [x19], #16 952 - tbnz x9, #6, 6f 953 - 954 - ld1 {v14.16b}, [x20], #16 955 - eor v2.16b, v2.16b, v14.16b 956 - st1 {v2.16b}, [x19], #16 957 - tbnz x9, #7, 7f 958 - 959 - ld1 {v15.16b}, [x20], #16 960 - eor v5.16b, v5.16b, v15.16b 961 - st1 {v5.16b}, [x19], #16 962 - 963 - 8: next_ctr v0 964 - st1 {v0.16b}, [x24] 965 - cbz x23, .Lctr_done 966 - 967 - b 99b 968 - 969 - .Lctr_done: 970 - frame_pop 916 + st1 {v0.16b}, [x5] 917 + ldp x29, x30, [sp], #16 971 918 ret 972 - 973 - /* 974 - * If we are handling the tail of the input (x6 != NULL), return the 975 - * final keystream block back to the caller. 976 - */ 977 - 0: cbz x25, 8b 978 - st1 {v0.16b}, [x25] 979 - b 8b 980 - 1: cbz x25, 8b 981 - st1 {v1.16b}, [x25] 982 - b 8b 983 - 2: cbz x25, 8b 984 - st1 {v4.16b}, [x25] 985 - b 8b 986 - 3: cbz x25, 8b 987 - st1 {v6.16b}, [x25] 988 - b 8b 989 - 4: cbz x25, 8b 990 - st1 {v3.16b}, [x25] 991 - b 8b 992 - 5: cbz x25, 8b 993 - st1 {v7.16b}, [x25] 994 - b 8b 995 - 6: cbz x25, 8b 996 - st1 {v2.16b}, [x25] 997 - b 8b 998 - 7: cbz x25, 8b 999 - st1 {v5.16b}, [x25] 1000 - b 8b 1001 919 SYM_FUNC_END(aesbs_ctr_encrypt)
+29 -35
arch/arm64/crypto/aes-neonbs-glue.c
··· 34 34 int rounds, int blocks, u8 iv[]); 35 35 36 36 asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], 37 - int rounds, int blocks, u8 iv[], u8 final[]); 37 + int rounds, int blocks, u8 iv[]); 38 38 39 39 asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], 40 40 int rounds, int blocks, u8 iv[]); ··· 46 46 int rounds, int blocks); 47 47 asmlinkage void neon_aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], 48 48 int rounds, int blocks, u8 iv[]); 49 + asmlinkage void neon_aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], 50 + int rounds, int bytes, u8 ctr[]); 49 51 asmlinkage void neon_aes_xts_encrypt(u8 out[], u8 const in[], 50 52 u32 const rk1[], int rounds, int bytes, 51 53 u32 const rk2[], u8 iv[], int first); ··· 60 58 int rounds; 61 59 } __aligned(AES_BLOCK_SIZE); 62 60 63 - struct aesbs_cbc_ctx { 61 + struct aesbs_cbc_ctr_ctx { 64 62 struct aesbs_ctx key; 65 63 u32 enc[AES_MAX_KEYLENGTH_U32]; 66 64 }; ··· 130 128 return __ecb_crypt(req, aesbs_ecb_decrypt); 131 129 } 132 130 133 - static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key, 131 + static int aesbs_cbc_ctr_setkey(struct crypto_skcipher *tfm, const u8 *in_key, 134 132 unsigned int key_len) 135 133 { 136 - struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); 134 + struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); 137 135 struct crypto_aes_ctx rk; 138 136 int err; 139 137 ··· 156 154 static int cbc_encrypt(struct skcipher_request *req) 157 155 { 158 156 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 159 - struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); 157 + struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); 160 158 struct skcipher_walk walk; 161 159 int err; 162 160 ··· 179 177 static int cbc_decrypt(struct skcipher_request *req) 180 178 { 181 179 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 182 - struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); 180 + struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); 183 181 struct skcipher_walk walk; 184 182 int err; 185 183 ··· 207 205 static int ctr_encrypt(struct skcipher_request *req) 208 206 { 209 207 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 210 - struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm); 208 + struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); 211 209 struct skcipher_walk walk; 212 - u8 buf[AES_BLOCK_SIZE]; 213 210 int err; 214 211 215 212 err = skcipher_walk_virt(&walk, req, false); 216 213 217 214 while (walk.nbytes > 0) { 218 - unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; 219 - u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL; 220 - 221 - if (walk.nbytes < walk.total) { 222 - blocks = round_down(blocks, 223 - walk.stride / AES_BLOCK_SIZE); 224 - final = NULL; 225 - } 215 + int blocks = (walk.nbytes / AES_BLOCK_SIZE) & ~7; 216 + int nbytes = walk.nbytes % (8 * AES_BLOCK_SIZE); 217 + const u8 *src = walk.src.virt.addr; 218 + u8 *dst = walk.dst.virt.addr; 226 219 227 220 kernel_neon_begin(); 228 - aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr, 229 - ctx->rk, ctx->rounds, blocks, walk.iv, final); 230 - kernel_neon_end(); 231 - 232 - if (final) { 233 - u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; 234 - u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; 235 - 236 - crypto_xor_cpy(dst, src, final, 237 - walk.total % AES_BLOCK_SIZE); 238 - 239 - err = skcipher_walk_done(&walk, 0); 240 - break; 221 + if (blocks >= 8) { 222 + aesbs_ctr_encrypt(dst, src, ctx->key.rk, ctx->key.rounds, 223 + blocks, walk.iv); 224 + dst += blocks * AES_BLOCK_SIZE; 225 + src += blocks * AES_BLOCK_SIZE; 241 226 } 242 - err = skcipher_walk_done(&walk, 243 - walk.nbytes - blocks * AES_BLOCK_SIZE); 227 + if (nbytes && walk.nbytes == walk.total) { 228 + neon_aes_ctr_encrypt(dst, src, ctx->enc, ctx->key.rounds, 229 + nbytes, walk.iv); 230 + nbytes = 0; 231 + } 232 + kernel_neon_end(); 233 + err = skcipher_walk_done(&walk, nbytes); 244 234 } 245 235 return err; 246 236 } ··· 396 402 .base.cra_driver_name = "cbc-aes-neonbs", 397 403 .base.cra_priority = 250, 398 404 .base.cra_blocksize = AES_BLOCK_SIZE, 399 - .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctx), 405 + .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctr_ctx), 400 406 .base.cra_module = THIS_MODULE, 401 407 402 408 .min_keysize = AES_MIN_KEY_SIZE, 403 409 .max_keysize = AES_MAX_KEY_SIZE, 404 410 .walksize = 8 * AES_BLOCK_SIZE, 405 411 .ivsize = AES_BLOCK_SIZE, 406 - .setkey = aesbs_cbc_setkey, 412 + .setkey = aesbs_cbc_ctr_setkey, 407 413 .encrypt = cbc_encrypt, 408 414 .decrypt = cbc_decrypt, 409 415 }, { ··· 411 417 .base.cra_driver_name = "ctr-aes-neonbs", 412 418 .base.cra_priority = 250, 413 419 .base.cra_blocksize = 1, 414 - .base.cra_ctxsize = sizeof(struct aesbs_ctx), 420 + .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctr_ctx), 415 421 .base.cra_module = THIS_MODULE, 416 422 417 423 .min_keysize = AES_MIN_KEY_SIZE, ··· 419 425 .chunksize = AES_BLOCK_SIZE, 420 426 .walksize = 8 * AES_BLOCK_SIZE, 421 427 .ivsize = AES_BLOCK_SIZE, 422 - .setkey = aesbs_setkey, 428 + .setkey = aesbs_cbc_ctr_setkey, 423 429 .encrypt = ctr_encrypt, 424 430 .decrypt = ctr_encrypt, 425 431 }, {