Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

crypto: x86/aesni-xctr - Add accelerated implementation of XCTR

Add hardware accelerated version of XCTR for x86-64 CPUs with AESNI
support.

More information on XCTR can be found in the HCTR2 paper:
"Length-preserving encryption with HCTR2":
https://eprint.iacr.org/2021/1441.pdf

Signed-off-by: Nathan Huckleberry <nhuck@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Nathan Huckleberry and committed by
Herbert Xu
fd94fcf0 7ff554ce

+266 -82
+152 -80
arch/x86/crypto/aes_ctrby8_avx-x86_64.S
··· 23 23 24 24 #define VMOVDQ vmovdqu 25 25 26 + /* 27 + * Note: the "x" prefix in these aliases means "this is an xmm register". The 28 + * alias prefixes have no relation to XCTR where the "X" prefix means "XOR 29 + * counter". 30 + */ 26 31 #define xdata0 %xmm0 27 32 #define xdata1 %xmm1 28 33 #define xdata2 %xmm2 ··· 36 31 #define xdata5 %xmm5 37 32 #define xdata6 %xmm6 38 33 #define xdata7 %xmm7 39 - #define xcounter %xmm8 40 - #define xbyteswap %xmm9 34 + #define xcounter %xmm8 // CTR mode only 35 + #define xiv %xmm8 // XCTR mode only 36 + #define xbyteswap %xmm9 // CTR mode only 37 + #define xtmp %xmm9 // XCTR mode only 41 38 #define xkey0 %xmm10 42 39 #define xkey4 %xmm11 43 40 #define xkey8 %xmm12 ··· 52 45 #define p_keys %rdx 53 46 #define p_out %rcx 54 47 #define num_bytes %r8 55 - 48 + #define counter %r9 // XCTR mode only 56 49 #define tmp %r10 57 50 #define DDQ_DATA 0 58 51 #define XDATA 1 ··· 109 102 * do_aes num_in_par load_keys key_len 110 103 * This increments p_in, but not p_out 111 104 */ 112 - .macro do_aes b, k, key_len 105 + .macro do_aes b, k, key_len, xctr 113 106 .set by, \b 114 107 .set load_keys, \k 115 108 .set klen, \key_len ··· 118 111 vmovdqa 0*16(p_keys), xkey0 119 112 .endif 120 113 121 - vpshufb xbyteswap, xcounter, xdata0 122 - 123 - .set i, 1 124 - .rept (by - 1) 125 - club XDATA, i 126 - vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata 127 - vptest ddq_low_msk(%rip), var_xdata 128 - jnz 1f 129 - vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata 130 - vpaddq ddq_high_add_1(%rip), xcounter, xcounter 131 - 1: 132 - vpshufb xbyteswap, var_xdata, var_xdata 133 - .set i, (i +1) 134 - .endr 114 + .if \xctr 115 + movq counter, xtmp 116 + .set i, 0 117 + .rept (by) 118 + club XDATA, i 119 + vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata 120 + .set i, (i +1) 121 + .endr 122 + .set i, 0 123 + .rept (by) 124 + club XDATA, i 125 + vpxor xiv, var_xdata, var_xdata 126 + .set i, (i +1) 127 + .endr 128 + .else 129 + vpshufb xbyteswap, xcounter, xdata0 130 + .set i, 1 131 + .rept (by - 1) 132 + club XDATA, i 133 + vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata 134 + vptest ddq_low_msk(%rip), var_xdata 135 + jnz 1f 136 + vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata 137 + vpaddq ddq_high_add_1(%rip), xcounter, xcounter 138 + 1: 139 + vpshufb xbyteswap, var_xdata, var_xdata 140 + .set i, (i +1) 141 + .endr 142 + .endif 135 143 136 144 vmovdqa 1*16(p_keys), xkeyA 137 145 138 146 vpxor xkey0, xdata0, xdata0 139 - vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter 140 - vptest ddq_low_msk(%rip), xcounter 141 - jnz 1f 142 - vpaddq ddq_high_add_1(%rip), xcounter, xcounter 143 - 1: 147 + .if \xctr 148 + add $by, counter 149 + .else 150 + vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter 151 + vptest ddq_low_msk(%rip), xcounter 152 + jnz 1f 153 + vpaddq ddq_high_add_1(%rip), xcounter, xcounter 154 + 1: 155 + .endif 144 156 145 157 .set i, 1 146 158 .rept (by - 1) ··· 397 371 .endr 398 372 .endm 399 373 400 - .macro do_aes_load val, key_len 401 - do_aes \val, 1, \key_len 374 + .macro do_aes_load val, key_len, xctr 375 + do_aes \val, 1, \key_len, \xctr 402 376 .endm 403 377 404 - .macro do_aes_noload val, key_len 405 - do_aes \val, 0, \key_len 378 + .macro do_aes_noload val, key_len, xctr 379 + do_aes \val, 0, \key_len, \xctr 406 380 .endm 407 381 408 382 /* main body of aes ctr load */ 409 383 410 - .macro do_aes_ctrmain key_len 384 + .macro do_aes_ctrmain key_len, xctr 411 385 cmp $16, num_bytes 412 - jb .Ldo_return2\key_len 386 + jb .Ldo_return2\xctr\key_len 413 387 414 - vmovdqa byteswap_const(%rip), xbyteswap 415 - vmovdqu (p_iv), xcounter 416 - vpshufb xbyteswap, xcounter, xcounter 388 + .if \xctr 389 + shr $4, counter 390 + vmovdqu (p_iv), xiv 391 + .else 392 + vmovdqa byteswap_const(%rip), xbyteswap 393 + vmovdqu (p_iv), xcounter 394 + vpshufb xbyteswap, xcounter, xcounter 395 + .endif 417 396 418 397 mov num_bytes, tmp 419 398 and $(7*16), tmp 420 - jz .Lmult_of_8_blks\key_len 399 + jz .Lmult_of_8_blks\xctr\key_len 421 400 422 401 /* 1 <= tmp <= 7 */ 423 402 cmp $(4*16), tmp 424 - jg .Lgt4\key_len 425 - je .Leq4\key_len 403 + jg .Lgt4\xctr\key_len 404 + je .Leq4\xctr\key_len 426 405 427 - .Llt4\key_len: 406 + .Llt4\xctr\key_len: 428 407 cmp $(2*16), tmp 429 - jg .Leq3\key_len 430 - je .Leq2\key_len 408 + jg .Leq3\xctr\key_len 409 + je .Leq2\xctr\key_len 431 410 432 - .Leq1\key_len: 433 - do_aes_load 1, \key_len 411 + .Leq1\xctr\key_len: 412 + do_aes_load 1, \key_len, \xctr 434 413 add $(1*16), p_out 435 414 and $(~7*16), num_bytes 436 - jz .Ldo_return2\key_len 437 - jmp .Lmain_loop2\key_len 415 + jz .Ldo_return2\xctr\key_len 416 + jmp .Lmain_loop2\xctr\key_len 438 417 439 - .Leq2\key_len: 440 - do_aes_load 2, \key_len 418 + .Leq2\xctr\key_len: 419 + do_aes_load 2, \key_len, \xctr 441 420 add $(2*16), p_out 442 421 and $(~7*16), num_bytes 443 - jz .Ldo_return2\key_len 444 - jmp .Lmain_loop2\key_len 422 + jz .Ldo_return2\xctr\key_len 423 + jmp .Lmain_loop2\xctr\key_len 445 424 446 425 447 - .Leq3\key_len: 448 - do_aes_load 3, \key_len 426 + .Leq3\xctr\key_len: 427 + do_aes_load 3, \key_len, \xctr 449 428 add $(3*16), p_out 450 429 and $(~7*16), num_bytes 451 - jz .Ldo_return2\key_len 452 - jmp .Lmain_loop2\key_len 430 + jz .Ldo_return2\xctr\key_len 431 + jmp .Lmain_loop2\xctr\key_len 453 432 454 - .Leq4\key_len: 455 - do_aes_load 4, \key_len 433 + .Leq4\xctr\key_len: 434 + do_aes_load 4, \key_len, \xctr 456 435 add $(4*16), p_out 457 436 and $(~7*16), num_bytes 458 - jz .Ldo_return2\key_len 459 - jmp .Lmain_loop2\key_len 437 + jz .Ldo_return2\xctr\key_len 438 + jmp .Lmain_loop2\xctr\key_len 460 439 461 - .Lgt4\key_len: 440 + .Lgt4\xctr\key_len: 462 441 cmp $(6*16), tmp 463 - jg .Leq7\key_len 464 - je .Leq6\key_len 442 + jg .Leq7\xctr\key_len 443 + je .Leq6\xctr\key_len 465 444 466 - .Leq5\key_len: 467 - do_aes_load 5, \key_len 445 + .Leq5\xctr\key_len: 446 + do_aes_load 5, \key_len, \xctr 468 447 add $(5*16), p_out 469 448 and $(~7*16), num_bytes 470 - jz .Ldo_return2\key_len 471 - jmp .Lmain_loop2\key_len 449 + jz .Ldo_return2\xctr\key_len 450 + jmp .Lmain_loop2\xctr\key_len 472 451 473 - .Leq6\key_len: 474 - do_aes_load 6, \key_len 452 + .Leq6\xctr\key_len: 453 + do_aes_load 6, \key_len, \xctr 475 454 add $(6*16), p_out 476 455 and $(~7*16), num_bytes 477 - jz .Ldo_return2\key_len 478 - jmp .Lmain_loop2\key_len 456 + jz .Ldo_return2\xctr\key_len 457 + jmp .Lmain_loop2\xctr\key_len 479 458 480 - .Leq7\key_len: 481 - do_aes_load 7, \key_len 459 + .Leq7\xctr\key_len: 460 + do_aes_load 7, \key_len, \xctr 482 461 add $(7*16), p_out 483 462 and $(~7*16), num_bytes 484 - jz .Ldo_return2\key_len 485 - jmp .Lmain_loop2\key_len 463 + jz .Ldo_return2\xctr\key_len 464 + jmp .Lmain_loop2\xctr\key_len 486 465 487 - .Lmult_of_8_blks\key_len: 466 + .Lmult_of_8_blks\xctr\key_len: 488 467 .if (\key_len != KEY_128) 489 468 vmovdqa 0*16(p_keys), xkey0 490 469 vmovdqa 4*16(p_keys), xkey4 ··· 502 471 vmovdqa 9*16(p_keys), xkey12 503 472 .endif 504 473 .align 16 505 - .Lmain_loop2\key_len: 474 + .Lmain_loop2\xctr\key_len: 506 475 /* num_bytes is a multiple of 8 and >0 */ 507 - do_aes_noload 8, \key_len 476 + do_aes_noload 8, \key_len, \xctr 508 477 add $(8*16), p_out 509 478 sub $(8*16), num_bytes 510 - jne .Lmain_loop2\key_len 479 + jne .Lmain_loop2\xctr\key_len 511 480 512 - .Ldo_return2\key_len: 513 - /* return updated IV */ 514 - vpshufb xbyteswap, xcounter, xcounter 515 - vmovdqu xcounter, (p_iv) 481 + .Ldo_return2\xctr\key_len: 482 + .if !\xctr 483 + /* return updated IV */ 484 + vpshufb xbyteswap, xcounter, xcounter 485 + vmovdqu xcounter, (p_iv) 486 + .endif 516 487 RET 517 488 .endm 518 489 ··· 527 494 */ 528 495 SYM_FUNC_START(aes_ctr_enc_128_avx_by8) 529 496 /* call the aes main loop */ 530 - do_aes_ctrmain KEY_128 497 + do_aes_ctrmain KEY_128 0 531 498 532 499 SYM_FUNC_END(aes_ctr_enc_128_avx_by8) 533 500 ··· 540 507 */ 541 508 SYM_FUNC_START(aes_ctr_enc_192_avx_by8) 542 509 /* call the aes main loop */ 543 - do_aes_ctrmain KEY_192 510 + do_aes_ctrmain KEY_192 0 544 511 545 512 SYM_FUNC_END(aes_ctr_enc_192_avx_by8) 546 513 ··· 553 520 */ 554 521 SYM_FUNC_START(aes_ctr_enc_256_avx_by8) 555 522 /* call the aes main loop */ 556 - do_aes_ctrmain KEY_256 523 + do_aes_ctrmain KEY_256 0 557 524 558 525 SYM_FUNC_END(aes_ctr_enc_256_avx_by8) 526 + 527 + /* 528 + * routine to do AES128 XCTR enc/decrypt "by8" 529 + * XMM registers are clobbered. 530 + * Saving/restoring must be done at a higher level 531 + * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys, 532 + * u8* out, unsigned int num_bytes, unsigned int byte_ctr) 533 + */ 534 + SYM_FUNC_START(aes_xctr_enc_128_avx_by8) 535 + /* call the aes main loop */ 536 + do_aes_ctrmain KEY_128 1 537 + 538 + SYM_FUNC_END(aes_xctr_enc_128_avx_by8) 539 + 540 + /* 541 + * routine to do AES192 XCTR enc/decrypt "by8" 542 + * XMM registers are clobbered. 543 + * Saving/restoring must be done at a higher level 544 + * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys, 545 + * u8* out, unsigned int num_bytes, unsigned int byte_ctr) 546 + */ 547 + SYM_FUNC_START(aes_xctr_enc_192_avx_by8) 548 + /* call the aes main loop */ 549 + do_aes_ctrmain KEY_192 1 550 + 551 + SYM_FUNC_END(aes_xctr_enc_192_avx_by8) 552 + 553 + /* 554 + * routine to do AES256 XCTR enc/decrypt "by8" 555 + * XMM registers are clobbered. 556 + * Saving/restoring must be done at a higher level 557 + * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys, 558 + * u8* out, unsigned int num_bytes, unsigned int byte_ctr) 559 + */ 560 + SYM_FUNC_START(aes_xctr_enc_256_avx_by8) 561 + /* call the aes main loop */ 562 + do_aes_ctrmain KEY_256 1 563 + 564 + SYM_FUNC_END(aes_xctr_enc_256_avx_by8)
+113 -1
arch/x86/crypto/aesni-intel_glue.c
··· 135 135 void *keys, u8 *out, unsigned int num_bytes); 136 136 asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, 137 137 void *keys, u8 *out, unsigned int num_bytes); 138 + 139 + 140 + asmlinkage void aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, 141 + const void *keys, u8 *out, unsigned int num_bytes, 142 + unsigned int byte_ctr); 143 + 144 + asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, 145 + const void *keys, u8 *out, unsigned int num_bytes, 146 + unsigned int byte_ctr); 147 + 148 + asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, 149 + const void *keys, u8 *out, unsigned int num_bytes, 150 + unsigned int byte_ctr); 151 + 138 152 /* 139 153 * asmlinkage void aesni_gcm_init_avx_gen2() 140 154 * gcm_data *my_ctx_data, context data ··· 533 519 walk.src.virt.addr + walk.nbytes - nbytes, 534 520 keystream, nbytes); 535 521 crypto_inc(walk.iv, AES_BLOCK_SIZE); 522 + nbytes = 0; 523 + } 524 + kernel_fpu_end(); 525 + err = skcipher_walk_done(&walk, nbytes); 526 + } 527 + return err; 528 + } 529 + 530 + static void aesni_xctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, 531 + const u8 *in, unsigned int len, u8 *iv, 532 + unsigned int byte_ctr) 533 + { 534 + if (ctx->key_length == AES_KEYSIZE_128) 535 + aes_xctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len, 536 + byte_ctr); 537 + else if (ctx->key_length == AES_KEYSIZE_192) 538 + aes_xctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len, 539 + byte_ctr); 540 + else 541 + aes_xctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len, 542 + byte_ctr); 543 + } 544 + 545 + static int xctr_crypt(struct skcipher_request *req) 546 + { 547 + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 548 + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); 549 + u8 keystream[AES_BLOCK_SIZE]; 550 + struct skcipher_walk walk; 551 + unsigned int nbytes; 552 + unsigned int byte_ctr = 0; 553 + int err; 554 + __le32 block[AES_BLOCK_SIZE / sizeof(__le32)]; 555 + 556 + err = skcipher_walk_virt(&walk, req, false); 557 + 558 + while ((nbytes = walk.nbytes) > 0) { 559 + kernel_fpu_begin(); 560 + if (nbytes & AES_BLOCK_MASK) 561 + aesni_xctr_enc_avx_tfm(ctx, walk.dst.virt.addr, 562 + walk.src.virt.addr, nbytes & AES_BLOCK_MASK, 563 + walk.iv, byte_ctr); 564 + nbytes &= ~AES_BLOCK_MASK; 565 + byte_ctr += walk.nbytes - nbytes; 566 + 567 + if (walk.nbytes == walk.total && nbytes > 0) { 568 + memcpy(block, walk.iv, AES_BLOCK_SIZE); 569 + block[0] ^= cpu_to_le32(1 + byte_ctr / AES_BLOCK_SIZE); 570 + aesni_enc(ctx, keystream, (u8 *)block); 571 + crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - 572 + nbytes, walk.src.virt.addr + walk.nbytes 573 + - nbytes, keystream, nbytes); 574 + byte_ctr += nbytes; 536 575 nbytes = 0; 537 576 } 538 577 kernel_fpu_end(); ··· 1118 1051 struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)]; 1119 1052 1120 1053 #ifdef CONFIG_X86_64 1054 + /* 1055 + * XCTR does not have a non-AVX implementation, so it must be enabled 1056 + * conditionally. 1057 + */ 1058 + static struct skcipher_alg aesni_xctr = { 1059 + .base = { 1060 + .cra_name = "__xctr(aes)", 1061 + .cra_driver_name = "__xctr-aes-aesni", 1062 + .cra_priority = 400, 1063 + .cra_flags = CRYPTO_ALG_INTERNAL, 1064 + .cra_blocksize = 1, 1065 + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, 1066 + .cra_module = THIS_MODULE, 1067 + }, 1068 + .min_keysize = AES_MIN_KEY_SIZE, 1069 + .max_keysize = AES_MAX_KEY_SIZE, 1070 + .ivsize = AES_BLOCK_SIZE, 1071 + .chunksize = AES_BLOCK_SIZE, 1072 + .setkey = aesni_skcipher_setkey, 1073 + .encrypt = xctr_crypt, 1074 + .decrypt = xctr_crypt, 1075 + }; 1076 + 1077 + static struct simd_skcipher_alg *aesni_simd_xctr; 1078 + #endif /* CONFIG_X86_64 */ 1079 + 1080 + #ifdef CONFIG_X86_64 1121 1081 static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, 1122 1082 unsigned int key_len) 1123 1083 { ··· 1257 1163 static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm); 1258 1164 pr_info("AES CTR mode by8 optimization enabled\n"); 1259 1165 } 1260 - #endif 1166 + #endif /* CONFIG_X86_64 */ 1261 1167 1262 1168 err = crypto_register_alg(&aesni_cipher_alg); 1263 1169 if (err) ··· 1274 1180 if (err) 1275 1181 goto unregister_skciphers; 1276 1182 1183 + #ifdef CONFIG_X86_64 1184 + if (boot_cpu_has(X86_FEATURE_AVX)) 1185 + err = simd_register_skciphers_compat(&aesni_xctr, 1, 1186 + &aesni_simd_xctr); 1187 + if (err) 1188 + goto unregister_aeads; 1189 + #endif /* CONFIG_X86_64 */ 1190 + 1277 1191 return 0; 1192 + 1193 + #ifdef CONFIG_X86_64 1194 + unregister_aeads: 1195 + simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), 1196 + aesni_simd_aeads); 1197 + #endif /* CONFIG_X86_64 */ 1278 1198 1279 1199 unregister_skciphers: 1280 1200 simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), ··· 1305 1197 simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), 1306 1198 aesni_simd_skciphers); 1307 1199 crypto_unregister_alg(&aesni_cipher_alg); 1200 + #ifdef CONFIG_X86_64 1201 + if (boot_cpu_has(X86_FEATURE_AVX)) 1202 + simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); 1203 + #endif /* CONFIG_X86_64 */ 1308 1204 } 1309 1205 1310 1206 late_initcall(aesni_init);
+1 -1
crypto/Kconfig
··· 1169 1169 In addition to AES cipher algorithm support, the acceleration 1170 1170 for some popular block cipher mode is supported too, including 1171 1171 ECB, CBC, LRW, XTS. The 64 bit version has additional 1172 - acceleration for CTR. 1172 + acceleration for CTR and XCTR. 1173 1173 1174 1174 config CRYPTO_AES_SPARC64 1175 1175 tristate "AES cipher algorithms (SPARC64)"