Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

crypto: x86/aes-xts - optimize size of instructions operating on lengths

x86_64 has the "interesting" property that the instruction size is
generally a bit shorter for instructions that operate on the 32-bit (or
less) part of registers, or registers that are in the original set of 8.

This patch adjusts the AES-XTS code to take advantage of that property
by changing the LEN parameter from size_t to unsigned int (which is all
that's needed and is what the non-AVX implementation uses) and using the
%eax register for KEYLEN.

This decreases the size of aes-xts-avx-x86_64.o by 1.2%.

Note that changing the kmovq to kmovd was going to be needed anyway to
make the AVX10/256 code really work on CPUs that don't support 512-bit
vectors (since the AVX10 spec says that 64-bit opmask instructions will
only be supported on processors that support 512-bit vectors).

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Eric Biggers and committed by
Herbert Xu
543ea178 e619723a

+30 -28
+21 -19
arch/x86/crypto/aes-xts-avx-x86_64.S
··· 85 85 // advanced to point to 7th-from-last round key 86 86 .set SRC, %rsi // Pointer to next source data 87 87 .set DST, %rdx // Pointer to next destination data 88 - .set LEN, %rcx // Remaining length in bytes 88 + .set LEN, %ecx // Remaining length in bytes 89 + .set LEN8, %cl 90 + .set LEN64, %rcx 89 91 .set TWEAK, %r8 // Pointer to next tweak 90 92 91 - // %r9 holds the AES key length in bytes. 92 - .set KEYLEN, %r9d 93 - .set KEYLEN64, %r9 93 + // %rax holds the AES key length in bytes. 94 + .set KEYLEN, %eax 95 + .set KEYLEN64, %rax 94 96 95 - // %rax and %r10-r11 are available as temporaries. 97 + // %r9-r11 are available as temporaries. 96 98 97 99 .macro _define_Vi i 98 100 .if VL == 16 ··· 567 565 // subtracting 16 from LEN. This is needed because ciphertext stealing 568 566 // decryption uses the last two tweaks in reverse order. We'll handle 569 567 // the last full block and the partial block specially at the end. 570 - lea -16(LEN), %rax 571 - test $15, LEN 572 - cmovnz %rax, LEN 568 + lea -16(LEN), %eax 569 + test $15, LEN8 570 + cmovnz %eax, LEN 573 571 .endif 574 572 575 573 // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). ··· 652 650 // Check for the uncommon case where the data length isn't a multiple of 653 651 // 4*VL. Handle it out-of-line in order to optimize for the common 654 652 // case. In the common case, just fall through to the ret. 655 - test $4*VL-1, LEN 653 + test $4*VL-1, LEN8 656 654 jnz .Lhandle_remainder\@ 657 655 .Ldone\@: 658 656 // Store the next tweak back to *TWEAK to support continuation calls. ··· 720 718 721 719 .if USE_AVX10 722 720 // Create a mask that has the first LEN bits set. 723 - mov $-1, %rax 724 - bzhi LEN, %rax, %rax 725 - kmovq %rax, %k1 721 + mov $-1, %r9d 722 + bzhi LEN, %r9d, %r9d 723 + kmovd %r9d, %k1 726 724 727 725 // Swap the first LEN bytes of the en/decryption of the last full block 728 726 // with the partial block. Note that to support in-place en/decryption, ··· 732 730 vmovdqu8 16(SRC), %xmm0{%k1} 733 731 vmovdqu8 %xmm1, 16(DST){%k1} 734 732 .else 735 - lea .Lcts_permute_table(%rip), %rax 733 + lea .Lcts_permute_table(%rip), %r9 736 734 737 735 // Load the src partial block, left-aligned. Note that to support 738 736 // in-place en/decryption, this must happen before the store to the dst 739 737 // partial block. 740 - vmovdqu (SRC, LEN, 1), %xmm1 738 + vmovdqu (SRC, LEN64, 1), %xmm1 741 739 742 740 // Shift the first LEN bytes of the en/decryption of the last full block 743 741 // to the end of a register, then store it to DST+LEN. This stores the 744 742 // dst partial block. It also writes to the second part of the dst last 745 743 // full block, but that part is overwritten later. 746 - vpshufb (%rax, LEN, 1), %xmm0, %xmm2 747 - vmovdqu %xmm2, (DST, LEN, 1) 744 + vpshufb (%r9, LEN64, 1), %xmm0, %xmm2 745 + vmovdqu %xmm2, (DST, LEN64, 1) 748 746 749 747 // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...]. 750 - sub LEN, %rax 751 - vmovdqu 32(%rax), %xmm3 748 + sub LEN64, %r9 749 + vmovdqu 32(%r9), %xmm3 752 750 753 751 // Shift the src partial block to the beginning of its register. 754 752 vpshufb %xmm3, %xmm1, %xmm1 ··· 797 795 // instantiated from the above macro. They all have the following prototype: 798 796 // 799 797 // void (*xts_asm_func)(const struct crypto_aes_ctx *key, 800 - // const u8 *src, u8 *dst, size_t len, 798 + // const u8 *src, u8 *dst, unsigned int len, 801 799 // u8 tweak[AES_BLOCK_SIZE]); 802 800 // 803 801 // |key| is the data key. |tweak| contains the next tweak; the encryption of
+9 -9
arch/x86/crypto/aesni-intel_glue.c
··· 899 899 typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key, 900 900 u8 iv[AES_BLOCK_SIZE]); 901 901 typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key, 902 - const u8 *src, u8 *dst, size_t len, 902 + const u8 *src, u8 *dst, unsigned int len, 903 903 u8 tweak[AES_BLOCK_SIZE]); 904 904 905 905 /* This handles cases where the source and/or destination span pages. */ ··· 1021 1021 } 1022 1022 1023 1023 static void aesni_xts_encrypt(const struct crypto_aes_ctx *key, 1024 - const u8 *src, u8 *dst, size_t len, 1024 + const u8 *src, u8 *dst, unsigned int len, 1025 1025 u8 tweak[AES_BLOCK_SIZE]) 1026 1026 { 1027 1027 aesni_xts_enc(key, dst, src, len, tweak); 1028 1028 } 1029 1029 1030 1030 static void aesni_xts_decrypt(const struct crypto_aes_ctx *key, 1031 - const u8 *src, u8 *dst, size_t len, 1031 + const u8 *src, u8 *dst, unsigned int len, 1032 1032 u8 tweak[AES_BLOCK_SIZE]) 1033 1033 { 1034 1034 aesni_xts_dec(key, dst, src, len, tweak); ··· 1185 1185 1186 1186 #define DEFINE_XTS_ALG(suffix, driver_name, priority) \ 1187 1187 \ 1188 - asmlinkage void aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, \ 1189 - const u8 *src, u8 *dst, size_t len, \ 1190 - u8 tweak[AES_BLOCK_SIZE]); \ 1191 - asmlinkage void aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, \ 1192 - const u8 *src, u8 *dst, size_t len, \ 1193 - u8 tweak[AES_BLOCK_SIZE]); \ 1188 + asmlinkage void \ 1189 + aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ 1190 + u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \ 1191 + asmlinkage void \ 1192 + aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ 1193 + u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \ 1194 1194 \ 1195 1195 static int xts_encrypt_##suffix(struct skcipher_request *req) \ 1196 1196 { \