crypto: x86/aes-xts - handle AES-128 and AES-192 more efficiently

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Decrease the amount of code specific to the different AES variants by
"right-aligning" the sequence of round keys, and for AES-128 and AES-192
just skipping irrelevant rounds at the beginning.

This shrinks the size of aes-xts-avx-x86_64.o by 13.3%, and it improves
the efficiency of AES-128 and AES-192. The tradeoff is that for AES-256
some additional not-taken conditional jumps are now executed. But these
are predicted well and are cheap on x86.

Note that the ARMv8 CE based AES-XTS implementation uses a similar
strategy to handle the different AES variants.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Eric Biggers and committed by

Herbert Xu 2 years ago 2717e01f ea9459ef

+92 -86

1 changed file

expand all

arch

x86

crypto

aes-xts-avx-x86_64.S

+92 -86

arch/x86/crypto/aes-xts-avx-x86_64.S

··· 82 82 83 83 // Function parameters 84 84 .set KEY, %rdi // Initially points to crypto_aes_ctx, then is 85 - // advanced to point directly to 7th round key 85 + // advanced to point to 7th-from-last round key 86 86 .set SRC, %rsi // Pointer to next source data 87 87 .set DST, %rdx // Pointer to next destination data 88 88 .set LEN, %rcx // Remaining length in bytes 89 89 .set TWEAK, %r8 // Pointer to next tweak 90 90 91 - // %r9d holds the AES key length in bytes. 91 + // %r9 holds the AES key length in bytes. 92 92 .set KEYLEN, %r9d 93 + .set KEYLEN64, %r9 93 94 94 95 // %rax and %r10-r11 are available as temporaries. 95 96 ··· 166 165 .set GF_POLY_XMM, %xmm14 167 166 .set GF_POLY, V14 168 167 169 - // V15 holds the first AES round key, copied to all 128-bit lanes. 168 + // V15 holds the key for AES "round 0", copied to all 128-bit lanes. 170 169 .set KEY0_XMM, %xmm15 171 170 .set KEY0, V15 172 171 173 172 // If 32 SIMD registers are available, then V16-V29 hold the remaining 174 173 // AES round keys, copied to all 128-bit lanes. 174 + // 175 + // AES-128, AES-192, and AES-256 use different numbers of round keys. 176 + // To allow handling all three variants efficiently, we align the round 177 + // keys to the *end* of this register range. I.e., AES-128 uses 178 + // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14. 179 + // (All also use KEY0 for the XOR-only "round" at the beginning.) 175 180 .if USE_AVX10 176 181 .set KEY1_XMM, %xmm16 177 182 .set KEY1, V16 ··· 347 340 .set PREV_TWEAK, NEXT_TWEAK2 348 341 .set NEXT_TWEAK, NEXT_TWEAK3 349 342 .endif 350 - .if \i < 20 && \i % 5 == 0 343 + .if \i >= 0 && \i < 20 && \i % 5 == 0 351 344 vpshufd $0x13, PREV_TWEAK, V5 352 - .elseif \i < 20 && \i % 5 == 1 345 + .elseif \i >= 0 && \i < 20 && \i % 5 == 1 353 346 vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK 354 - .elseif \i < 20 && \i % 5 == 2 347 + .elseif \i >= 0 && \i < 20 && \i % 5 == 2 355 348 vpsrad $31, V5, V5 356 - .elseif \i < 20 && \i % 5 == 3 349 + .elseif \i >= 0 && \i < 20 && \i % 5 == 3 357 350 vpand GF_POLY, V5, V5 358 - .elseif \i < 20 && \i % 5 == 4 351 + .elseif \i >= 0 && \i < 20 && \i % 5 == 4 359 352 vpxor V5, NEXT_TWEAK, NEXT_TWEAK 360 353 .elseif \i == 1000 361 354 vmovdqa NEXT_TWEAK0, TWEAK0 ··· 371 364 // when VL > 16 (which it is here), the needed shift amounts are byte-aligned, 372 365 // which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts. 373 366 .macro _tweak_step_pclmul i 374 - .if \i == 2 367 + .if \i == 0 375 368 vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0 376 - .elseif \i == 4 369 + .elseif \i == 2 377 370 vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1 378 - .elseif \i == 6 371 + .elseif \i == 4 379 372 vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2 380 - .elseif \i == 8 373 + .elseif \i == 6 381 374 vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3 382 - .elseif \i == 10 375 + .elseif \i == 8 383 376 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0 384 - .elseif \i == 12 377 + .elseif \i == 10 385 378 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1 386 - .elseif \i == 14 379 + .elseif \i == 12 387 380 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2 388 - .elseif \i == 16 381 + .elseif \i == 14 389 382 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3 390 383 .elseif \i == 1000 391 384 vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0 ··· 400 393 .endm 401 394 402 395 // _tweak_step does one step of the computation of the next set of tweaks from 403 - // TWEAK[0-3]. To complete all steps, this must be invoked with \i values 0 404 - // through at least 19, then 1000 which signals the last step. 396 + // TWEAK[0-3]. To complete all steps, this is invoked with increasing values of 397 + // \i that include at least 0 through 19, then 1000 which signals the last step. 405 398 // 406 399 // This is used to interleave the computation of the next set of tweaks with the 407 400 // AES en/decryptions, which increases performance in some cases. ··· 413 406 .endif 414 407 .endm 415 408 416 - // Load the round keys: just the first one if !USE_AVX10, otherwise all of them. 417 - .macro _load_round_keys 418 - _vbroadcast128 -7*16(KEY), KEY0 409 + .macro _setup_round_keys enc 410 + 411 + // Select either the encryption round keys or the decryption round keys. 412 + .if \enc 413 + .set OFFS, 0 414 + .else 415 + .set OFFS, 240 416 + .endif 417 + 418 + // Load the round key for "round 0". 419 + _vbroadcast128 OFFS(KEY), KEY0 420 + 421 + // Increment KEY to make it so that 7*16(KEY) is the last round key. 422 + // For AES-128, increment by 3*16, resulting in the 10 round keys (not 423 + // counting the zero-th round key which was just loaded into KEY0) being 424 + // -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use 425 + // 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment 426 + // by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY). 427 + // 428 + // This rebasing provides two benefits. First, it makes the offset to 429 + // any round key be in the range [-96, 112], fitting in a signed byte. 430 + // This shortens VEX-encoded instructions that access the later round 431 + // keys which otherwise would need 4-byte offsets. Second, it makes it 432 + // easy to do AES-128 and AES-192 by skipping irrelevant rounds at the 433 + // beginning. Skipping rounds at the end doesn't work as well because 434 + // the last round needs different instructions. 435 + // 436 + // An alternative approach would be to roll up all the round loops. We 437 + // don't do that because it isn't compatible with caching the round keys 438 + // in registers which we do when possible (see below), and also because 439 + // it seems unwise to rely *too* heavily on the CPU's branch predictor. 440 + lea OFFS-16(KEY, KEYLEN64, 4), KEY 441 + 442 + // If all 32 SIMD registers are available, cache all the round keys. 419 443 .if USE_AVX10 444 + cmp $24, KEYLEN 445 + jl .Laes128\@ 446 + je .Laes192\@ 420 447 _vbroadcast128 -6*16(KEY), KEY1 421 448 _vbroadcast128 -5*16(KEY), KEY2 449 + .Laes192\@: 422 450 _vbroadcast128 -4*16(KEY), KEY3 423 451 _vbroadcast128 -3*16(KEY), KEY4 452 + .Laes128\@: 424 453 _vbroadcast128 -2*16(KEY), KEY5 425 454 _vbroadcast128 -1*16(KEY), KEY6 426 455 _vbroadcast128 0*16(KEY), KEY7 427 456 _vbroadcast128 1*16(KEY), KEY8 428 457 _vbroadcast128 2*16(KEY), KEY9 429 458 _vbroadcast128 3*16(KEY), KEY10 430 - // Note: if it's AES-128 or AES-192, the last several round keys won't 431 - // be used. We do the loads anyway to save a conditional jump. 432 459 _vbroadcast128 4*16(KEY), KEY11 433 460 _vbroadcast128 5*16(KEY), KEY12 434 461 _vbroadcast128 6*16(KEY), KEY13 ··· 507 466 508 467 // Do a single round of AES en/decryption on the blocks in registers V0-V3, 509 468 // using the same key for all blocks. The round key is loaded from the 510 - // appropriate register or memory location for round \i. In addition, does step 511 - // \i of the computation of the next set of tweaks. May clobber V4. 469 + // appropriate register or memory location for round \i. In addition, does two 470 + // steps of the computation of the next set of tweaks. May clobber V4. 512 471 .macro _vaes_4x enc, last, i 513 472 .if USE_AVX10 514 - _tweak_step (2*(\i-1)) 473 + _tweak_step (2*(\i-5)) 515 474 _vaes \enc, \last, KEY\i, V0 516 475 _vaes \enc, \last, KEY\i, V1 517 - _tweak_step (2*(\i-1) + 1) 476 + _tweak_step (2*(\i-5) + 1) 518 477 _vaes \enc, \last, KEY\i, V2 519 478 _vaes \enc, \last, KEY\i, V3 520 479 .else 521 480 _vbroadcast128 (\i-7)*16(KEY), V4 522 - _tweak_step (2*(\i-1)) 481 + _tweak_step (2*(\i-5)) 523 482 _vaes \enc, \last, V4, V0 524 483 _vaes \enc, \last, V4, V1 525 - _tweak_step (2*(\i-1) + 1) 484 + _tweak_step (2*(\i-5) + 1) 526 485 _vaes \enc, \last, V4, V2 527 486 _vaes \enc, \last, V4, V3 528 487 .endif ··· 534 493 // length VL, use V* registers and leave \xmm_suffix empty. May clobber V4. 535 494 .macro _aes_crypt enc, xmm_suffix, tweak, data 536 495 _xor3 KEY0\xmm_suffix, \tweak, \data 496 + cmp $24, KEYLEN 497 + jl .Laes128\@ 498 + je .Laes192\@ 537 499 _vaes_1x \enc, 0, 1, \xmm_suffix, \data 538 500 _vaes_1x \enc, 0, 2, \xmm_suffix, \data 501 + .Laes192\@: 539 502 _vaes_1x \enc, 0, 3, \xmm_suffix, \data 540 503 _vaes_1x \enc, 0, 4, \xmm_suffix, \data 504 + .Laes128\@: 541 505 _vaes_1x \enc, 0, 5, \xmm_suffix, \data 542 506 _vaes_1x \enc, 0, 6, \xmm_suffix, \data 543 507 _vaes_1x \enc, 0, 7, \xmm_suffix, \data 544 508 _vaes_1x \enc, 0, 8, \xmm_suffix, \data 545 509 _vaes_1x \enc, 0, 9, \xmm_suffix, \data 546 - cmp $24, KEYLEN 547 - jle .Laes_128_or_192\@ 548 510 _vaes_1x \enc, 0, 10, \xmm_suffix, \data 549 511 _vaes_1x \enc, 0, 11, \xmm_suffix, \data 550 512 _vaes_1x \enc, 0, 12, \xmm_suffix, \data 551 513 _vaes_1x \enc, 0, 13, \xmm_suffix, \data 552 514 _vaes_1x \enc, 1, 14, \xmm_suffix, \data 553 - jmp .Laes_done\@ 554 - .Laes_128_or_192\@: 555 - je .Laes_192\@ 556 - _vaes_1x \enc, 1, 10, \xmm_suffix, \data 557 - jmp .Laes_done\@ 558 - .Laes_192\@: 559 - _vaes_1x \enc, 0, 10, \xmm_suffix, \data 560 - _vaes_1x \enc, 0, 11, \xmm_suffix, \data 561 - _vaes_1x \enc, 1, 12, \xmm_suffix, \data 562 - .Laes_done\@: 563 515 _vpxor \tweak, \data, \data 564 516 .endm 565 517 ··· 562 528 // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). 563 529 movl 480(KEY), KEYLEN 564 530 565 - // Advance KEY to point to the 7th encryption round key (if encrypting) 566 - // or the 7th decryption round key (if decrypting). This makes the 567 - // offset to any round key be in the range [-112, 112], fitting in a 568 - // signed byte. This shortens VEX-encoded instructions that access the 569 - // 8th and later round keys which otherwise would need 4-byte offsets. 570 - .if \enc 571 - add $7*16, KEY 572 - .else 573 - add $(15+7)*16, KEY 574 - 531 + .if !\enc 575 532 // When decrypting a message whose length isn't a multiple of the AES 576 533 // block length, exclude the last full block from the main loop by 577 534 // subtracting 16 from LEN. This is needed because ciphertext stealing ··· 573 548 .Lxts_init\@: 574 549 .endif 575 550 576 - // Cache as many round keys as possible. 577 - _load_round_keys 551 + // Setup the pointer to the round keys and cache as many as possible. 552 + _setup_round_keys \enc 578 553 579 554 // Compute the first set of tweaks TWEAK[0-3]. 580 555 _compute_first_set_of_tweaks ··· 585 560 .Lmain_loop\@: 586 561 // This is the main loop, en/decrypting 4*VL bytes per iteration. 587 562 588 - // XOR each source block with its tweak and the first round key. 563 + // XOR each source block with its tweak and the zero-th round key. 589 564 .if USE_AVX10 590 565 vmovdqu8 0*VL(SRC), V0 591 566 vmovdqu8 1*VL(SRC), V1 ··· 605 580 vpxor TWEAK2, V2, V2 606 581 vpxor TWEAK3, V3, V3 607 582 .endif 583 + cmp $24, KEYLEN 584 + jl .Laes128\@ 585 + je .Laes192\@ 608 586 // Do all the AES rounds on the data blocks, interleaved with 609 587 // the computation of the next set of tweaks. 610 588 _vaes_4x \enc, 0, 1 611 589 _vaes_4x \enc, 0, 2 590 + .Laes192\@: 612 591 _vaes_4x \enc, 0, 3 613 592 _vaes_4x \enc, 0, 4 593 + .Laes128\@: 614 594 _vaes_4x \enc, 0, 5 615 595 _vaes_4x \enc, 0, 6 616 596 _vaes_4x \enc, 0, 7 617 597 _vaes_4x \enc, 0, 8 618 598 _vaes_4x \enc, 0, 9 619 - // Try to optimize for AES-256 by keeping the code for AES-128 and 620 - // AES-192 out-of-line. 621 - cmp $24, KEYLEN 622 - jle .Lencrypt_4x_aes_128_or_192\@ 623 599 _vaes_4x \enc, 0, 10 624 600 _vaes_4x \enc, 0, 11 625 601 _vaes_4x \enc, 0, 12 626 602 _vaes_4x \enc, 0, 13 627 603 _vaes_4x \enc, 1, 14 628 - .Lencrypt_4x_done\@: 629 604 630 605 // XOR in the tweaks again. 631 606 _vpxor TWEAK0, V0, V0 ··· 703 678 jnz .Lcts\@ 704 679 jmp .Ldone\@ 705 680 706 - // Out-of-line handling of AES-128 and AES-192 707 - .Lencrypt_4x_aes_128_or_192\@: 708 - jz .Lencrypt_4x_aes_192\@ 709 - _vaes_4x \enc, 1, 10 710 - jmp .Lencrypt_4x_done\@ 711 - .Lencrypt_4x_aes_192\@: 712 - _vaes_4x \enc, 0, 10 713 - _vaes_4x \enc, 0, 11 714 - _vaes_4x \enc, 1, 12 715 - jmp .Lencrypt_4x_done\@ 716 - 717 681 .if !\enc 718 682 .Lneed_cts_dec\@: 719 683 sub $16, LEN ··· 778 764 // u8 iv[AES_BLOCK_SIZE]); 779 765 SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) 780 766 vmovdqu (%rsi), %xmm0 781 - add $7*16, %rdi 782 - vpxor -7*16(%rdi), %xmm0, %xmm0 767 + vpxor (%rdi), %xmm0, %xmm0 768 + movl 480(%rdi), %eax // AES key length 769 + lea -16(%rdi, %rax, 4), %rdi 770 + cmp $24, %eax 771 + jl .Lencrypt_iv_aes128 772 + je .Lencrypt_iv_aes192 783 773 vaesenc -6*16(%rdi), %xmm0, %xmm0 784 774 vaesenc -5*16(%rdi), %xmm0, %xmm0 775 + .Lencrypt_iv_aes192: 785 776 vaesenc -4*16(%rdi), %xmm0, %xmm0 786 777 vaesenc -3*16(%rdi), %xmm0, %xmm0 778 + .Lencrypt_iv_aes128: 787 779 vaesenc -2*16(%rdi), %xmm0, %xmm0 788 780 vaesenc -1*16(%rdi), %xmm0, %xmm0 789 781 vaesenc 0*16(%rdi), %xmm0, %xmm0 790 782 vaesenc 1*16(%rdi), %xmm0, %xmm0 791 783 vaesenc 2*16(%rdi), %xmm0, %xmm0 792 - cmpl $24, 480-(7*16)(%rdi) 793 - jle .Lencrypt_iv_aes_128_or_192 794 784 vaesenc 3*16(%rdi), %xmm0, %xmm0 795 785 vaesenc 4*16(%rdi), %xmm0, %xmm0 796 786 vaesenc 5*16(%rdi), %xmm0, %xmm0 797 787 vaesenc 6*16(%rdi), %xmm0, %xmm0 798 788 vaesenclast 7*16(%rdi), %xmm0, %xmm0 799 - .Lencrypt_iv_done: 800 789 vmovdqu %xmm0, (%rsi) 801 790 RET 802 - 803 - // Out-of-line handling of AES-128 and AES-192 804 - .Lencrypt_iv_aes_128_or_192: 805 - jz .Lencrypt_iv_aes_192 806 - vaesenclast 3*16(%rdi), %xmm0, %xmm0 807 - jmp .Lencrypt_iv_done 808 - .Lencrypt_iv_aes_192: 809 - vaesenc 3*16(%rdi), %xmm0, %xmm0 810 - vaesenc 4*16(%rdi), %xmm0, %xmm0 811 - vaesenclast 5*16(%rdi), %xmm0, %xmm0 812 - jmp .Lencrypt_iv_done 813 791 SYM_FUNC_END(aes_xts_encrypt_iv) 814 792 815 793 // Below are the actual AES-XTS encryption and decryption functions,