crypto: serpent-sse2 - change transpose_4x4 to only use integer instructions

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Matrix transpose macro in serpent-sse2 uses mix of SSE2 integer and SSE floating
point instructions, which might cause performance penality on some CPUs.

This patch replaces transpose_4x4 macro with version that uses only SSE2
integer instructions.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Jussi Kivilinna and committed by

Herbert Xu 14 years ago 847cb7ef 4c58464b

+26 -32

2 changed files

expand all

arch

x86

crypto

serpent-sse2-i586-asm_32.S

serpent-sse2-x86_64-asm_64.S

+13 -16

arch/x86/crypto/serpent-sse2-i586-asm_32.S

··· 463 463 pand x0, x4; \ 464 464 pxor x2, x4; 465 465 466 - #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ 467 - movdqa x2, t3; \ 468 - movdqa x0, t1; \ 469 - unpcklps x3, t3; \ 466 + #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 470 467 movdqa x0, t2; \ 471 - unpcklps x1, t1; \ 472 - unpckhps x1, t2; \ 473 - movdqa t3, x1; \ 474 - unpckhps x3, x2; \ 475 - movdqa t1, x0; \ 476 - movhlps t1, x1; \ 477 - movdqa t2, t1; \ 478 - movlhps t3, x0; \ 479 - movlhps x2, t1; \ 480 - movhlps t2, x2; \ 481 - movdqa x2, x3; \ 482 - movdqa t1, x2; 468 + punpckldq x1, x0; \ 469 + punpckhdq x1, t2; \ 470 + movdqa x2, t1; \ 471 + punpckhdq x3, x2; \ 472 + punpckldq x3, t1; \ 473 + movdqa x0, x1; \ 474 + punpcklqdq t1, x0; \ 475 + punpckhqdq t1, x1; \ 476 + movdqa t2, x3; \ 477 + punpcklqdq x2, t2; \ 478 + punpckhqdq x2, x3; \ 479 + movdqa t2, x2; 483 480 484 481 #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ 485 482 movdqu (0*4*4)(in), x0; \

+13 -16

arch/x86/crypto/serpent-sse2-x86_64-asm_64.S

··· 585 585 get_key(i, 1, RK1); \ 586 586 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 587 587 588 - #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ 589 - movdqa x2, t3; \ 590 - movdqa x0, t1; \ 591 - unpcklps x3, t3; \ 588 + #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 592 589 movdqa x0, t2; \ 593 - unpcklps x1, t1; \ 594 - unpckhps x1, t2; \ 595 - movdqa t3, x1; \ 596 - unpckhps x3, x2; \ 597 - movdqa t1, x0; \ 598 - movhlps t1, x1; \ 599 - movdqa t2, t1; \ 600 - movlhps t3, x0; \ 601 - movlhps x2, t1; \ 602 - movhlps t2, x2; \ 603 - movdqa x2, x3; \ 604 - movdqa t1, x2; 590 + punpckldq x1, x0; \ 591 + punpckhdq x1, t2; \ 592 + movdqa x2, t1; \ 593 + punpckhdq x3, x2; \ 594 + punpckldq x3, t1; \ 595 + movdqa x0, x1; \ 596 + punpcklqdq t1, x0; \ 597 + punpckhqdq t1, x1; \ 598 + movdqa t2, x3; \ 599 + punpcklqdq x2, t2; \ 600 + punpckhqdq x2, x3; \ 601 + movdqa t2, x2; 605 602 606 603 #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ 607 604 movdqu (0*4*4)(in), x0; \