crypto: arm/chacha-neon - optimize for non-block size multiples

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

The current NEON based ChaCha implementation for ARM is optimized for
multiples of 4x the ChaCha block size (64 bytes). This makes sense for
block encryption, but given that ChaCha is also often used in the
context of networking, it makes sense to consider arbitrary length
inputs as well.

For example, WireGuard typically uses 1420 byte packets, and performing
ChaCha encryption involves 5 invocations of chacha_4block_xor_neon()
and 3 invocations of chacha_block_xor_neon(), where the last one also
involves a memcpy() using a buffer on the stack to process the final
chunk of 1420 % 64 == 12 bytes.

Let's optimize for this case as well, by letting chacha_4block_xor_neon()
deal with any input size between 64 and 256 bytes, using NEON permutation
instructions and overlapping loads and stores. This way, the 140 byte
tail of a 1420 byte input buffer can simply be processed in one go.

This results in the following performance improvements for 1420 byte
blocks, without significant impact on power-of-2 input sizes. (Note
that Raspberry Pi is widely used in combination with a 32-bit kernel,
even though the core is 64-bit capable)

Cortex-A8 (BeagleBone) : 7%
Cortex-A15 (Calxeda Midway) : 21%
Cortex-A53 (Raspberry Pi 3) : 3%
Cortex-A72 (Raspberry Pi 4) : 19%

Cc: Eric Biggers <ebiggers@google.com>
Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Ard Biesheuvel and committed by

Herbert Xu 5 years ago 86cd97ec ec3c5b32

+107 -24

2 changed files

expand all

arch

arm

crypto

chacha-glue.c

chacha-neon-core.S

+17 -17

arch/arm/crypto/chacha-glue.c

··· 23 23 asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src, 24 24 int nrounds); 25 25 asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src, 26 - int nrounds); 26 + int nrounds, unsigned int nbytes); 27 27 asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds); 28 28 asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); 29 29 ··· 42 42 { 43 43 u8 buf[CHACHA_BLOCK_SIZE]; 44 44 45 - while (bytes >= CHACHA_BLOCK_SIZE * 4) { 46 - chacha_4block_xor_neon(state, dst, src, nrounds); 47 - bytes -= CHACHA_BLOCK_SIZE * 4; 48 - src += CHACHA_BLOCK_SIZE * 4; 49 - dst += CHACHA_BLOCK_SIZE * 4; 50 - state[12] += 4; 51 - } 52 - while (bytes >= CHACHA_BLOCK_SIZE) { 53 - chacha_block_xor_neon(state, dst, src, nrounds); 54 - bytes -= CHACHA_BLOCK_SIZE; 55 - src += CHACHA_BLOCK_SIZE; 56 - dst += CHACHA_BLOCK_SIZE; 57 - state[12]++; 45 + while (bytes > CHACHA_BLOCK_SIZE) { 46 + unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U); 47 + 48 + chacha_4block_xor_neon(state, dst, src, nrounds, l); 49 + bytes -= l; 50 + src += l; 51 + dst += l; 52 + state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); 58 53 } 59 54 if (bytes) { 60 - memcpy(buf, src, bytes); 61 - chacha_block_xor_neon(state, buf, buf, nrounds); 62 - memcpy(dst, buf, bytes); 55 + const u8 *s = src; 56 + u8 *d = dst; 57 + 58 + if (bytes != CHACHA_BLOCK_SIZE) 59 + s = d = memcpy(buf, src, bytes); 60 + chacha_block_xor_neon(state, d, s, nrounds); 61 + if (d != dst) 62 + memcpy(dst, buf, bytes); 63 63 } 64 64 } 65 65

+90 -7

arch/arm/crypto/chacha-neon-core.S

··· 47 47 */ 48 48 49 49 #include <linux/linkage.h> 50 + #include <asm/cache.h> 50 51 51 52 .text 52 53 .fpu neon ··· 206 205 207 206 .align 5 208 207 ENTRY(chacha_4block_xor_neon) 209 - push {r4-r5} 208 + push {r4, lr} 210 209 mov r4, sp // preserve the stack pointer 211 210 sub ip, sp, #0x20 // allocate a 32 byte buffer 212 211 bic ip, ip, #0x1f // aligned to 32 bytes ··· 230 229 vld1.32 {q0-q1}, [r0] 231 230 vld1.32 {q2-q3}, [ip] 232 231 233 - adr r5, .Lctrinc 232 + adr lr, .Lctrinc 234 233 vdup.32 q15, d7[1] 235 234 vdup.32 q14, d7[0] 236 - vld1.32 {q4}, [r5, :128] 235 + vld1.32 {q4}, [lr, :128] 237 236 vdup.32 q13, d6[1] 238 237 vdup.32 q12, d6[0] 239 238 vdup.32 q11, d5[1] ··· 456 455 457 456 // Re-interleave the words in the first two rows of each block (x0..7). 458 457 // Also add the counter values 0-3 to x12[0-3]. 459 - vld1.32 {q8}, [r5, :128] // load counter values 0-3 458 + vld1.32 {q8}, [lr, :128] // load counter values 0-3 460 459 vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1) 461 460 vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3) 462 461 vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5) ··· 494 493 495 494 // Re-interleave the words in the last two rows of each block (x8..15). 496 495 vld1.32 {q8-q9}, [sp, :256] 496 + mov sp, r4 // restore original stack pointer 497 + ldr r4, [r4, #8] // load number of bytes 497 498 vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13) 498 499 vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15) 499 500 vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9) ··· 523 520 // XOR the rest of the data with the keystream 524 521 525 522 vld1.8 {q0-q1}, [r2]! 523 + subs r4, r4, #96 526 524 veor q0, q0, q8 527 525 veor q1, q1, q12 526 + ble .Lle96 528 527 vst1.8 {q0-q1}, [r1]! 529 528 530 529 vld1.8 {q0-q1}, [r2]! 530 + subs r4, r4, #32 531 531 veor q0, q0, q2 532 532 veor q1, q1, q6 533 + ble .Lle128 533 534 vst1.8 {q0-q1}, [r1]! 534 535 535 536 vld1.8 {q0-q1}, [r2]! 537 + subs r4, r4, #32 536 538 veor q0, q0, q10 537 539 veor q1, q1, q14 540 + ble .Lle160 538 541 vst1.8 {q0-q1}, [r1]! 539 542 540 543 vld1.8 {q0-q1}, [r2]! 544 + subs r4, r4, #32 541 545 veor q0, q0, q4 542 546 veor q1, q1, q5 547 + ble .Lle192 543 548 vst1.8 {q0-q1}, [r1]! 544 549 545 550 vld1.8 {q0-q1}, [r2]! 551 + subs r4, r4, #32 546 552 veor q0, q0, q9 547 553 veor q1, q1, q13 554 + ble .Lle224 548 555 vst1.8 {q0-q1}, [r1]! 549 556 550 557 vld1.8 {q0-q1}, [r2]! 558 + subs r4, r4, #32 551 559 veor q0, q0, q3 552 560 veor q1, q1, q7 561 + blt .Llt256 562 + .Lout: 553 563 vst1.8 {q0-q1}, [r1]! 554 564 555 565 vld1.8 {q0-q1}, [r2] 556 - mov sp, r4 // restore original stack pointer 557 566 veor q0, q0, q11 558 567 veor q1, q1, q15 559 568 vst1.8 {q0-q1}, [r1] 560 569 561 - pop {r4-r5} 562 - bx lr 570 + pop {r4, pc} 571 + 572 + .Lle192: 573 + vmov q4, q9 574 + vmov q5, q13 575 + 576 + .Lle160: 577 + // nothing to do 578 + 579 + .Lfinalblock: 580 + // Process the final block if processing less than 4 full blocks. 581 + // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the 582 + // previous 32 byte output block that still needs to be written at 583 + // [r1] in q0-q1. 584 + beq .Lfullblock 585 + 586 + .Lpartialblock: 587 + adr lr, .Lpermute + 32 588 + add r2, r2, r4 589 + add lr, lr, r4 590 + add r4, r4, r1 591 + 592 + vld1.8 {q2-q3}, [lr] 593 + vld1.8 {q6-q7}, [r2] 594 + 595 + add r4, r4, #32 596 + 597 + vtbl.8 d4, {q4-q5}, d4 598 + vtbl.8 d5, {q4-q5}, d5 599 + vtbl.8 d6, {q4-q5}, d6 600 + vtbl.8 d7, {q4-q5}, d7 601 + 602 + veor q6, q6, q2 603 + veor q7, q7, q3 604 + 605 + vst1.8 {q6-q7}, [r4] // overlapping stores 606 + vst1.8 {q0-q1}, [r1] 607 + pop {r4, pc} 608 + 609 + .Lfullblock: 610 + vmov q11, q4 611 + vmov q15, q5 612 + b .Lout 613 + .Lle96: 614 + vmov q4, q2 615 + vmov q5, q6 616 + b .Lfinalblock 617 + .Lle128: 618 + vmov q4, q10 619 + vmov q5, q14 620 + b .Lfinalblock 621 + .Lle224: 622 + vmov q4, q3 623 + vmov q5, q7 624 + b .Lfinalblock 625 + .Llt256: 626 + vmov q4, q11 627 + vmov q5, q15 628 + b .Lpartialblock 563 629 ENDPROC(chacha_4block_xor_neon) 630 + 631 + .align L1_CACHE_SHIFT 632 + .Lpermute: 633 + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 634 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 635 + .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 636 + .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f 637 + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 638 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 639 + .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 640 + .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f