Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

crypto: x86/chacha20 - Add a 4-block AVX2 variant

This variant builds upon the idea of the 2-block AVX2 variant that
shuffles words after each round. The shuffling has a rather high latency,
so the arithmetic units are not optimally used.

Given that we have plenty of registers in AVX, this version parallelizes
the 2-block variant to do four blocks. While the first two blocks are
shuffling, the CPU can do the XORing on the second two blocks and
vice-versa, which makes this version much faster than the SSSE3 variant
for four blocks. The latter is now mostly for systems that do not have
AVX2, but there it is the work-horse, so we keep it in place.

The partial XORing function trailer is very similar to the AVX2 2-block
variant. While it could be shared, that code segment is rather short;
profiling is also easier with the trailer integrated, so we keep it per
function.

Signed-off-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Martin Willi and committed by
Herbert Xu
8a5a79d5 a5dd97f8

+317
+310
arch/x86/crypto/chacha20-avx2-x86_64.S
··· 31 31 CTR2BL: .octa 0x00000000000000000000000000000000 32 32 .octa 0x00000000000000000000000000000001 33 33 34 + .section .rodata.cst32.CTR4BL, "aM", @progbits, 32 35 + .align 32 36 + CTR4BL: .octa 0x00000000000000000000000000000002 37 + .octa 0x00000000000000000000000000000003 38 + 34 39 .text 35 40 36 41 ENTRY(chacha20_2block_xor_avx2) ··· 229 224 jmp .Ldone2 230 225 231 226 ENDPROC(chacha20_2block_xor_avx2) 227 + 228 + ENTRY(chacha20_4block_xor_avx2) 229 + # %rdi: Input state matrix, s 230 + # %rsi: up to 4 data blocks output, o 231 + # %rdx: up to 4 data blocks input, i 232 + # %rcx: input/output length in bytes 233 + 234 + # This function encrypts four ChaCha20 block by loading the state 235 + # matrix four times across eight AVX registers. It performs matrix 236 + # operations on four words in two matrices in parallel, sequentially 237 + # to the operations on the four words of the other two matrices. The 238 + # required word shuffling has a rather high latency, we can do the 239 + # arithmetic on two matrix-pairs without much slowdown. 240 + 241 + vzeroupper 242 + 243 + # x0..3[0-4] = s0..3 244 + vbroadcasti128 0x00(%rdi),%ymm0 245 + vbroadcasti128 0x10(%rdi),%ymm1 246 + vbroadcasti128 0x20(%rdi),%ymm2 247 + vbroadcasti128 0x30(%rdi),%ymm3 248 + 249 + vmovdqa %ymm0,%ymm4 250 + vmovdqa %ymm1,%ymm5 251 + vmovdqa %ymm2,%ymm6 252 + vmovdqa %ymm3,%ymm7 253 + 254 + vpaddd CTR2BL(%rip),%ymm3,%ymm3 255 + vpaddd CTR4BL(%rip),%ymm7,%ymm7 256 + 257 + vmovdqa %ymm0,%ymm11 258 + vmovdqa %ymm1,%ymm12 259 + vmovdqa %ymm2,%ymm13 260 + vmovdqa %ymm3,%ymm14 261 + vmovdqa %ymm7,%ymm15 262 + 263 + vmovdqa ROT8(%rip),%ymm8 264 + vmovdqa ROT16(%rip),%ymm9 265 + 266 + mov %rcx,%rax 267 + mov $10,%ecx 268 + 269 + .Ldoubleround4: 270 + 271 + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 272 + vpaddd %ymm1,%ymm0,%ymm0 273 + vpxor %ymm0,%ymm3,%ymm3 274 + vpshufb %ymm9,%ymm3,%ymm3 275 + 276 + vpaddd %ymm5,%ymm4,%ymm4 277 + vpxor %ymm4,%ymm7,%ymm7 278 + vpshufb %ymm9,%ymm7,%ymm7 279 + 280 + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 281 + vpaddd %ymm3,%ymm2,%ymm2 282 + vpxor %ymm2,%ymm1,%ymm1 283 + vmovdqa %ymm1,%ymm10 284 + vpslld $12,%ymm10,%ymm10 285 + vpsrld $20,%ymm1,%ymm1 286 + vpor %ymm10,%ymm1,%ymm1 287 + 288 + vpaddd %ymm7,%ymm6,%ymm6 289 + vpxor %ymm6,%ymm5,%ymm5 290 + vmovdqa %ymm5,%ymm10 291 + vpslld $12,%ymm10,%ymm10 292 + vpsrld $20,%ymm5,%ymm5 293 + vpor %ymm10,%ymm5,%ymm5 294 + 295 + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 296 + vpaddd %ymm1,%ymm0,%ymm0 297 + vpxor %ymm0,%ymm3,%ymm3 298 + vpshufb %ymm8,%ymm3,%ymm3 299 + 300 + vpaddd %ymm5,%ymm4,%ymm4 301 + vpxor %ymm4,%ymm7,%ymm7 302 + vpshufb %ymm8,%ymm7,%ymm7 303 + 304 + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 305 + vpaddd %ymm3,%ymm2,%ymm2 306 + vpxor %ymm2,%ymm1,%ymm1 307 + vmovdqa %ymm1,%ymm10 308 + vpslld $7,%ymm10,%ymm10 309 + vpsrld $25,%ymm1,%ymm1 310 + vpor %ymm10,%ymm1,%ymm1 311 + 312 + vpaddd %ymm7,%ymm6,%ymm6 313 + vpxor %ymm6,%ymm5,%ymm5 314 + vmovdqa %ymm5,%ymm10 315 + vpslld $7,%ymm10,%ymm10 316 + vpsrld $25,%ymm5,%ymm5 317 + vpor %ymm10,%ymm5,%ymm5 318 + 319 + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 320 + vpshufd $0x39,%ymm1,%ymm1 321 + vpshufd $0x39,%ymm5,%ymm5 322 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 323 + vpshufd $0x4e,%ymm2,%ymm2 324 + vpshufd $0x4e,%ymm6,%ymm6 325 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 326 + vpshufd $0x93,%ymm3,%ymm3 327 + vpshufd $0x93,%ymm7,%ymm7 328 + 329 + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 330 + vpaddd %ymm1,%ymm0,%ymm0 331 + vpxor %ymm0,%ymm3,%ymm3 332 + vpshufb %ymm9,%ymm3,%ymm3 333 + 334 + vpaddd %ymm5,%ymm4,%ymm4 335 + vpxor %ymm4,%ymm7,%ymm7 336 + vpshufb %ymm9,%ymm7,%ymm7 337 + 338 + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 339 + vpaddd %ymm3,%ymm2,%ymm2 340 + vpxor %ymm2,%ymm1,%ymm1 341 + vmovdqa %ymm1,%ymm10 342 + vpslld $12,%ymm10,%ymm10 343 + vpsrld $20,%ymm1,%ymm1 344 + vpor %ymm10,%ymm1,%ymm1 345 + 346 + vpaddd %ymm7,%ymm6,%ymm6 347 + vpxor %ymm6,%ymm5,%ymm5 348 + vmovdqa %ymm5,%ymm10 349 + vpslld $12,%ymm10,%ymm10 350 + vpsrld $20,%ymm5,%ymm5 351 + vpor %ymm10,%ymm5,%ymm5 352 + 353 + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 354 + vpaddd %ymm1,%ymm0,%ymm0 355 + vpxor %ymm0,%ymm3,%ymm3 356 + vpshufb %ymm8,%ymm3,%ymm3 357 + 358 + vpaddd %ymm5,%ymm4,%ymm4 359 + vpxor %ymm4,%ymm7,%ymm7 360 + vpshufb %ymm8,%ymm7,%ymm7 361 + 362 + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 363 + vpaddd %ymm3,%ymm2,%ymm2 364 + vpxor %ymm2,%ymm1,%ymm1 365 + vmovdqa %ymm1,%ymm10 366 + vpslld $7,%ymm10,%ymm10 367 + vpsrld $25,%ymm1,%ymm1 368 + vpor %ymm10,%ymm1,%ymm1 369 + 370 + vpaddd %ymm7,%ymm6,%ymm6 371 + vpxor %ymm6,%ymm5,%ymm5 372 + vmovdqa %ymm5,%ymm10 373 + vpslld $7,%ymm10,%ymm10 374 + vpsrld $25,%ymm5,%ymm5 375 + vpor %ymm10,%ymm5,%ymm5 376 + 377 + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 378 + vpshufd $0x93,%ymm1,%ymm1 379 + vpshufd $0x93,%ymm5,%ymm5 380 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 381 + vpshufd $0x4e,%ymm2,%ymm2 382 + vpshufd $0x4e,%ymm6,%ymm6 383 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 384 + vpshufd $0x39,%ymm3,%ymm3 385 + vpshufd $0x39,%ymm7,%ymm7 386 + 387 + dec %ecx 388 + jnz .Ldoubleround4 389 + 390 + # o0 = i0 ^ (x0 + s0), first block 391 + vpaddd %ymm11,%ymm0,%ymm10 392 + cmp $0x10,%rax 393 + jl .Lxorpart4 394 + vpxor 0x00(%rdx),%xmm10,%xmm9 395 + vmovdqu %xmm9,0x00(%rsi) 396 + vextracti128 $1,%ymm10,%xmm0 397 + # o1 = i1 ^ (x1 + s1), first block 398 + vpaddd %ymm12,%ymm1,%ymm10 399 + cmp $0x20,%rax 400 + jl .Lxorpart4 401 + vpxor 0x10(%rdx),%xmm10,%xmm9 402 + vmovdqu %xmm9,0x10(%rsi) 403 + vextracti128 $1,%ymm10,%xmm1 404 + # o2 = i2 ^ (x2 + s2), first block 405 + vpaddd %ymm13,%ymm2,%ymm10 406 + cmp $0x30,%rax 407 + jl .Lxorpart4 408 + vpxor 0x20(%rdx),%xmm10,%xmm9 409 + vmovdqu %xmm9,0x20(%rsi) 410 + vextracti128 $1,%ymm10,%xmm2 411 + # o3 = i3 ^ (x3 + s3), first block 412 + vpaddd %ymm14,%ymm3,%ymm10 413 + cmp $0x40,%rax 414 + jl .Lxorpart4 415 + vpxor 0x30(%rdx),%xmm10,%xmm9 416 + vmovdqu %xmm9,0x30(%rsi) 417 + vextracti128 $1,%ymm10,%xmm3 418 + 419 + # xor and write second block 420 + vmovdqa %xmm0,%xmm10 421 + cmp $0x50,%rax 422 + jl .Lxorpart4 423 + vpxor 0x40(%rdx),%xmm10,%xmm9 424 + vmovdqu %xmm9,0x40(%rsi) 425 + 426 + vmovdqa %xmm1,%xmm10 427 + cmp $0x60,%rax 428 + jl .Lxorpart4 429 + vpxor 0x50(%rdx),%xmm10,%xmm9 430 + vmovdqu %xmm9,0x50(%rsi) 431 + 432 + vmovdqa %xmm2,%xmm10 433 + cmp $0x70,%rax 434 + jl .Lxorpart4 435 + vpxor 0x60(%rdx),%xmm10,%xmm9 436 + vmovdqu %xmm9,0x60(%rsi) 437 + 438 + vmovdqa %xmm3,%xmm10 439 + cmp $0x80,%rax 440 + jl .Lxorpart4 441 + vpxor 0x70(%rdx),%xmm10,%xmm9 442 + vmovdqu %xmm9,0x70(%rsi) 443 + 444 + # o0 = i0 ^ (x0 + s0), third block 445 + vpaddd %ymm11,%ymm4,%ymm10 446 + cmp $0x90,%rax 447 + jl .Lxorpart4 448 + vpxor 0x80(%rdx),%xmm10,%xmm9 449 + vmovdqu %xmm9,0x80(%rsi) 450 + vextracti128 $1,%ymm10,%xmm4 451 + # o1 = i1 ^ (x1 + s1), third block 452 + vpaddd %ymm12,%ymm5,%ymm10 453 + cmp $0xa0,%rax 454 + jl .Lxorpart4 455 + vpxor 0x90(%rdx),%xmm10,%xmm9 456 + vmovdqu %xmm9,0x90(%rsi) 457 + vextracti128 $1,%ymm10,%xmm5 458 + # o2 = i2 ^ (x2 + s2), third block 459 + vpaddd %ymm13,%ymm6,%ymm10 460 + cmp $0xb0,%rax 461 + jl .Lxorpart4 462 + vpxor 0xa0(%rdx),%xmm10,%xmm9 463 + vmovdqu %xmm9,0xa0(%rsi) 464 + vextracti128 $1,%ymm10,%xmm6 465 + # o3 = i3 ^ (x3 + s3), third block 466 + vpaddd %ymm15,%ymm7,%ymm10 467 + cmp $0xc0,%rax 468 + jl .Lxorpart4 469 + vpxor 0xb0(%rdx),%xmm10,%xmm9 470 + vmovdqu %xmm9,0xb0(%rsi) 471 + vextracti128 $1,%ymm10,%xmm7 472 + 473 + # xor and write fourth block 474 + vmovdqa %xmm4,%xmm10 475 + cmp $0xd0,%rax 476 + jl .Lxorpart4 477 + vpxor 0xc0(%rdx),%xmm10,%xmm9 478 + vmovdqu %xmm9,0xc0(%rsi) 479 + 480 + vmovdqa %xmm5,%xmm10 481 + cmp $0xe0,%rax 482 + jl .Lxorpart4 483 + vpxor 0xd0(%rdx),%xmm10,%xmm9 484 + vmovdqu %xmm9,0xd0(%rsi) 485 + 486 + vmovdqa %xmm6,%xmm10 487 + cmp $0xf0,%rax 488 + jl .Lxorpart4 489 + vpxor 0xe0(%rdx),%xmm10,%xmm9 490 + vmovdqu %xmm9,0xe0(%rsi) 491 + 492 + vmovdqa %xmm7,%xmm10 493 + cmp $0x100,%rax 494 + jl .Lxorpart4 495 + vpxor 0xf0(%rdx),%xmm10,%xmm9 496 + vmovdqu %xmm9,0xf0(%rsi) 497 + 498 + .Ldone4: 499 + vzeroupper 500 + ret 501 + 502 + .Lxorpart4: 503 + # xor remaining bytes from partial register into output 504 + mov %rax,%r9 505 + and $0x0f,%r9 506 + jz .Ldone4 507 + and $~0x0f,%rax 508 + 509 + mov %rsi,%r11 510 + 511 + lea 8(%rsp),%r10 512 + sub $0x10,%rsp 513 + and $~31,%rsp 514 + 515 + lea (%rdx,%rax),%rsi 516 + mov %rsp,%rdi 517 + mov %r9,%rcx 518 + rep movsb 519 + 520 + vpxor 0x00(%rsp),%xmm10,%xmm10 521 + vmovdqa %xmm10,0x00(%rsp) 522 + 523 + mov %rsp,%rsi 524 + lea (%r11,%rax),%rdi 525 + mov %r9,%rcx 526 + rep movsb 527 + 528 + lea -8(%r10),%rsp 529 + jmp .Ldone4 530 + 531 + ENDPROC(chacha20_4block_xor_avx2) 232 532 233 533 ENTRY(chacha20_8block_xor_avx2) 234 534 # %rdi: Input state matrix, s
+7
arch/x86/crypto/chacha20_glue.c
··· 26 26 #ifdef CONFIG_AS_AVX2 27 27 asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, 28 28 unsigned int len); 29 + asmlinkage void chacha20_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, 30 + unsigned int len); 29 31 asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, 30 32 unsigned int len); 31 33 static bool chacha20_use_avx2; ··· 54 52 if (bytes > CHACHA20_BLOCK_SIZE * 4) { 55 53 chacha20_8block_xor_avx2(state, dst, src, bytes); 56 54 state[12] += chacha20_advance(bytes, 8); 55 + return; 56 + } 57 + if (bytes > CHACHA20_BLOCK_SIZE * 2) { 58 + chacha20_4block_xor_avx2(state, dst, src, bytes); 59 + state[12] += chacha20_advance(bytes, 4); 57 60 return; 58 61 } 59 62 if (bytes > CHACHA20_BLOCK_SIZE) {