Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation

This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation
for NEON authored by Andy Polyakov, and contributed by him to the OpenSSL
project. The file 'poly1305-armv4.pl' is taken straight from this upstream
GitHub repository [0] at commit ec55a08dc0244ce570c4fc7cade330c60798952f,
and already contains all the changes required to build it as part of a
Linux kernel module.

[0] https://github.com/dot-asm/cryptogams

Co-developed-by: Andy Polyakov <appro@cryptogams.org>
Signed-off-by: Andy Polyakov <appro@cryptogams.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Ard Biesheuvel and committed by
Herbert Xu
a6b803b3 f569ca16

+2687 -2
+5
arch/arm/crypto/Kconfig
··· 132 132 select CRYPTO_SKCIPHER 133 133 select CRYPTO_ARCH_HAVE_LIB_CHACHA 134 134 135 + config CRYPTO_POLY1305_ARM 136 + tristate "Accelerated scalar and SIMD Poly1305 hash implementations" 137 + select CRYPTO_HASH 138 + select CRYPTO_ARCH_HAVE_LIB_POLY1305 139 + 135 140 config CRYPTO_NHPOLY1305_NEON 136 141 tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)" 137 142 depends on KERNEL_MODE_NEON
+11 -1
arch/arm/crypto/Makefile
··· 10 10 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o 11 11 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o 12 12 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o 13 + obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o 13 14 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o 14 15 15 16 obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o ··· 36 35 crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o 37 36 chacha-neon-y := chacha-scalar-core.o chacha-glue.o 38 37 chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o 38 + poly1305-arm-y := poly1305-core.o poly1305-glue.o 39 39 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o 40 40 41 41 ifdef REGENERATE_ARM_CRYPTO 42 42 quiet_cmd_perl = PERL $@ 43 43 cmd_perl = $(PERL) $(<) > $(@) 44 + 45 + $(src)/poly1305-core.S_shipped: $(src)/poly1305-armv4.pl 46 + $(call cmd,perl) 44 47 45 48 $(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl 46 49 $(call cmd,perl) ··· 53 48 $(call cmd,perl) 54 49 endif 55 50 56 - clean-files += sha256-core.S sha512-core.S 51 + clean-files += poly1305-core.S sha256-core.S sha512-core.S 52 + 53 + # massage the perlasm code a bit so we only get the NEON routine if we need it 54 + poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5 55 + poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7 56 + AFLAGS_poly1305-core.o += $(poly1305-aflags-y)
+1236
arch/arm/crypto/poly1305-armv4.pl
··· 1 + #!/usr/bin/env perl 2 + # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause 3 + # 4 + # ==================================================================== 5 + # Written by Andy Polyakov, @dot-asm, initially for the OpenSSL 6 + # project. 7 + # ==================================================================== 8 + # 9 + # IALU(*)/gcc-4.4 NEON 10 + # 11 + # ARM11xx(ARMv6) 7.78/+100% - 12 + # Cortex-A5 6.35/+130% 3.00 13 + # Cortex-A8 6.25/+115% 2.36 14 + # Cortex-A9 5.10/+95% 2.55 15 + # Cortex-A15 3.85/+85% 1.25(**) 16 + # Snapdragon S4 5.70/+100% 1.48(**) 17 + # 18 + # (*) this is for -march=armv6, i.e. with bunch of ldrb loading data; 19 + # (**) these are trade-off results, they can be improved by ~8% but at 20 + # the cost of 15/12% regression on Cortex-A5/A7, it's even possible 21 + # to improve Cortex-A9 result, but then A5/A7 loose more than 20%; 22 + 23 + $flavour = shift; 24 + if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 25 + else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 26 + 27 + if ($flavour && $flavour ne "void") { 28 + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 29 + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 30 + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 31 + die "can't locate arm-xlate.pl"; 32 + 33 + open STDOUT,"| \"$^X\" $xlate $flavour $output"; 34 + } else { 35 + open STDOUT,">$output"; 36 + } 37 + 38 + ($ctx,$inp,$len,$padbit)=map("r$_",(0..3)); 39 + 40 + $code.=<<___; 41 + #ifndef __KERNEL__ 42 + # include "arm_arch.h" 43 + #else 44 + # define __ARM_ARCH__ __LINUX_ARM_ARCH__ 45 + # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ 46 + # define poly1305_init poly1305_init_arm 47 + # define poly1305_blocks poly1305_blocks_arm 48 + # define poly1305_emit poly1305_emit_arm 49 + .globl poly1305_blocks_neon 50 + #endif 51 + 52 + #if defined(__thumb2__) 53 + .syntax unified 54 + .thumb 55 + #else 56 + .code 32 57 + #endif 58 + 59 + .text 60 + 61 + .globl poly1305_emit 62 + .globl poly1305_blocks 63 + .globl poly1305_init 64 + .type poly1305_init,%function 65 + .align 5 66 + poly1305_init: 67 + .Lpoly1305_init: 68 + stmdb sp!,{r4-r11} 69 + 70 + eor r3,r3,r3 71 + cmp $inp,#0 72 + str r3,[$ctx,#0] @ zero hash value 73 + str r3,[$ctx,#4] 74 + str r3,[$ctx,#8] 75 + str r3,[$ctx,#12] 76 + str r3,[$ctx,#16] 77 + str r3,[$ctx,#36] @ clear is_base2_26 78 + add $ctx,$ctx,#20 79 + 80 + #ifdef __thumb2__ 81 + it eq 82 + #endif 83 + moveq r0,#0 84 + beq .Lno_key 85 + 86 + #if __ARM_MAX_ARCH__>=7 87 + mov r3,#-1 88 + str r3,[$ctx,#28] @ impossible key power value 89 + # ifndef __KERNEL__ 90 + adr r11,.Lpoly1305_init 91 + ldr r12,.LOPENSSL_armcap 92 + # endif 93 + #endif 94 + ldrb r4,[$inp,#0] 95 + mov r10,#0x0fffffff 96 + ldrb r5,[$inp,#1] 97 + and r3,r10,#-4 @ 0x0ffffffc 98 + ldrb r6,[$inp,#2] 99 + ldrb r7,[$inp,#3] 100 + orr r4,r4,r5,lsl#8 101 + ldrb r5,[$inp,#4] 102 + orr r4,r4,r6,lsl#16 103 + ldrb r6,[$inp,#5] 104 + orr r4,r4,r7,lsl#24 105 + ldrb r7,[$inp,#6] 106 + and r4,r4,r10 107 + 108 + #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 109 + # if !defined(_WIN32) 110 + ldr r12,[r11,r12] @ OPENSSL_armcap_P 111 + # endif 112 + # if defined(__APPLE__) || defined(_WIN32) 113 + ldr r12,[r12] 114 + # endif 115 + #endif 116 + ldrb r8,[$inp,#7] 117 + orr r5,r5,r6,lsl#8 118 + ldrb r6,[$inp,#8] 119 + orr r5,r5,r7,lsl#16 120 + ldrb r7,[$inp,#9] 121 + orr r5,r5,r8,lsl#24 122 + ldrb r8,[$inp,#10] 123 + and r5,r5,r3 124 + 125 + #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 126 + tst r12,#ARMV7_NEON @ check for NEON 127 + # ifdef __thumb2__ 128 + adr r9,.Lpoly1305_blocks_neon 129 + adr r11,.Lpoly1305_blocks 130 + it ne 131 + movne r11,r9 132 + adr r12,.Lpoly1305_emit 133 + orr r11,r11,#1 @ thumb-ify addresses 134 + orr r12,r12,#1 135 + # else 136 + add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) 137 + ite eq 138 + addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) 139 + addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) 140 + # endif 141 + #endif 142 + ldrb r9,[$inp,#11] 143 + orr r6,r6,r7,lsl#8 144 + ldrb r7,[$inp,#12] 145 + orr r6,r6,r8,lsl#16 146 + ldrb r8,[$inp,#13] 147 + orr r6,r6,r9,lsl#24 148 + ldrb r9,[$inp,#14] 149 + and r6,r6,r3 150 + 151 + ldrb r10,[$inp,#15] 152 + orr r7,r7,r8,lsl#8 153 + str r4,[$ctx,#0] 154 + orr r7,r7,r9,lsl#16 155 + str r5,[$ctx,#4] 156 + orr r7,r7,r10,lsl#24 157 + str r6,[$ctx,#8] 158 + and r7,r7,r3 159 + str r7,[$ctx,#12] 160 + #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 161 + stmia r2,{r11,r12} @ fill functions table 162 + mov r0,#1 163 + #else 164 + mov r0,#0 165 + #endif 166 + .Lno_key: 167 + ldmia sp!,{r4-r11} 168 + #if __ARM_ARCH__>=5 169 + ret @ bx lr 170 + #else 171 + tst lr,#1 172 + moveq pc,lr @ be binary compatible with V4, yet 173 + bx lr @ interoperable with Thumb ISA:-) 174 + #endif 175 + .size poly1305_init,.-poly1305_init 176 + ___ 177 + { 178 + my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12)); 179 + my ($s1,$s2,$s3)=($r1,$r2,$r3); 180 + 181 + $code.=<<___; 182 + .type poly1305_blocks,%function 183 + .align 5 184 + poly1305_blocks: 185 + .Lpoly1305_blocks: 186 + stmdb sp!,{r3-r11,lr} 187 + 188 + ands $len,$len,#-16 189 + beq .Lno_data 190 + 191 + add $len,$len,$inp @ end pointer 192 + sub sp,sp,#32 193 + 194 + #if __ARM_ARCH__<7 195 + ldmia $ctx,{$h0-$r3} @ load context 196 + add $ctx,$ctx,#20 197 + str $len,[sp,#16] @ offload stuff 198 + str $ctx,[sp,#12] 199 + #else 200 + ldr lr,[$ctx,#36] @ is_base2_26 201 + ldmia $ctx!,{$h0-$h4} @ load hash value 202 + str $len,[sp,#16] @ offload stuff 203 + str $ctx,[sp,#12] 204 + 205 + adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 206 + mov $r1,$h1,lsr#6 207 + adcs $r1,$r1,$h2,lsl#20 208 + mov $r2,$h2,lsr#12 209 + adcs $r2,$r2,$h3,lsl#14 210 + mov $r3,$h3,lsr#18 211 + adcs $r3,$r3,$h4,lsl#8 212 + mov $len,#0 213 + teq lr,#0 214 + str $len,[$ctx,#16] @ clear is_base2_26 215 + adc $len,$len,$h4,lsr#24 216 + 217 + itttt ne 218 + movne $h0,$r0 @ choose between radixes 219 + movne $h1,$r1 220 + movne $h2,$r2 221 + movne $h3,$r3 222 + ldmia $ctx,{$r0-$r3} @ load key 223 + it ne 224 + movne $h4,$len 225 + #endif 226 + 227 + mov lr,$inp 228 + cmp $padbit,#0 229 + str $r1,[sp,#20] 230 + str $r2,[sp,#24] 231 + str $r3,[sp,#28] 232 + b .Loop 233 + 234 + .align 4 235 + .Loop: 236 + #if __ARM_ARCH__<7 237 + ldrb r0,[lr],#16 @ load input 238 + # ifdef __thumb2__ 239 + it hi 240 + # endif 241 + addhi $h4,$h4,#1 @ 1<<128 242 + ldrb r1,[lr,#-15] 243 + ldrb r2,[lr,#-14] 244 + ldrb r3,[lr,#-13] 245 + orr r1,r0,r1,lsl#8 246 + ldrb r0,[lr,#-12] 247 + orr r2,r1,r2,lsl#16 248 + ldrb r1,[lr,#-11] 249 + orr r3,r2,r3,lsl#24 250 + ldrb r2,[lr,#-10] 251 + adds $h0,$h0,r3 @ accumulate input 252 + 253 + ldrb r3,[lr,#-9] 254 + orr r1,r0,r1,lsl#8 255 + ldrb r0,[lr,#-8] 256 + orr r2,r1,r2,lsl#16 257 + ldrb r1,[lr,#-7] 258 + orr r3,r2,r3,lsl#24 259 + ldrb r2,[lr,#-6] 260 + adcs $h1,$h1,r3 261 + 262 + ldrb r3,[lr,#-5] 263 + orr r1,r0,r1,lsl#8 264 + ldrb r0,[lr,#-4] 265 + orr r2,r1,r2,lsl#16 266 + ldrb r1,[lr,#-3] 267 + orr r3,r2,r3,lsl#24 268 + ldrb r2,[lr,#-2] 269 + adcs $h2,$h2,r3 270 + 271 + ldrb r3,[lr,#-1] 272 + orr r1,r0,r1,lsl#8 273 + str lr,[sp,#8] @ offload input pointer 274 + orr r2,r1,r2,lsl#16 275 + add $s1,$r1,$r1,lsr#2 276 + orr r3,r2,r3,lsl#24 277 + #else 278 + ldr r0,[lr],#16 @ load input 279 + it hi 280 + addhi $h4,$h4,#1 @ padbit 281 + ldr r1,[lr,#-12] 282 + ldr r2,[lr,#-8] 283 + ldr r3,[lr,#-4] 284 + # ifdef __ARMEB__ 285 + rev r0,r0 286 + rev r1,r1 287 + rev r2,r2 288 + rev r3,r3 289 + # endif 290 + adds $h0,$h0,r0 @ accumulate input 291 + str lr,[sp,#8] @ offload input pointer 292 + adcs $h1,$h1,r1 293 + add $s1,$r1,$r1,lsr#2 294 + adcs $h2,$h2,r2 295 + #endif 296 + add $s2,$r2,$r2,lsr#2 297 + adcs $h3,$h3,r3 298 + add $s3,$r3,$r3,lsr#2 299 + 300 + umull r2,r3,$h1,$r0 301 + adc $h4,$h4,#0 302 + umull r0,r1,$h0,$r0 303 + umlal r2,r3,$h4,$s1 304 + umlal r0,r1,$h3,$s1 305 + ldr $r1,[sp,#20] @ reload $r1 306 + umlal r2,r3,$h2,$s3 307 + umlal r0,r1,$h1,$s3 308 + umlal r2,r3,$h3,$s2 309 + umlal r0,r1,$h2,$s2 310 + umlal r2,r3,$h0,$r1 311 + str r0,[sp,#0] @ future $h0 312 + mul r0,$s2,$h4 313 + ldr $r2,[sp,#24] @ reload $r2 314 + adds r2,r2,r1 @ d1+=d0>>32 315 + eor r1,r1,r1 316 + adc lr,r3,#0 @ future $h2 317 + str r2,[sp,#4] @ future $h1 318 + 319 + mul r2,$s3,$h4 320 + eor r3,r3,r3 321 + umlal r0,r1,$h3,$s3 322 + ldr $r3,[sp,#28] @ reload $r3 323 + umlal r2,r3,$h3,$r0 324 + umlal r0,r1,$h2,$r0 325 + umlal r2,r3,$h2,$r1 326 + umlal r0,r1,$h1,$r1 327 + umlal r2,r3,$h1,$r2 328 + umlal r0,r1,$h0,$r2 329 + umlal r2,r3,$h0,$r3 330 + ldr $h0,[sp,#0] 331 + mul $h4,$r0,$h4 332 + ldr $h1,[sp,#4] 333 + 334 + adds $h2,lr,r0 @ d2+=d1>>32 335 + ldr lr,[sp,#8] @ reload input pointer 336 + adc r1,r1,#0 337 + adds $h3,r2,r1 @ d3+=d2>>32 338 + ldr r0,[sp,#16] @ reload end pointer 339 + adc r3,r3,#0 340 + add $h4,$h4,r3 @ h4+=d3>>32 341 + 342 + and r1,$h4,#-4 343 + and $h4,$h4,#3 344 + add r1,r1,r1,lsr#2 @ *=5 345 + adds $h0,$h0,r1 346 + adcs $h1,$h1,#0 347 + adcs $h2,$h2,#0 348 + adcs $h3,$h3,#0 349 + adc $h4,$h4,#0 350 + 351 + cmp r0,lr @ done yet? 352 + bhi .Loop 353 + 354 + ldr $ctx,[sp,#12] 355 + add sp,sp,#32 356 + stmdb $ctx,{$h0-$h4} @ store the result 357 + 358 + .Lno_data: 359 + #if __ARM_ARCH__>=5 360 + ldmia sp!,{r3-r11,pc} 361 + #else 362 + ldmia sp!,{r3-r11,lr} 363 + tst lr,#1 364 + moveq pc,lr @ be binary compatible with V4, yet 365 + bx lr @ interoperable with Thumb ISA:-) 366 + #endif 367 + .size poly1305_blocks,.-poly1305_blocks 368 + ___ 369 + } 370 + { 371 + my ($ctx,$mac,$nonce)=map("r$_",(0..2)); 372 + my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11)); 373 + my $g4=$ctx; 374 + 375 + $code.=<<___; 376 + .type poly1305_emit,%function 377 + .align 5 378 + poly1305_emit: 379 + .Lpoly1305_emit: 380 + stmdb sp!,{r4-r11} 381 + 382 + ldmia $ctx,{$h0-$h4} 383 + 384 + #if __ARM_ARCH__>=7 385 + ldr ip,[$ctx,#36] @ is_base2_26 386 + 387 + adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 388 + mov $g1,$h1,lsr#6 389 + adcs $g1,$g1,$h2,lsl#20 390 + mov $g2,$h2,lsr#12 391 + adcs $g2,$g2,$h3,lsl#14 392 + mov $g3,$h3,lsr#18 393 + adcs $g3,$g3,$h4,lsl#8 394 + mov $g4,#0 395 + adc $g4,$g4,$h4,lsr#24 396 + 397 + tst ip,ip 398 + itttt ne 399 + movne $h0,$g0 400 + movne $h1,$g1 401 + movne $h2,$g2 402 + movne $h3,$g3 403 + it ne 404 + movne $h4,$g4 405 + #endif 406 + 407 + adds $g0,$h0,#5 @ compare to modulus 408 + adcs $g1,$h1,#0 409 + adcs $g2,$h2,#0 410 + adcs $g3,$h3,#0 411 + adc $g4,$h4,#0 412 + tst $g4,#4 @ did it carry/borrow? 413 + 414 + #ifdef __thumb2__ 415 + it ne 416 + #endif 417 + movne $h0,$g0 418 + ldr $g0,[$nonce,#0] 419 + #ifdef __thumb2__ 420 + it ne 421 + #endif 422 + movne $h1,$g1 423 + ldr $g1,[$nonce,#4] 424 + #ifdef __thumb2__ 425 + it ne 426 + #endif 427 + movne $h2,$g2 428 + ldr $g2,[$nonce,#8] 429 + #ifdef __thumb2__ 430 + it ne 431 + #endif 432 + movne $h3,$g3 433 + ldr $g3,[$nonce,#12] 434 + 435 + adds $h0,$h0,$g0 436 + adcs $h1,$h1,$g1 437 + adcs $h2,$h2,$g2 438 + adc $h3,$h3,$g3 439 + 440 + #if __ARM_ARCH__>=7 441 + # ifdef __ARMEB__ 442 + rev $h0,$h0 443 + rev $h1,$h1 444 + rev $h2,$h2 445 + rev $h3,$h3 446 + # endif 447 + str $h0,[$mac,#0] 448 + str $h1,[$mac,#4] 449 + str $h2,[$mac,#8] 450 + str $h3,[$mac,#12] 451 + #else 452 + strb $h0,[$mac,#0] 453 + mov $h0,$h0,lsr#8 454 + strb $h1,[$mac,#4] 455 + mov $h1,$h1,lsr#8 456 + strb $h2,[$mac,#8] 457 + mov $h2,$h2,lsr#8 458 + strb $h3,[$mac,#12] 459 + mov $h3,$h3,lsr#8 460 + 461 + strb $h0,[$mac,#1] 462 + mov $h0,$h0,lsr#8 463 + strb $h1,[$mac,#5] 464 + mov $h1,$h1,lsr#8 465 + strb $h2,[$mac,#9] 466 + mov $h2,$h2,lsr#8 467 + strb $h3,[$mac,#13] 468 + mov $h3,$h3,lsr#8 469 + 470 + strb $h0,[$mac,#2] 471 + mov $h0,$h0,lsr#8 472 + strb $h1,[$mac,#6] 473 + mov $h1,$h1,lsr#8 474 + strb $h2,[$mac,#10] 475 + mov $h2,$h2,lsr#8 476 + strb $h3,[$mac,#14] 477 + mov $h3,$h3,lsr#8 478 + 479 + strb $h0,[$mac,#3] 480 + strb $h1,[$mac,#7] 481 + strb $h2,[$mac,#11] 482 + strb $h3,[$mac,#15] 483 + #endif 484 + ldmia sp!,{r4-r11} 485 + #if __ARM_ARCH__>=5 486 + ret @ bx lr 487 + #else 488 + tst lr,#1 489 + moveq pc,lr @ be binary compatible with V4, yet 490 + bx lr @ interoperable with Thumb ISA:-) 491 + #endif 492 + .size poly1305_emit,.-poly1305_emit 493 + ___ 494 + { 495 + my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9)); 496 + my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14)); 497 + my ($T0,$T1,$MASK) = map("q$_",(15,4,0)); 498 + 499 + my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7)); 500 + 501 + $code.=<<___; 502 + #if __ARM_MAX_ARCH__>=7 503 + .fpu neon 504 + 505 + .type poly1305_init_neon,%function 506 + .align 5 507 + poly1305_init_neon: 508 + .Lpoly1305_init_neon: 509 + ldr r3,[$ctx,#48] @ first table element 510 + cmp r3,#-1 @ is value impossible? 511 + bne .Lno_init_neon 512 + 513 + ldr r4,[$ctx,#20] @ load key base 2^32 514 + ldr r5,[$ctx,#24] 515 + ldr r6,[$ctx,#28] 516 + ldr r7,[$ctx,#32] 517 + 518 + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 519 + mov r3,r4,lsr#26 520 + mov r4,r5,lsr#20 521 + orr r3,r3,r5,lsl#6 522 + mov r5,r6,lsr#14 523 + orr r4,r4,r6,lsl#12 524 + mov r6,r7,lsr#8 525 + orr r5,r5,r7,lsl#18 526 + and r3,r3,#0x03ffffff 527 + and r4,r4,#0x03ffffff 528 + and r5,r5,#0x03ffffff 529 + 530 + vdup.32 $R0,r2 @ r^1 in both lanes 531 + add r2,r3,r3,lsl#2 @ *5 532 + vdup.32 $R1,r3 533 + add r3,r4,r4,lsl#2 534 + vdup.32 $S1,r2 535 + vdup.32 $R2,r4 536 + add r4,r5,r5,lsl#2 537 + vdup.32 $S2,r3 538 + vdup.32 $R3,r5 539 + add r5,r6,r6,lsl#2 540 + vdup.32 $S3,r4 541 + vdup.32 $R4,r6 542 + vdup.32 $S4,r5 543 + 544 + mov $zeros,#2 @ counter 545 + 546 + .Lsquare_neon: 547 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 548 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 549 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 550 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 551 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 552 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 553 + 554 + vmull.u32 $D0,$R0,${R0}[1] 555 + vmull.u32 $D1,$R1,${R0}[1] 556 + vmull.u32 $D2,$R2,${R0}[1] 557 + vmull.u32 $D3,$R3,${R0}[1] 558 + vmull.u32 $D4,$R4,${R0}[1] 559 + 560 + vmlal.u32 $D0,$R4,${S1}[1] 561 + vmlal.u32 $D1,$R0,${R1}[1] 562 + vmlal.u32 $D2,$R1,${R1}[1] 563 + vmlal.u32 $D3,$R2,${R1}[1] 564 + vmlal.u32 $D4,$R3,${R1}[1] 565 + 566 + vmlal.u32 $D0,$R3,${S2}[1] 567 + vmlal.u32 $D1,$R4,${S2}[1] 568 + vmlal.u32 $D3,$R1,${R2}[1] 569 + vmlal.u32 $D2,$R0,${R2}[1] 570 + vmlal.u32 $D4,$R2,${R2}[1] 571 + 572 + vmlal.u32 $D0,$R2,${S3}[1] 573 + vmlal.u32 $D3,$R0,${R3}[1] 574 + vmlal.u32 $D1,$R3,${S3}[1] 575 + vmlal.u32 $D2,$R4,${S3}[1] 576 + vmlal.u32 $D4,$R1,${R3}[1] 577 + 578 + vmlal.u32 $D3,$R4,${S4}[1] 579 + vmlal.u32 $D0,$R1,${S4}[1] 580 + vmlal.u32 $D1,$R2,${S4}[1] 581 + vmlal.u32 $D2,$R3,${S4}[1] 582 + vmlal.u32 $D4,$R0,${R4}[1] 583 + 584 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 585 + @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 586 + @ and P. Schwabe 587 + @ 588 + @ H0>>+H1>>+H2>>+H3>>+H4 589 + @ H3>>+H4>>*5+H0>>+H1 590 + @ 591 + @ Trivia. 592 + @ 593 + @ Result of multiplication of n-bit number by m-bit number is 594 + @ n+m bits wide. However! Even though 2^n is a n+1-bit number, 595 + @ m-bit number multiplied by 2^n is still n+m bits wide. 596 + @ 597 + @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, 598 + @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit 599 + @ one is n+1 bits wide. 600 + @ 601 + @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that 602 + @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 603 + @ can be 27. However! In cases when their width exceeds 26 bits 604 + @ they are limited by 2^26+2^6. This in turn means that *sum* 605 + @ of the products with these values can still be viewed as sum 606 + @ of 52-bit numbers as long as the amount of addends is not a 607 + @ power of 2. For example, 608 + @ 609 + @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, 610 + @ 611 + @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or 612 + @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than 613 + @ 8 * (2^52) or 2^55. However, the value is then multiplied by 614 + @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), 615 + @ which is less than 32 * (2^52) or 2^57. And when processing 616 + @ data we are looking at triple as many addends... 617 + @ 618 + @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and 619 + @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the 620 + @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while 621 + @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 622 + @ instruction accepts 2x32-bit input and writes 2x64-bit result. 623 + @ This means that result of reduction have to be compressed upon 624 + @ loop wrap-around. This can be done in the process of reduction 625 + @ to minimize amount of instructions [as well as amount of 626 + @ 128-bit instructions, which benefits low-end processors], but 627 + @ one has to watch for H2 (which is narrower than H0) and 5*H4 628 + @ not being wider than 58 bits, so that result of right shift 629 + @ by 26 bits fits in 32 bits. This is also useful on x86, 630 + @ because it allows to use paddd in place for paddq, which 631 + @ benefits Atom, where paddq is ridiculously slow. 632 + 633 + vshr.u64 $T0,$D3,#26 634 + vmovn.i64 $D3#lo,$D3 635 + vshr.u64 $T1,$D0,#26 636 + vmovn.i64 $D0#lo,$D0 637 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 638 + vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff 639 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 640 + vbic.i32 $D0#lo,#0xfc000000 641 + 642 + vshrn.u64 $T0#lo,$D4,#26 643 + vmovn.i64 $D4#lo,$D4 644 + vshr.u64 $T1,$D1,#26 645 + vmovn.i64 $D1#lo,$D1 646 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 647 + vbic.i32 $D4#lo,#0xfc000000 648 + vbic.i32 $D1#lo,#0xfc000000 649 + 650 + vadd.i32 $D0#lo,$D0#lo,$T0#lo 651 + vshl.u32 $T0#lo,$T0#lo,#2 652 + vshrn.u64 $T1#lo,$D2,#26 653 + vmovn.i64 $D2#lo,$D2 654 + vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0 655 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 656 + vbic.i32 $D2#lo,#0xfc000000 657 + 658 + vshr.u32 $T0#lo,$D0#lo,#26 659 + vbic.i32 $D0#lo,#0xfc000000 660 + vshr.u32 $T1#lo,$D3#lo,#26 661 + vbic.i32 $D3#lo,#0xfc000000 662 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 663 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 664 + 665 + subs $zeros,$zeros,#1 666 + beq .Lsquare_break_neon 667 + 668 + add $tbl0,$ctx,#(48+0*9*4) 669 + add $tbl1,$ctx,#(48+1*9*4) 670 + 671 + vtrn.32 $R0,$D0#lo @ r^2:r^1 672 + vtrn.32 $R2,$D2#lo 673 + vtrn.32 $R3,$D3#lo 674 + vtrn.32 $R1,$D1#lo 675 + vtrn.32 $R4,$D4#lo 676 + 677 + vshl.u32 $S2,$R2,#2 @ *5 678 + vshl.u32 $S3,$R3,#2 679 + vshl.u32 $S1,$R1,#2 680 + vshl.u32 $S4,$R4,#2 681 + vadd.i32 $S2,$S2,$R2 682 + vadd.i32 $S1,$S1,$R1 683 + vadd.i32 $S3,$S3,$R3 684 + vadd.i32 $S4,$S4,$R4 685 + 686 + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! 687 + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! 688 + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 689 + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 690 + vst1.32 {${S4}[0]},[$tbl0,:32] 691 + vst1.32 {${S4}[1]},[$tbl1,:32] 692 + 693 + b .Lsquare_neon 694 + 695 + .align 4 696 + .Lsquare_break_neon: 697 + add $tbl0,$ctx,#(48+2*4*9) 698 + add $tbl1,$ctx,#(48+3*4*9) 699 + 700 + vmov $R0,$D0#lo @ r^4:r^3 701 + vshl.u32 $S1,$D1#lo,#2 @ *5 702 + vmov $R1,$D1#lo 703 + vshl.u32 $S2,$D2#lo,#2 704 + vmov $R2,$D2#lo 705 + vshl.u32 $S3,$D3#lo,#2 706 + vmov $R3,$D3#lo 707 + vshl.u32 $S4,$D4#lo,#2 708 + vmov $R4,$D4#lo 709 + vadd.i32 $S1,$S1,$D1#lo 710 + vadd.i32 $S2,$S2,$D2#lo 711 + vadd.i32 $S3,$S3,$D3#lo 712 + vadd.i32 $S4,$S4,$D4#lo 713 + 714 + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! 715 + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! 716 + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 717 + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 718 + vst1.32 {${S4}[0]},[$tbl0] 719 + vst1.32 {${S4}[1]},[$tbl1] 720 + 721 + .Lno_init_neon: 722 + ret @ bx lr 723 + .size poly1305_init_neon,.-poly1305_init_neon 724 + 725 + .type poly1305_blocks_neon,%function 726 + .align 5 727 + poly1305_blocks_neon: 728 + .Lpoly1305_blocks_neon: 729 + ldr ip,[$ctx,#36] @ is_base2_26 730 + 731 + cmp $len,#64 732 + blo .Lpoly1305_blocks 733 + 734 + stmdb sp!,{r4-r7} 735 + vstmdb sp!,{d8-d15} @ ABI specification says so 736 + 737 + tst ip,ip @ is_base2_26? 738 + bne .Lbase2_26_neon 739 + 740 + stmdb sp!,{r1-r3,lr} 741 + bl .Lpoly1305_init_neon 742 + 743 + ldr r4,[$ctx,#0] @ load hash value base 2^32 744 + ldr r5,[$ctx,#4] 745 + ldr r6,[$ctx,#8] 746 + ldr r7,[$ctx,#12] 747 + ldr ip,[$ctx,#16] 748 + 749 + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 750 + mov r3,r4,lsr#26 751 + veor $D0#lo,$D0#lo,$D0#lo 752 + mov r4,r5,lsr#20 753 + orr r3,r3,r5,lsl#6 754 + veor $D1#lo,$D1#lo,$D1#lo 755 + mov r5,r6,lsr#14 756 + orr r4,r4,r6,lsl#12 757 + veor $D2#lo,$D2#lo,$D2#lo 758 + mov r6,r7,lsr#8 759 + orr r5,r5,r7,lsl#18 760 + veor $D3#lo,$D3#lo,$D3#lo 761 + and r3,r3,#0x03ffffff 762 + orr r6,r6,ip,lsl#24 763 + veor $D4#lo,$D4#lo,$D4#lo 764 + and r4,r4,#0x03ffffff 765 + mov r1,#1 766 + and r5,r5,#0x03ffffff 767 + str r1,[$ctx,#36] @ set is_base2_26 768 + 769 + vmov.32 $D0#lo[0],r2 770 + vmov.32 $D1#lo[0],r3 771 + vmov.32 $D2#lo[0],r4 772 + vmov.32 $D3#lo[0],r5 773 + vmov.32 $D4#lo[0],r6 774 + adr $zeros,.Lzeros 775 + 776 + ldmia sp!,{r1-r3,lr} 777 + b .Lhash_loaded 778 + 779 + .align 4 780 + .Lbase2_26_neon: 781 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 782 + @ load hash value 783 + 784 + veor $D0#lo,$D0#lo,$D0#lo 785 + veor $D1#lo,$D1#lo,$D1#lo 786 + veor $D2#lo,$D2#lo,$D2#lo 787 + veor $D3#lo,$D3#lo,$D3#lo 788 + veor $D4#lo,$D4#lo,$D4#lo 789 + vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! 790 + adr $zeros,.Lzeros 791 + vld1.32 {$D4#lo[0]},[$ctx] 792 + sub $ctx,$ctx,#16 @ rewind 793 + 794 + .Lhash_loaded: 795 + add $in2,$inp,#32 796 + mov $padbit,$padbit,lsl#24 797 + tst $len,#31 798 + beq .Leven 799 + 800 + vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]! 801 + vmov.32 $H4#lo[0],$padbit 802 + sub $len,$len,#16 803 + add $in2,$inp,#32 804 + 805 + # ifdef __ARMEB__ 806 + vrev32.8 $H0,$H0 807 + vrev32.8 $H3,$H3 808 + vrev32.8 $H1,$H1 809 + vrev32.8 $H2,$H2 810 + # endif 811 + vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26 812 + vshl.u32 $H3#lo,$H3#lo,#18 813 + 814 + vsri.u32 $H3#lo,$H2#lo,#14 815 + vshl.u32 $H2#lo,$H2#lo,#12 816 + vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi 817 + 818 + vbic.i32 $H3#lo,#0xfc000000 819 + vsri.u32 $H2#lo,$H1#lo,#20 820 + vshl.u32 $H1#lo,$H1#lo,#6 821 + 822 + vbic.i32 $H2#lo,#0xfc000000 823 + vsri.u32 $H1#lo,$H0#lo,#26 824 + vadd.i32 $H3#hi,$H3#lo,$D3#lo 825 + 826 + vbic.i32 $H0#lo,#0xfc000000 827 + vbic.i32 $H1#lo,#0xfc000000 828 + vadd.i32 $H2#hi,$H2#lo,$D2#lo 829 + 830 + vadd.i32 $H0#hi,$H0#lo,$D0#lo 831 + vadd.i32 $H1#hi,$H1#lo,$D1#lo 832 + 833 + mov $tbl1,$zeros 834 + add $tbl0,$ctx,#48 835 + 836 + cmp $len,$len 837 + b .Long_tail 838 + 839 + .align 4 840 + .Leven: 841 + subs $len,$len,#64 842 + it lo 843 + movlo $in2,$zeros 844 + 845 + vmov.i32 $H4,#1<<24 @ padbit, yes, always 846 + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] 847 + add $inp,$inp,#64 848 + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) 849 + add $in2,$in2,#64 850 + itt hi 851 + addhi $tbl1,$ctx,#(48+1*9*4) 852 + addhi $tbl0,$ctx,#(48+3*9*4) 853 + 854 + # ifdef __ARMEB__ 855 + vrev32.8 $H0,$H0 856 + vrev32.8 $H3,$H3 857 + vrev32.8 $H1,$H1 858 + vrev32.8 $H2,$H2 859 + # endif 860 + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 861 + vshl.u32 $H3,$H3,#18 862 + 863 + vsri.u32 $H3,$H2,#14 864 + vshl.u32 $H2,$H2,#12 865 + 866 + vbic.i32 $H3,#0xfc000000 867 + vsri.u32 $H2,$H1,#20 868 + vshl.u32 $H1,$H1,#6 869 + 870 + vbic.i32 $H2,#0xfc000000 871 + vsri.u32 $H1,$H0,#26 872 + 873 + vbic.i32 $H0,#0xfc000000 874 + vbic.i32 $H1,#0xfc000000 875 + 876 + bls .Lskip_loop 877 + 878 + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2 879 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 880 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 881 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 882 + b .Loop_neon 883 + 884 + .align 5 885 + .Loop_neon: 886 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 887 + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 888 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 889 + @ \___________________/ 890 + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 891 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 892 + @ \___________________/ \____________________/ 893 + @ 894 + @ Note that we start with inp[2:3]*r^2. This is because it 895 + @ doesn't depend on reduction in previous iteration. 896 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 897 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 898 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 899 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 900 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 901 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 902 + 903 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 904 + @ inp[2:3]*r^2 905 + 906 + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1] 907 + vmull.u32 $D2,$H2#hi,${R0}[1] 908 + vadd.i32 $H0#lo,$H0#lo,$D0#lo 909 + vmull.u32 $D0,$H0#hi,${R0}[1] 910 + vadd.i32 $H3#lo,$H3#lo,$D3#lo 911 + vmull.u32 $D3,$H3#hi,${R0}[1] 912 + vmlal.u32 $D2,$H1#hi,${R1}[1] 913 + vadd.i32 $H1#lo,$H1#lo,$D1#lo 914 + vmull.u32 $D1,$H1#hi,${R0}[1] 915 + 916 + vadd.i32 $H4#lo,$H4#lo,$D4#lo 917 + vmull.u32 $D4,$H4#hi,${R0}[1] 918 + subs $len,$len,#64 919 + vmlal.u32 $D0,$H4#hi,${S1}[1] 920 + it lo 921 + movlo $in2,$zeros 922 + vmlal.u32 $D3,$H2#hi,${R1}[1] 923 + vld1.32 ${S4}[1],[$tbl1,:32] 924 + vmlal.u32 $D1,$H0#hi,${R1}[1] 925 + vmlal.u32 $D4,$H3#hi,${R1}[1] 926 + 927 + vmlal.u32 $D0,$H3#hi,${S2}[1] 928 + vmlal.u32 $D3,$H1#hi,${R2}[1] 929 + vmlal.u32 $D4,$H2#hi,${R2}[1] 930 + vmlal.u32 $D1,$H4#hi,${S2}[1] 931 + vmlal.u32 $D2,$H0#hi,${R2}[1] 932 + 933 + vmlal.u32 $D3,$H0#hi,${R3}[1] 934 + vmlal.u32 $D0,$H2#hi,${S3}[1] 935 + vmlal.u32 $D4,$H1#hi,${R3}[1] 936 + vmlal.u32 $D1,$H3#hi,${S3}[1] 937 + vmlal.u32 $D2,$H4#hi,${S3}[1] 938 + 939 + vmlal.u32 $D3,$H4#hi,${S4}[1] 940 + vmlal.u32 $D0,$H1#hi,${S4}[1] 941 + vmlal.u32 $D4,$H0#hi,${R4}[1] 942 + vmlal.u32 $D1,$H2#hi,${S4}[1] 943 + vmlal.u32 $D2,$H3#hi,${S4}[1] 944 + 945 + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) 946 + add $in2,$in2,#64 947 + 948 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 949 + @ (hash+inp[0:1])*r^4 and accumulate 950 + 951 + vmlal.u32 $D3,$H3#lo,${R0}[0] 952 + vmlal.u32 $D0,$H0#lo,${R0}[0] 953 + vmlal.u32 $D4,$H4#lo,${R0}[0] 954 + vmlal.u32 $D1,$H1#lo,${R0}[0] 955 + vmlal.u32 $D2,$H2#lo,${R0}[0] 956 + vld1.32 ${S4}[0],[$tbl0,:32] 957 + 958 + vmlal.u32 $D3,$H2#lo,${R1}[0] 959 + vmlal.u32 $D0,$H4#lo,${S1}[0] 960 + vmlal.u32 $D4,$H3#lo,${R1}[0] 961 + vmlal.u32 $D1,$H0#lo,${R1}[0] 962 + vmlal.u32 $D2,$H1#lo,${R1}[0] 963 + 964 + vmlal.u32 $D3,$H1#lo,${R2}[0] 965 + vmlal.u32 $D0,$H3#lo,${S2}[0] 966 + vmlal.u32 $D4,$H2#lo,${R2}[0] 967 + vmlal.u32 $D1,$H4#lo,${S2}[0] 968 + vmlal.u32 $D2,$H0#lo,${R2}[0] 969 + 970 + vmlal.u32 $D3,$H0#lo,${R3}[0] 971 + vmlal.u32 $D0,$H2#lo,${S3}[0] 972 + vmlal.u32 $D4,$H1#lo,${R3}[0] 973 + vmlal.u32 $D1,$H3#lo,${S3}[0] 974 + vmlal.u32 $D3,$H4#lo,${S4}[0] 975 + 976 + vmlal.u32 $D2,$H4#lo,${S3}[0] 977 + vmlal.u32 $D0,$H1#lo,${S4}[0] 978 + vmlal.u32 $D4,$H0#lo,${R4}[0] 979 + vmov.i32 $H4,#1<<24 @ padbit, yes, always 980 + vmlal.u32 $D1,$H2#lo,${S4}[0] 981 + vmlal.u32 $D2,$H3#lo,${S4}[0] 982 + 983 + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] 984 + add $inp,$inp,#64 985 + # ifdef __ARMEB__ 986 + vrev32.8 $H0,$H0 987 + vrev32.8 $H1,$H1 988 + vrev32.8 $H2,$H2 989 + vrev32.8 $H3,$H3 990 + # endif 991 + 992 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 993 + @ lazy reduction interleaved with base 2^32 -> base 2^26 of 994 + @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4. 995 + 996 + vshr.u64 $T0,$D3,#26 997 + vmovn.i64 $D3#lo,$D3 998 + vshr.u64 $T1,$D0,#26 999 + vmovn.i64 $D0#lo,$D0 1000 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 1001 + vbic.i32 $D3#lo,#0xfc000000 1002 + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 1003 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 1004 + vshl.u32 $H3,$H3,#18 1005 + vbic.i32 $D0#lo,#0xfc000000 1006 + 1007 + vshrn.u64 $T0#lo,$D4,#26 1008 + vmovn.i64 $D4#lo,$D4 1009 + vshr.u64 $T1,$D1,#26 1010 + vmovn.i64 $D1#lo,$D1 1011 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 1012 + vsri.u32 $H3,$H2,#14 1013 + vbic.i32 $D4#lo,#0xfc000000 1014 + vshl.u32 $H2,$H2,#12 1015 + vbic.i32 $D1#lo,#0xfc000000 1016 + 1017 + vadd.i32 $D0#lo,$D0#lo,$T0#lo 1018 + vshl.u32 $T0#lo,$T0#lo,#2 1019 + vbic.i32 $H3,#0xfc000000 1020 + vshrn.u64 $T1#lo,$D2,#26 1021 + vmovn.i64 $D2#lo,$D2 1022 + vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec] 1023 + vsri.u32 $H2,$H1,#20 1024 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 1025 + vshl.u32 $H1,$H1,#6 1026 + vbic.i32 $D2#lo,#0xfc000000 1027 + vbic.i32 $H2,#0xfc000000 1028 + 1029 + vshrn.u64 $T0#lo,$D0,#26 @ re-narrow 1030 + vmovn.i64 $D0#lo,$D0 1031 + vsri.u32 $H1,$H0,#26 1032 + vbic.i32 $H0,#0xfc000000 1033 + vshr.u32 $T1#lo,$D3#lo,#26 1034 + vbic.i32 $D3#lo,#0xfc000000 1035 + vbic.i32 $D0#lo,#0xfc000000 1036 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 1037 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 1038 + vbic.i32 $H1,#0xfc000000 1039 + 1040 + bhi .Loop_neon 1041 + 1042 + .Lskip_loop: 1043 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1044 + @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 1045 + 1046 + add $tbl1,$ctx,#(48+0*9*4) 1047 + add $tbl0,$ctx,#(48+1*9*4) 1048 + adds $len,$len,#32 1049 + it ne 1050 + movne $len,#0 1051 + bne .Long_tail 1052 + 1053 + vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi 1054 + vadd.i32 $H0#hi,$H0#lo,$D0#lo 1055 + vadd.i32 $H3#hi,$H3#lo,$D3#lo 1056 + vadd.i32 $H1#hi,$H1#lo,$D1#lo 1057 + vadd.i32 $H4#hi,$H4#lo,$D4#lo 1058 + 1059 + .Long_tail: 1060 + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1 1061 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2 1062 + 1063 + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant 1064 + vmull.u32 $D2,$H2#hi,$R0 1065 + vadd.i32 $H0#lo,$H0#lo,$D0#lo 1066 + vmull.u32 $D0,$H0#hi,$R0 1067 + vadd.i32 $H3#lo,$H3#lo,$D3#lo 1068 + vmull.u32 $D3,$H3#hi,$R0 1069 + vadd.i32 $H1#lo,$H1#lo,$D1#lo 1070 + vmull.u32 $D1,$H1#hi,$R0 1071 + vadd.i32 $H4#lo,$H4#lo,$D4#lo 1072 + vmull.u32 $D4,$H4#hi,$R0 1073 + 1074 + vmlal.u32 $D0,$H4#hi,$S1 1075 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 1076 + vmlal.u32 $D3,$H2#hi,$R1 1077 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 1078 + vmlal.u32 $D1,$H0#hi,$R1 1079 + vmlal.u32 $D4,$H3#hi,$R1 1080 + vmlal.u32 $D2,$H1#hi,$R1 1081 + 1082 + vmlal.u32 $D3,$H1#hi,$R2 1083 + vld1.32 ${S4}[1],[$tbl1,:32] 1084 + vmlal.u32 $D0,$H3#hi,$S2 1085 + vld1.32 ${S4}[0],[$tbl0,:32] 1086 + vmlal.u32 $D4,$H2#hi,$R2 1087 + vmlal.u32 $D1,$H4#hi,$S2 1088 + vmlal.u32 $D2,$H0#hi,$R2 1089 + 1090 + vmlal.u32 $D3,$H0#hi,$R3 1091 + it ne 1092 + addne $tbl1,$ctx,#(48+2*9*4) 1093 + vmlal.u32 $D0,$H2#hi,$S3 1094 + it ne 1095 + addne $tbl0,$ctx,#(48+3*9*4) 1096 + vmlal.u32 $D4,$H1#hi,$R3 1097 + vmlal.u32 $D1,$H3#hi,$S3 1098 + vmlal.u32 $D2,$H4#hi,$S3 1099 + 1100 + vmlal.u32 $D3,$H4#hi,$S4 1101 + vorn $MASK,$MASK,$MASK @ all-ones, can be redundant 1102 + vmlal.u32 $D0,$H1#hi,$S4 1103 + vshr.u64 $MASK,$MASK,#38 1104 + vmlal.u32 $D4,$H0#hi,$R4 1105 + vmlal.u32 $D1,$H2#hi,$S4 1106 + vmlal.u32 $D2,$H3#hi,$S4 1107 + 1108 + beq .Lshort_tail 1109 + 1110 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1111 + @ (hash+inp[0:1])*r^4:r^3 and accumulate 1112 + 1113 + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3 1114 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 1115 + 1116 + vmlal.u32 $D2,$H2#lo,$R0 1117 + vmlal.u32 $D0,$H0#lo,$R0 1118 + vmlal.u32 $D3,$H3#lo,$R0 1119 + vmlal.u32 $D1,$H1#lo,$R0 1120 + vmlal.u32 $D4,$H4#lo,$R0 1121 + 1122 + vmlal.u32 $D0,$H4#lo,$S1 1123 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 1124 + vmlal.u32 $D3,$H2#lo,$R1 1125 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 1126 + vmlal.u32 $D1,$H0#lo,$R1 1127 + vmlal.u32 $D4,$H3#lo,$R1 1128 + vmlal.u32 $D2,$H1#lo,$R1 1129 + 1130 + vmlal.u32 $D3,$H1#lo,$R2 1131 + vld1.32 ${S4}[1],[$tbl1,:32] 1132 + vmlal.u32 $D0,$H3#lo,$S2 1133 + vld1.32 ${S4}[0],[$tbl0,:32] 1134 + vmlal.u32 $D4,$H2#lo,$R2 1135 + vmlal.u32 $D1,$H4#lo,$S2 1136 + vmlal.u32 $D2,$H0#lo,$R2 1137 + 1138 + vmlal.u32 $D3,$H0#lo,$R3 1139 + vmlal.u32 $D0,$H2#lo,$S3 1140 + vmlal.u32 $D4,$H1#lo,$R3 1141 + vmlal.u32 $D1,$H3#lo,$S3 1142 + vmlal.u32 $D2,$H4#lo,$S3 1143 + 1144 + vmlal.u32 $D3,$H4#lo,$S4 1145 + vorn $MASK,$MASK,$MASK @ all-ones 1146 + vmlal.u32 $D0,$H1#lo,$S4 1147 + vshr.u64 $MASK,$MASK,#38 1148 + vmlal.u32 $D4,$H0#lo,$R4 1149 + vmlal.u32 $D1,$H2#lo,$S4 1150 + vmlal.u32 $D2,$H3#lo,$S4 1151 + 1152 + .Lshort_tail: 1153 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1154 + @ horizontal addition 1155 + 1156 + vadd.i64 $D3#lo,$D3#lo,$D3#hi 1157 + vadd.i64 $D0#lo,$D0#lo,$D0#hi 1158 + vadd.i64 $D4#lo,$D4#lo,$D4#hi 1159 + vadd.i64 $D1#lo,$D1#lo,$D1#hi 1160 + vadd.i64 $D2#lo,$D2#lo,$D2#hi 1161 + 1162 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1163 + @ lazy reduction, but without narrowing 1164 + 1165 + vshr.u64 $T0,$D3,#26 1166 + vand.i64 $D3,$D3,$MASK 1167 + vshr.u64 $T1,$D0,#26 1168 + vand.i64 $D0,$D0,$MASK 1169 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 1170 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 1171 + 1172 + vshr.u64 $T0,$D4,#26 1173 + vand.i64 $D4,$D4,$MASK 1174 + vshr.u64 $T1,$D1,#26 1175 + vand.i64 $D1,$D1,$MASK 1176 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 1177 + 1178 + vadd.i64 $D0,$D0,$T0 1179 + vshl.u64 $T0,$T0,#2 1180 + vshr.u64 $T1,$D2,#26 1181 + vand.i64 $D2,$D2,$MASK 1182 + vadd.i64 $D0,$D0,$T0 @ h4 -> h0 1183 + vadd.i64 $D3,$D3,$T1 @ h2 -> h3 1184 + 1185 + vshr.u64 $T0,$D0,#26 1186 + vand.i64 $D0,$D0,$MASK 1187 + vshr.u64 $T1,$D3,#26 1188 + vand.i64 $D3,$D3,$MASK 1189 + vadd.i64 $D1,$D1,$T0 @ h0 -> h1 1190 + vadd.i64 $D4,$D4,$T1 @ h3 -> h4 1191 + 1192 + cmp $len,#0 1193 + bne .Leven 1194 + 1195 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1196 + @ store hash value 1197 + 1198 + vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! 1199 + vst1.32 {$D4#lo[0]},[$ctx] 1200 + 1201 + vldmia sp!,{d8-d15} @ epilogue 1202 + ldmia sp!,{r4-r7} 1203 + ret @ bx lr 1204 + .size poly1305_blocks_neon,.-poly1305_blocks_neon 1205 + 1206 + .align 5 1207 + .Lzeros: 1208 + .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1209 + #ifndef __KERNEL__ 1210 + .LOPENSSL_armcap: 1211 + # ifdef _WIN32 1212 + .word OPENSSL_armcap_P 1213 + # else 1214 + .word OPENSSL_armcap_P-.Lpoly1305_init 1215 + # endif 1216 + .comm OPENSSL_armcap_P,4,4 1217 + .hidden OPENSSL_armcap_P 1218 + #endif 1219 + #endif 1220 + ___ 1221 + } } 1222 + $code.=<<___; 1223 + .asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm" 1224 + .align 2 1225 + ___ 1226 + 1227 + foreach (split("\n",$code)) { 1228 + s/\`([^\`]*)\`/eval $1/geo; 1229 + 1230 + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or 1231 + s/\bret\b/bx lr/go or 1232 + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 1233 + 1234 + print $_,"\n"; 1235 + } 1236 + close STDOUT; # enforce flush
+1158
arch/arm/crypto/poly1305-core.S_shipped
··· 1 + #ifndef __KERNEL__ 2 + # include "arm_arch.h" 3 + #else 4 + # define __ARM_ARCH__ __LINUX_ARM_ARCH__ 5 + # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ 6 + # define poly1305_init poly1305_init_arm 7 + # define poly1305_blocks poly1305_blocks_arm 8 + # define poly1305_emit poly1305_emit_arm 9 + .globl poly1305_blocks_neon 10 + #endif 11 + 12 + #if defined(__thumb2__) 13 + .syntax unified 14 + .thumb 15 + #else 16 + .code 32 17 + #endif 18 + 19 + .text 20 + 21 + .globl poly1305_emit 22 + .globl poly1305_blocks 23 + .globl poly1305_init 24 + .type poly1305_init,%function 25 + .align 5 26 + poly1305_init: 27 + .Lpoly1305_init: 28 + stmdb sp!,{r4-r11} 29 + 30 + eor r3,r3,r3 31 + cmp r1,#0 32 + str r3,[r0,#0] @ zero hash value 33 + str r3,[r0,#4] 34 + str r3,[r0,#8] 35 + str r3,[r0,#12] 36 + str r3,[r0,#16] 37 + str r3,[r0,#36] @ clear is_base2_26 38 + add r0,r0,#20 39 + 40 + #ifdef __thumb2__ 41 + it eq 42 + #endif 43 + moveq r0,#0 44 + beq .Lno_key 45 + 46 + #if __ARM_MAX_ARCH__>=7 47 + mov r3,#-1 48 + str r3,[r0,#28] @ impossible key power value 49 + # ifndef __KERNEL__ 50 + adr r11,.Lpoly1305_init 51 + ldr r12,.LOPENSSL_armcap 52 + # endif 53 + #endif 54 + ldrb r4,[r1,#0] 55 + mov r10,#0x0fffffff 56 + ldrb r5,[r1,#1] 57 + and r3,r10,#-4 @ 0x0ffffffc 58 + ldrb r6,[r1,#2] 59 + ldrb r7,[r1,#3] 60 + orr r4,r4,r5,lsl#8 61 + ldrb r5,[r1,#4] 62 + orr r4,r4,r6,lsl#16 63 + ldrb r6,[r1,#5] 64 + orr r4,r4,r7,lsl#24 65 + ldrb r7,[r1,#6] 66 + and r4,r4,r10 67 + 68 + #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 69 + # if !defined(_WIN32) 70 + ldr r12,[r11,r12] @ OPENSSL_armcap_P 71 + # endif 72 + # if defined(__APPLE__) || defined(_WIN32) 73 + ldr r12,[r12] 74 + # endif 75 + #endif 76 + ldrb r8,[r1,#7] 77 + orr r5,r5,r6,lsl#8 78 + ldrb r6,[r1,#8] 79 + orr r5,r5,r7,lsl#16 80 + ldrb r7,[r1,#9] 81 + orr r5,r5,r8,lsl#24 82 + ldrb r8,[r1,#10] 83 + and r5,r5,r3 84 + 85 + #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 86 + tst r12,#ARMV7_NEON @ check for NEON 87 + # ifdef __thumb2__ 88 + adr r9,.Lpoly1305_blocks_neon 89 + adr r11,.Lpoly1305_blocks 90 + it ne 91 + movne r11,r9 92 + adr r12,.Lpoly1305_emit 93 + orr r11,r11,#1 @ thumb-ify addresses 94 + orr r12,r12,#1 95 + # else 96 + add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) 97 + ite eq 98 + addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) 99 + addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) 100 + # endif 101 + #endif 102 + ldrb r9,[r1,#11] 103 + orr r6,r6,r7,lsl#8 104 + ldrb r7,[r1,#12] 105 + orr r6,r6,r8,lsl#16 106 + ldrb r8,[r1,#13] 107 + orr r6,r6,r9,lsl#24 108 + ldrb r9,[r1,#14] 109 + and r6,r6,r3 110 + 111 + ldrb r10,[r1,#15] 112 + orr r7,r7,r8,lsl#8 113 + str r4,[r0,#0] 114 + orr r7,r7,r9,lsl#16 115 + str r5,[r0,#4] 116 + orr r7,r7,r10,lsl#24 117 + str r6,[r0,#8] 118 + and r7,r7,r3 119 + str r7,[r0,#12] 120 + #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 121 + stmia r2,{r11,r12} @ fill functions table 122 + mov r0,#1 123 + #else 124 + mov r0,#0 125 + #endif 126 + .Lno_key: 127 + ldmia sp!,{r4-r11} 128 + #if __ARM_ARCH__>=5 129 + bx lr @ bx lr 130 + #else 131 + tst lr,#1 132 + moveq pc,lr @ be binary compatible with V4, yet 133 + .word 0xe12fff1e @ interoperable with Thumb ISA:-) 134 + #endif 135 + .size poly1305_init,.-poly1305_init 136 + .type poly1305_blocks,%function 137 + .align 5 138 + poly1305_blocks: 139 + .Lpoly1305_blocks: 140 + stmdb sp!,{r3-r11,lr} 141 + 142 + ands r2,r2,#-16 143 + beq .Lno_data 144 + 145 + add r2,r2,r1 @ end pointer 146 + sub sp,sp,#32 147 + 148 + #if __ARM_ARCH__<7 149 + ldmia r0,{r4-r12} @ load context 150 + add r0,r0,#20 151 + str r2,[sp,#16] @ offload stuff 152 + str r0,[sp,#12] 153 + #else 154 + ldr lr,[r0,#36] @ is_base2_26 155 + ldmia r0!,{r4-r8} @ load hash value 156 + str r2,[sp,#16] @ offload stuff 157 + str r0,[sp,#12] 158 + 159 + adds r9,r4,r5,lsl#26 @ base 2^26 -> base 2^32 160 + mov r10,r5,lsr#6 161 + adcs r10,r10,r6,lsl#20 162 + mov r11,r6,lsr#12 163 + adcs r11,r11,r7,lsl#14 164 + mov r12,r7,lsr#18 165 + adcs r12,r12,r8,lsl#8 166 + mov r2,#0 167 + teq lr,#0 168 + str r2,[r0,#16] @ clear is_base2_26 169 + adc r2,r2,r8,lsr#24 170 + 171 + itttt ne 172 + movne r4,r9 @ choose between radixes 173 + movne r5,r10 174 + movne r6,r11 175 + movne r7,r12 176 + ldmia r0,{r9-r12} @ load key 177 + it ne 178 + movne r8,r2 179 + #endif 180 + 181 + mov lr,r1 182 + cmp r3,#0 183 + str r10,[sp,#20] 184 + str r11,[sp,#24] 185 + str r12,[sp,#28] 186 + b .Loop 187 + 188 + .align 4 189 + .Loop: 190 + #if __ARM_ARCH__<7 191 + ldrb r0,[lr],#16 @ load input 192 + # ifdef __thumb2__ 193 + it hi 194 + # endif 195 + addhi r8,r8,#1 @ 1<<128 196 + ldrb r1,[lr,#-15] 197 + ldrb r2,[lr,#-14] 198 + ldrb r3,[lr,#-13] 199 + orr r1,r0,r1,lsl#8 200 + ldrb r0,[lr,#-12] 201 + orr r2,r1,r2,lsl#16 202 + ldrb r1,[lr,#-11] 203 + orr r3,r2,r3,lsl#24 204 + ldrb r2,[lr,#-10] 205 + adds r4,r4,r3 @ accumulate input 206 + 207 + ldrb r3,[lr,#-9] 208 + orr r1,r0,r1,lsl#8 209 + ldrb r0,[lr,#-8] 210 + orr r2,r1,r2,lsl#16 211 + ldrb r1,[lr,#-7] 212 + orr r3,r2,r3,lsl#24 213 + ldrb r2,[lr,#-6] 214 + adcs r5,r5,r3 215 + 216 + ldrb r3,[lr,#-5] 217 + orr r1,r0,r1,lsl#8 218 + ldrb r0,[lr,#-4] 219 + orr r2,r1,r2,lsl#16 220 + ldrb r1,[lr,#-3] 221 + orr r3,r2,r3,lsl#24 222 + ldrb r2,[lr,#-2] 223 + adcs r6,r6,r3 224 + 225 + ldrb r3,[lr,#-1] 226 + orr r1,r0,r1,lsl#8 227 + str lr,[sp,#8] @ offload input pointer 228 + orr r2,r1,r2,lsl#16 229 + add r10,r10,r10,lsr#2 230 + orr r3,r2,r3,lsl#24 231 + #else 232 + ldr r0,[lr],#16 @ load input 233 + it hi 234 + addhi r8,r8,#1 @ padbit 235 + ldr r1,[lr,#-12] 236 + ldr r2,[lr,#-8] 237 + ldr r3,[lr,#-4] 238 + # ifdef __ARMEB__ 239 + rev r0,r0 240 + rev r1,r1 241 + rev r2,r2 242 + rev r3,r3 243 + # endif 244 + adds r4,r4,r0 @ accumulate input 245 + str lr,[sp,#8] @ offload input pointer 246 + adcs r5,r5,r1 247 + add r10,r10,r10,lsr#2 248 + adcs r6,r6,r2 249 + #endif 250 + add r11,r11,r11,lsr#2 251 + adcs r7,r7,r3 252 + add r12,r12,r12,lsr#2 253 + 254 + umull r2,r3,r5,r9 255 + adc r8,r8,#0 256 + umull r0,r1,r4,r9 257 + umlal r2,r3,r8,r10 258 + umlal r0,r1,r7,r10 259 + ldr r10,[sp,#20] @ reload r10 260 + umlal r2,r3,r6,r12 261 + umlal r0,r1,r5,r12 262 + umlal r2,r3,r7,r11 263 + umlal r0,r1,r6,r11 264 + umlal r2,r3,r4,r10 265 + str r0,[sp,#0] @ future r4 266 + mul r0,r11,r8 267 + ldr r11,[sp,#24] @ reload r11 268 + adds r2,r2,r1 @ d1+=d0>>32 269 + eor r1,r1,r1 270 + adc lr,r3,#0 @ future r6 271 + str r2,[sp,#4] @ future r5 272 + 273 + mul r2,r12,r8 274 + eor r3,r3,r3 275 + umlal r0,r1,r7,r12 276 + ldr r12,[sp,#28] @ reload r12 277 + umlal r2,r3,r7,r9 278 + umlal r0,r1,r6,r9 279 + umlal r2,r3,r6,r10 280 + umlal r0,r1,r5,r10 281 + umlal r2,r3,r5,r11 282 + umlal r0,r1,r4,r11 283 + umlal r2,r3,r4,r12 284 + ldr r4,[sp,#0] 285 + mul r8,r9,r8 286 + ldr r5,[sp,#4] 287 + 288 + adds r6,lr,r0 @ d2+=d1>>32 289 + ldr lr,[sp,#8] @ reload input pointer 290 + adc r1,r1,#0 291 + adds r7,r2,r1 @ d3+=d2>>32 292 + ldr r0,[sp,#16] @ reload end pointer 293 + adc r3,r3,#0 294 + add r8,r8,r3 @ h4+=d3>>32 295 + 296 + and r1,r8,#-4 297 + and r8,r8,#3 298 + add r1,r1,r1,lsr#2 @ *=5 299 + adds r4,r4,r1 300 + adcs r5,r5,#0 301 + adcs r6,r6,#0 302 + adcs r7,r7,#0 303 + adc r8,r8,#0 304 + 305 + cmp r0,lr @ done yet? 306 + bhi .Loop 307 + 308 + ldr r0,[sp,#12] 309 + add sp,sp,#32 310 + stmdb r0,{r4-r8} @ store the result 311 + 312 + .Lno_data: 313 + #if __ARM_ARCH__>=5 314 + ldmia sp!,{r3-r11,pc} 315 + #else 316 + ldmia sp!,{r3-r11,lr} 317 + tst lr,#1 318 + moveq pc,lr @ be binary compatible with V4, yet 319 + .word 0xe12fff1e @ interoperable with Thumb ISA:-) 320 + #endif 321 + .size poly1305_blocks,.-poly1305_blocks 322 + .type poly1305_emit,%function 323 + .align 5 324 + poly1305_emit: 325 + .Lpoly1305_emit: 326 + stmdb sp!,{r4-r11} 327 + 328 + ldmia r0,{r3-r7} 329 + 330 + #if __ARM_ARCH__>=7 331 + ldr ip,[r0,#36] @ is_base2_26 332 + 333 + adds r8,r3,r4,lsl#26 @ base 2^26 -> base 2^32 334 + mov r9,r4,lsr#6 335 + adcs r9,r9,r5,lsl#20 336 + mov r10,r5,lsr#12 337 + adcs r10,r10,r6,lsl#14 338 + mov r11,r6,lsr#18 339 + adcs r11,r11,r7,lsl#8 340 + mov r0,#0 341 + adc r0,r0,r7,lsr#24 342 + 343 + tst ip,ip 344 + itttt ne 345 + movne r3,r8 346 + movne r4,r9 347 + movne r5,r10 348 + movne r6,r11 349 + it ne 350 + movne r7,r0 351 + #endif 352 + 353 + adds r8,r3,#5 @ compare to modulus 354 + adcs r9,r4,#0 355 + adcs r10,r5,#0 356 + adcs r11,r6,#0 357 + adc r0,r7,#0 358 + tst r0,#4 @ did it carry/borrow? 359 + 360 + #ifdef __thumb2__ 361 + it ne 362 + #endif 363 + movne r3,r8 364 + ldr r8,[r2,#0] 365 + #ifdef __thumb2__ 366 + it ne 367 + #endif 368 + movne r4,r9 369 + ldr r9,[r2,#4] 370 + #ifdef __thumb2__ 371 + it ne 372 + #endif 373 + movne r5,r10 374 + ldr r10,[r2,#8] 375 + #ifdef __thumb2__ 376 + it ne 377 + #endif 378 + movne r6,r11 379 + ldr r11,[r2,#12] 380 + 381 + adds r3,r3,r8 382 + adcs r4,r4,r9 383 + adcs r5,r5,r10 384 + adc r6,r6,r11 385 + 386 + #if __ARM_ARCH__>=7 387 + # ifdef __ARMEB__ 388 + rev r3,r3 389 + rev r4,r4 390 + rev r5,r5 391 + rev r6,r6 392 + # endif 393 + str r3,[r1,#0] 394 + str r4,[r1,#4] 395 + str r5,[r1,#8] 396 + str r6,[r1,#12] 397 + #else 398 + strb r3,[r1,#0] 399 + mov r3,r3,lsr#8 400 + strb r4,[r1,#4] 401 + mov r4,r4,lsr#8 402 + strb r5,[r1,#8] 403 + mov r5,r5,lsr#8 404 + strb r6,[r1,#12] 405 + mov r6,r6,lsr#8 406 + 407 + strb r3,[r1,#1] 408 + mov r3,r3,lsr#8 409 + strb r4,[r1,#5] 410 + mov r4,r4,lsr#8 411 + strb r5,[r1,#9] 412 + mov r5,r5,lsr#8 413 + strb r6,[r1,#13] 414 + mov r6,r6,lsr#8 415 + 416 + strb r3,[r1,#2] 417 + mov r3,r3,lsr#8 418 + strb r4,[r1,#6] 419 + mov r4,r4,lsr#8 420 + strb r5,[r1,#10] 421 + mov r5,r5,lsr#8 422 + strb r6,[r1,#14] 423 + mov r6,r6,lsr#8 424 + 425 + strb r3,[r1,#3] 426 + strb r4,[r1,#7] 427 + strb r5,[r1,#11] 428 + strb r6,[r1,#15] 429 + #endif 430 + ldmia sp!,{r4-r11} 431 + #if __ARM_ARCH__>=5 432 + bx lr @ bx lr 433 + #else 434 + tst lr,#1 435 + moveq pc,lr @ be binary compatible with V4, yet 436 + .word 0xe12fff1e @ interoperable with Thumb ISA:-) 437 + #endif 438 + .size poly1305_emit,.-poly1305_emit 439 + #if __ARM_MAX_ARCH__>=7 440 + .fpu neon 441 + 442 + .type poly1305_init_neon,%function 443 + .align 5 444 + poly1305_init_neon: 445 + .Lpoly1305_init_neon: 446 + ldr r3,[r0,#48] @ first table element 447 + cmp r3,#-1 @ is value impossible? 448 + bne .Lno_init_neon 449 + 450 + ldr r4,[r0,#20] @ load key base 2^32 451 + ldr r5,[r0,#24] 452 + ldr r6,[r0,#28] 453 + ldr r7,[r0,#32] 454 + 455 + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 456 + mov r3,r4,lsr#26 457 + mov r4,r5,lsr#20 458 + orr r3,r3,r5,lsl#6 459 + mov r5,r6,lsr#14 460 + orr r4,r4,r6,lsl#12 461 + mov r6,r7,lsr#8 462 + orr r5,r5,r7,lsl#18 463 + and r3,r3,#0x03ffffff 464 + and r4,r4,#0x03ffffff 465 + and r5,r5,#0x03ffffff 466 + 467 + vdup.32 d0,r2 @ r^1 in both lanes 468 + add r2,r3,r3,lsl#2 @ *5 469 + vdup.32 d1,r3 470 + add r3,r4,r4,lsl#2 471 + vdup.32 d2,r2 472 + vdup.32 d3,r4 473 + add r4,r5,r5,lsl#2 474 + vdup.32 d4,r3 475 + vdup.32 d5,r5 476 + add r5,r6,r6,lsl#2 477 + vdup.32 d6,r4 478 + vdup.32 d7,r6 479 + vdup.32 d8,r5 480 + 481 + mov r5,#2 @ counter 482 + 483 + .Lsquare_neon: 484 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 485 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 486 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 487 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 488 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 489 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 490 + 491 + vmull.u32 q5,d0,d0[1] 492 + vmull.u32 q6,d1,d0[1] 493 + vmull.u32 q7,d3,d0[1] 494 + vmull.u32 q8,d5,d0[1] 495 + vmull.u32 q9,d7,d0[1] 496 + 497 + vmlal.u32 q5,d7,d2[1] 498 + vmlal.u32 q6,d0,d1[1] 499 + vmlal.u32 q7,d1,d1[1] 500 + vmlal.u32 q8,d3,d1[1] 501 + vmlal.u32 q9,d5,d1[1] 502 + 503 + vmlal.u32 q5,d5,d4[1] 504 + vmlal.u32 q6,d7,d4[1] 505 + vmlal.u32 q8,d1,d3[1] 506 + vmlal.u32 q7,d0,d3[1] 507 + vmlal.u32 q9,d3,d3[1] 508 + 509 + vmlal.u32 q5,d3,d6[1] 510 + vmlal.u32 q8,d0,d5[1] 511 + vmlal.u32 q6,d5,d6[1] 512 + vmlal.u32 q7,d7,d6[1] 513 + vmlal.u32 q9,d1,d5[1] 514 + 515 + vmlal.u32 q8,d7,d8[1] 516 + vmlal.u32 q5,d1,d8[1] 517 + vmlal.u32 q6,d3,d8[1] 518 + vmlal.u32 q7,d5,d8[1] 519 + vmlal.u32 q9,d0,d7[1] 520 + 521 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 522 + @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 523 + @ and P. Schwabe 524 + @ 525 + @ H0>>+H1>>+H2>>+H3>>+H4 526 + @ H3>>+H4>>*5+H0>>+H1 527 + @ 528 + @ Trivia. 529 + @ 530 + @ Result of multiplication of n-bit number by m-bit number is 531 + @ n+m bits wide. However! Even though 2^n is a n+1-bit number, 532 + @ m-bit number multiplied by 2^n is still n+m bits wide. 533 + @ 534 + @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, 535 + @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit 536 + @ one is n+1 bits wide. 537 + @ 538 + @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that 539 + @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 540 + @ can be 27. However! In cases when their width exceeds 26 bits 541 + @ they are limited by 2^26+2^6. This in turn means that *sum* 542 + @ of the products with these values can still be viewed as sum 543 + @ of 52-bit numbers as long as the amount of addends is not a 544 + @ power of 2. For example, 545 + @ 546 + @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, 547 + @ 548 + @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or 549 + @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than 550 + @ 8 * (2^52) or 2^55. However, the value is then multiplied by 551 + @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), 552 + @ which is less than 32 * (2^52) or 2^57. And when processing 553 + @ data we are looking at triple as many addends... 554 + @ 555 + @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and 556 + @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the 557 + @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while 558 + @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 559 + @ instruction accepts 2x32-bit input and writes 2x64-bit result. 560 + @ This means that result of reduction have to be compressed upon 561 + @ loop wrap-around. This can be done in the process of reduction 562 + @ to minimize amount of instructions [as well as amount of 563 + @ 128-bit instructions, which benefits low-end processors], but 564 + @ one has to watch for H2 (which is narrower than H0) and 5*H4 565 + @ not being wider than 58 bits, so that result of right shift 566 + @ by 26 bits fits in 32 bits. This is also useful on x86, 567 + @ because it allows to use paddd in place for paddq, which 568 + @ benefits Atom, where paddq is ridiculously slow. 569 + 570 + vshr.u64 q15,q8,#26 571 + vmovn.i64 d16,q8 572 + vshr.u64 q4,q5,#26 573 + vmovn.i64 d10,q5 574 + vadd.i64 q9,q9,q15 @ h3 -> h4 575 + vbic.i32 d16,#0xfc000000 @ &=0x03ffffff 576 + vadd.i64 q6,q6,q4 @ h0 -> h1 577 + vbic.i32 d10,#0xfc000000 578 + 579 + vshrn.u64 d30,q9,#26 580 + vmovn.i64 d18,q9 581 + vshr.u64 q4,q6,#26 582 + vmovn.i64 d12,q6 583 + vadd.i64 q7,q7,q4 @ h1 -> h2 584 + vbic.i32 d18,#0xfc000000 585 + vbic.i32 d12,#0xfc000000 586 + 587 + vadd.i32 d10,d10,d30 588 + vshl.u32 d30,d30,#2 589 + vshrn.u64 d8,q7,#26 590 + vmovn.i64 d14,q7 591 + vadd.i32 d10,d10,d30 @ h4 -> h0 592 + vadd.i32 d16,d16,d8 @ h2 -> h3 593 + vbic.i32 d14,#0xfc000000 594 + 595 + vshr.u32 d30,d10,#26 596 + vbic.i32 d10,#0xfc000000 597 + vshr.u32 d8,d16,#26 598 + vbic.i32 d16,#0xfc000000 599 + vadd.i32 d12,d12,d30 @ h0 -> h1 600 + vadd.i32 d18,d18,d8 @ h3 -> h4 601 + 602 + subs r5,r5,#1 603 + beq .Lsquare_break_neon 604 + 605 + add r6,r0,#(48+0*9*4) 606 + add r7,r0,#(48+1*9*4) 607 + 608 + vtrn.32 d0,d10 @ r^2:r^1 609 + vtrn.32 d3,d14 610 + vtrn.32 d5,d16 611 + vtrn.32 d1,d12 612 + vtrn.32 d7,d18 613 + 614 + vshl.u32 d4,d3,#2 @ *5 615 + vshl.u32 d6,d5,#2 616 + vshl.u32 d2,d1,#2 617 + vshl.u32 d8,d7,#2 618 + vadd.i32 d4,d4,d3 619 + vadd.i32 d2,d2,d1 620 + vadd.i32 d6,d6,d5 621 + vadd.i32 d8,d8,d7 622 + 623 + vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! 624 + vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! 625 + vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 626 + vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 627 + vst1.32 {d8[0]},[r6,:32] 628 + vst1.32 {d8[1]},[r7,:32] 629 + 630 + b .Lsquare_neon 631 + 632 + .align 4 633 + .Lsquare_break_neon: 634 + add r6,r0,#(48+2*4*9) 635 + add r7,r0,#(48+3*4*9) 636 + 637 + vmov d0,d10 @ r^4:r^3 638 + vshl.u32 d2,d12,#2 @ *5 639 + vmov d1,d12 640 + vshl.u32 d4,d14,#2 641 + vmov d3,d14 642 + vshl.u32 d6,d16,#2 643 + vmov d5,d16 644 + vshl.u32 d8,d18,#2 645 + vmov d7,d18 646 + vadd.i32 d2,d2,d12 647 + vadd.i32 d4,d4,d14 648 + vadd.i32 d6,d6,d16 649 + vadd.i32 d8,d8,d18 650 + 651 + vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! 652 + vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! 653 + vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 654 + vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 655 + vst1.32 {d8[0]},[r6] 656 + vst1.32 {d8[1]},[r7] 657 + 658 + .Lno_init_neon: 659 + bx lr @ bx lr 660 + .size poly1305_init_neon,.-poly1305_init_neon 661 + 662 + .type poly1305_blocks_neon,%function 663 + .align 5 664 + poly1305_blocks_neon: 665 + .Lpoly1305_blocks_neon: 666 + ldr ip,[r0,#36] @ is_base2_26 667 + 668 + cmp r2,#64 669 + blo .Lpoly1305_blocks 670 + 671 + stmdb sp!,{r4-r7} 672 + vstmdb sp!,{d8-d15} @ ABI specification says so 673 + 674 + tst ip,ip @ is_base2_26? 675 + bne .Lbase2_26_neon 676 + 677 + stmdb sp!,{r1-r3,lr} 678 + bl .Lpoly1305_init_neon 679 + 680 + ldr r4,[r0,#0] @ load hash value base 2^32 681 + ldr r5,[r0,#4] 682 + ldr r6,[r0,#8] 683 + ldr r7,[r0,#12] 684 + ldr ip,[r0,#16] 685 + 686 + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 687 + mov r3,r4,lsr#26 688 + veor d10,d10,d10 689 + mov r4,r5,lsr#20 690 + orr r3,r3,r5,lsl#6 691 + veor d12,d12,d12 692 + mov r5,r6,lsr#14 693 + orr r4,r4,r6,lsl#12 694 + veor d14,d14,d14 695 + mov r6,r7,lsr#8 696 + orr r5,r5,r7,lsl#18 697 + veor d16,d16,d16 698 + and r3,r3,#0x03ffffff 699 + orr r6,r6,ip,lsl#24 700 + veor d18,d18,d18 701 + and r4,r4,#0x03ffffff 702 + mov r1,#1 703 + and r5,r5,#0x03ffffff 704 + str r1,[r0,#36] @ set is_base2_26 705 + 706 + vmov.32 d10[0],r2 707 + vmov.32 d12[0],r3 708 + vmov.32 d14[0],r4 709 + vmov.32 d16[0],r5 710 + vmov.32 d18[0],r6 711 + adr r5,.Lzeros 712 + 713 + ldmia sp!,{r1-r3,lr} 714 + b .Lhash_loaded 715 + 716 + .align 4 717 + .Lbase2_26_neon: 718 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 719 + @ load hash value 720 + 721 + veor d10,d10,d10 722 + veor d12,d12,d12 723 + veor d14,d14,d14 724 + veor d16,d16,d16 725 + veor d18,d18,d18 726 + vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! 727 + adr r5,.Lzeros 728 + vld1.32 {d18[0]},[r0] 729 + sub r0,r0,#16 @ rewind 730 + 731 + .Lhash_loaded: 732 + add r4,r1,#32 733 + mov r3,r3,lsl#24 734 + tst r2,#31 735 + beq .Leven 736 + 737 + vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]! 738 + vmov.32 d28[0],r3 739 + sub r2,r2,#16 740 + add r4,r1,#32 741 + 742 + # ifdef __ARMEB__ 743 + vrev32.8 q10,q10 744 + vrev32.8 q13,q13 745 + vrev32.8 q11,q11 746 + vrev32.8 q12,q12 747 + # endif 748 + vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26 749 + vshl.u32 d26,d26,#18 750 + 751 + vsri.u32 d26,d24,#14 752 + vshl.u32 d24,d24,#12 753 + vadd.i32 d29,d28,d18 @ add hash value and move to #hi 754 + 755 + vbic.i32 d26,#0xfc000000 756 + vsri.u32 d24,d22,#20 757 + vshl.u32 d22,d22,#6 758 + 759 + vbic.i32 d24,#0xfc000000 760 + vsri.u32 d22,d20,#26 761 + vadd.i32 d27,d26,d16 762 + 763 + vbic.i32 d20,#0xfc000000 764 + vbic.i32 d22,#0xfc000000 765 + vadd.i32 d25,d24,d14 766 + 767 + vadd.i32 d21,d20,d10 768 + vadd.i32 d23,d22,d12 769 + 770 + mov r7,r5 771 + add r6,r0,#48 772 + 773 + cmp r2,r2 774 + b .Long_tail 775 + 776 + .align 4 777 + .Leven: 778 + subs r2,r2,#64 779 + it lo 780 + movlo r4,r5 781 + 782 + vmov.i32 q14,#1<<24 @ padbit, yes, always 783 + vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] 784 + add r1,r1,#64 785 + vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) 786 + add r4,r4,#64 787 + itt hi 788 + addhi r7,r0,#(48+1*9*4) 789 + addhi r6,r0,#(48+3*9*4) 790 + 791 + # ifdef __ARMEB__ 792 + vrev32.8 q10,q10 793 + vrev32.8 q13,q13 794 + vrev32.8 q11,q11 795 + vrev32.8 q12,q12 796 + # endif 797 + vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 798 + vshl.u32 q13,q13,#18 799 + 800 + vsri.u32 q13,q12,#14 801 + vshl.u32 q12,q12,#12 802 + 803 + vbic.i32 q13,#0xfc000000 804 + vsri.u32 q12,q11,#20 805 + vshl.u32 q11,q11,#6 806 + 807 + vbic.i32 q12,#0xfc000000 808 + vsri.u32 q11,q10,#26 809 + 810 + vbic.i32 q10,#0xfc000000 811 + vbic.i32 q11,#0xfc000000 812 + 813 + bls .Lskip_loop 814 + 815 + vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2 816 + vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 817 + vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 818 + vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 819 + b .Loop_neon 820 + 821 + .align 5 822 + .Loop_neon: 823 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 824 + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 825 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 826 + @ ___________________/ 827 + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 828 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 829 + @ ___________________/ ____________________/ 830 + @ 831 + @ Note that we start with inp[2:3]*r^2. This is because it 832 + @ doesn't depend on reduction in previous iteration. 833 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 834 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 835 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 836 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 837 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 838 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 839 + 840 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 841 + @ inp[2:3]*r^2 842 + 843 + vadd.i32 d24,d24,d14 @ accumulate inp[0:1] 844 + vmull.u32 q7,d25,d0[1] 845 + vadd.i32 d20,d20,d10 846 + vmull.u32 q5,d21,d0[1] 847 + vadd.i32 d26,d26,d16 848 + vmull.u32 q8,d27,d0[1] 849 + vmlal.u32 q7,d23,d1[1] 850 + vadd.i32 d22,d22,d12 851 + vmull.u32 q6,d23,d0[1] 852 + 853 + vadd.i32 d28,d28,d18 854 + vmull.u32 q9,d29,d0[1] 855 + subs r2,r2,#64 856 + vmlal.u32 q5,d29,d2[1] 857 + it lo 858 + movlo r4,r5 859 + vmlal.u32 q8,d25,d1[1] 860 + vld1.32 d8[1],[r7,:32] 861 + vmlal.u32 q6,d21,d1[1] 862 + vmlal.u32 q9,d27,d1[1] 863 + 864 + vmlal.u32 q5,d27,d4[1] 865 + vmlal.u32 q8,d23,d3[1] 866 + vmlal.u32 q9,d25,d3[1] 867 + vmlal.u32 q6,d29,d4[1] 868 + vmlal.u32 q7,d21,d3[1] 869 + 870 + vmlal.u32 q8,d21,d5[1] 871 + vmlal.u32 q5,d25,d6[1] 872 + vmlal.u32 q9,d23,d5[1] 873 + vmlal.u32 q6,d27,d6[1] 874 + vmlal.u32 q7,d29,d6[1] 875 + 876 + vmlal.u32 q8,d29,d8[1] 877 + vmlal.u32 q5,d23,d8[1] 878 + vmlal.u32 q9,d21,d7[1] 879 + vmlal.u32 q6,d25,d8[1] 880 + vmlal.u32 q7,d27,d8[1] 881 + 882 + vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) 883 + add r4,r4,#64 884 + 885 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 886 + @ (hash+inp[0:1])*r^4 and accumulate 887 + 888 + vmlal.u32 q8,d26,d0[0] 889 + vmlal.u32 q5,d20,d0[0] 890 + vmlal.u32 q9,d28,d0[0] 891 + vmlal.u32 q6,d22,d0[0] 892 + vmlal.u32 q7,d24,d0[0] 893 + vld1.32 d8[0],[r6,:32] 894 + 895 + vmlal.u32 q8,d24,d1[0] 896 + vmlal.u32 q5,d28,d2[0] 897 + vmlal.u32 q9,d26,d1[0] 898 + vmlal.u32 q6,d20,d1[0] 899 + vmlal.u32 q7,d22,d1[0] 900 + 901 + vmlal.u32 q8,d22,d3[0] 902 + vmlal.u32 q5,d26,d4[0] 903 + vmlal.u32 q9,d24,d3[0] 904 + vmlal.u32 q6,d28,d4[0] 905 + vmlal.u32 q7,d20,d3[0] 906 + 907 + vmlal.u32 q8,d20,d5[0] 908 + vmlal.u32 q5,d24,d6[0] 909 + vmlal.u32 q9,d22,d5[0] 910 + vmlal.u32 q6,d26,d6[0] 911 + vmlal.u32 q8,d28,d8[0] 912 + 913 + vmlal.u32 q7,d28,d6[0] 914 + vmlal.u32 q5,d22,d8[0] 915 + vmlal.u32 q9,d20,d7[0] 916 + vmov.i32 q14,#1<<24 @ padbit, yes, always 917 + vmlal.u32 q6,d24,d8[0] 918 + vmlal.u32 q7,d26,d8[0] 919 + 920 + vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] 921 + add r1,r1,#64 922 + # ifdef __ARMEB__ 923 + vrev32.8 q10,q10 924 + vrev32.8 q11,q11 925 + vrev32.8 q12,q12 926 + vrev32.8 q13,q13 927 + # endif 928 + 929 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 930 + @ lazy reduction interleaved with base 2^32 -> base 2^26 of 931 + @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14. 932 + 933 + vshr.u64 q15,q8,#26 934 + vmovn.i64 d16,q8 935 + vshr.u64 q4,q5,#26 936 + vmovn.i64 d10,q5 937 + vadd.i64 q9,q9,q15 @ h3 -> h4 938 + vbic.i32 d16,#0xfc000000 939 + vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 940 + vadd.i64 q6,q6,q4 @ h0 -> h1 941 + vshl.u32 q13,q13,#18 942 + vbic.i32 d10,#0xfc000000 943 + 944 + vshrn.u64 d30,q9,#26 945 + vmovn.i64 d18,q9 946 + vshr.u64 q4,q6,#26 947 + vmovn.i64 d12,q6 948 + vadd.i64 q7,q7,q4 @ h1 -> h2 949 + vsri.u32 q13,q12,#14 950 + vbic.i32 d18,#0xfc000000 951 + vshl.u32 q12,q12,#12 952 + vbic.i32 d12,#0xfc000000 953 + 954 + vadd.i32 d10,d10,d30 955 + vshl.u32 d30,d30,#2 956 + vbic.i32 q13,#0xfc000000 957 + vshrn.u64 d8,q7,#26 958 + vmovn.i64 d14,q7 959 + vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec] 960 + vsri.u32 q12,q11,#20 961 + vadd.i32 d16,d16,d8 @ h2 -> h3 962 + vshl.u32 q11,q11,#6 963 + vbic.i32 d14,#0xfc000000 964 + vbic.i32 q12,#0xfc000000 965 + 966 + vshrn.u64 d30,q5,#26 @ re-narrow 967 + vmovn.i64 d10,q5 968 + vsri.u32 q11,q10,#26 969 + vbic.i32 q10,#0xfc000000 970 + vshr.u32 d8,d16,#26 971 + vbic.i32 d16,#0xfc000000 972 + vbic.i32 d10,#0xfc000000 973 + vadd.i32 d12,d12,d30 @ h0 -> h1 974 + vadd.i32 d18,d18,d8 @ h3 -> h4 975 + vbic.i32 q11,#0xfc000000 976 + 977 + bhi .Loop_neon 978 + 979 + .Lskip_loop: 980 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 981 + @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 982 + 983 + add r7,r0,#(48+0*9*4) 984 + add r6,r0,#(48+1*9*4) 985 + adds r2,r2,#32 986 + it ne 987 + movne r2,#0 988 + bne .Long_tail 989 + 990 + vadd.i32 d25,d24,d14 @ add hash value and move to #hi 991 + vadd.i32 d21,d20,d10 992 + vadd.i32 d27,d26,d16 993 + vadd.i32 d23,d22,d12 994 + vadd.i32 d29,d28,d18 995 + 996 + .Long_tail: 997 + vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1 998 + vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2 999 + 1000 + vadd.i32 d24,d24,d14 @ can be redundant 1001 + vmull.u32 q7,d25,d0 1002 + vadd.i32 d20,d20,d10 1003 + vmull.u32 q5,d21,d0 1004 + vadd.i32 d26,d26,d16 1005 + vmull.u32 q8,d27,d0 1006 + vadd.i32 d22,d22,d12 1007 + vmull.u32 q6,d23,d0 1008 + vadd.i32 d28,d28,d18 1009 + vmull.u32 q9,d29,d0 1010 + 1011 + vmlal.u32 q5,d29,d2 1012 + vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 1013 + vmlal.u32 q8,d25,d1 1014 + vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 1015 + vmlal.u32 q6,d21,d1 1016 + vmlal.u32 q9,d27,d1 1017 + vmlal.u32 q7,d23,d1 1018 + 1019 + vmlal.u32 q8,d23,d3 1020 + vld1.32 d8[1],[r7,:32] 1021 + vmlal.u32 q5,d27,d4 1022 + vld1.32 d8[0],[r6,:32] 1023 + vmlal.u32 q9,d25,d3 1024 + vmlal.u32 q6,d29,d4 1025 + vmlal.u32 q7,d21,d3 1026 + 1027 + vmlal.u32 q8,d21,d5 1028 + it ne 1029 + addne r7,r0,#(48+2*9*4) 1030 + vmlal.u32 q5,d25,d6 1031 + it ne 1032 + addne r6,r0,#(48+3*9*4) 1033 + vmlal.u32 q9,d23,d5 1034 + vmlal.u32 q6,d27,d6 1035 + vmlal.u32 q7,d29,d6 1036 + 1037 + vmlal.u32 q8,d29,d8 1038 + vorn q0,q0,q0 @ all-ones, can be redundant 1039 + vmlal.u32 q5,d23,d8 1040 + vshr.u64 q0,q0,#38 1041 + vmlal.u32 q9,d21,d7 1042 + vmlal.u32 q6,d25,d8 1043 + vmlal.u32 q7,d27,d8 1044 + 1045 + beq .Lshort_tail 1046 + 1047 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1048 + @ (hash+inp[0:1])*r^4:r^3 and accumulate 1049 + 1050 + vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3 1051 + vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 1052 + 1053 + vmlal.u32 q7,d24,d0 1054 + vmlal.u32 q5,d20,d0 1055 + vmlal.u32 q8,d26,d0 1056 + vmlal.u32 q6,d22,d0 1057 + vmlal.u32 q9,d28,d0 1058 + 1059 + vmlal.u32 q5,d28,d2 1060 + vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 1061 + vmlal.u32 q8,d24,d1 1062 + vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 1063 + vmlal.u32 q6,d20,d1 1064 + vmlal.u32 q9,d26,d1 1065 + vmlal.u32 q7,d22,d1 1066 + 1067 + vmlal.u32 q8,d22,d3 1068 + vld1.32 d8[1],[r7,:32] 1069 + vmlal.u32 q5,d26,d4 1070 + vld1.32 d8[0],[r6,:32] 1071 + vmlal.u32 q9,d24,d3 1072 + vmlal.u32 q6,d28,d4 1073 + vmlal.u32 q7,d20,d3 1074 + 1075 + vmlal.u32 q8,d20,d5 1076 + vmlal.u32 q5,d24,d6 1077 + vmlal.u32 q9,d22,d5 1078 + vmlal.u32 q6,d26,d6 1079 + vmlal.u32 q7,d28,d6 1080 + 1081 + vmlal.u32 q8,d28,d8 1082 + vorn q0,q0,q0 @ all-ones 1083 + vmlal.u32 q5,d22,d8 1084 + vshr.u64 q0,q0,#38 1085 + vmlal.u32 q9,d20,d7 1086 + vmlal.u32 q6,d24,d8 1087 + vmlal.u32 q7,d26,d8 1088 + 1089 + .Lshort_tail: 1090 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1091 + @ horizontal addition 1092 + 1093 + vadd.i64 d16,d16,d17 1094 + vadd.i64 d10,d10,d11 1095 + vadd.i64 d18,d18,d19 1096 + vadd.i64 d12,d12,d13 1097 + vadd.i64 d14,d14,d15 1098 + 1099 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1100 + @ lazy reduction, but without narrowing 1101 + 1102 + vshr.u64 q15,q8,#26 1103 + vand.i64 q8,q8,q0 1104 + vshr.u64 q4,q5,#26 1105 + vand.i64 q5,q5,q0 1106 + vadd.i64 q9,q9,q15 @ h3 -> h4 1107 + vadd.i64 q6,q6,q4 @ h0 -> h1 1108 + 1109 + vshr.u64 q15,q9,#26 1110 + vand.i64 q9,q9,q0 1111 + vshr.u64 q4,q6,#26 1112 + vand.i64 q6,q6,q0 1113 + vadd.i64 q7,q7,q4 @ h1 -> h2 1114 + 1115 + vadd.i64 q5,q5,q15 1116 + vshl.u64 q15,q15,#2 1117 + vshr.u64 q4,q7,#26 1118 + vand.i64 q7,q7,q0 1119 + vadd.i64 q5,q5,q15 @ h4 -> h0 1120 + vadd.i64 q8,q8,q4 @ h2 -> h3 1121 + 1122 + vshr.u64 q15,q5,#26 1123 + vand.i64 q5,q5,q0 1124 + vshr.u64 q4,q8,#26 1125 + vand.i64 q8,q8,q0 1126 + vadd.i64 q6,q6,q15 @ h0 -> h1 1127 + vadd.i64 q9,q9,q4 @ h3 -> h4 1128 + 1129 + cmp r2,#0 1130 + bne .Leven 1131 + 1132 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1133 + @ store hash value 1134 + 1135 + vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! 1136 + vst1.32 {d18[0]},[r0] 1137 + 1138 + vldmia sp!,{d8-d15} @ epilogue 1139 + ldmia sp!,{r4-r7} 1140 + bx lr @ bx lr 1141 + .size poly1305_blocks_neon,.-poly1305_blocks_neon 1142 + 1143 + .align 5 1144 + .Lzeros: 1145 + .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1146 + #ifndef __KERNEL__ 1147 + .LOPENSSL_armcap: 1148 + # ifdef _WIN32 1149 + .word OPENSSL_armcap_P 1150 + # else 1151 + .word OPENSSL_armcap_P-.Lpoly1305_init 1152 + # endif 1153 + .comm OPENSSL_armcap_P,4,4 1154 + .hidden OPENSSL_armcap_P 1155 + #endif 1156 + #endif 1157 + .asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by @dot-asm" 1158 + .align 2
+276
arch/arm/crypto/poly1305-glue.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM 4 + * 5 + * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org> 6 + */ 7 + 8 + #include <asm/hwcap.h> 9 + #include <asm/neon.h> 10 + #include <asm/simd.h> 11 + #include <asm/unaligned.h> 12 + #include <crypto/algapi.h> 13 + #include <crypto/internal/hash.h> 14 + #include <crypto/internal/poly1305.h> 15 + #include <crypto/internal/simd.h> 16 + #include <linux/cpufeature.h> 17 + #include <linux/crypto.h> 18 + #include <linux/jump_label.h> 19 + #include <linux/module.h> 20 + 21 + void poly1305_init_arm(void *state, const u8 *key); 22 + void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit); 23 + void poly1305_emit_arm(void *state, __le32 *digest, const u32 *nonce); 24 + 25 + void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit) 26 + { 27 + } 28 + 29 + static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); 30 + 31 + void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) 32 + { 33 + poly1305_init_arm(&dctx->h, key); 34 + dctx->s[0] = get_unaligned_le32(key + 16); 35 + dctx->s[1] = get_unaligned_le32(key + 20); 36 + dctx->s[2] = get_unaligned_le32(key + 24); 37 + dctx->s[3] = get_unaligned_le32(key + 28); 38 + dctx->buflen = 0; 39 + } 40 + EXPORT_SYMBOL(poly1305_init_arch); 41 + 42 + static int arm_poly1305_init(struct shash_desc *desc) 43 + { 44 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); 45 + 46 + dctx->buflen = 0; 47 + dctx->rset = 0; 48 + dctx->sset = false; 49 + 50 + return 0; 51 + } 52 + 53 + static void arm_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, 54 + u32 len, u32 hibit, bool do_neon) 55 + { 56 + if (unlikely(!dctx->sset)) { 57 + if (!dctx->rset) { 58 + poly1305_init_arm(&dctx->h, src); 59 + src += POLY1305_BLOCK_SIZE; 60 + len -= POLY1305_BLOCK_SIZE; 61 + dctx->rset = 1; 62 + } 63 + if (len >= POLY1305_BLOCK_SIZE) { 64 + dctx->s[0] = get_unaligned_le32(src + 0); 65 + dctx->s[1] = get_unaligned_le32(src + 4); 66 + dctx->s[2] = get_unaligned_le32(src + 8); 67 + dctx->s[3] = get_unaligned_le32(src + 12); 68 + src += POLY1305_BLOCK_SIZE; 69 + len -= POLY1305_BLOCK_SIZE; 70 + dctx->sset = true; 71 + } 72 + if (len < POLY1305_BLOCK_SIZE) 73 + return; 74 + } 75 + 76 + len &= ~(POLY1305_BLOCK_SIZE - 1); 77 + 78 + if (static_branch_likely(&have_neon) && likely(do_neon)) 79 + poly1305_blocks_neon(&dctx->h, src, len, hibit); 80 + else 81 + poly1305_blocks_arm(&dctx->h, src, len, hibit); 82 + } 83 + 84 + static void arm_poly1305_do_update(struct poly1305_desc_ctx *dctx, 85 + const u8 *src, u32 len, bool do_neon) 86 + { 87 + if (unlikely(dctx->buflen)) { 88 + u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen); 89 + 90 + memcpy(dctx->buf + dctx->buflen, src, bytes); 91 + src += bytes; 92 + len -= bytes; 93 + dctx->buflen += bytes; 94 + 95 + if (dctx->buflen == POLY1305_BLOCK_SIZE) { 96 + arm_poly1305_blocks(dctx, dctx->buf, 97 + POLY1305_BLOCK_SIZE, 1, false); 98 + dctx->buflen = 0; 99 + } 100 + } 101 + 102 + if (likely(len >= POLY1305_BLOCK_SIZE)) { 103 + arm_poly1305_blocks(dctx, src, len, 1, do_neon); 104 + src += round_down(len, POLY1305_BLOCK_SIZE); 105 + len %= POLY1305_BLOCK_SIZE; 106 + } 107 + 108 + if (unlikely(len)) { 109 + dctx->buflen = len; 110 + memcpy(dctx->buf, src, len); 111 + } 112 + } 113 + 114 + static int arm_poly1305_update(struct shash_desc *desc, 115 + const u8 *src, unsigned int srclen) 116 + { 117 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); 118 + 119 + arm_poly1305_do_update(dctx, src, srclen, false); 120 + return 0; 121 + } 122 + 123 + static int __maybe_unused arm_poly1305_update_neon(struct shash_desc *desc, 124 + const u8 *src, 125 + unsigned int srclen) 126 + { 127 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); 128 + bool do_neon = crypto_simd_usable() && srclen > 128; 129 + 130 + if (static_branch_likely(&have_neon) && do_neon) 131 + kernel_neon_begin(); 132 + arm_poly1305_do_update(dctx, src, srclen, do_neon); 133 + if (static_branch_likely(&have_neon) && do_neon) 134 + kernel_neon_end(); 135 + return 0; 136 + } 137 + 138 + void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, 139 + unsigned int nbytes) 140 + { 141 + bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && 142 + crypto_simd_usable(); 143 + 144 + if (unlikely(dctx->buflen)) { 145 + u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen); 146 + 147 + memcpy(dctx->buf + dctx->buflen, src, bytes); 148 + src += bytes; 149 + nbytes -= bytes; 150 + dctx->buflen += bytes; 151 + 152 + if (dctx->buflen == POLY1305_BLOCK_SIZE) { 153 + poly1305_blocks_arm(&dctx->h, dctx->buf, 154 + POLY1305_BLOCK_SIZE, 1); 155 + dctx->buflen = 0; 156 + } 157 + } 158 + 159 + if (likely(nbytes >= POLY1305_BLOCK_SIZE)) { 160 + unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE); 161 + 162 + if (static_branch_likely(&have_neon) && do_neon) { 163 + kernel_neon_begin(); 164 + poly1305_blocks_neon(&dctx->h, src, len, 1); 165 + kernel_neon_end(); 166 + } else { 167 + poly1305_blocks_arm(&dctx->h, src, len, 1); 168 + } 169 + src += len; 170 + nbytes %= POLY1305_BLOCK_SIZE; 171 + } 172 + 173 + if (unlikely(nbytes)) { 174 + dctx->buflen = nbytes; 175 + memcpy(dctx->buf, src, nbytes); 176 + } 177 + } 178 + EXPORT_SYMBOL(poly1305_update_arch); 179 + 180 + void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) 181 + { 182 + __le32 digest[4]; 183 + u64 f = 0; 184 + 185 + if (unlikely(dctx->buflen)) { 186 + dctx->buf[dctx->buflen++] = 1; 187 + memset(dctx->buf + dctx->buflen, 0, 188 + POLY1305_BLOCK_SIZE - dctx->buflen); 189 + poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); 190 + } 191 + 192 + poly1305_emit_arm(&dctx->h, digest, dctx->s); 193 + 194 + /* mac = (h + s) % (2^128) */ 195 + f = (f >> 32) + le32_to_cpu(digest[0]); 196 + put_unaligned_le32(f, dst); 197 + f = (f >> 32) + le32_to_cpu(digest[1]); 198 + put_unaligned_le32(f, dst + 4); 199 + f = (f >> 32) + le32_to_cpu(digest[2]); 200 + put_unaligned_le32(f, dst + 8); 201 + f = (f >> 32) + le32_to_cpu(digest[3]); 202 + put_unaligned_le32(f, dst + 12); 203 + 204 + *dctx = (struct poly1305_desc_ctx){}; 205 + } 206 + EXPORT_SYMBOL(poly1305_final_arch); 207 + 208 + static int arm_poly1305_final(struct shash_desc *desc, u8 *dst) 209 + { 210 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); 211 + 212 + if (unlikely(!dctx->sset)) 213 + return -ENOKEY; 214 + 215 + poly1305_final_arch(dctx, dst); 216 + return 0; 217 + } 218 + 219 + static struct shash_alg arm_poly1305_algs[] = {{ 220 + .init = arm_poly1305_init, 221 + .update = arm_poly1305_update, 222 + .final = arm_poly1305_final, 223 + .digestsize = POLY1305_DIGEST_SIZE, 224 + .descsize = sizeof(struct poly1305_desc_ctx), 225 + 226 + .base.cra_name = "poly1305", 227 + .base.cra_driver_name = "poly1305-arm", 228 + .base.cra_priority = 150, 229 + .base.cra_blocksize = POLY1305_BLOCK_SIZE, 230 + .base.cra_module = THIS_MODULE, 231 + #ifdef CONFIG_KERNEL_MODE_NEON 232 + }, { 233 + .init = arm_poly1305_init, 234 + .update = arm_poly1305_update_neon, 235 + .final = arm_poly1305_final, 236 + .digestsize = POLY1305_DIGEST_SIZE, 237 + .descsize = sizeof(struct poly1305_desc_ctx), 238 + 239 + .base.cra_name = "poly1305", 240 + .base.cra_driver_name = "poly1305-neon", 241 + .base.cra_priority = 200, 242 + .base.cra_blocksize = POLY1305_BLOCK_SIZE, 243 + .base.cra_module = THIS_MODULE, 244 + #endif 245 + }}; 246 + 247 + static int __init arm_poly1305_mod_init(void) 248 + { 249 + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && 250 + (elf_hwcap & HWCAP_NEON)) 251 + static_branch_enable(&have_neon); 252 + else 253 + /* register only the first entry */ 254 + return crypto_register_shash(&arm_poly1305_algs[0]); 255 + 256 + return crypto_register_shashes(arm_poly1305_algs, 257 + ARRAY_SIZE(arm_poly1305_algs)); 258 + } 259 + 260 + static void __exit arm_poly1305_mod_exit(void) 261 + { 262 + if (!static_branch_likely(&have_neon)) { 263 + crypto_unregister_shash(&arm_poly1305_algs[0]); 264 + return; 265 + } 266 + crypto_unregister_shashes(arm_poly1305_algs, 267 + ARRAY_SIZE(arm_poly1305_algs)); 268 + } 269 + 270 + module_init(arm_poly1305_mod_init); 271 + module_exit(arm_poly1305_mod_exit); 272 + 273 + MODULE_LICENSE("GPL v2"); 274 + MODULE_ALIAS_CRYPTO("poly1305"); 275 + MODULE_ALIAS_CRYPTO("poly1305-arm"); 276 + MODULE_ALIAS_CRYPTO("poly1305-neon");
+1 -1
lib/crypto/Kconfig
··· 40 40 config CRYPTO_LIB_POLY1305_RSIZE 41 41 int 42 42 default 4 if X86_64 43 - default 9 if ARM64 43 + default 9 if ARM || ARM64 44 44 default 1 45 45 46 46 config CRYPTO_ARCH_HAVE_LIB_POLY1305