lib/crypto: riscv/poly1305: Import OpenSSL/CRYPTOGAMS implementation

+2 -1

lib/crypto/Kconfig

··· 128 128 default y if MIPS 129 129 # The PPC64 code needs to be fixed to work in softirq context. 130 130 default y if PPC64 && CPU_LITTLE_ENDIAN && VSX && BROKEN 131 + default y if RISCV 131 132 default y if X86_64 132 133 133 134 # This symbol controls the inclusion of the Poly1305 generic code. This differs ··· 144 143 145 144 config CRYPTO_LIB_POLY1305_RSIZE 146 145 int 147 - default 2 if MIPS 146 + default 2 if MIPS || RISCV 148 147 default 11 if X86_64 149 148 default 9 if ARM || ARM64 150 149 default 1

+14

lib/crypto/Makefile

··· 112 112 113 113 libpoly1305-$(CONFIG_PPC) += powerpc/poly1305-p10le_64.o 114 114 115 + ifeq ($(CONFIG_RISCV),y) 116 + libpoly1305-y += riscv/poly1305-core.o 117 + poly1305-perlasm-flavour-$(CONFIG_32BIT) := 32 118 + poly1305-perlasm-flavour-$(CONFIG_64BIT) := 64 119 + quiet_cmd_perlasm_poly1305 = PERLASM $@ 120 + cmd_perlasm_poly1305 = $(PERL) $< $(poly1305-perlasm-flavour-y) $@ 121 + # Use if_changed instead of cmd, in case the flavour changed. 122 + $(obj)/riscv/poly1305-core.S: $(src)/riscv/poly1305-riscv.pl FORCE 123 + $(call if_changed,perlasm_poly1305) 124 + targets += riscv/poly1305-core.S 125 + AFLAGS_riscv/poly1305-core.o += -Dpoly1305_init=poly1305_block_init 126 + endif 127 + 115 128 ifeq ($(CONFIG_X86),y) 116 129 libpoly1305-y += x86/poly1305-x86_64-cryptogams.o 117 130 $(obj)/x86/poly1305-x86_64-cryptogams.S: $(src)/x86/poly1305-x86_64-cryptogams.pl ··· 137 124 clean-files += arm/poly1305-core.S \ 138 125 arm64/poly1305-core.S \ 139 126 mips/poly1305-core.S \ 127 + riscv/poly1305-core.S \ 140 128 x86/poly1305-x86_64-cryptogams.S 141 129 142 130 ################################################################################

+847

lib/crypto/riscv/poly1305-riscv.pl

··· 1 + #!/usr/bin/env perl 2 + # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause 3 + # 4 + # ==================================================================== 5 + # Written by Andy Polyakov, @dot-asm, initially for use with OpenSSL. 6 + # ==================================================================== 7 + # 8 + # Poly1305 hash for RISC-V. 9 + # 10 + # February 2019 11 + # 12 + # In the essence it's pretty straightforward transliteration of MIPS 13 + # module [without big-endian option]. 14 + # 15 + # 1.8 cycles per byte on U74, >100% faster than compiler-generated 16 + # code. 1.9 cpb on C910, ~75% improvement. 3.3 on Spacemit X60, ~69% 17 + # improvement. 18 + # 19 + # June 2024. 20 + # 21 + # Add CHERI support. 22 + # 23 + ###################################################################### 24 + # 25 + ($zero,$ra,$sp,$gp,$tp)=map("x$_",(0..4)); 26 + ($t0,$t1,$t2,$t3,$t4,$t5,$t6)=map("x$_",(5..7,28..31)); 27 + ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(10..17)); 28 + ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("x$_",(8,9,18..27)); 29 + # 30 + ###################################################################### 31 + 32 + $flavour = shift || "64"; 33 + 34 + for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); } 35 + open STDOUT,">$output"; 36 + 37 + $code.=<<___; 38 + #ifdef __KERNEL__ 39 + # ifdef __riscv_zicfilp 40 + # undef __riscv_zicfilp // calls are expected to be direct 41 + # endif 42 + #endif 43 + 44 + #if defined(__CHERI_PURE_CAPABILITY__) && !defined(__riscv_misaligned_fast) 45 + # define __riscv_misaligned_fast 1 46 + #endif 47 + ___ 48 + 49 + if ($flavour =~ /64/) {{{ 50 + ###################################################################### 51 + # 64-bit code path... 52 + # 53 + my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); 54 + my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$t0,$t1,$t2); 55 + 56 + $code.=<<___; 57 + #if __riscv_xlen == 64 58 + # if __SIZEOF_POINTER__ == 16 59 + # define PUSH csc 60 + # define POP clc 61 + # else 62 + # define PUSH sd 63 + # define POP ld 64 + # endif 65 + #else 66 + # error "unsupported __riscv_xlen" 67 + #endif 68 + 69 + .option pic 70 + .text 71 + 72 + .globl poly1305_init 73 + .type poly1305_init,\@function 74 + poly1305_init: 75 + #ifdef __riscv_zicfilp 76 + lpad 0 77 + #endif 78 + sd $zero,0($ctx) 79 + sd $zero,8($ctx) 80 + sd $zero,16($ctx) 81 + 82 + beqz $inp,.Lno_key 83 + 84 + #ifndef __riscv_misaligned_fast 85 + andi $tmp0,$inp,7 # $inp % 8 86 + andi $inp,$inp,-8 # align $inp 87 + slli $tmp0,$tmp0,3 # byte to bit offset 88 + #endif 89 + ld $in0,0($inp) 90 + ld $in1,8($inp) 91 + #ifndef __riscv_misaligned_fast 92 + beqz $tmp0,.Laligned_key 93 + 94 + ld $tmp2,16($inp) 95 + neg $tmp1,$tmp0 # implicit &63 in sll 96 + srl $in0,$in0,$tmp0 97 + sll $tmp3,$in1,$tmp1 98 + srl $in1,$in1,$tmp0 99 + sll $tmp2,$tmp2,$tmp1 100 + or $in0,$in0,$tmp3 101 + or $in1,$in1,$tmp2 102 + 103 + .Laligned_key: 104 + #endif 105 + li $tmp0,1 106 + slli $tmp0,$tmp0,32 # 0x0000000100000000 107 + addi $tmp0,$tmp0,-63 # 0x00000000ffffffc1 108 + slli $tmp0,$tmp0,28 # 0x0ffffffc10000000 109 + addi $tmp0,$tmp0,-1 # 0x0ffffffc0fffffff 110 + 111 + and $in0,$in0,$tmp0 112 + addi $tmp0,$tmp0,-3 # 0x0ffffffc0ffffffc 113 + and $in1,$in1,$tmp0 114 + 115 + sd $in0,24($ctx) 116 + srli $tmp0,$in1,2 117 + sd $in1,32($ctx) 118 + add $tmp0,$tmp0,$in1 # s1 = r1 + (r1 >> 2) 119 + sd $tmp0,40($ctx) 120 + 121 + .Lno_key: 122 + li $a0,0 # return 0 123 + ret 124 + .size poly1305_init,.-poly1305_init 125 + ___ 126 + { 127 + my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = 128 + ($s0,$s1,$s2,$s3,$t3,$t4,$in0,$in1,$t2); 129 + my ($shr,$shl) = ($t5,$t6); # used on R6 130 + 131 + $code.=<<___; 132 + .globl poly1305_blocks 133 + .type poly1305_blocks,\@function 134 + poly1305_blocks: 135 + #ifdef __riscv_zicfilp 136 + lpad 0 137 + #endif 138 + andi $len,$len,-16 # complete blocks only 139 + beqz $len,.Lno_data 140 + 141 + caddi $sp,$sp,-4*__SIZEOF_POINTER__ 142 + PUSH $s0,3*__SIZEOF_POINTER__($sp) 143 + PUSH $s1,2*__SIZEOF_POINTER__($sp) 144 + PUSH $s2,1*__SIZEOF_POINTER__($sp) 145 + PUSH $s3,0*__SIZEOF_POINTER__($sp) 146 + 147 + #ifndef __riscv_misaligned_fast 148 + andi $shr,$inp,7 149 + andi $inp,$inp,-8 # align $inp 150 + slli $shr,$shr,3 # byte to bit offset 151 + neg $shl,$shr # implicit &63 in sll 152 + #endif 153 + 154 + ld $h0,0($ctx) # load hash value 155 + ld $h1,8($ctx) 156 + ld $h2,16($ctx) 157 + 158 + ld $r0,24($ctx) # load key 159 + ld $r1,32($ctx) 160 + ld $rs1,40($ctx) 161 + 162 + add $len,$len,$inp # end of buffer 163 + 164 + .Loop: 165 + ld $in0,0($inp) # load input 166 + ld $in1,8($inp) 167 + #ifndef __riscv_misaligned_fast 168 + beqz $shr,.Laligned_inp 169 + 170 + ld $tmp2,16($inp) 171 + srl $in0,$in0,$shr 172 + sll $tmp3,$in1,$shl 173 + srl $in1,$in1,$shr 174 + sll $tmp2,$tmp2,$shl 175 + or $in0,$in0,$tmp3 176 + or $in1,$in1,$tmp2 177 + 178 + .Laligned_inp: 179 + #endif 180 + caddi $inp,$inp,16 181 + 182 + andi $tmp0,$h2,-4 # modulo-scheduled reduction 183 + srli $tmp1,$h2,2 184 + andi $h2,$h2,3 185 + 186 + add $d0,$h0,$in0 # accumulate input 187 + add $tmp1,$tmp1,$tmp0 188 + sltu $tmp0,$d0,$h0 189 + add $d0,$d0,$tmp1 # ... and residue 190 + sltu $tmp1,$d0,$tmp1 191 + add $d1,$h1,$in1 192 + add $tmp0,$tmp0,$tmp1 193 + sltu $tmp1,$d1,$h1 194 + add $d1,$d1,$tmp0 195 + 196 + add $d2,$h2,$padbit 197 + sltu $tmp0,$d1,$tmp0 198 + mulhu $h1,$r0,$d0 # h0*r0 199 + mul $h0,$r0,$d0 200 + 201 + add $d2,$d2,$tmp1 202 + add $d2,$d2,$tmp0 203 + mulhu $tmp1,$rs1,$d1 # h1*5*r1 204 + mul $tmp0,$rs1,$d1 205 + 206 + mulhu $h2,$r1,$d0 # h0*r1 207 + mul $tmp2,$r1,$d0 208 + add $h0,$h0,$tmp0 209 + add $h1,$h1,$tmp1 210 + sltu $tmp0,$h0,$tmp0 211 + 212 + add $h1,$h1,$tmp0 213 + add $h1,$h1,$tmp2 214 + mulhu $tmp1,$r0,$d1 # h1*r0 215 + mul $tmp0,$r0,$d1 216 + 217 + sltu $tmp2,$h1,$tmp2 218 + add $h2,$h2,$tmp2 219 + mul $tmp2,$rs1,$d2 # h2*5*r1 220 + 221 + add $h1,$h1,$tmp0 222 + add $h2,$h2,$tmp1 223 + mul $tmp3,$r0,$d2 # h2*r0 224 + sltu $tmp0,$h1,$tmp0 225 + add $h2,$h2,$tmp0 226 + 227 + add $h1,$h1,$tmp2 228 + sltu $tmp2,$h1,$tmp2 229 + add $h2,$h2,$tmp2 230 + add $h2,$h2,$tmp3 231 + 232 + bne $inp,$len,.Loop 233 + 234 + sd $h0,0($ctx) # store hash value 235 + sd $h1,8($ctx) 236 + sd $h2,16($ctx) 237 + 238 + POP $s0,3*__SIZEOF_POINTER__($sp) # epilogue 239 + POP $s1,2*__SIZEOF_POINTER__($sp) 240 + POP $s2,1*__SIZEOF_POINTER__($sp) 241 + POP $s3,0*__SIZEOF_POINTER__($sp) 242 + caddi $sp,$sp,4*__SIZEOF_POINTER__ 243 + 244 + .Lno_data: 245 + ret 246 + .size poly1305_blocks,.-poly1305_blocks 247 + ___ 248 + } 249 + { 250 + my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); 251 + 252 + $code.=<<___; 253 + .globl poly1305_emit 254 + .type poly1305_emit,\@function 255 + poly1305_emit: 256 + #ifdef __riscv_zicfilp 257 + lpad 0 258 + #endif 259 + ld $tmp2,16($ctx) 260 + ld $tmp0,0($ctx) 261 + ld $tmp1,8($ctx) 262 + 263 + andi $in0,$tmp2,-4 # final reduction 264 + srl $in1,$tmp2,2 265 + andi $tmp2,$tmp2,3 266 + add $in0,$in0,$in1 267 + 268 + add $tmp0,$tmp0,$in0 269 + sltu $in1,$tmp0,$in0 270 + addi $in0,$tmp0,5 # compare to modulus 271 + add $tmp1,$tmp1,$in1 272 + sltiu $tmp3,$in0,5 273 + sltu $tmp4,$tmp1,$in1 274 + add $in1,$tmp1,$tmp3 275 + add $tmp2,$tmp2,$tmp4 276 + sltu $tmp3,$in1,$tmp3 277 + add $tmp2,$tmp2,$tmp3 278 + 279 + srli $tmp2,$tmp2,2 # see if it carried/borrowed 280 + neg $tmp2,$tmp2 281 + 282 + xor $in0,$in0,$tmp0 283 + xor $in1,$in1,$tmp1 284 + and $in0,$in0,$tmp2 285 + and $in1,$in1,$tmp2 286 + xor $in0,$in0,$tmp0 287 + xor $in1,$in1,$tmp1 288 + 289 + lwu $tmp0,0($nonce) # load nonce 290 + lwu $tmp1,4($nonce) 291 + lwu $tmp2,8($nonce) 292 + lwu $tmp3,12($nonce) 293 + slli $tmp1,$tmp1,32 294 + slli $tmp3,$tmp3,32 295 + or $tmp0,$tmp0,$tmp1 296 + or $tmp2,$tmp2,$tmp3 297 + 298 + add $in0,$in0,$tmp0 # accumulate nonce 299 + add $in1,$in1,$tmp2 300 + sltu $tmp0,$in0,$tmp0 301 + add $in1,$in1,$tmp0 302 + 303 + #ifdef __riscv_misaligned_fast 304 + sd $in0,0($mac) # write mac value 305 + sd $in1,8($mac) 306 + #else 307 + srli $tmp0,$in0,8 # write mac value 308 + srli $tmp1,$in0,16 309 + srli $tmp2,$in0,24 310 + sb $in0,0($mac) 311 + srli $tmp3,$in0,32 312 + sb $tmp0,1($mac) 313 + srli $tmp0,$in0,40 314 + sb $tmp1,2($mac) 315 + srli $tmp1,$in0,48 316 + sb $tmp2,3($mac) 317 + srli $tmp2,$in0,56 318 + sb $tmp3,4($mac) 319 + srli $tmp3,$in1,8 320 + sb $tmp0,5($mac) 321 + srli $tmp0,$in1,16 322 + sb $tmp1,6($mac) 323 + srli $tmp1,$in1,24 324 + sb $tmp2,7($mac) 325 + 326 + sb $in1,8($mac) 327 + srli $tmp2,$in1,32 328 + sb $tmp3,9($mac) 329 + srli $tmp3,$in1,40 330 + sb $tmp0,10($mac) 331 + srli $tmp0,$in1,48 332 + sb $tmp1,11($mac) 333 + srli $tmp1,$in1,56 334 + sb $tmp2,12($mac) 335 + sb $tmp3,13($mac) 336 + sb $tmp0,14($mac) 337 + sb $tmp1,15($mac) 338 + #endif 339 + 340 + ret 341 + .size poly1305_emit,.-poly1305_emit 342 + .string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm" 343 + ___ 344 + } 345 + }}} else {{{ 346 + ###################################################################### 347 + # 32-bit code path 348 + # 349 + 350 + my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); 351 + my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = 352 + ($a4,$a5,$a6,$a7,$t0,$t1,$t2,$t3); 353 + 354 + $code.=<<___; 355 + #if __riscv_xlen == 32 356 + # if __SIZEOF_POINTER__ == 8 357 + # define PUSH csc 358 + # define POP clc 359 + # else 360 + # define PUSH sw 361 + # define POP lw 362 + # endif 363 + # define MULX(hi,lo,a,b) mulhu hi,a,b; mul lo,a,b 364 + # define srliw srli 365 + # define srlw srl 366 + # define sllw sll 367 + # define addw add 368 + # define addiw addi 369 + # define mulw mul 370 + #elif __riscv_xlen == 64 371 + # if __SIZEOF_POINTER__ == 16 372 + # define PUSH csc 373 + # define POP clc 374 + # else 375 + # define PUSH sd 376 + # define POP ld 377 + # endif 378 + # define MULX(hi,lo,a,b) slli b,b,32; srli b,b,32; mul hi,a,b; addiw lo,hi,0; srai hi,hi,32 379 + #else 380 + # error "unsupported __riscv_xlen" 381 + #endif 382 + 383 + .option pic 384 + .text 385 + 386 + .globl poly1305_init 387 + .type poly1305_init,\@function 388 + poly1305_init: 389 + #ifdef __riscv_zicfilp 390 + lpad 0 391 + #endif 392 + sw $zero,0($ctx) 393 + sw $zero,4($ctx) 394 + sw $zero,8($ctx) 395 + sw $zero,12($ctx) 396 + sw $zero,16($ctx) 397 + 398 + beqz $inp,.Lno_key 399 + 400 + #ifndef __riscv_misaligned_fast 401 + andi $tmp0,$inp,3 # $inp % 4 402 + sub $inp,$inp,$tmp0 # align $inp 403 + sll $tmp0,$tmp0,3 # byte to bit offset 404 + #endif 405 + lw $in0,0($inp) 406 + lw $in1,4($inp) 407 + lw $in2,8($inp) 408 + lw $in3,12($inp) 409 + #ifndef __riscv_misaligned_fast 410 + beqz $tmp0,.Laligned_key 411 + 412 + lw $tmp2,16($inp) 413 + sub $tmp1,$zero,$tmp0 414 + srlw $in0,$in0,$tmp0 415 + sllw $tmp3,$in1,$tmp1 416 + srlw $in1,$in1,$tmp0 417 + or $in0,$in0,$tmp3 418 + sllw $tmp3,$in2,$tmp1 419 + srlw $in2,$in2,$tmp0 420 + or $in1,$in1,$tmp3 421 + sllw $tmp3,$in3,$tmp1 422 + srlw $in3,$in3,$tmp0 423 + or $in2,$in2,$tmp3 424 + sllw $tmp2,$tmp2,$tmp1 425 + or $in3,$in3,$tmp2 426 + .Laligned_key: 427 + #endif 428 + 429 + lui $tmp0,0x10000 430 + addi $tmp0,$tmp0,-1 # 0x0fffffff 431 + and $in0,$in0,$tmp0 432 + addi $tmp0,$tmp0,-3 # 0x0ffffffc 433 + and $in1,$in1,$tmp0 434 + and $in2,$in2,$tmp0 435 + and $in3,$in3,$tmp0 436 + 437 + sw $in0,20($ctx) 438 + sw $in1,24($ctx) 439 + sw $in2,28($ctx) 440 + sw $in3,32($ctx) 441 + 442 + srlw $tmp1,$in1,2 443 + srlw $tmp2,$in2,2 444 + srlw $tmp3,$in3,2 445 + addw $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2) 446 + addw $in2,$in2,$tmp2 447 + addw $in3,$in3,$tmp3 448 + sw $in1,36($ctx) 449 + sw $in2,40($ctx) 450 + sw $in3,44($ctx) 451 + .Lno_key: 452 + li $a0,0 453 + ret 454 + .size poly1305_init,.-poly1305_init 455 + ___ 456 + { 457 + my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = 458 + ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $t0,$t1,$t2); 459 + my ($d0,$d1,$d2,$d3) = 460 + ($a4,$a5,$a6,$a7); 461 + my $shr = $ra; # used on R6 462 + 463 + $code.=<<___; 464 + .globl poly1305_blocks 465 + .type poly1305_blocks,\@function 466 + poly1305_blocks: 467 + #ifdef __riscv_zicfilp 468 + lpad 0 469 + #endif 470 + andi $len,$len,-16 # complete blocks only 471 + beqz $len,.Labort 472 + 473 + #ifdef __riscv_zcmp 474 + cm.push {ra,s0-s8}, -48 475 + #else 476 + caddi $sp,$sp,-__SIZEOF_POINTER__*12 477 + PUSH $ra, __SIZEOF_POINTER__*11($sp) 478 + PUSH $s0, __SIZEOF_POINTER__*10($sp) 479 + PUSH $s1, __SIZEOF_POINTER__*9($sp) 480 + PUSH $s2, __SIZEOF_POINTER__*8($sp) 481 + PUSH $s3, __SIZEOF_POINTER__*7($sp) 482 + PUSH $s4, __SIZEOF_POINTER__*6($sp) 483 + PUSH $s5, __SIZEOF_POINTER__*5($sp) 484 + PUSH $s6, __SIZEOF_POINTER__*4($sp) 485 + PUSH $s7, __SIZEOF_POINTER__*3($sp) 486 + PUSH $s8, __SIZEOF_POINTER__*2($sp) 487 + #endif 488 + 489 + #ifndef __riscv_misaligned_fast 490 + andi $shr,$inp,3 491 + andi $inp,$inp,-4 # align $inp 492 + slli $shr,$shr,3 # byte to bit offset 493 + #endif 494 + 495 + lw $h0,0($ctx) # load hash value 496 + lw $h1,4($ctx) 497 + lw $h2,8($ctx) 498 + lw $h3,12($ctx) 499 + lw $h4,16($ctx) 500 + 501 + lw $r0,20($ctx) # load key 502 + lw $r1,24($ctx) 503 + lw $r2,28($ctx) 504 + lw $r3,32($ctx) 505 + lw $rs1,36($ctx) 506 + lw $rs2,40($ctx) 507 + lw $rs3,44($ctx) 508 + 509 + add $len,$len,$inp # end of buffer 510 + 511 + .Loop: 512 + lw $d0,0($inp) # load input 513 + lw $d1,4($inp) 514 + lw $d2,8($inp) 515 + lw $d3,12($inp) 516 + #ifndef __riscv_misaligned_fast 517 + beqz $shr,.Laligned_inp 518 + 519 + lw $t4,16($inp) 520 + sub $t5,$zero,$shr 521 + srlw $d0,$d0,$shr 522 + sllw $t3,$d1,$t5 523 + srlw $d1,$d1,$shr 524 + or $d0,$d0,$t3 525 + sllw $t3,$d2,$t5 526 + srlw $d2,$d2,$shr 527 + or $d1,$d1,$t3 528 + sllw $t3,$d3,$t5 529 + srlw $d3,$d3,$shr 530 + or $d2,$d2,$t3 531 + sllw $t4,$t4,$t5 532 + or $d3,$d3,$t4 533 + 534 + .Laligned_inp: 535 + #endif 536 + srliw $t3,$h4,2 # modulo-scheduled reduction 537 + andi $t4,$h4,-4 538 + andi $h4,$h4,3 539 + 540 + addw $d0,$d0,$h0 # accumulate input 541 + addw $t4,$t4,$t3 542 + sltu $h0,$d0,$h0 543 + addw $d0,$d0,$t4 # ... and residue 544 + sltu $t4,$d0,$t4 545 + 546 + addw $d1,$d1,$h1 547 + addw $h0,$h0,$t4 # carry 548 + sltu $h1,$d1,$h1 549 + addw $d1,$d1,$h0 550 + sltu $h0,$d1,$h0 551 + 552 + addw $d2,$d2,$h2 553 + addw $h1,$h1,$h0 # carry 554 + sltu $h2,$d2,$h2 555 + addw $d2,$d2,$h1 556 + sltu $h1,$d2,$h1 557 + 558 + addw $d3,$d3,$h3 559 + addw $h2,$h2,$h1 # carry 560 + sltu $h3,$d3,$h3 561 + addw $d3,$d3,$h2 562 + 563 + MULX ($h1,$h0,$r0,$d0) # d0*r0 564 + 565 + sltu $h2,$d3,$h2 566 + addw $h3,$h3,$h2 # carry 567 + 568 + MULX ($t4,$t3,$rs3,$d1) # d1*s3 569 + 570 + addw $h4,$h4,$padbit 571 + caddi $inp,$inp,16 572 + addw $h4,$h4,$h3 573 + 574 + MULX ($t6,$a3,$rs2,$d2) # d2*s2 575 + addw $h0,$h0,$t3 576 + addw $h1,$h1,$t4 577 + sltu $t3,$h0,$t3 578 + addw $h1,$h1,$t3 579 + 580 + MULX ($t4,$t3,$rs1,$d3) # d3*s1 581 + addw $h0,$h0,$a3 582 + addw $h1,$h1,$t6 583 + sltu $a3,$h0,$a3 584 + addw $h1,$h1,$a3 585 + 586 + 587 + MULX ($h2,$a3,$r1,$d0) # d0*r1 588 + addw $h0,$h0,$t3 589 + addw $h1,$h1,$t4 590 + sltu $t3,$h0,$t3 591 + addw $h1,$h1,$t3 592 + 593 + MULX ($t4,$t3,$r0,$d1) # d1*r0 594 + addw $h1,$h1,$a3 595 + sltu $a3,$h1,$a3 596 + addw $h2,$h2,$a3 597 + 598 + MULX ($t6,$a3,$rs3,$d2) # d2*s3 599 + addw $h1,$h1,$t3 600 + addw $h2,$h2,$t4 601 + sltu $t3,$h1,$t3 602 + addw $h2,$h2,$t3 603 + 604 + MULX ($t4,$t3,$rs2,$d3) # d3*s2 605 + addw $h1,$h1,$a3 606 + addw $h2,$h2,$t6 607 + sltu $a3,$h1,$a3 608 + addw $h2,$h2,$a3 609 + 610 + mulw $a3,$rs1,$h4 # h4*s1 611 + addw $h1,$h1,$t3 612 + addw $h2,$h2,$t4 613 + sltu $t3,$h1,$t3 614 + addw $h2,$h2,$t3 615 + 616 + 617 + MULX ($h3,$t3,$r2,$d0) # d0*r2 618 + addw $h1,$h1,$a3 619 + sltu $a3,$h1,$a3 620 + addw $h2,$h2,$a3 621 + 622 + MULX ($t6,$a3,$r1,$d1) # d1*r1 623 + addw $h2,$h2,$t3 624 + sltu $t3,$h2,$t3 625 + addw $h3,$h3,$t3 626 + 627 + MULX ($t4,$t3,$r0,$d2) # d2*r0 628 + addw $h2,$h2,$a3 629 + addw $h3,$h3,$t6 630 + sltu $a3,$h2,$a3 631 + addw $h3,$h3,$a3 632 + 633 + MULX ($t6,$a3,$rs3,$d3) # d3*s3 634 + addw $h2,$h2,$t3 635 + addw $h3,$h3,$t4 636 + sltu $t3,$h2,$t3 637 + addw $h3,$h3,$t3 638 + 639 + mulw $t3,$rs2,$h4 # h4*s2 640 + addw $h2,$h2,$a3 641 + addw $h3,$h3,$t6 642 + sltu $a3,$h2,$a3 643 + addw $h3,$h3,$a3 644 + 645 + 646 + MULX ($t6,$a3,$r3,$d0) # d0*r3 647 + addw $h2,$h2,$t3 648 + sltu $t3,$h2,$t3 649 + addw $h3,$h3,$t3 650 + 651 + MULX ($t4,$t3,$r2,$d1) # d1*r2 652 + addw $h3,$h3,$a3 653 + sltu $a3,$h3,$a3 654 + addw $t6,$t6,$a3 655 + 656 + MULX ($a3,$d3,$r0,$d3) # d3*r0 657 + addw $h3,$h3,$t3 658 + addw $t6,$t6,$t4 659 + sltu $t3,$h3,$t3 660 + addw $t6,$t6,$t3 661 + 662 + MULX ($t4,$t3,$r1,$d2) # d2*r1 663 + addw $h3,$h3,$d3 664 + addw $t6,$t6,$a3 665 + sltu $d3,$h3,$d3 666 + addw $t6,$t6,$d3 667 + 668 + mulw $a3,$rs3,$h4 # h4*s3 669 + addw $h3,$h3,$t3 670 + addw $t6,$t6,$t4 671 + sltu $t3,$h3,$t3 672 + addw $t6,$t6,$t3 673 + 674 + 675 + mulw $h4,$r0,$h4 # h4*r0 676 + addw $h3,$h3,$a3 677 + sltu $a3,$h3,$a3 678 + addw $t6,$t6,$a3 679 + addw $h4,$t6,$h4 680 + 681 + li $padbit,1 # if we loop, padbit is 1 682 + 683 + bne $inp,$len,.Loop 684 + 685 + sw $h0,0($ctx) # store hash value 686 + sw $h1,4($ctx) 687 + sw $h2,8($ctx) 688 + sw $h3,12($ctx) 689 + sw $h4,16($ctx) 690 + 691 + #ifdef __riscv_zcmp 692 + cm.popret {ra,s0-s8}, 48 693 + #else 694 + POP $ra, __SIZEOF_POINTER__*11($sp) 695 + POP $s0, __SIZEOF_POINTER__*10($sp) 696 + POP $s1, __SIZEOF_POINTER__*9($sp) 697 + POP $s2, __SIZEOF_POINTER__*8($sp) 698 + POP $s3, __SIZEOF_POINTER__*7($sp) 699 + POP $s4, __SIZEOF_POINTER__*6($sp) 700 + POP $s5, __SIZEOF_POINTER__*5($sp) 701 + POP $s6, __SIZEOF_POINTER__*4($sp) 702 + POP $s7, __SIZEOF_POINTER__*3($sp) 703 + POP $s8, __SIZEOF_POINTER__*2($sp) 704 + caddi $sp,$sp,__SIZEOF_POINTER__*12 705 + #endif 706 + .Labort: 707 + ret 708 + .size poly1305_blocks,.-poly1305_blocks 709 + ___ 710 + } 711 + { 712 + my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); 713 + 714 + $code.=<<___; 715 + .globl poly1305_emit 716 + .type poly1305_emit,\@function 717 + poly1305_emit: 718 + #ifdef __riscv_zicfilp 719 + lpad 0 720 + #endif 721 + lw $tmp4,16($ctx) 722 + lw $tmp0,0($ctx) 723 + lw $tmp1,4($ctx) 724 + lw $tmp2,8($ctx) 725 + lw $tmp3,12($ctx) 726 + 727 + srliw $ctx,$tmp4,2 # final reduction 728 + andi $in0,$tmp4,-4 729 + andi $tmp4,$tmp4,3 730 + addw $ctx,$ctx,$in0 731 + 732 + addw $tmp0,$tmp0,$ctx 733 + sltu $ctx,$tmp0,$ctx 734 + addiw $in0,$tmp0,5 # compare to modulus 735 + addw $tmp1,$tmp1,$ctx 736 + sltiu $in1,$in0,5 737 + sltu $ctx,$tmp1,$ctx 738 + addw $in1,$in1,$tmp1 739 + addw $tmp2,$tmp2,$ctx 740 + sltu $in2,$in1,$tmp1 741 + sltu $ctx,$tmp2,$ctx 742 + addw $in2,$in2,$tmp2 743 + addw $tmp3,$tmp3,$ctx 744 + sltu $in3,$in2,$tmp2 745 + sltu $ctx,$tmp3,$ctx 746 + addw $in3,$in3,$tmp3 747 + addw $tmp4,$tmp4,$ctx 748 + sltu $ctx,$in3,$tmp3 749 + addw $ctx,$ctx,$tmp4 750 + 751 + srl $ctx,$ctx,2 # see if it carried/borrowed 752 + sub $ctx,$zero,$ctx 753 + 754 + xor $in0,$in0,$tmp0 755 + xor $in1,$in1,$tmp1 756 + xor $in2,$in2,$tmp2 757 + xor $in3,$in3,$tmp3 758 + and $in0,$in0,$ctx 759 + and $in1,$in1,$ctx 760 + and $in2,$in2,$ctx 761 + and $in3,$in3,$ctx 762 + xor $in0,$in0,$tmp0 763 + xor $in1,$in1,$tmp1 764 + xor $in2,$in2,$tmp2 765 + xor $in3,$in3,$tmp3 766 + 767 + lw $tmp0,0($nonce) # load nonce 768 + lw $tmp1,4($nonce) 769 + lw $tmp2,8($nonce) 770 + lw $tmp3,12($nonce) 771 + 772 + addw $in0,$in0,$tmp0 # accumulate nonce 773 + sltu $ctx,$in0,$tmp0 774 + 775 + addw $in1,$in1,$tmp1 776 + sltu $tmp1,$in1,$tmp1 777 + addw $in1,$in1,$ctx 778 + sltu $ctx,$in1,$ctx 779 + addw $ctx,$ctx,$tmp1 780 + 781 + addw $in2,$in2,$tmp2 782 + sltu $tmp2,$in2,$tmp2 783 + addw $in2,$in2,$ctx 784 + sltu $ctx,$in2,$ctx 785 + addw $ctx,$ctx,$tmp2 786 + 787 + addw $in3,$in3,$tmp3 788 + addw $in3,$in3,$ctx 789 + 790 + #ifdef __riscv_misaligned_fast 791 + sw $in0,0($mac) # write mac value 792 + sw $in1,4($mac) 793 + sw $in2,8($mac) 794 + sw $in3,12($mac) 795 + #else 796 + srl $tmp0,$in0,8 # write mac value 797 + srl $tmp1,$in0,16 798 + srl $tmp2,$in0,24 799 + sb $in0, 0($mac) 800 + sb $tmp0,1($mac) 801 + srl $tmp0,$in1,8 802 + sb $tmp1,2($mac) 803 + srl $tmp1,$in1,16 804 + sb $tmp2,3($mac) 805 + srl $tmp2,$in1,24 806 + sb $in1, 4($mac) 807 + sb $tmp0,5($mac) 808 + srl $tmp0,$in2,8 809 + sb $tmp1,6($mac) 810 + srl $tmp1,$in2,16 811 + sb $tmp2,7($mac) 812 + srl $tmp2,$in2,24 813 + sb $in2, 8($mac) 814 + sb $tmp0,9($mac) 815 + srl $tmp0,$in3,8 816 + sb $tmp1,10($mac) 817 + srl $tmp1,$in3,16 818 + sb $tmp2,11($mac) 819 + srl $tmp2,$in3,24 820 + sb $in3, 12($mac) 821 + sb $tmp0,13($mac) 822 + sb $tmp1,14($mac) 823 + sb $tmp2,15($mac) 824 + #endif 825 + 826 + ret 827 + .size poly1305_emit,.-poly1305_emit 828 + .string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm" 829 + ___ 830 + } 831 + }}} 832 + 833 + foreach (split("\n", $code)) { 834 + if ($flavour =~ /^cheri/) { 835 + s/$x([0-9]+)$/(c$1)/ and s/\b([ls][bhwd]u?)\b/c$1/; 836 + s/\b(PUSH|POP)(\s+)x([0-9]+)/$1$2c$3/ or 837 + s/\b(ret|jal)\b/c$1/; 838 + s/\bcaddi?\b/cincoffset/ and s/\bx([0-9]+,)/c$1/g or 839 + m/\bcmove\b/ and s/\bx([0-9]+)/c$1/g; 840 + } else { 841 + s/\bcaddi?\b/add/ or 842 + s/\bcmove\b/mv/; 843 + } 844 + print $_, "\n"; 845 + } 846 + 847 + close STDOUT;

+14

lib/crypto/riscv/poly1305.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * OpenSSL/Cryptogams accelerated Poly1305 transform for riscv 4 + * 5 + * Copyright (C) 2025 Institute of Software, CAS. 6 + */ 7 + 8 + asmlinkage void poly1305_block_init(struct poly1305_block_state *state, 9 + const u8 raw_key[POLY1305_BLOCK_SIZE]); 10 + asmlinkage void poly1305_blocks(struct poly1305_block_state *state, 11 + const u8 *src, u32 len, u32 hibit); 12 + asmlinkage void poly1305_emit(const struct poly1305_state *state, 13 + u8 digest[POLY1305_DIGEST_SIZE], 14 + const u32 nonce[4]);