Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

crypto: p10-aes-gcm - Supporting functions for ghash

This perl code is taken from the OpenSSL project and added gcm_init_htable function
used in the p10-aes-gcm-glue.c code to initialize hash table. gcm_hash_p8 is used
to hash encrypted data blocks.

Signed-off-by: Danny Tsen <dtsen@linux.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Danny Tsen and committed by
Herbert Xu
41a6437a 3b47ecca

+370
+370
arch/powerpc/crypto/ghashp8-ppc.pl
··· 1 + #!/usr/bin/env perl 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + # This code is taken from the OpenSSL project but the author (Andy Polyakov) 5 + # has relicensed it under the GPLv2. Therefore this program is free software; 6 + # you can redistribute it and/or modify it under the terms of the GNU General 7 + # Public License version 2 as published by the Free Software Foundation. 8 + # 9 + # The original headers, including the original license headers, are 10 + # included below for completeness. 11 + 12 + # ==================================================================== 13 + # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 14 + # project. The module is, however, dual licensed under OpenSSL and 15 + # CRYPTOGAMS licenses depending on where you obtain it. For further 16 + # details see https://www.openssl.org/~appro/cryptogams/. 17 + # ==================================================================== 18 + # 19 + # GHASH for PowerISA v2.07. 20 + # 21 + # July 2014 22 + # 23 + # Accurate performance measurements are problematic, because it's 24 + # always virtualized setup with possibly throttled processor. 25 + # Relative comparison is therefore more informative. This initial 26 + # version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x 27 + # faster than "4-bit" integer-only compiler-generated 64-bit code. 28 + # "Initial version" means that there is room for futher improvement. 29 + 30 + $flavour=shift; 31 + $output =shift; 32 + 33 + if ($flavour =~ /64/) { 34 + $SIZE_T=8; 35 + $LRSAVE=2*$SIZE_T; 36 + $STU="stdu"; 37 + $POP="ld"; 38 + $PUSH="std"; 39 + } elsif ($flavour =~ /32/) { 40 + $SIZE_T=4; 41 + $LRSAVE=$SIZE_T; 42 + $STU="stwu"; 43 + $POP="lwz"; 44 + $PUSH="stw"; 45 + } else { die "nonsense $flavour"; } 46 + 47 + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 48 + ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 49 + ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 50 + die "can't locate ppc-xlate.pl"; 51 + 52 + open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; 53 + 54 + my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block 55 + 56 + my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3)); 57 + my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12)); 58 + my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19)); 59 + my $vrsave="r12"; 60 + my ($t4,$t5,$t6) = ($Hl,$H,$Hh); 61 + 62 + $code=<<___; 63 + .machine "any" 64 + 65 + .text 66 + 67 + .globl .gcm_init_p8 68 + lis r0,0xfff0 69 + li r8,0x10 70 + mfspr $vrsave,256 71 + li r9,0x20 72 + mtspr 256,r0 73 + li r10,0x30 74 + lvx_u $H,0,r4 # load H 75 + le?xor r7,r7,r7 76 + le?addi r7,r7,0x8 # need a vperm start with 08 77 + le?lvsr 5,0,r7 78 + le?vspltisb 6,0x0f 79 + le?vxor 5,5,6 # set a b-endian mask 80 + le?vperm $H,$H,$H,5 81 + 82 + vspltisb $xC2,-16 # 0xf0 83 + vspltisb $t0,1 # one 84 + vaddubm $xC2,$xC2,$xC2 # 0xe0 85 + vxor $zero,$zero,$zero 86 + vor $xC2,$xC2,$t0 # 0xe1 87 + vsldoi $xC2,$xC2,$zero,15 # 0xe1... 88 + vsldoi $t1,$zero,$t0,1 # ...1 89 + vaddubm $xC2,$xC2,$xC2 # 0xc2... 90 + vspltisb $t2,7 91 + vor $xC2,$xC2,$t1 # 0xc2....01 92 + vspltb $t1,$H,0 # most significant byte 93 + vsl $H,$H,$t0 # H<<=1 94 + vsrab $t1,$t1,$t2 # broadcast carry bit 95 + vand $t1,$t1,$xC2 96 + vxor $H,$H,$t1 # twisted H 97 + 98 + vsldoi $H,$H,$H,8 # twist even more ... 99 + vsldoi $xC2,$zero,$xC2,8 # 0xc2.0 100 + vsldoi $Hl,$zero,$H,8 # ... and split 101 + vsldoi $Hh,$H,$zero,8 102 + 103 + stvx_u $xC2,0,r3 # save pre-computed table 104 + stvx_u $Hl,r8,r3 105 + stvx_u $H, r9,r3 106 + stvx_u $Hh,r10,r3 107 + 108 + mtspr 256,$vrsave 109 + blr 110 + .long 0 111 + .byte 0,12,0x14,0,0,0,2,0 112 + .long 0 113 + .size .gcm_init_p8,.-.gcm_init_p8 114 + 115 + .globl .gcm_init_htable 116 + lis r0,0xfff0 117 + li r8,0x10 118 + mfspr $vrsave,256 119 + li r9,0x20 120 + mtspr 256,r0 121 + li r10,0x30 122 + lvx_u $H,0,r4 # load H 123 + 124 + vspltisb $xC2,-16 # 0xf0 125 + vspltisb $t0,1 # one 126 + vaddubm $xC2,$xC2,$xC2 # 0xe0 127 + vxor $zero,$zero,$zero 128 + vor $xC2,$xC2,$t0 # 0xe1 129 + vsldoi $xC2,$xC2,$zero,15 # 0xe1... 130 + vsldoi $t1,$zero,$t0,1 # ...1 131 + vaddubm $xC2,$xC2,$xC2 # 0xc2... 132 + vspltisb $t2,7 133 + vor $xC2,$xC2,$t1 # 0xc2....01 134 + vspltb $t1,$H,0 # most significant byte 135 + vsl $H,$H,$t0 # H<<=1 136 + vsrab $t1,$t1,$t2 # broadcast carry bit 137 + vand $t1,$t1,$xC2 138 + vxor $IN,$H,$t1 # twisted H 139 + 140 + vsldoi $H,$IN,$IN,8 # twist even more ... 141 + vsldoi $xC2,$zero,$xC2,8 # 0xc2.0 142 + vsldoi $Hl,$zero,$H,8 # ... and split 143 + vsldoi $Hh,$H,$zero,8 144 + 145 + stvx_u $xC2,0,r3 # save pre-computed table 146 + stvx_u $Hl,r8,r3 147 + li r8,0x40 148 + stvx_u $H, r9,r3 149 + li r9,0x50 150 + stvx_u $Hh,r10,r3 151 + li r10,0x60 152 + 153 + vpmsumd $Xl,$IN,$Hl # H.lo·H.lo 154 + vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi 155 + vpmsumd $Xh,$IN,$Hh # H.hi·H.hi 156 + 157 + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase 158 + 159 + vsldoi $t0,$Xm,$zero,8 160 + vsldoi $t1,$zero,$Xm,8 161 + vxor $Xl,$Xl,$t0 162 + vxor $Xh,$Xh,$t1 163 + 164 + vsldoi $Xl,$Xl,$Xl,8 165 + vxor $Xl,$Xl,$t2 166 + 167 + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase 168 + vpmsumd $Xl,$Xl,$xC2 169 + vxor $t1,$t1,$Xh 170 + vxor $IN1,$Xl,$t1 171 + 172 + vsldoi $H2,$IN1,$IN1,8 173 + vsldoi $H2l,$zero,$H2,8 174 + vsldoi $H2h,$H2,$zero,8 175 + 176 + stvx_u $H2l,r8,r3 # save H^2 177 + li r8,0x70 178 + stvx_u $H2,r9,r3 179 + li r9,0x80 180 + stvx_u $H2h,r10,r3 181 + li r10,0x90 182 + 183 + vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo 184 + vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo 185 + vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi 186 + vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi 187 + vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi 188 + vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi 189 + 190 + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase 191 + vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase 192 + 193 + vsldoi $t0,$Xm,$zero,8 194 + vsldoi $t1,$zero,$Xm,8 195 + vsldoi $t4,$Xm1,$zero,8 196 + vsldoi $t5,$zero,$Xm1,8 197 + vxor $Xl,$Xl,$t0 198 + vxor $Xh,$Xh,$t1 199 + vxor $Xl1,$Xl1,$t4 200 + vxor $Xh1,$Xh1,$t5 201 + 202 + vsldoi $Xl,$Xl,$Xl,8 203 + vsldoi $Xl1,$Xl1,$Xl1,8 204 + vxor $Xl,$Xl,$t2 205 + vxor $Xl1,$Xl1,$t6 206 + 207 + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase 208 + vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase 209 + vpmsumd $Xl,$Xl,$xC2 210 + vpmsumd $Xl1,$Xl1,$xC2 211 + vxor $t1,$t1,$Xh 212 + vxor $t5,$t5,$Xh1 213 + vxor $Xl,$Xl,$t1 214 + vxor $Xl1,$Xl1,$t5 215 + 216 + vsldoi $H,$Xl,$Xl,8 217 + vsldoi $H2,$Xl1,$Xl1,8 218 + vsldoi $Hl,$zero,$H,8 219 + vsldoi $Hh,$H,$zero,8 220 + vsldoi $H2l,$zero,$H2,8 221 + vsldoi $H2h,$H2,$zero,8 222 + 223 + stvx_u $Hl,r8,r3 # save H^3 224 + li r8,0xa0 225 + stvx_u $H,r9,r3 226 + li r9,0xb0 227 + stvx_u $Hh,r10,r3 228 + li r10,0xc0 229 + stvx_u $H2l,r8,r3 # save H^4 230 + stvx_u $H2,r9,r3 231 + stvx_u $H2h,r10,r3 232 + 233 + mtspr 256,$vrsave 234 + blr 235 + .long 0 236 + .byte 0,12,0x14,0,0,0,2,0 237 + .long 0 238 + .size .gcm_init_htable,.-.gcm_init_htable 239 + 240 + .globl .gcm_gmult_p8 241 + lis r0,0xfff8 242 + li r8,0x10 243 + mfspr $vrsave,256 244 + li r9,0x20 245 + mtspr 256,r0 246 + li r10,0x30 247 + lvx_u $IN,0,$Xip # load Xi 248 + 249 + lvx_u $Hl,r8,$Htbl # load pre-computed table 250 + le?lvsl $lemask,r0,r0 251 + lvx_u $H, r9,$Htbl 252 + le?vspltisb $t0,0x07 253 + lvx_u $Hh,r10,$Htbl 254 + le?vxor $lemask,$lemask,$t0 255 + lvx_u $xC2,0,$Htbl 256 + le?vperm $IN,$IN,$IN,$lemask 257 + vxor $zero,$zero,$zero 258 + 259 + vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo 260 + vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi 261 + vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi 262 + 263 + vpmsumd $t2,$Xl,$xC2 # 1st phase 264 + 265 + vsldoi $t0,$Xm,$zero,8 266 + vsldoi $t1,$zero,$Xm,8 267 + vxor $Xl,$Xl,$t0 268 + vxor $Xh,$Xh,$t1 269 + 270 + vsldoi $Xl,$Xl,$Xl,8 271 + vxor $Xl,$Xl,$t2 272 + 273 + vsldoi $t1,$Xl,$Xl,8 # 2nd phase 274 + vpmsumd $Xl,$Xl,$xC2 275 + vxor $t1,$t1,$Xh 276 + vxor $Xl,$Xl,$t1 277 + 278 + le?vperm $Xl,$Xl,$Xl,$lemask 279 + stvx_u $Xl,0,$Xip # write out Xi 280 + 281 + mtspr 256,$vrsave 282 + blr 283 + .long 0 284 + .byte 0,12,0x14,0,0,0,2,0 285 + .long 0 286 + .size .gcm_gmult_p8,.-.gcm_gmult_p8 287 + 288 + .globl .gcm_ghash_p8 289 + lis r0,0xfff8 290 + li r8,0x10 291 + mfspr $vrsave,256 292 + li r9,0x20 293 + mtspr 256,r0 294 + li r10,0x30 295 + lvx_u $Xl,0,$Xip # load Xi 296 + 297 + lvx_u $Hl,r8,$Htbl # load pre-computed table 298 + le?lvsl $lemask,r0,r0 299 + lvx_u $H, r9,$Htbl 300 + le?vspltisb $t0,0x07 301 + lvx_u $Hh,r10,$Htbl 302 + le?vxor $lemask,$lemask,$t0 303 + lvx_u $xC2,0,$Htbl 304 + le?vperm $Xl,$Xl,$Xl,$lemask 305 + vxor $zero,$zero,$zero 306 + 307 + lvx_u $IN,0,$inp 308 + addi $inp,$inp,16 309 + subi $len,$len,16 310 + le?vperm $IN,$IN,$IN,$lemask 311 + vxor $IN,$IN,$Xl 312 + b Loop 313 + 314 + .align 5 315 + Loop: 316 + subic $len,$len,16 317 + vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo 318 + subfe. r0,r0,r0 # borrow?-1:0 319 + vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi 320 + and r0,r0,$len 321 + vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi 322 + add $inp,$inp,r0 323 + 324 + vpmsumd $t2,$Xl,$xC2 # 1st phase 325 + 326 + vsldoi $t0,$Xm,$zero,8 327 + vsldoi $t1,$zero,$Xm,8 328 + vxor $Xl,$Xl,$t0 329 + vxor $Xh,$Xh,$t1 330 + 331 + vsldoi $Xl,$Xl,$Xl,8 332 + vxor $Xl,$Xl,$t2 333 + lvx_u $IN,0,$inp 334 + addi $inp,$inp,16 335 + 336 + vsldoi $t1,$Xl,$Xl,8 # 2nd phase 337 + vpmsumd $Xl,$Xl,$xC2 338 + le?vperm $IN,$IN,$IN,$lemask 339 + vxor $t1,$t1,$Xh 340 + vxor $IN,$IN,$t1 341 + vxor $IN,$IN,$Xl 342 + beq Loop # did $len-=16 borrow? 343 + 344 + vxor $Xl,$Xl,$t1 345 + le?vperm $Xl,$Xl,$Xl,$lemask 346 + stvx_u $Xl,0,$Xip # write out Xi 347 + 348 + mtspr 256,$vrsave 349 + blr 350 + .long 0 351 + .byte 0,12,0x14,0,0,0,4,0 352 + .long 0 353 + .size .gcm_ghash_p8,.-.gcm_ghash_p8 354 + 355 + .asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" 356 + .align 2 357 + ___ 358 + 359 + foreach (split("\n",$code)) { 360 + if ($flavour =~ /le$/o) { # little-endian 361 + s/le\?//o or 362 + s/be\?/#be#/o; 363 + } else { 364 + s/le\?/#le#/o or 365 + s/be\?//o; 366 + } 367 + print $_,"\n"; 368 + } 369 + 370 + close STDOUT; # enforce flush