crypto: powerpc/p10-aes-gcm - Re-write AES/GCM stitched implementation

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Re-write AES/GCM assembly codes with smaller footprints and
small performance gain. Handling the partial blocks differently that
computes partial block to AES states and re-assembles to a complete
block and then computes a full-block hash.

Added gcm_update() to update the last partial block hash value and
generate the final digest.

Fixes: fd0e9b3e2ee6 ("crypto: p10-aes-gcm - An accelerated AES/GCM stitched implementation")

Signed-off-by: Danny Tsen <dtsen@linux.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Danny Tsen and committed by

Herbert Xu 2 years ago 7aa747ed fb10c7a8

+1098 -1383

1 changed file

expand all

arch

powerpc

crypto

aes-gcm-p10.S

+1098 -1383

arch/powerpc/crypto/aes-gcm-p10.S

··· 1 1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 - # 3 - # Accelerated AES-GCM stitched implementation for ppc64le. 4 - # 5 - # Copyright 2022- IBM Inc. All rights reserved 6 - # 7 - #=================================================================================== 8 - # Written by Danny Tsen <dtsen@linux.ibm.com> 9 - # 10 - # GHASH is based on the Karatsuba multiplication method. 11 - # 12 - # Xi xor X1 13 - # 14 - # X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H = 15 - # (X1.h * H4.h + xX.l * H4.l + X1 * H4) + 16 - # (X2.h * H3.h + X2.l * H3.l + X2 * H3) + 17 - # (X3.h * H2.h + X3.l * H2.l + X3 * H2) + 18 - # (X4.h * H.h + X4.l * H.l + X4 * H) 19 - # 20 - # Xi = v0 21 - # H Poly = v2 22 - # Hash keys = v3 - v14 23 - # ( H.l, H, H.h) 24 - # ( H^2.l, H^2, H^2.h) 25 - # ( H^3.l, H^3, H^3.h) 26 - # ( H^4.l, H^4, H^4.h) 27 - # 28 - # v30 is IV 29 - # v31 - counter 1 30 - # 31 - # AES used, 32 - # vs0 - vs14 for round keys 33 - # v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted) 34 - # 35 - # This implementation uses stitched AES-GCM approach to improve overall performance. 36 - # AES is implemented with 8x blocks and GHASH is using 2 4x blocks. 37 - # 38 - # =================================================================================== 39 - # 2 + # 3 + # Accelerated AES-GCM stitched implementation for ppc64le. 4 + # 5 + # Copyright 2024- IBM Inc. 6 + # 7 + #=================================================================================== 8 + # Written by Danny Tsen <dtsen@us.ibm.com> 9 + # 10 + # GHASH is based on the Karatsuba multiplication method. 11 + # 12 + # Xi xor X1 13 + # 14 + # X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H = 15 + # (X1.h * H4.h + xX.l * H4.l + X1 * H4) + 16 + # (X2.h * H3.h + X2.l * H3.l + X2 * H3) + 17 + # (X3.h * H2.h + X3.l * H2.l + X3 * H2) + 18 + # (X4.h * H.h + X4.l * H.l + X4 * H) 19 + # 20 + # Xi = v0 21 + # H Poly = v2 22 + # Hash keys = v3 - v14 23 + # ( H.l, H, H.h) 24 + # ( H^2.l, H^2, H^2.h) 25 + # ( H^3.l, H^3, H^3.h) 26 + # ( H^4.l, H^4, H^4.h) 27 + # 28 + # v30 is IV 29 + # v31 - counter 1 30 + # 31 + # AES used, 32 + # vs0 - round key 0 33 + # v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted) 34 + # 35 + # This implementation uses stitched AES-GCM approach to improve overall performance. 36 + # AES is implemented with 8x blocks and GHASH is using 2 4x blocks. 37 + # 38 + # =================================================================================== 39 + # 40 40 41 41 #include <asm/ppc_asm.h> 42 42 #include <linux/linkage.h> ··· 44 44 .machine "any" 45 45 .text 46 46 47 - # 4x loops 48 - # v15 - v18 - input states 49 - # vs1 - vs9 - round keys 50 - # 51 - .macro Loop_aes_middle4x 52 - xxlor 19+32, 1, 1 53 - xxlor 20+32, 2, 2 54 - xxlor 21+32, 3, 3 55 - xxlor 22+32, 4, 4 56 - 57 - vcipher 15, 15, 19 58 - vcipher 16, 16, 19 59 - vcipher 17, 17, 19 60 - vcipher 18, 18, 19 61 - 62 - vcipher 15, 15, 20 63 - vcipher 16, 16, 20 64 - vcipher 17, 17, 20 65 - vcipher 18, 18, 20 66 - 67 - vcipher 15, 15, 21 68 - vcipher 16, 16, 21 69 - vcipher 17, 17, 21 70 - vcipher 18, 18, 21 71 - 72 - vcipher 15, 15, 22 73 - vcipher 16, 16, 22 74 - vcipher 17, 17, 22 75 - vcipher 18, 18, 22 76 - 77 - xxlor 19+32, 5, 5 78 - xxlor 20+32, 6, 6 79 - xxlor 21+32, 7, 7 80 - xxlor 22+32, 8, 8 81 - 82 - vcipher 15, 15, 19 83 - vcipher 16, 16, 19 84 - vcipher 17, 17, 19 85 - vcipher 18, 18, 19 86 - 87 - vcipher 15, 15, 20 88 - vcipher 16, 16, 20 89 - vcipher 17, 17, 20 90 - vcipher 18, 18, 20 91 - 92 - vcipher 15, 15, 21 93 - vcipher 16, 16, 21 94 - vcipher 17, 17, 21 95 - vcipher 18, 18, 21 96 - 97 - vcipher 15, 15, 22 98 - vcipher 16, 16, 22 99 - vcipher 17, 17, 22 100 - vcipher 18, 18, 22 101 - 102 - xxlor 23+32, 9, 9 103 - vcipher 15, 15, 23 104 - vcipher 16, 16, 23 105 - vcipher 17, 17, 23 106 - vcipher 18, 18, 23 47 + .macro SAVE_GPR GPR OFFSET FRAME 48 + std \GPR,\OFFSET(\FRAME) 107 49 .endm 108 50 109 - # 8x loops 110 - # v15 - v22 - input states 111 - # vs1 - vs9 - round keys 112 - # 113 - .macro Loop_aes_middle8x 114 - xxlor 23+32, 1, 1 115 - xxlor 24+32, 2, 2 116 - xxlor 25+32, 3, 3 117 - xxlor 26+32, 4, 4 118 - 119 - vcipher 15, 15, 23 120 - vcipher 16, 16, 23 121 - vcipher 17, 17, 23 122 - vcipher 18, 18, 23 123 - vcipher 19, 19, 23 124 - vcipher 20, 20, 23 125 - vcipher 21, 21, 23 126 - vcipher 22, 22, 23 127 - 128 - vcipher 15, 15, 24 129 - vcipher 16, 16, 24 130 - vcipher 17, 17, 24 131 - vcipher 18, 18, 24 132 - vcipher 19, 19, 24 133 - vcipher 20, 20, 24 134 - vcipher 21, 21, 24 135 - vcipher 22, 22, 24 136 - 137 - vcipher 15, 15, 25 138 - vcipher 16, 16, 25 139 - vcipher 17, 17, 25 140 - vcipher 18, 18, 25 141 - vcipher 19, 19, 25 142 - vcipher 20, 20, 25 143 - vcipher 21, 21, 25 144 - vcipher 22, 22, 25 145 - 146 - vcipher 15, 15, 26 147 - vcipher 16, 16, 26 148 - vcipher 17, 17, 26 149 - vcipher 18, 18, 26 150 - vcipher 19, 19, 26 151 - vcipher 20, 20, 26 152 - vcipher 21, 21, 26 153 - vcipher 22, 22, 26 154 - 155 - xxlor 23+32, 5, 5 156 - xxlor 24+32, 6, 6 157 - xxlor 25+32, 7, 7 158 - xxlor 26+32, 8, 8 159 - 160 - vcipher 15, 15, 23 161 - vcipher 16, 16, 23 162 - vcipher 17, 17, 23 163 - vcipher 18, 18, 23 164 - vcipher 19, 19, 23 165 - vcipher 20, 20, 23 166 - vcipher 21, 21, 23 167 - vcipher 22, 22, 23 168 - 169 - vcipher 15, 15, 24 170 - vcipher 16, 16, 24 171 - vcipher 17, 17, 24 172 - vcipher 18, 18, 24 173 - vcipher 19, 19, 24 174 - vcipher 20, 20, 24 175 - vcipher 21, 21, 24 176 - vcipher 22, 22, 24 177 - 178 - vcipher 15, 15, 25 179 - vcipher 16, 16, 25 180 - vcipher 17, 17, 25 181 - vcipher 18, 18, 25 182 - vcipher 19, 19, 25 183 - vcipher 20, 20, 25 184 - vcipher 21, 21, 25 185 - vcipher 22, 22, 25 186 - 187 - vcipher 15, 15, 26 188 - vcipher 16, 16, 26 189 - vcipher 17, 17, 26 190 - vcipher 18, 18, 26 191 - vcipher 19, 19, 26 192 - vcipher 20, 20, 26 193 - vcipher 21, 21, 26 194 - vcipher 22, 22, 26 195 - 196 - xxlor 23+32, 9, 9 197 - vcipher 15, 15, 23 198 - vcipher 16, 16, 23 199 - vcipher 17, 17, 23 200 - vcipher 18, 18, 23 201 - vcipher 19, 19, 23 202 - vcipher 20, 20, 23 203 - vcipher 21, 21, 23 204 - vcipher 22, 22, 23 51 + .macro SAVE_VRS VRS OFFSET FRAME 52 + stxv \VRS+32, \OFFSET(\FRAME) 205 53 .endm 206 54 207 - .macro Loop_aes_middle_1x 208 - xxlor 19+32, 1, 1 209 - xxlor 20+32, 2, 2 210 - xxlor 21+32, 3, 3 211 - xxlor 22+32, 4, 4 212 - 213 - vcipher 15, 15, 19 214 - vcipher 15, 15, 20 215 - vcipher 15, 15, 21 216 - vcipher 15, 15, 22 217 - 218 - xxlor 19+32, 5, 5 219 - xxlor 20+32, 6, 6 220 - xxlor 21+32, 7, 7 221 - xxlor 22+32, 8, 8 222 - 223 - vcipher 15, 15, 19 224 - vcipher 15, 15, 20 225 - vcipher 15, 15, 21 226 - vcipher 15, 15, 22 227 - 228 - xxlor 19+32, 9, 9 229 - vcipher 15, 15, 19 55 + .macro RESTORE_GPR GPR OFFSET FRAME 56 + ld \GPR,\OFFSET(\FRAME) 230 57 .endm 231 58 232 - # 233 - # Compute 4x hash values based on Karatsuba method. 234 - # 235 - .macro ppc_aes_gcm_ghash 236 - vxor 15, 15, 0 237 - 238 - vpmsumd 23, 12, 15 # H4.L * X.L 239 - vpmsumd 24, 9, 16 240 - vpmsumd 25, 6, 17 241 - vpmsumd 26, 3, 18 242 - 243 - vxor 23, 23, 24 244 - vxor 23, 23, 25 245 - vxor 23, 23, 26 # L 246 - 247 - vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L 248 - vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L 249 - vpmsumd 26, 7, 17 250 - vpmsumd 27, 4, 18 251 - 252 - vxor 24, 24, 25 253 - vxor 24, 24, 26 254 - vxor 24, 24, 27 # M 255 - 256 - # sum hash and reduction with H Poly 257 - vpmsumd 28, 23, 2 # reduction 258 - 259 - vxor 29, 29, 29 260 - vsldoi 26, 24, 29, 8 # mL 261 - vsldoi 29, 29, 24, 8 # mH 262 - vxor 23, 23, 26 # mL + L 263 - 264 - vsldoi 23, 23, 23, 8 # swap 265 - vxor 23, 23, 28 266 - 267 - vpmsumd 24, 14, 15 # H4.H * X.H 268 - vpmsumd 25, 11, 16 269 - vpmsumd 26, 8, 17 270 - vpmsumd 27, 5, 18 271 - 272 - vxor 24, 24, 25 273 - vxor 24, 24, 26 274 - vxor 24, 24, 27 275 - 276 - vxor 24, 24, 29 277 - 278 - # sum hash and reduction with H Poly 279 - vsldoi 27, 23, 23, 8 # swap 280 - vpmsumd 23, 23, 2 281 - vxor 27, 27, 24 282 - vxor 23, 23, 27 283 - 284 - xxlor 32, 23+32, 23+32 # update hash 285 - 286 - .endm 287 - 288 - # 289 - # Combine two 4x ghash 290 - # v15 - v22 - input blocks 291 - # 292 - .macro ppc_aes_gcm_ghash2_4x 293 - # first 4x hash 294 - vxor 15, 15, 0 # Xi + X 295 - 296 - vpmsumd 23, 12, 15 # H4.L * X.L 297 - vpmsumd 24, 9, 16 298 - vpmsumd 25, 6, 17 299 - vpmsumd 26, 3, 18 300 - 301 - vxor 23, 23, 24 302 - vxor 23, 23, 25 303 - vxor 23, 23, 26 # L 304 - 305 - vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L 306 - vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L 307 - vpmsumd 26, 7, 17 308 - vpmsumd 27, 4, 18 309 - 310 - vxor 24, 24, 25 311 - vxor 24, 24, 26 312 - 313 - # sum hash and reduction with H Poly 314 - vpmsumd 28, 23, 2 # reduction 315 - 316 - vxor 29, 29, 29 317 - 318 - vxor 24, 24, 27 # M 319 - vsldoi 26, 24, 29, 8 # mL 320 - vsldoi 29, 29, 24, 8 # mH 321 - vxor 23, 23, 26 # mL + L 322 - 323 - vsldoi 23, 23, 23, 8 # swap 324 - vxor 23, 23, 28 325 - 326 - vpmsumd 24, 14, 15 # H4.H * X.H 327 - vpmsumd 25, 11, 16 328 - vpmsumd 26, 8, 17 329 - vpmsumd 27, 5, 18 330 - 331 - vxor 24, 24, 25 332 - vxor 24, 24, 26 333 - vxor 24, 24, 27 # H 334 - 335 - vxor 24, 24, 29 # H + mH 336 - 337 - # sum hash and reduction with H Poly 338 - vsldoi 27, 23, 23, 8 # swap 339 - vpmsumd 23, 23, 2 340 - vxor 27, 27, 24 341 - vxor 27, 23, 27 # 1st Xi 342 - 343 - # 2nd 4x hash 344 - vpmsumd 24, 9, 20 345 - vpmsumd 25, 6, 21 346 - vpmsumd 26, 3, 22 347 - vxor 19, 19, 27 # Xi + X 348 - vpmsumd 23, 12, 19 # H4.L * X.L 349 - 350 - vxor 23, 23, 24 351 - vxor 23, 23, 25 352 - vxor 23, 23, 26 # L 353 - 354 - vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L 355 - vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L 356 - vpmsumd 26, 7, 21 357 - vpmsumd 27, 4, 22 358 - 359 - vxor 24, 24, 25 360 - vxor 24, 24, 26 361 - 362 - # sum hash and reduction with H Poly 363 - vpmsumd 28, 23, 2 # reduction 364 - 365 - vxor 29, 29, 29 366 - 367 - vxor 24, 24, 27 # M 368 - vsldoi 26, 24, 29, 8 # mL 369 - vsldoi 29, 29, 24, 8 # mH 370 - vxor 23, 23, 26 # mL + L 371 - 372 - vsldoi 23, 23, 23, 8 # swap 373 - vxor 23, 23, 28 374 - 375 - vpmsumd 24, 14, 19 # H4.H * X.H 376 - vpmsumd 25, 11, 20 377 - vpmsumd 26, 8, 21 378 - vpmsumd 27, 5, 22 379 - 380 - vxor 24, 24, 25 381 - vxor 24, 24, 26 382 - vxor 24, 24, 27 # H 383 - 384 - vxor 24, 24, 29 # H + mH 385 - 386 - # sum hash and reduction with H Poly 387 - vsldoi 27, 23, 23, 8 # swap 388 - vpmsumd 23, 23, 2 389 - vxor 27, 27, 24 390 - vxor 23, 23, 27 391 - 392 - xxlor 32, 23+32, 23+32 # update hash 393 - 394 - .endm 395 - 396 - # 397 - # Compute update single hash 398 - # 399 - .macro ppc_update_hash_1x 400 - vxor 28, 28, 0 401 - 402 - vxor 19, 19, 19 403 - 404 - vpmsumd 22, 3, 28 # L 405 - vpmsumd 23, 4, 28 # M 406 - vpmsumd 24, 5, 28 # H 407 - 408 - vpmsumd 27, 22, 2 # reduction 409 - 410 - vsldoi 25, 23, 19, 8 # mL 411 - vsldoi 26, 19, 23, 8 # mH 412 - vxor 22, 22, 25 # LL + LL 413 - vxor 24, 24, 26 # HH + HH 414 - 415 - vsldoi 22, 22, 22, 8 # swap 416 - vxor 22, 22, 27 417 - 418 - vsldoi 20, 22, 22, 8 # swap 419 - vpmsumd 22, 22, 2 # reduction 420 - vxor 20, 20, 24 421 - vxor 22, 22, 20 422 - 423 - vmr 0, 22 # update hash 424 - 59 + .macro RESTORE_VRS VRS OFFSET FRAME 60 + lxv \VRS+32, \OFFSET(\FRAME) 425 61 .endm 426 62 427 63 .macro SAVE_REGS 428 - stdu 1,-640(1) 429 64 mflr 0 65 + std 0, 16(1) 66 + stdu 1,-512(1) 430 67 431 - std 14,112(1) 432 - std 15,120(1) 433 - std 16,128(1) 434 - std 17,136(1) 435 - std 18,144(1) 436 - std 19,152(1) 437 - std 20,160(1) 438 - std 21,168(1) 439 - li 9, 256 440 - stvx 20, 9, 1 441 - addi 9, 9, 16 442 - stvx 21, 9, 1 443 - addi 9, 9, 16 444 - stvx 22, 9, 1 445 - addi 9, 9, 16 446 - stvx 23, 9, 1 447 - addi 9, 9, 16 448 - stvx 24, 9, 1 449 - addi 9, 9, 16 450 - stvx 25, 9, 1 451 - addi 9, 9, 16 452 - stvx 26, 9, 1 453 - addi 9, 9, 16 454 - stvx 27, 9, 1 455 - addi 9, 9, 16 456 - stvx 28, 9, 1 457 - addi 9, 9, 16 458 - stvx 29, 9, 1 459 - addi 9, 9, 16 460 - stvx 30, 9, 1 461 - addi 9, 9, 16 462 - stvx 31, 9, 1 463 - stxv 14, 464(1) 464 - stxv 15, 480(1) 465 - stxv 16, 496(1) 466 - stxv 17, 512(1) 467 - stxv 18, 528(1) 468 - stxv 19, 544(1) 469 - stxv 20, 560(1) 470 - stxv 21, 576(1) 471 - stxv 22, 592(1) 472 - std 0, 656(1) 473 - .endm 68 + SAVE_GPR 14, 112, 1 69 + SAVE_GPR 15, 120, 1 70 + SAVE_GPR 16, 128, 1 71 + SAVE_GPR 17, 136, 1 72 + SAVE_GPR 18, 144, 1 73 + SAVE_GPR 19, 152, 1 74 + SAVE_GPR 20, 160, 1 75 + SAVE_GPR 21, 168, 1 76 + SAVE_GPR 22, 176, 1 77 + SAVE_GPR 23, 184, 1 78 + SAVE_GPR 24, 192, 1 79 + 80 + addi 9, 1, 256 81 + SAVE_VRS 20, 0, 9 82 + SAVE_VRS 21, 16, 9 83 + SAVE_VRS 22, 32, 9 84 + SAVE_VRS 23, 48, 9 85 + SAVE_VRS 24, 64, 9 86 + SAVE_VRS 25, 80, 9 87 + SAVE_VRS 26, 96, 9 88 + SAVE_VRS 27, 112, 9 89 + SAVE_VRS 28, 128, 9 90 + SAVE_VRS 29, 144, 9 91 + SAVE_VRS 30, 160, 9 92 + SAVE_VRS 31, 176, 9 93 + .endm # SAVE_REGS 474 94 475 95 .macro RESTORE_REGS 476 - lxv 14, 464(1) 477 - lxv 15, 480(1) 478 - lxv 16, 496(1) 479 - lxv 17, 512(1) 480 - lxv 18, 528(1) 481 - lxv 19, 544(1) 482 - lxv 20, 560(1) 483 - lxv 21, 576(1) 484 - lxv 22, 592(1) 485 - li 9, 256 486 - lvx 20, 9, 1 487 - addi 9, 9, 16 488 - lvx 21, 9, 1 489 - addi 9, 9, 16 490 - lvx 22, 9, 1 491 - addi 9, 9, 16 492 - lvx 23, 9, 1 493 - addi 9, 9, 16 494 - lvx 24, 9, 1 495 - addi 9, 9, 16 496 - lvx 25, 9, 1 497 - addi 9, 9, 16 498 - lvx 26, 9, 1 499 - addi 9, 9, 16 500 - lvx 27, 9, 1 501 - addi 9, 9, 16 502 - lvx 28, 9, 1 503 - addi 9, 9, 16 504 - lvx 29, 9, 1 505 - addi 9, 9, 16 506 - lvx 30, 9, 1 507 - addi 9, 9, 16 508 - lvx 31, 9, 1 96 + addi 9, 1, 256 97 + RESTORE_VRS 20, 0, 9 98 + RESTORE_VRS 21, 16, 9 99 + RESTORE_VRS 22, 32, 9 100 + RESTORE_VRS 23, 48, 9 101 + RESTORE_VRS 24, 64, 9 102 + RESTORE_VRS 25, 80, 9 103 + RESTORE_VRS 26, 96, 9 104 + RESTORE_VRS 27, 112, 9 105 + RESTORE_VRS 28, 128, 9 106 + RESTORE_VRS 29, 144, 9 107 + RESTORE_VRS 30, 160, 9 108 + RESTORE_VRS 31, 176, 9 509 109 510 - ld 0, 656(1) 511 - ld 14,112(1) 512 - ld 15,120(1) 513 - ld 16,128(1) 514 - ld 17,136(1) 515 - ld 18,144(1) 516 - ld 19,152(1) 517 - ld 20,160(1) 518 - ld 21,168(1) 110 + RESTORE_GPR 14, 112, 1 111 + RESTORE_GPR 15, 120, 1 112 + RESTORE_GPR 16, 128, 1 113 + RESTORE_GPR 17, 136, 1 114 + RESTORE_GPR 18, 144, 1 115 + RESTORE_GPR 19, 152, 1 116 + RESTORE_GPR 20, 160, 1 117 + RESTORE_GPR 21, 168, 1 118 + RESTORE_GPR 22, 176, 1 119 + RESTORE_GPR 23, 184, 1 120 + RESTORE_GPR 24, 192, 1 519 121 520 - mtlr 0 521 - addi 1, 1, 640 122 + addi 1, 1, 512 123 + ld 0, 16(1) 124 + mtlr 0 125 + .endm # RESTORE_REGS 126 + 127 + # 4x loops 128 + .macro AES_CIPHER_4x _VCIPHER ST r 129 + \_VCIPHER \ST, \ST, \r 130 + \_VCIPHER \ST+1, \ST+1, \r 131 + \_VCIPHER \ST+2, \ST+2, \r 132 + \_VCIPHER \ST+3, \ST+3, \r 522 133 .endm 523 134 135 + # 8x loops 136 + .macro AES_CIPHER_8x _VCIPHER ST r 137 + \_VCIPHER \ST, \ST, \r 138 + \_VCIPHER \ST+1, \ST+1, \r 139 + \_VCIPHER \ST+2, \ST+2, \r 140 + \_VCIPHER \ST+3, \ST+3, \r 141 + \_VCIPHER \ST+4, \ST+4, \r 142 + \_VCIPHER \ST+5, \ST+5, \r 143 + \_VCIPHER \ST+6, \ST+6, \r 144 + \_VCIPHER \ST+7, \ST+7, \r 145 + .endm 146 + 147 + .macro LOOP_8AES_STATE 148 + xxlor 32+23, 1, 1 149 + xxlor 32+24, 2, 2 150 + xxlor 32+25, 3, 3 151 + xxlor 32+26, 4, 4 152 + AES_CIPHER_8x vcipher, 15, 23 153 + AES_CIPHER_8x vcipher, 15, 24 154 + AES_CIPHER_8x vcipher, 15, 25 155 + AES_CIPHER_8x vcipher, 15, 26 156 + xxlor 32+23, 5, 5 157 + xxlor 32+24, 6, 6 158 + xxlor 32+25, 7, 7 159 + xxlor 32+26, 8, 8 160 + AES_CIPHER_8x vcipher, 15, 23 161 + AES_CIPHER_8x vcipher, 15, 24 162 + AES_CIPHER_8x vcipher, 15, 25 163 + AES_CIPHER_8x vcipher, 15, 26 164 + .endm 165 + 166 + # 167 + # PPC_GHASH4x(H, S1, S2, S3, S4): Compute 4x hash values based on Karatsuba method. 168 + # H: returning digest 169 + # S#: states 170 + # 171 + # S1 should xor with the previous digest 172 + # 173 + # Xi = v0 174 + # H Poly = v2 175 + # Hash keys = v3 - v14 176 + # Scratch: v23 - v29 177 + # 178 + .macro PPC_GHASH4x H S1 S2 S3 S4 179 + 180 + vpmsumd 23, 12, \S1 # H4.L * X.L 181 + vpmsumd 24, 9, \S2 182 + vpmsumd 25, 6, \S3 183 + vpmsumd 26, 3, \S4 184 + 185 + vpmsumd 27, 13, \S1 # H4.L * X.H + H4.H * X.L 186 + vpmsumd 28, 10, \S2 # H3.L * X1.H + H3.H * X1.L 187 + 188 + vxor 23, 23, 24 189 + vxor 23, 23, 25 190 + vxor 23, 23, 26 # L 191 + 192 + vxor 24, 27, 28 193 + vpmsumd 25, 7, \S3 194 + vpmsumd 26, 4, \S4 195 + 196 + vxor 24, 24, 25 197 + vxor 24, 24, 26 # M 198 + 199 + # sum hash and reduction with H Poly 200 + vpmsumd 28, 23, 2 # reduction 201 + 202 + vxor 1, 1, 1 203 + vsldoi 25, 24, 1, 8 # mL 204 + vsldoi 1, 1, 24, 8 # mH 205 + vxor 23, 23, 25 # mL + L 206 + 207 + # This performs swap and xor like, 208 + # vsldoi 23, 23, 23, 8 # swap 209 + # vxor 23, 23, 28 210 + xxlor 32+25, 10, 10 211 + vpermxor 23, 23, 28, 25 212 + 213 + vpmsumd 26, 14, \S1 # H4.H * X.H 214 + vpmsumd 27, 11, \S2 215 + vpmsumd 28, 8, \S3 216 + vpmsumd 29, 5, \S4 217 + 218 + vxor 24, 26, 27 219 + vxor 24, 24, 28 220 + vxor 24, 24, 29 221 + 222 + vxor 24, 24, 1 223 + 224 + # sum hash and reduction with H Poly 225 + vsldoi 25, 23, 23, 8 # swap 226 + vpmsumd 23, 23, 2 227 + vxor 27, 25, 24 228 + vxor \H, 23, 27 229 + .endm 230 + 231 + # 232 + # Compute update single ghash 233 + # scratch: v1, v22..v27 234 + # 235 + .macro PPC_GHASH1x H S1 236 + 237 + vxor 1, 1, 1 238 + 239 + vpmsumd 22, 3, \S1 # L 240 + vpmsumd 23, 4, \S1 # M 241 + vpmsumd 24, 5, \S1 # H 242 + 243 + vpmsumd 27, 22, 2 # reduction 244 + 245 + vsldoi 25, 23, 1, 8 # mL 246 + vsldoi 26, 1, 23, 8 # mH 247 + vxor 22, 22, 25 # LL + LL 248 + vxor 24, 24, 26 # HH + HH 249 + 250 + xxlor 32+25, 10, 10 251 + vpermxor 22, 22, 27, 25 252 + 253 + vsldoi 23, 22, 22, 8 # swap 254 + vpmsumd 22, 22, 2 # reduction 255 + vxor 23, 23, 24 256 + vxor \H, 22, 23 257 + .endm 258 + 259 + # 260 + # LOAD_HASH_TABLE 261 + # Xi = v0 262 + # H Poly = v2 263 + # Hash keys = v3 - v14 264 + # 524 265 .macro LOAD_HASH_TABLE 525 266 # Load Xi 526 267 lxvb16x 32, 0, 8 # load Xi ··· 298 557 lxvd2x 14+32, 10, 8 # H^4h 299 558 .endm 300 559 301 - # 302 - # aes_p10_gcm_encrypt (const void *inp, void *out, size_t len, 303 - # const char *rk, unsigned char iv[16], void *Xip); 304 - # 305 - # r3 - inp 306 - # r4 - out 307 - # r5 - len 308 - # r6 - AES round keys 309 - # r7 - iv and other data 310 - # r8 - Xi, HPoli, hash keys 311 - # 312 - # rounds is at offset 240 in rk 313 - # Xi is at 0 in gcm_table (Xip). 314 - # 315 - _GLOBAL(aes_p10_gcm_encrypt) 316 - .align 5 560 + ################################################################################ 561 + # Compute AES and ghash one block at a time. 562 + # r23: AES rounds 563 + # v30: current IV 564 + # vs0: roundkey 0 565 + # 566 + ################################################################################ 567 + SYM_FUNC_START_LOCAL(aes_gcm_crypt_1x) 568 + 569 + cmpdi 5, 16 570 + bge __More_1x 571 + blr 572 + __More_1x: 573 + li 10, 16 574 + divdu 12, 5, 10 575 + 576 + xxlxor 32+15, 32+30, 0 577 + 578 + # Pre-load 8 AES rounds to scratch vectors. 579 + xxlor 32+16, 1, 1 580 + xxlor 32+17, 2, 2 581 + xxlor 32+18, 3, 3 582 + xxlor 32+19, 4, 4 583 + xxlor 32+20, 5, 5 584 + xxlor 32+21, 6, 6 585 + xxlor 32+28, 7, 7 586 + xxlor 32+29, 8, 8 587 + lwz 23, 240(6) # n rounds 588 + addi 22, 23, -9 # remaing AES rounds 589 + 590 + cmpdi 12, 0 591 + bgt __Loop_1x 592 + blr 593 + 594 + __Loop_1x: 595 + mtctr 22 596 + addi 10, 6, 144 597 + vcipher 15, 15, 16 598 + vcipher 15, 15, 17 599 + vcipher 15, 15, 18 600 + vcipher 15, 15, 19 601 + vcipher 15, 15, 20 602 + vcipher 15, 15, 21 603 + vcipher 15, 15, 28 604 + vcipher 15, 15, 29 605 + 606 + __Loop_aes_1state: 607 + lxv 32+1, 0(10) 608 + vcipher 15, 15, 1 609 + addi 10, 10, 16 610 + bdnz __Loop_aes_1state 611 + lxv 32+1, 0(10) # last round key 612 + lxvb16x 11, 0, 14 # load input block 613 + vcipherlast 15, 15, 1 614 + 615 + xxlxor 32+15, 32+15, 11 616 + stxvb16x 32+15, 0, 9 # store output 617 + addi 14, 14, 16 618 + addi 9, 9, 16 619 + 620 + cmpdi 24, 0 # decrypt? 621 + bne __Encrypt_1x 622 + xxlor 15+32, 11, 11 623 + __Encrypt_1x: 624 + vxor 15, 15, 0 625 + PPC_GHASH1x 0, 15 626 + 627 + addi 5, 5, -16 628 + addi 11, 11, 16 629 + 630 + vadduwm 30, 30, 31 # IV + counter 631 + xxlxor 32+15, 32+30, 0 632 + addi 12, 12, -1 633 + cmpdi 12, 0 634 + bgt __Loop_1x 635 + 636 + stxvb16x 32+30, 0, 7 # update IV 637 + stxvb16x 32+0, 0, 8 # update Xi 638 + blr 639 + SYM_FUNC_END(aes_gcm_crypt_1x) 640 + 641 + ################################################################################ 642 + # Process a normal partial block when we come here. 643 + # Compute partial mask, Load and store partial block to stack. 644 + # Update partial_len and pblock. 645 + # pblock is (encrypted ^ AES state) for encrypt 646 + # and (input ^ AES state) for decrypt. 647 + # 648 + ################################################################################ 649 + SYM_FUNC_START_LOCAL(__Process_partial) 650 + 651 + # create partial mask 652 + vspltisb 16, -1 653 + li 12, 16 654 + sub 12, 12, 5 655 + sldi 12, 12, 3 656 + mtvsrdd 32+17, 0, 12 657 + vslo 16, 16, 17 # partial block mask 658 + 659 + lxvb16x 11, 0, 14 # load partial block 660 + xxland 11, 11, 32+16 661 + 662 + # AES crypt partial 663 + xxlxor 32+15, 32+30, 0 664 + lwz 23, 240(6) # n rounds 665 + addi 22, 23, -1 # loop - 1 666 + mtctr 22 667 + addi 10, 6, 16 668 + 669 + __Loop_aes_pstate: 670 + lxv 32+1, 0(10) 671 + vcipher 15, 15, 1 672 + addi 10, 10, 16 673 + bdnz __Loop_aes_pstate 674 + lxv 32+1, 0(10) # last round key 675 + vcipherlast 15, 15, 1 676 + 677 + xxlxor 32+15, 32+15, 11 678 + vand 15, 15, 16 679 + 680 + # AES crypt output v15 681 + # Write partial 682 + li 10, 224 683 + stxvb16x 15+32, 10, 1 # write v15 to stack 684 + addi 10, 1, 223 685 + addi 12, 9, -1 686 + mtctr 5 # partial block len 687 + __Write_partial: 688 + lbzu 22, 1(10) 689 + stbu 22, 1(12) 690 + bdnz __Write_partial 691 + 692 + cmpdi 24, 0 # decrypt? 693 + bne __Encrypt_partial 694 + xxlor 32+15, 11, 11 # decrypt using the input block 695 + __Encrypt_partial: 696 + #vxor 15, 15, 0 # ^ previous hash 697 + #PPC_GHASH1x 0, 15 698 + 699 + add 14, 14, 5 700 + add 9, 9, 5 701 + std 5, 56(7) # update partial 702 + sub 11, 11, 5 703 + li 5, 0 # done last byte 704 + 705 + # 706 + # Don't increase IV since this is the last partial. 707 + # It should get updated in gcm_update if no more data blocks. 708 + #vadduwm 30, 30, 31 # increase IV 709 + stxvb16x 32+30, 0, 7 # update IV 710 + li 10, 64 711 + stxvb16x 32+0, 0, 8 # Update X1 712 + stxvb16x 32+15, 10, 7 # Update pblock 713 + blr 714 + SYM_FUNC_END(__Process_partial) 715 + 716 + ################################################################################ 717 + # Combine partial blocks and ghash when we come here. 718 + # 719 + # The partial block has to be shifted to the right location to encrypt/decrypt 720 + # and compute ghash if combing the previous partial block is needed. 721 + # - Compute ghash for a full block. Clear Partial_len and pblock. Update IV. 722 + # Write Xi. 723 + # - Don't compute ghash if not full block. gcm_update will take care of it 724 + # is the last block. Update Partial_len and pblock. 725 + # 726 + ################################################################################ 727 + SYM_FUNC_START_LOCAL(__Combine_partial) 728 + 729 + ld 12, 56(7) 730 + mr 21, 5 # these bytes to be processed 731 + 732 + li 17, 0 733 + li 16, 16 734 + sub 22, 16, 12 # bytes to complete a block 735 + sub 17, 22, 5 # remaining bytes in a block 736 + cmpdi 5, 16 737 + ble __Inp_msg_less16 738 + li 17, 0 739 + mr 21, 22 740 + b __Combine_continue 741 + __Inp_msg_less16: 742 + cmpd 22, 5 743 + bgt __Combine_continue 744 + li 17, 0 745 + mr 21, 22 # these bytes to be processed 746 + 747 + __Combine_continue: 748 + # load msg and shift to the proper location and mask 749 + vspltisb 16, -1 750 + sldi 15, 12, 3 751 + mtvsrdd 32+17, 0, 15 752 + vslo 16, 16, 17 753 + vsro 16, 16, 17 754 + sldi 15, 17, 3 755 + mtvsrdd 32+17, 0, 15 756 + vsro 16, 16, 17 757 + vslo 16, 16, 17 # mask 758 + 759 + lxvb16x 32+19, 0, 14 # load partial block 760 + sldi 15, 12, 3 761 + mtvsrdd 32+17, 0, 15 762 + vsro 19, 19, 17 # 0x00..xxxx??..?? 763 + sldi 15, 17, 3 764 + mtvsrdd 32+17, 0, 15 765 + vsro 19, 19, 17 # 0x00..xxxx 766 + vslo 19, 19, 17 # shift back to form 0x00..xxxx00..00 767 + 768 + # AES crypt partial 769 + xxlxor 32+15, 32+30, 0 770 + lwz 23, 240(6) # n rounds 771 + addi 22, 23, -1 # loop - 1 772 + mtctr 22 773 + addi 10, 6, 16 774 + 775 + __Loop_aes_cpstate: 776 + lxv 32+1, 0(10) 777 + vcipher 15, 15, 1 778 + addi 10, 10, 16 779 + bdnz __Loop_aes_cpstate 780 + lxv 32+1, 0(10) # last round key 781 + vcipherlast 15, 15, 1 782 + 783 + vxor 15, 15, 19 784 + vand 15, 15, 16 785 + 786 + # AES crypt output v15 787 + # Write partial 788 + li 10, 224 789 + stxvb16x 15+32, 10, 1 # write v15 to stack 790 + addi 10, 1, 223 791 + add 10, 10, 12 # add offset 792 + addi 15, 9, -1 793 + mtctr 21 # partial block len 794 + __Write_combine_partial: 795 + lbzu 22, 1(10) 796 + stbu 22, 1(15) 797 + bdnz __Write_combine_partial 798 + 799 + add 14, 14, 21 800 + add 11, 11, 21 801 + add 9, 9, 21 802 + sub 5, 5, 21 803 + 804 + # Encrypt/Decrypt? 805 + cmpdi 24, 0 # decrypt? 806 + bne __Encrypt_combine_partial 807 + vmr 15, 19 # decrypt using the input block 808 + 809 + __Encrypt_combine_partial: 810 + # 811 + # Update partial flag and combine ghash. 812 + __Update_partial_ghash: 813 + li 10, 64 814 + lxvb16x 32+17, 10, 7 # load previous pblock 815 + add 12, 12, 21 # combined pprocessed 816 + vxor 15, 15, 17 # combined pblock 817 + 818 + cmpdi 12, 16 819 + beq __Clear_partial_flag 820 + std 12, 56(7) # update partial len 821 + stxvb16x 32+15, 10, 7 # Update current pblock 822 + blr 823 + 824 + __Clear_partial_flag: 825 + li 12, 0 826 + std 12, 56(7) 827 + # Update IV and ghash here 828 + vadduwm 30, 30, 31 # increase IV 829 + stxvb16x 32+30, 0, 7 # update IV 830 + 831 + # v15 either is either (input blockor encrypted)^(AES state) 832 + vxor 15, 15, 0 833 + PPC_GHASH1x 0, 15 834 + stxvb16x 32+0, 10, 7 # update pblock for debug? 835 + stxvb16x 32+0, 0, 8 # update Xi 836 + blr 837 + SYM_FUNC_END(__Combine_partial) 838 + 839 + ################################################################################ 840 + # gcm_update(iv, Xi) - compute last hash 841 + # 842 + ################################################################################ 843 + SYM_FUNC_START(gcm_update) 844 + 845 + ld 10, 56(3) 846 + cmpdi 10, 0 847 + beq __no_update 848 + 849 + lxvb16x 32, 0, 4 # load Xi 850 + # load Hash - h^4, h^3, h^2, h 851 + li 10, 32 852 + lxvd2x 2+32, 10, 4 # H Poli 853 + li 10, 48 854 + lxvd2x 3+32, 10, 4 # Hl 855 + li 10, 64 856 + lxvd2x 4+32, 10, 4 # H 857 + li 10, 80 858 + lxvd2x 5+32, 10, 4 # Hh 859 + 860 + addis 11, 2, permx@toc@ha 861 + addi 11, 11, permx@toc@l 862 + lxv 10, 0(11) # vs10: vpermxor vector 863 + 864 + li 9, 64 865 + lxvb16x 32+6, 9, 3 # load pblock 866 + vxor 6, 6, 0 867 + 868 + vxor 1, 1, 1 869 + vpmsumd 12, 3, 6 # L 870 + vpmsumd 13, 4, 6 # M 871 + vpmsumd 14, 5, 6 # H 872 + vpmsumd 17, 12, 2 # reduction 873 + vsldoi 15, 13, 1, 8 # mL 874 + vsldoi 16, 1, 13, 8 # mH 875 + vxor 12, 12, 15 # LL + LL 876 + vxor 14, 14, 16 # HH + HH 877 + xxlor 32+15, 10, 10 878 + vpermxor 12, 12, 17, 15 879 + vsldoi 13, 12, 12, 8 # swap 880 + vpmsumd 12, 12, 2 # reduction 881 + vxor 13, 13, 14 882 + vxor 7, 12, 13 883 + 884 + #vxor 0, 0, 0 885 + #stxvb16x 32+0, 9, 3 886 + li 10, 0 887 + std 10, 56(3) 888 + stxvb16x 32+7, 0, 4 889 + 890 + __no_update: 891 + blr 892 + SYM_FUNC_END(gcm_update) 893 + 894 + ################################################################################ 895 + # aes_p10_gcm_encrypt (const void *inp, void *out, size_t len, 896 + # const char *rk, unsigned char iv[16], void *Xip); 897 + # 898 + # r3 - inp 899 + # r4 - out 900 + # r5 - len 901 + # r6 - AES round keys 902 + # r7 - iv and other data 903 + # r8 - Xi, HPoli, hash keys 904 + # 905 + # rounds is at offset 240 in rk 906 + # Xi is at 0 in gcm_table (Xip). 907 + # 908 + ################################################################################ 909 + SYM_FUNC_START(aes_p10_gcm_encrypt) 910 + 911 + cmpdi 5, 0 912 + ble __Invalid_msg_len 317 913 318 914 SAVE_REGS 319 - 320 915 LOAD_HASH_TABLE 321 916 322 917 # initialize ICB: GHASH( IV ), IV - r7 323 918 lxvb16x 30+32, 0, 7 # load IV - v30 324 919 325 - mr 12, 5 # length 326 - li 11, 0 # block index 920 + mr 14, 3 921 + mr 9, 4 327 922 328 923 # counter 1 329 924 vxor 31, 31, 31 330 925 vspltisb 22, 1 331 926 vsldoi 31, 31, 22,1 # counter 1 332 927 333 - # load round key to VSR 334 - lxv 0, 0(6) 335 - lxv 1, 0x10(6) 336 - lxv 2, 0x20(6) 337 - lxv 3, 0x30(6) 338 - lxv 4, 0x40(6) 339 - lxv 5, 0x50(6) 340 - lxv 6, 0x60(6) 341 - lxv 7, 0x70(6) 342 - lxv 8, 0x80(6) 343 - lxv 9, 0x90(6) 344 - lxv 10, 0xa0(6) 928 + addis 11, 2, permx@toc@ha 929 + addi 11, 11, permx@toc@l 930 + lxv 10, 0(11) # vs10: vpermxor vector 931 + li 11, 0 932 + 933 + # load 9 round keys to VSR 934 + lxv 0, 0(6) # round key 0 935 + lxv 1, 16(6) # round key 1 936 + lxv 2, 32(6) # round key 2 937 + lxv 3, 48(6) # round key 3 938 + lxv 4, 64(6) # round key 4 939 + lxv 5, 80(6) # round key 5 940 + lxv 6, 96(6) # round key 6 941 + lxv 7, 112(6) # round key 7 942 + lxv 8, 128(6) # round key 8 345 943 346 944 # load rounds - 10 (128), 12 (192), 14 (256) 347 - lwz 9,240(6) 945 + lwz 23, 240(6) # n rounds 946 + li 24, 1 # encrypt 348 947 948 + __Process_encrypt: 349 949 # 350 - # vxor state, state, w # addroundkey 351 - xxlor 32+29, 0, 0 352 - vxor 15, 30, 29 # IV + round key - add round key 0 353 - 354 - cmpdi 9, 10 355 - beq Loop_aes_gcm_8x 356 - 357 - # load 2 more round keys (v11, v12) 358 - lxv 11, 0xb0(6) 359 - lxv 12, 0xc0(6) 360 - 361 - cmpdi 9, 12 362 - beq Loop_aes_gcm_8x 363 - 364 - # load 2 more round keys (v11, v12, v13, v14) 365 - lxv 13, 0xd0(6) 366 - lxv 14, 0xe0(6) 367 - cmpdi 9, 14 368 - beq Loop_aes_gcm_8x 369 - 370 - b aes_gcm_out 371 - 372 - .align 5 373 - Loop_aes_gcm_8x: 374 - mr 14, 3 375 - mr 9, 4 376 - 950 + # Process different blocks 377 951 # 378 - # check partial block 379 - # 380 - Continue_partial_check: 381 - ld 15, 56(7) 382 - cmpdi 15, 0 383 - beq Continue 384 - bgt Final_block 385 - cmpdi 15, 16 386 - blt Final_block 952 + ld 12, 56(7) 953 + cmpdi 12, 0 954 + bgt __Do_combine_enc 955 + cmpdi 5, 128 956 + blt __Process_more_enc 387 957 388 - Continue: 389 - # n blcoks 958 + # 959 + # Process 8x AES/GCM blocks 960 + # 961 + __Process_8x_enc: 962 + # 8x blcoks 390 963 li 10, 128 391 - divdu 10, 12, 10 # n 128 bytes-blocks 392 - cmpdi 10, 0 393 - beq Loop_last_block 964 + divdu 12, 5, 10 # n 128 bytes-blocks 394 965 395 - vaddudm 30, 30, 31 # IV + counter 396 - vxor 16, 30, 29 397 - vaddudm 30, 30, 31 398 - vxor 17, 30, 29 399 - vaddudm 30, 30, 31 400 - vxor 18, 30, 29 401 - vaddudm 30, 30, 31 402 - vxor 19, 30, 29 403 - vaddudm 30, 30, 31 404 - vxor 20, 30, 29 405 - vaddudm 30, 30, 31 406 - vxor 21, 30, 29 407 - vaddudm 30, 30, 31 408 - vxor 22, 30, 29 966 + addi 12, 12, -1 # loop - 1 409 967 410 - mtctr 10 968 + vmr 15, 30 # first state: IV 969 + vadduwm 16, 15, 31 # state + counter 970 + vadduwm 17, 16, 31 971 + vadduwm 18, 17, 31 972 + vadduwm 19, 18, 31 973 + vadduwm 20, 19, 31 974 + vadduwm 21, 20, 31 975 + vadduwm 22, 21, 31 976 + xxlor 9, 32+22, 32+22 # save last state 977 + 978 + # vxor state, state, w # addroundkey 979 + xxlor 32+29, 0, 0 980 + vxor 15, 15, 29 # IV + round key - add round key 0 981 + vxor 16, 16, 29 982 + vxor 17, 17, 29 983 + vxor 18, 18, 29 984 + vxor 19, 19, 29 985 + vxor 20, 20, 29 986 + vxor 21, 21, 29 987 + vxor 22, 22, 29 411 988 412 989 li 15, 16 413 990 li 16, 32 ··· 735 676 li 20, 96 736 677 li 21, 112 737 678 738 - lwz 10, 240(6) 739 - 740 - Loop_8x_block: 741 - 742 - lxvb16x 15, 0, 14 # load block 743 - lxvb16x 16, 15, 14 # load block 744 - lxvb16x 17, 16, 14 # load block 745 - lxvb16x 18, 17, 14 # load block 746 - lxvb16x 19, 18, 14 # load block 747 - lxvb16x 20, 19, 14 # load block 748 - lxvb16x 21, 20, 14 # load block 749 - lxvb16x 22, 21, 14 # load block 750 - addi 14, 14, 128 751 - 752 - Loop_aes_middle8x 753 - 754 - xxlor 23+32, 10, 10 755 - 756 - cmpdi 10, 10 757 - beq Do_next_ghash 758 - 759 - # 192 bits 760 - xxlor 24+32, 11, 11 761 - 762 - vcipher 15, 15, 23 763 - vcipher 16, 16, 23 764 - vcipher 17, 17, 23 765 - vcipher 18, 18, 23 766 - vcipher 19, 19, 23 767 - vcipher 20, 20, 23 768 - vcipher 21, 21, 23 769 - vcipher 22, 22, 23 770 - 771 - vcipher 15, 15, 24 772 - vcipher 16, 16, 24 773 - vcipher 17, 17, 24 774 - vcipher 18, 18, 24 775 - vcipher 19, 19, 24 776 - vcipher 20, 20, 24 777 - vcipher 21, 21, 24 778 - vcipher 22, 22, 24 779 - 780 - xxlor 23+32, 12, 12 781 - 782 - cmpdi 10, 12 783 - beq Do_next_ghash 784 - 785 - # 256 bits 786 - xxlor 24+32, 13, 13 787 - 788 - vcipher 15, 15, 23 789 - vcipher 16, 16, 23 790 - vcipher 17, 17, 23 791 - vcipher 18, 18, 23 792 - vcipher 19, 19, 23 793 - vcipher 20, 20, 23 794 - vcipher 21, 21, 23 795 - vcipher 22, 22, 23 796 - 797 - vcipher 15, 15, 24 798 - vcipher 16, 16, 24 799 - vcipher 17, 17, 24 800 - vcipher 18, 18, 24 801 - vcipher 19, 19, 24 802 - vcipher 20, 20, 24 803 - vcipher 21, 21, 24 804 - vcipher 22, 22, 24 805 - 806 - xxlor 23+32, 14, 14 807 - 808 - cmpdi 10, 14 809 - beq Do_next_ghash 810 - b aes_gcm_out 811 - 812 - Do_next_ghash: 813 - 814 679 # 815 - # last round 816 - vcipherlast 15, 15, 23 817 - vcipherlast 16, 16, 23 680 + # Pre-compute first 8 AES state and leave 1/3/5 more rounds 681 + # for the loop. 682 + # 683 + addi 22, 23, -9 # process 8 keys 684 + mtctr 22 # AES key loop 685 + addi 10, 6, 144 818 686 819 - xxlxor 47, 47, 15 820 - stxvb16x 47, 0, 9 # store output 821 - xxlxor 48, 48, 16 822 - stxvb16x 48, 15, 9 # store output 687 + LOOP_8AES_STATE # process 8 AES keys 823 688 824 - vcipherlast 17, 17, 23 825 - vcipherlast 18, 18, 23 689 + __PreLoop_aes_state: 690 + lxv 32+1, 0(10) # round key 691 + AES_CIPHER_8x vcipher 15 1 692 + addi 10, 10, 16 693 + bdnz __PreLoop_aes_state 694 + lxv 32+1, 0(10) # last round key (v1) 826 695 827 - xxlxor 49, 49, 17 828 - stxvb16x 49, 16, 9 # store output 829 - xxlxor 50, 50, 18 830 - stxvb16x 50, 17, 9 # store output 696 + cmpdi 12, 0 # Only one loop (8 block) 697 + beq __Finish_ghash 831 698 832 - vcipherlast 19, 19, 23 833 - vcipherlast 20, 20, 23 699 + # 700 + # Loop 8x blocks and compute ghash 701 + # 702 + __Loop_8x_block_enc: 703 + vcipherlast 15, 15, 1 704 + vcipherlast 16, 16, 1 705 + vcipherlast 17, 17, 1 706 + vcipherlast 18, 18, 1 707 + vcipherlast 19, 19, 1 708 + vcipherlast 20, 20, 1 709 + vcipherlast 21, 21, 1 710 + vcipherlast 22, 22, 1 834 711 835 - xxlxor 51, 51, 19 836 - stxvb16x 51, 18, 9 # store output 837 - xxlxor 52, 52, 20 838 - stxvb16x 52, 19, 9 # store output 712 + lxvb16x 32+23, 0, 14 # load block 713 + lxvb16x 32+24, 15, 14 # load block 714 + lxvb16x 32+25, 16, 14 # load block 715 + lxvb16x 32+26, 17, 14 # load block 716 + lxvb16x 32+27, 18, 14 # load block 717 + lxvb16x 32+28, 19, 14 # load block 718 + lxvb16x 32+29, 20, 14 # load block 719 + lxvb16x 32+30, 21, 14 # load block 720 + addi 14, 14, 128 839 721 840 - vcipherlast 21, 21, 23 841 - vcipherlast 22, 22, 23 722 + vxor 15, 15, 23 723 + vxor 16, 16, 24 724 + vxor 17, 17, 25 725 + vxor 18, 18, 26 726 + vxor 19, 19, 27 727 + vxor 20, 20, 28 728 + vxor 21, 21, 29 729 + vxor 22, 22, 30 842 730 843 - xxlxor 53, 53, 21 844 - stxvb16x 53, 20, 9 # store output 845 - xxlxor 54, 54, 22 846 - stxvb16x 54, 21, 9 # store output 847 - 848 - addi 9, 9, 128 731 + stxvb16x 47, 0, 9 # store output 732 + stxvb16x 48, 15, 9 # store output 733 + stxvb16x 49, 16, 9 # store output 734 + stxvb16x 50, 17, 9 # store output 735 + stxvb16x 51, 18, 9 # store output 736 + stxvb16x 52, 19, 9 # store output 737 + stxvb16x 53, 20, 9 # store output 738 + stxvb16x 54, 21, 9 # store output 739 + addi 9, 9, 128 849 740 850 741 # ghash here 851 - ppc_aes_gcm_ghash2_4x 742 + vxor 15, 15, 0 743 + PPC_GHASH4x 0, 15, 16, 17, 18 852 744 853 - xxlor 27+32, 0, 0 854 - vaddudm 30, 30, 31 # IV + counter 855 - vmr 29, 30 856 - vxor 15, 30, 27 # add round key 857 - vaddudm 30, 30, 31 858 - vxor 16, 30, 27 859 - vaddudm 30, 30, 31 860 - vxor 17, 30, 27 861 - vaddudm 30, 30, 31 862 - vxor 18, 30, 27 863 - vaddudm 30, 30, 31 864 - vxor 19, 30, 27 865 - vaddudm 30, 30, 31 866 - vxor 20, 30, 27 867 - vaddudm 30, 30, 31 868 - vxor 21, 30, 27 869 - vaddudm 30, 30, 31 870 - vxor 22, 30, 27 745 + vxor 19, 19, 0 746 + PPC_GHASH4x 0, 19, 20, 21, 22 871 747 872 - addi 12, 12, -128 748 + xxlor 32+15, 9, 9 # last state 749 + vadduwm 15, 15, 31 # state + counter 750 + vadduwm 16, 15, 31 751 + vadduwm 17, 16, 31 752 + vadduwm 18, 17, 31 753 + vadduwm 19, 18, 31 754 + vadduwm 20, 19, 31 755 + vadduwm 21, 20, 31 756 + vadduwm 22, 21, 31 757 + xxlor 9, 32+22, 32+22 # save last state 758 + 759 + xxlor 32+27, 0, 0 # restore roundkey 0 760 + vxor 15, 15, 27 # IV + round key - add round key 0 761 + vxor 16, 16, 27 762 + vxor 17, 17, 27 763 + vxor 18, 18, 27 764 + vxor 19, 19, 27 765 + vxor 20, 20, 27 766 + vxor 21, 21, 27 767 + vxor 22, 22, 27 768 + 769 + addi 5, 5, -128 873 770 addi 11, 11, 128 874 771 875 - bdnz Loop_8x_block 772 + LOOP_8AES_STATE # process 8 AES keys 773 + mtctr 22 # AES key loop 774 + addi 10, 6, 144 775 + __LastLoop_aes_state: 776 + lxv 32+1, 0(10) # round key 777 + AES_CIPHER_8x vcipher 15 1 778 + addi 10, 10, 16 779 + bdnz __LastLoop_aes_state 780 + lxv 32+1, 0(10) # last round key (v1) 876 781 877 - vmr 30, 29 878 - stxvb16x 30+32, 0, 7 # update IV 782 + addi 12, 12, -1 783 + cmpdi 12, 0 784 + bne __Loop_8x_block_enc 879 785 880 - Loop_last_block: 881 - cmpdi 12, 0 786 + __Finish_ghash: 787 + vcipherlast 15, 15, 1 788 + vcipherlast 16, 16, 1 789 + vcipherlast 17, 17, 1 790 + vcipherlast 18, 18, 1 791 + vcipherlast 19, 19, 1 792 + vcipherlast 20, 20, 1 793 + vcipherlast 21, 21, 1 794 + vcipherlast 22, 22, 1 795 + 796 + lxvb16x 32+23, 0, 14 # load block 797 + lxvb16x 32+24, 15, 14 # load block 798 + lxvb16x 32+25, 16, 14 # load block 799 + lxvb16x 32+26, 17, 14 # load block 800 + lxvb16x 32+27, 18, 14 # load block 801 + lxvb16x 32+28, 19, 14 # load block 802 + lxvb16x 32+29, 20, 14 # load block 803 + lxvb16x 32+30, 21, 14 # load block 804 + addi 14, 14, 128 805 + 806 + vxor 15, 15, 23 807 + vxor 16, 16, 24 808 + vxor 17, 17, 25 809 + vxor 18, 18, 26 810 + vxor 19, 19, 27 811 + vxor 20, 20, 28 812 + vxor 21, 21, 29 813 + vxor 22, 22, 30 814 + 815 + stxvb16x 47, 0, 9 # store output 816 + stxvb16x 48, 15, 9 # store output 817 + stxvb16x 49, 16, 9 # store output 818 + stxvb16x 50, 17, 9 # store output 819 + stxvb16x 51, 18, 9 # store output 820 + stxvb16x 52, 19, 9 # store output 821 + stxvb16x 53, 20, 9 # store output 822 + stxvb16x 54, 21, 9 # store output 823 + addi 9, 9, 128 824 + 825 + vxor 15, 15, 0 826 + PPC_GHASH4x 0, 15, 16, 17, 18 827 + 828 + vxor 19, 19, 0 829 + PPC_GHASH4x 0, 19, 20, 21, 22 830 + 831 + xxlor 30+32, 9, 9 # last ctr 832 + vadduwm 30, 30, 31 # increase ctr 833 + stxvb16x 32+30, 0, 7 # update IV 834 + stxvb16x 32+0, 0, 8 # update Xi 835 + 836 + addi 5, 5, -128 837 + addi 11, 11, 128 838 + 839 + # 840 + # Done 8x blocks 841 + # 842 + 843 + cmpdi 5, 0 882 844 beq aes_gcm_out 883 845 884 - # loop last few blocks 885 - li 10, 16 886 - divdu 10, 12, 10 846 + __Process_more_enc: 847 + li 24, 1 # encrypt 848 + bl aes_gcm_crypt_1x 849 + cmpdi 5, 0 850 + beq aes_gcm_out 887 851 888 - mtctr 10 852 + bl __Process_partial 853 + cmpdi 5, 0 854 + beq aes_gcm_out 855 + __Do_combine_enc: 856 + bl __Combine_partial 857 + cmpdi 5, 0 858 + bgt __Process_encrypt 859 + b aes_gcm_out 889 860 890 - lwz 10, 240(6) 861 + SYM_FUNC_END(aes_p10_gcm_encrypt) 891 862 892 - cmpdi 12, 16 893 - blt Final_block 863 + ################################################################################ 864 + # aes_p10_gcm_decrypt (const void *inp, void *out, size_t len, 865 + # const char *rk, unsigned char iv[16], void *Xip); 866 + # 8x Decrypt 867 + # 868 + ################################################################################ 869 + SYM_FUNC_START(aes_p10_gcm_decrypt) 894 870 895 - Next_rem_block: 896 - lxvb16x 15, 0, 14 # load block 871 + cmpdi 5, 0 872 + ble __Invalid_msg_len 897 873 898 - Loop_aes_middle_1x 874 + SAVE_REGS 875 + LOAD_HASH_TABLE 899 876 900 - xxlor 23+32, 10, 10 877 + # initialize ICB: GHASH( IV ), IV - r7 878 + lxvb16x 30+32, 0, 7 # load IV - v30 901 879 902 - cmpdi 10, 10 903 - beq Do_next_1x 880 + mr 14, 3 881 + mr 9, 4 904 882 905 - # 192 bits 906 - xxlor 24+32, 11, 11 883 + # counter 1 884 + vxor 31, 31, 31 885 + vspltisb 22, 1 886 + vsldoi 31, 31, 22,1 # counter 1 907 887 908 - vcipher 15, 15, 23 909 - vcipher 15, 15, 24 888 + addis 11, 2, permx@toc@ha 889 + addi 11, 11, permx@toc@l 890 + lxv 10, 0(11) # vs10: vpermxor vector 891 + li 11, 0 910 892 911 - xxlor 23+32, 12, 12 893 + # load 9 round keys to VSR 894 + lxv 0, 0(6) # round key 0 895 + lxv 1, 16(6) # round key 1 896 + lxv 2, 32(6) # round key 2 897 + lxv 3, 48(6) # round key 3 898 + lxv 4, 64(6) # round key 4 899 + lxv 5, 80(6) # round key 5 900 + lxv 6, 96(6) # round key 6 901 + lxv 7, 112(6) # round key 7 902 + lxv 8, 128(6) # round key 8 912 903 913 - cmpdi 10, 12 914 - beq Do_next_1x 904 + # load rounds - 10 (128), 12 (192), 14 (256) 905 + lwz 23, 240(6) # n rounds 906 + li 24, 0 # decrypt 915 907 916 - # 256 bits 917 - xxlor 24+32, 13, 13 918 - 919 - vcipher 15, 15, 23 920 - vcipher 15, 15, 24 921 - 922 - xxlor 23+32, 14, 14 923 - 924 - cmpdi 10, 14 925 - beq Do_next_1x 926 - 927 - Do_next_1x: 928 - vcipherlast 15, 15, 23 929 - 930 - xxlxor 47, 47, 15 931 - stxvb16x 47, 0, 9 # store output 932 - addi 14, 14, 16 933 - addi 9, 9, 16 934 - 935 - vmr 28, 15 936 - ppc_update_hash_1x 937 - 938 - addi 12, 12, -16 939 - addi 11, 11, 16 940 - xxlor 19+32, 0, 0 941 - vaddudm 30, 30, 31 # IV + counter 942 - vxor 15, 30, 19 # add round key 943 - 944 - bdnz Next_rem_block 945 - 946 - li 15, 0 947 - std 15, 56(7) # clear partial? 948 - stxvb16x 30+32, 0, 7 # update IV 908 + __Process_decrypt: 909 + # 910 + # Process different blocks 911 + # 912 + ld 12, 56(7) 949 913 cmpdi 12, 0 950 - beq aes_gcm_out 914 + bgt __Do_combine_dec 915 + cmpdi 5, 128 916 + blt __Process_more_dec 951 917 952 - Final_block: 953 - lwz 10, 240(6) 954 - Loop_aes_middle_1x 918 + # 919 + # Process 8x AES/GCM blocks 920 + # 921 + __Process_8x_dec: 922 + # 8x blcoks 923 + li 10, 128 924 + divdu 12, 5, 10 # n 128 bytes-blocks 955 925 956 - xxlor 23+32, 10, 10 926 + addi 12, 12, -1 # loop - 1 957 927 958 - cmpdi 10, 10 959 - beq Do_final_1x 928 + vmr 15, 30 # first state: IV 929 + vadduwm 16, 15, 31 # state + counter 930 + vadduwm 17, 16, 31 931 + vadduwm 18, 17, 31 932 + vadduwm 19, 18, 31 933 + vadduwm 20, 19, 31 934 + vadduwm 21, 20, 31 935 + vadduwm 22, 21, 31 936 + xxlor 9, 32+22, 32+22 # save last state 960 937 961 - # 192 bits 962 - xxlor 24+32, 11, 11 963 - 964 - vcipher 15, 15, 23 965 - vcipher 15, 15, 24 966 - 967 - xxlor 23+32, 12, 12 968 - 969 - cmpdi 10, 12 970 - beq Do_final_1x 971 - 972 - # 256 bits 973 - xxlor 24+32, 13, 13 974 - 975 - vcipher 15, 15, 23 976 - vcipher 15, 15, 24 977 - 978 - xxlor 23+32, 14, 14 979 - 980 - cmpdi 10, 14 981 - beq Do_final_1x 982 - 983 - Do_final_1x: 984 - vcipherlast 15, 15, 23 985 - 986 - # check partial block 987 - li 21, 0 # encrypt 988 - ld 15, 56(7) # partial? 989 - cmpdi 15, 0 990 - beq Normal_block 991 - bl Do_partial_block 992 - 993 - cmpdi 12, 0 994 - ble aes_gcm_out 995 - 996 - b Continue_partial_check 997 - 998 - Normal_block: 999 - lxvb16x 15, 0, 14 # load last block 1000 - xxlxor 47, 47, 15 1001 - 1002 - # create partial block mask 1003 - li 15, 16 1004 - sub 15, 15, 12 # index to the mask 1005 - 1006 - vspltisb 16, -1 # first 16 bytes - 0xffff...ff 1007 - vspltisb 17, 0 # second 16 bytes - 0x0000...00 1008 - li 10, 192 1009 - stvx 16, 10, 1 1010 - addi 10, 10, 16 1011 - stvx 17, 10, 1 1012 - 1013 - addi 10, 1, 192 1014 - lxvb16x 16, 15, 10 # load partial block mask 1015 - xxland 47, 47, 16 1016 - 1017 - vmr 28, 15 1018 - ppc_update_hash_1x 1019 - 1020 - # * should store only the remaining bytes. 1021 - bl Write_partial_block 1022 - 1023 - stxvb16x 30+32, 0, 7 # update IV 1024 - std 12, 56(7) # update partial? 1025 - li 16, 16 1026 - 1027 - stxvb16x 32, 0, 8 # write out Xi 1028 - stxvb16x 32, 16, 8 # write out Xi 1029 - b aes_gcm_out 1030 - 1031 - # 1032 - # Compute data mask 1033 - # 1034 - .macro GEN_MASK _mask _start _end 1035 - vspltisb 16, -1 # first 16 bytes - 0xffff...ff 1036 - vspltisb 17, 0 # second 16 bytes - 0x0000...00 1037 - li 10, 192 1038 - stxvb16x 17+32, 10, 1 1039 - add 10, 10, \_start 1040 - stxvb16x 16+32, 10, 1 1041 - add 10, 10, \_end 1042 - stxvb16x 17+32, 10, 1 1043 - 1044 - addi 10, 1, 192 1045 - lxvb16x \_mask, 0, 10 # load partial block mask 1046 - .endm 1047 - 1048 - # 1049 - # Handle multiple partial blocks for encrypt and decrypt 1050 - # operations. 1051 - # 1052 - SYM_FUNC_START_LOCAL(Do_partial_block) 1053 - add 17, 15, 5 1054 - cmpdi 17, 16 1055 - bgt Big_block 1056 - GEN_MASK 18, 15, 5 1057 - b _Partial 1058 - SYM_FUNC_END(Do_partial_block) 1059 - Big_block: 1060 - li 16, 16 1061 - GEN_MASK 18, 15, 16 1062 - 1063 - _Partial: 1064 - lxvb16x 17+32, 0, 14 # load last block 1065 - sldi 16, 15, 3 1066 - mtvsrdd 32+16, 0, 16 1067 - vsro 17, 17, 16 1068 - xxlxor 47, 47, 17+32 1069 - xxland 47, 47, 18 1070 - 1071 - vxor 0, 0, 0 # clear Xi 1072 - vmr 28, 15 1073 - 1074 - cmpdi 21, 0 # encrypt/decrypt ops? 1075 - beq Skip_decrypt 1076 - xxland 32+28, 32+17, 18 1077 - 1078 - Skip_decrypt: 1079 - 1080 - ppc_update_hash_1x 1081 - 1082 - li 16, 16 1083 - lxvb16x 32+29, 16, 8 1084 - vxor 0, 0, 29 1085 - stxvb16x 32, 0, 8 # save Xi 1086 - stxvb16x 32, 16, 8 # save Xi 1087 - 1088 - # store partial block 1089 - # loop the rest of the stream if any 1090 - sldi 16, 15, 3 1091 - mtvsrdd 32+16, 0, 16 1092 - vslo 15, 15, 16 1093 - #stxvb16x 15+32, 0, 9 # last block 1094 - 1095 - li 16, 16 1096 - sub 17, 16, 15 # 16 - partial 1097 - 1098 - add 16, 15, 5 1099 - cmpdi 16, 16 1100 - bgt Larger_16 1101 - mr 17, 5 1102 - Larger_16: 1103 - 1104 - # write partial 1105 - li 10, 192 1106 - stxvb16x 15+32, 10, 1 # save current block 1107 - 1108 - addi 10, 9, -1 1109 - addi 16, 1, 191 1110 - mtctr 17 # move partial byte count 1111 - 1112 - Write_last_partial: 1113 - lbzu 18, 1(16) 1114 - stbu 18, 1(10) 1115 - bdnz Write_last_partial 1116 - # Complete loop partial 1117 - 1118 - add 14, 14, 17 1119 - add 9, 9, 17 1120 - sub 12, 12, 17 1121 - add 11, 11, 17 1122 - 1123 - add 15, 15, 5 1124 - cmpdi 15, 16 1125 - blt Save_partial 1126 - 1127 - vaddudm 30, 30, 31 1128 - stxvb16x 30+32, 0, 7 # update IV 938 + # vxor state, state, w # addroundkey 1129 939 xxlor 32+29, 0, 0 1130 - vxor 15, 30, 29 # IV + round key - add round key 0 1131 - li 15, 0 1132 - std 15, 56(7) # partial done - clear 1133 - b Partial_done 1134 - Save_partial: 1135 - std 15, 56(7) # partial 940 + vxor 15, 15, 29 # IV + round key - add round key 0 941 + vxor 16, 16, 29 942 + vxor 17, 17, 29 943 + vxor 18, 18, 29 944 + vxor 19, 19, 29 945 + vxor 20, 20, 29 946 + vxor 21, 21, 29 947 + vxor 22, 22, 29 1136 948 1137 - Partial_done: 1138 - blr 949 + li 15, 16 950 + li 16, 32 951 + li 17, 48 952 + li 18, 64 953 + li 19, 80 954 + li 20, 96 955 + li 21, 112 1139 956 1140 - # 1141 - # Write partial block 1142 - # r9 - output 1143 - # r12 - remaining bytes 1144 - # v15 - partial input data 1145 - # 1146 - SYM_FUNC_START_LOCAL(Write_partial_block) 1147 - li 10, 192 1148 - stxvb16x 15+32, 10, 1 # last block 957 + # 958 + # Pre-compute first 8 AES state and leave 1/3/5 more rounds 959 + # for the loop. 960 + # 961 + addi 22, 23, -9 # process 8 keys 962 + mtctr 22 # AES key loop 963 + addi 10, 6, 144 1149 964 1150 - addi 10, 9, -1 1151 - addi 16, 1, 191 965 + LOOP_8AES_STATE # process 8 AES keys 1152 966 1153 - mtctr 12 # remaining bytes 1154 - li 15, 0 967 + __PreLoop_aes_state_dec: 968 + lxv 32+1, 0(10) # round key 969 + AES_CIPHER_8x vcipher 15 1 970 + addi 10, 10, 16 971 + bdnz __PreLoop_aes_state_dec 972 + lxv 32+1, 0(10) # last round key (v1) 1155 973 1156 - Write_last_byte: 1157 - lbzu 14, 1(16) 1158 - stbu 14, 1(10) 1159 - bdnz Write_last_byte 1160 - blr 1161 - SYM_FUNC_END(Write_partial_block) 974 + cmpdi 12, 0 # Only one loop (8 block) 975 + beq __Finish_ghash_dec 1162 976 1163 - aes_gcm_out: 1164 - # out = state 1165 - stxvb16x 32, 0, 8 # write out Xi 1166 - add 3, 11, 12 # return count 977 + # 978 + # Loop 8x blocks and compute ghash 979 + # 980 + __Loop_8x_block_dec: 981 + vcipherlast 15, 15, 1 982 + vcipherlast 16, 16, 1 983 + vcipherlast 17, 17, 1 984 + vcipherlast 18, 18, 1 985 + vcipherlast 19, 19, 1 986 + vcipherlast 20, 20, 1 987 + vcipherlast 21, 21, 1 988 + vcipherlast 22, 22, 1 989 + 990 + lxvb16x 32+23, 0, 14 # load block 991 + lxvb16x 32+24, 15, 14 # load block 992 + lxvb16x 32+25, 16, 14 # load block 993 + lxvb16x 32+26, 17, 14 # load block 994 + lxvb16x 32+27, 18, 14 # load block 995 + lxvb16x 32+28, 19, 14 # load block 996 + lxvb16x 32+29, 20, 14 # load block 997 + lxvb16x 32+30, 21, 14 # load block 998 + addi 14, 14, 128 999 + 1000 + vxor 15, 15, 23 1001 + vxor 16, 16, 24 1002 + vxor 17, 17, 25 1003 + vxor 18, 18, 26 1004 + vxor 19, 19, 27 1005 + vxor 20, 20, 28 1006 + vxor 21, 21, 29 1007 + vxor 22, 22, 30 1008 + 1009 + stxvb16x 47, 0, 9 # store output 1010 + stxvb16x 48, 15, 9 # store output 1011 + stxvb16x 49, 16, 9 # store output 1012 + stxvb16x 50, 17, 9 # store output 1013 + stxvb16x 51, 18, 9 # store output 1014 + stxvb16x 52, 19, 9 # store output 1015 + stxvb16x 53, 20, 9 # store output 1016 + stxvb16x 54, 21, 9 # store output 1017 + 1018 + addi 9, 9, 128 1019 + 1020 + vmr 15, 23 1021 + vmr 16, 24 1022 + vmr 17, 25 1023 + vmr 18, 26 1024 + vmr 19, 27 1025 + vmr 20, 28 1026 + vmr 21, 29 1027 + vmr 22, 30 1028 + 1029 + # ghash here 1030 + vxor 15, 15, 0 1031 + PPC_GHASH4x 0, 15, 16, 17, 18 1032 + 1033 + vxor 19, 19, 0 1034 + PPC_GHASH4x 0, 19, 20, 21, 22 1035 + 1036 + xxlor 32+15, 9, 9 # last state 1037 + vadduwm 15, 15, 31 # state + counter 1038 + vadduwm 16, 15, 31 1039 + vadduwm 17, 16, 31 1040 + vadduwm 18, 17, 31 1041 + vadduwm 19, 18, 31 1042 + vadduwm 20, 19, 31 1043 + vadduwm 21, 20, 31 1044 + vadduwm 22, 21, 31 1045 + xxlor 9, 32+22, 32+22 # save last state 1046 + 1047 + xxlor 32+27, 0, 0 # restore roundkey 0 1048 + vxor 15, 15, 27 # IV + round key - add round key 0 1049 + vxor 16, 16, 27 1050 + vxor 17, 17, 27 1051 + vxor 18, 18, 27 1052 + vxor 19, 19, 27 1053 + vxor 20, 20, 27 1054 + vxor 21, 21, 27 1055 + vxor 22, 22, 27 1056 + 1057 + addi 5, 5, -128 1058 + addi 11, 11, 128 1059 + 1060 + LOOP_8AES_STATE # process 8 AES keys 1061 + mtctr 22 # AES key loop 1062 + addi 10, 6, 144 1063 + __LastLoop_aes_state_dec: 1064 + lxv 32+1, 0(10) # round key 1065 + AES_CIPHER_8x vcipher 15 1 1066 + addi 10, 10, 16 1067 + bdnz __LastLoop_aes_state_dec 1068 + lxv 32+1, 0(10) # last round key (v1) 1069 + 1070 + addi 12, 12, -1 1071 + cmpdi 12, 0 1072 + bne __Loop_8x_block_dec 1073 + 1074 + __Finish_ghash_dec: 1075 + vcipherlast 15, 15, 1 1076 + vcipherlast 16, 16, 1 1077 + vcipherlast 17, 17, 1 1078 + vcipherlast 18, 18, 1 1079 + vcipherlast 19, 19, 1 1080 + vcipherlast 20, 20, 1 1081 + vcipherlast 21, 21, 1 1082 + vcipherlast 22, 22, 1 1083 + 1084 + lxvb16x 32+23, 0, 14 # load block 1085 + lxvb16x 32+24, 15, 14 # load block 1086 + lxvb16x 32+25, 16, 14 # load block 1087 + lxvb16x 32+26, 17, 14 # load block 1088 + lxvb16x 32+27, 18, 14 # load block 1089 + lxvb16x 32+28, 19, 14 # load block 1090 + lxvb16x 32+29, 20, 14 # load block 1091 + lxvb16x 32+30, 21, 14 # load block 1092 + addi 14, 14, 128 1093 + 1094 + vxor 15, 15, 23 1095 + vxor 16, 16, 24 1096 + vxor 17, 17, 25 1097 + vxor 18, 18, 26 1098 + vxor 19, 19, 27 1099 + vxor 20, 20, 28 1100 + vxor 21, 21, 29 1101 + vxor 22, 22, 30 1102 + 1103 + stxvb16x 47, 0, 9 # store output 1104 + stxvb16x 48, 15, 9 # store output 1105 + stxvb16x 49, 16, 9 # store output 1106 + stxvb16x 50, 17, 9 # store output 1107 + stxvb16x 51, 18, 9 # store output 1108 + stxvb16x 52, 19, 9 # store output 1109 + stxvb16x 53, 20, 9 # store output 1110 + stxvb16x 54, 21, 9 # store output 1111 + addi 9, 9, 128 1112 + 1113 + #vmr 15, 23 1114 + vxor 15, 23, 0 1115 + vmr 16, 24 1116 + vmr 17, 25 1117 + vmr 18, 26 1118 + vmr 19, 27 1119 + vmr 20, 28 1120 + vmr 21, 29 1121 + vmr 22, 30 1122 + 1123 + #vxor 15, 15, 0 1124 + PPC_GHASH4x 0, 15, 16, 17, 18 1125 + 1126 + vxor 19, 19, 0 1127 + PPC_GHASH4x 0, 19, 20, 21, 22 1128 + 1129 + xxlor 30+32, 9, 9 # last ctr 1130 + vadduwm 30, 30, 31 # increase ctr 1131 + stxvb16x 32+30, 0, 7 # update IV 1132 + stxvb16x 32+0, 0, 8 # update Xi 1133 + 1134 + addi 5, 5, -128 1135 + addi 11, 11, 128 1136 + 1137 + # 1138 + # Done 8x blocks 1139 + # 1140 + 1141 + cmpdi 5, 0 1142 + beq aes_gcm_out 1143 + 1144 + __Process_more_dec: 1145 + li 24, 0 # decrypt 1146 + bl aes_gcm_crypt_1x 1147 + cmpdi 5, 0 1148 + beq aes_gcm_out 1149 + 1150 + bl __Process_partial 1151 + cmpdi 5, 0 1152 + beq aes_gcm_out 1153 + __Do_combine_dec: 1154 + bl __Combine_partial 1155 + cmpdi 5, 0 1156 + bgt __Process_decrypt 1157 + b aes_gcm_out 1158 + SYM_FUNC_END(aes_p10_gcm_decrypt) 1159 + 1160 + SYM_FUNC_START_LOCAL(aes_gcm_out) 1161 + 1162 + mr 3, 11 # return count 1167 1163 1168 1164 RESTORE_REGS 1169 1165 blr 1170 1166 1171 - # 1172 - # 8x Decrypt 1173 - # 1174 - _GLOBAL(aes_p10_gcm_decrypt) 1175 - .align 5 1176 - 1177 - SAVE_REGS 1178 - 1179 - LOAD_HASH_TABLE 1180 - 1181 - # initialize ICB: GHASH( IV ), IV - r7 1182 - lxvb16x 30+32, 0, 7 # load IV - v30 1183 - 1184 - mr 12, 5 # length 1185 - li 11, 0 # block index 1186 - 1187 - # counter 1 1188 - vxor 31, 31, 31 1189 - vspltisb 22, 1 1190 - vsldoi 31, 31, 22,1 # counter 1 1191 - 1192 - # load round key to VSR 1193 - lxv 0, 0(6) 1194 - lxv 1, 0x10(6) 1195 - lxv 2, 0x20(6) 1196 - lxv 3, 0x30(6) 1197 - lxv 4, 0x40(6) 1198 - lxv 5, 0x50(6) 1199 - lxv 6, 0x60(6) 1200 - lxv 7, 0x70(6) 1201 - lxv 8, 0x80(6) 1202 - lxv 9, 0x90(6) 1203 - lxv 10, 0xa0(6) 1204 - 1205 - # load rounds - 10 (128), 12 (192), 14 (256) 1206 - lwz 9,240(6) 1207 - 1208 - # 1209 - # vxor state, state, w # addroundkey 1210 - xxlor 32+29, 0, 0 1211 - vxor 15, 30, 29 # IV + round key - add round key 0 1212 - 1213 - cmpdi 9, 10 1214 - beq Loop_aes_gcm_8x_dec 1215 - 1216 - # load 2 more round keys (v11, v12) 1217 - lxv 11, 0xb0(6) 1218 - lxv 12, 0xc0(6) 1219 - 1220 - cmpdi 9, 12 1221 - beq Loop_aes_gcm_8x_dec 1222 - 1223 - # load 2 more round keys (v11, v12, v13, v14) 1224 - lxv 13, 0xd0(6) 1225 - lxv 14, 0xe0(6) 1226 - cmpdi 9, 14 1227 - beq Loop_aes_gcm_8x_dec 1228 - 1229 - b aes_gcm_out 1230 - 1231 - .align 5 1232 - Loop_aes_gcm_8x_dec: 1233 - mr 14, 3 1234 - mr 9, 4 1235 - 1236 - # 1237 - # check partial block 1238 - # 1239 - Continue_partial_check_dec: 1240 - ld 15, 56(7) 1241 - cmpdi 15, 0 1242 - beq Continue_dec 1243 - bgt Final_block_dec 1244 - cmpdi 15, 16 1245 - blt Final_block_dec 1246 - 1247 - Continue_dec: 1248 - # n blcoks 1249 - li 10, 128 1250 - divdu 10, 12, 10 # n 128 bytes-blocks 1251 - cmpdi 10, 0 1252 - beq Loop_last_block_dec 1253 - 1254 - vaddudm 30, 30, 31 # IV + counter 1255 - vxor 16, 30, 29 1256 - vaddudm 30, 30, 31 1257 - vxor 17, 30, 29 1258 - vaddudm 30, 30, 31 1259 - vxor 18, 30, 29 1260 - vaddudm 30, 30, 31 1261 - vxor 19, 30, 29 1262 - vaddudm 30, 30, 31 1263 - vxor 20, 30, 29 1264 - vaddudm 30, 30, 31 1265 - vxor 21, 30, 29 1266 - vaddudm 30, 30, 31 1267 - vxor 22, 30, 29 1268 - 1269 - mtctr 10 1270 - 1271 - li 15, 16 1272 - li 16, 32 1273 - li 17, 48 1274 - li 18, 64 1275 - li 19, 80 1276 - li 20, 96 1277 - li 21, 112 1278 - 1279 - lwz 10, 240(6) 1280 - 1281 - Loop_8x_block_dec: 1282 - 1283 - lxvb16x 15, 0, 14 # load block 1284 - lxvb16x 16, 15, 14 # load block 1285 - lxvb16x 17, 16, 14 # load block 1286 - lxvb16x 18, 17, 14 # load block 1287 - lxvb16x 19, 18, 14 # load block 1288 - lxvb16x 20, 19, 14 # load block 1289 - lxvb16x 21, 20, 14 # load block 1290 - lxvb16x 22, 21, 14 # load block 1291 - addi 14, 14, 128 1292 - 1293 - Loop_aes_middle8x 1294 - 1295 - xxlor 23+32, 10, 10 1296 - 1297 - cmpdi 10, 10 1298 - beq Do_next_ghash_dec 1299 - 1300 - # 192 bits 1301 - xxlor 24+32, 11, 11 1302 - 1303 - vcipher 15, 15, 23 1304 - vcipher 16, 16, 23 1305 - vcipher 17, 17, 23 1306 - vcipher 18, 18, 23 1307 - vcipher 19, 19, 23 1308 - vcipher 20, 20, 23 1309 - vcipher 21, 21, 23 1310 - vcipher 22, 22, 23 1311 - 1312 - vcipher 15, 15, 24 1313 - vcipher 16, 16, 24 1314 - vcipher 17, 17, 24 1315 - vcipher 18, 18, 24 1316 - vcipher 19, 19, 24 1317 - vcipher 20, 20, 24 1318 - vcipher 21, 21, 24 1319 - vcipher 22, 22, 24 1320 - 1321 - xxlor 23+32, 12, 12 1322 - 1323 - cmpdi 10, 12 1324 - beq Do_next_ghash_dec 1325 - 1326 - # 256 bits 1327 - xxlor 24+32, 13, 13 1328 - 1329 - vcipher 15, 15, 23 1330 - vcipher 16, 16, 23 1331 - vcipher 17, 17, 23 1332 - vcipher 18, 18, 23 1333 - vcipher 19, 19, 23 1334 - vcipher 20, 20, 23 1335 - vcipher 21, 21, 23 1336 - vcipher 22, 22, 23 1337 - 1338 - vcipher 15, 15, 24 1339 - vcipher 16, 16, 24 1340 - vcipher 17, 17, 24 1341 - vcipher 18, 18, 24 1342 - vcipher 19, 19, 24 1343 - vcipher 20, 20, 24 1344 - vcipher 21, 21, 24 1345 - vcipher 22, 22, 24 1346 - 1347 - xxlor 23+32, 14, 14 1348 - 1349 - cmpdi 10, 14 1350 - beq Do_next_ghash_dec 1351 - b aes_gcm_out 1352 - 1353 - Do_next_ghash_dec: 1354 - 1355 - # 1356 - # last round 1357 - vcipherlast 15, 15, 23 1358 - vcipherlast 16, 16, 23 1359 - 1360 - xxlxor 47, 47, 15 1361 - stxvb16x 47, 0, 9 # store output 1362 - xxlxor 48, 48, 16 1363 - stxvb16x 48, 15, 9 # store output 1364 - 1365 - vcipherlast 17, 17, 23 1366 - vcipherlast 18, 18, 23 1367 - 1368 - xxlxor 49, 49, 17 1369 - stxvb16x 49, 16, 9 # store output 1370 - xxlxor 50, 50, 18 1371 - stxvb16x 50, 17, 9 # store output 1372 - 1373 - vcipherlast 19, 19, 23 1374 - vcipherlast 20, 20, 23 1375 - 1376 - xxlxor 51, 51, 19 1377 - stxvb16x 51, 18, 9 # store output 1378 - xxlxor 52, 52, 20 1379 - stxvb16x 52, 19, 9 # store output 1380 - 1381 - vcipherlast 21, 21, 23 1382 - vcipherlast 22, 22, 23 1383 - 1384 - xxlxor 53, 53, 21 1385 - stxvb16x 53, 20, 9 # store output 1386 - xxlxor 54, 54, 22 1387 - stxvb16x 54, 21, 9 # store output 1388 - 1389 - addi 9, 9, 128 1390 - 1391 - xxlor 15+32, 15, 15 1392 - xxlor 16+32, 16, 16 1393 - xxlor 17+32, 17, 17 1394 - xxlor 18+32, 18, 18 1395 - xxlor 19+32, 19, 19 1396 - xxlor 20+32, 20, 20 1397 - xxlor 21+32, 21, 21 1398 - xxlor 22+32, 22, 22 1399 - 1400 - # ghash here 1401 - ppc_aes_gcm_ghash2_4x 1402 - 1403 - xxlor 27+32, 0, 0 1404 - vaddudm 30, 30, 31 # IV + counter 1405 - vmr 29, 30 1406 - vxor 15, 30, 27 # add round key 1407 - vaddudm 30, 30, 31 1408 - vxor 16, 30, 27 1409 - vaddudm 30, 30, 31 1410 - vxor 17, 30, 27 1411 - vaddudm 30, 30, 31 1412 - vxor 18, 30, 27 1413 - vaddudm 30, 30, 31 1414 - vxor 19, 30, 27 1415 - vaddudm 30, 30, 31 1416 - vxor 20, 30, 27 1417 - vaddudm 30, 30, 31 1418 - vxor 21, 30, 27 1419 - vaddudm 30, 30, 31 1420 - vxor 22, 30, 27 1421 - 1422 - addi 12, 12, -128 1423 - addi 11, 11, 128 1424 - 1425 - bdnz Loop_8x_block_dec 1426 - 1427 - vmr 30, 29 1428 - stxvb16x 30+32, 0, 7 # update IV 1429 - 1430 - Loop_last_block_dec: 1431 - cmpdi 12, 0 1432 - beq aes_gcm_out 1433 - 1434 - # loop last few blocks 1435 - li 10, 16 1436 - divdu 10, 12, 10 1437 - 1438 - mtctr 10 1439 - 1440 - lwz 10, 240(6) 1441 - 1442 - cmpdi 12, 16 1443 - blt Final_block_dec 1444 - 1445 - Next_rem_block_dec: 1446 - lxvb16x 15, 0, 14 # load block 1447 - 1448 - Loop_aes_middle_1x 1449 - 1450 - xxlor 23+32, 10, 10 1451 - 1452 - cmpdi 10, 10 1453 - beq Do_next_1x_dec 1454 - 1455 - # 192 bits 1456 - xxlor 24+32, 11, 11 1457 - 1458 - vcipher 15, 15, 23 1459 - vcipher 15, 15, 24 1460 - 1461 - xxlor 23+32, 12, 12 1462 - 1463 - cmpdi 10, 12 1464 - beq Do_next_1x_dec 1465 - 1466 - # 256 bits 1467 - xxlor 24+32, 13, 13 1468 - 1469 - vcipher 15, 15, 23 1470 - vcipher 15, 15, 24 1471 - 1472 - xxlor 23+32, 14, 14 1473 - 1474 - cmpdi 10, 14 1475 - beq Do_next_1x_dec 1476 - 1477 - Do_next_1x_dec: 1478 - vcipherlast 15, 15, 23 1479 - 1480 - xxlxor 47, 47, 15 1481 - stxvb16x 47, 0, 9 # store output 1482 - addi 14, 14, 16 1483 - addi 9, 9, 16 1484 - 1485 - xxlor 28+32, 15, 15 1486 - #vmr 28, 15 1487 - ppc_update_hash_1x 1488 - 1489 - addi 12, 12, -16 1490 - addi 11, 11, 16 1491 - xxlor 19+32, 0, 0 1492 - vaddudm 30, 30, 31 # IV + counter 1493 - vxor 15, 30, 19 # add round key 1494 - 1495 - bdnz Next_rem_block_dec 1496 - 1497 - li 15, 0 1498 - std 15, 56(7) # clear partial? 1499 - stxvb16x 30+32, 0, 7 # update IV 1500 - cmpdi 12, 0 1501 - beq aes_gcm_out 1502 - 1503 - Final_block_dec: 1504 - lwz 10, 240(6) 1505 - Loop_aes_middle_1x 1506 - 1507 - xxlor 23+32, 10, 10 1508 - 1509 - cmpdi 10, 10 1510 - beq Do_final_1x_dec 1511 - 1512 - # 192 bits 1513 - xxlor 24+32, 11, 11 1514 - 1515 - vcipher 15, 15, 23 1516 - vcipher 15, 15, 24 1517 - 1518 - xxlor 23+32, 12, 12 1519 - 1520 - cmpdi 10, 12 1521 - beq Do_final_1x_dec 1522 - 1523 - # 256 bits 1524 - xxlor 24+32, 13, 13 1525 - 1526 - vcipher 15, 15, 23 1527 - vcipher 15, 15, 24 1528 - 1529 - xxlor 23+32, 14, 14 1530 - 1531 - cmpdi 10, 14 1532 - beq Do_final_1x_dec 1533 - 1534 - Do_final_1x_dec: 1535 - vcipherlast 15, 15, 23 1536 - 1537 - # check partial block 1538 - li 21, 1 # decrypt 1539 - ld 15, 56(7) # partial? 1540 - cmpdi 15, 0 1541 - beq Normal_block_dec 1542 - bl Do_partial_block 1543 - cmpdi 12, 0 1544 - ble aes_gcm_out 1545 - 1546 - b Continue_partial_check_dec 1547 - 1548 - Normal_block_dec: 1549 - lxvb16x 15, 0, 14 # load last block 1550 - xxlxor 47, 47, 15 1551 - 1552 - # create partial block mask 1553 - li 15, 16 1554 - sub 15, 15, 12 # index to the mask 1555 - 1556 - vspltisb 16, -1 # first 16 bytes - 0xffff...ff 1557 - vspltisb 17, 0 # second 16 bytes - 0x0000...00 1558 - li 10, 192 1559 - stvx 16, 10, 1 1560 - addi 10, 10, 16 1561 - stvx 17, 10, 1 1562 - 1563 - addi 10, 1, 192 1564 - lxvb16x 16, 15, 10 # load partial block mask 1565 - xxland 47, 47, 16 1566 - 1567 - xxland 32+28, 15, 16 1568 - #vmr 28, 15 1569 - ppc_update_hash_1x 1570 - 1571 - # * should store only the remaining bytes. 1572 - bl Write_partial_block 1573 - 1574 - stxvb16x 30+32, 0, 7 # update IV 1575 - std 12, 56(7) # update partial? 1576 - li 16, 16 1577 - 1578 - stxvb16x 32, 0, 8 # write out Xi 1579 - stxvb16x 32, 16, 8 # write out Xi 1580 - b aes_gcm_out 1167 + __Invalid_msg_len: 1168 + li 3, 0 1169 + blr 1170 + SYM_FUNC_END(aes_gcm_out) 1171 + 1172 + SYM_DATA_START_LOCAL(PERMX) 1173 + .align 4 1174 + # for vector permute and xor 1175 + permx: 1176 + .long 0x4c5d6e7f, 0x08192a3b, 0xc4d5e6f7, 0x8091a2b3 1177 + SYM_DATA_END(permx)