Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2#
3# Accelerated AES-GCM stitched implementation for ppc64le.
4#
5# Copyright 2024- IBM Inc.
6#
7#===================================================================================
8# Written by Danny Tsen <dtsen@us.ibm.com>
9#
10# GHASH is based on the Karatsuba multiplication method.
11#
12# Xi xor X1
13#
14# X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
15# (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
16# (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
17# (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
18# (X4.h * H.h + X4.l * H.l + X4 * H)
19#
20# Xi = v0
21# H Poly = v2
22# Hash keys = v3 - v14
23# ( H.l, H, H.h)
24# ( H^2.l, H^2, H^2.h)
25# ( H^3.l, H^3, H^3.h)
26# ( H^4.l, H^4, H^4.h)
27#
28# v30 is IV
29# v31 - counter 1
30#
31# AES used,
32# vs0 - round key 0
33# v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
34#
35# This implementation uses stitched AES-GCM approach to improve overall performance.
36# AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
37#
38# ===================================================================================
39#
40
41#include <asm/ppc_asm.h>
42#include <linux/linkage.h>
43
44.machine "any"
45.text
46
47.macro SAVE_GPR GPR OFFSET FRAME
48 std \GPR,\OFFSET(\FRAME)
49.endm
50
51.macro SAVE_VRS VRS OFFSET FRAME
52 stxv \VRS+32, \OFFSET(\FRAME)
53.endm
54
55.macro RESTORE_GPR GPR OFFSET FRAME
56 ld \GPR,\OFFSET(\FRAME)
57.endm
58
59.macro RESTORE_VRS VRS OFFSET FRAME
60 lxv \VRS+32, \OFFSET(\FRAME)
61.endm
62
63.macro SAVE_REGS
64 mflr 0
65 std 0, 16(1)
66 stdu 1,-512(1)
67
68 SAVE_GPR 14, 112, 1
69 SAVE_GPR 15, 120, 1
70 SAVE_GPR 16, 128, 1
71 SAVE_GPR 17, 136, 1
72 SAVE_GPR 18, 144, 1
73 SAVE_GPR 19, 152, 1
74 SAVE_GPR 20, 160, 1
75 SAVE_GPR 21, 168, 1
76 SAVE_GPR 22, 176, 1
77 SAVE_GPR 23, 184, 1
78 SAVE_GPR 24, 192, 1
79
80 addi 9, 1, 256
81 SAVE_VRS 20, 0, 9
82 SAVE_VRS 21, 16, 9
83 SAVE_VRS 22, 32, 9
84 SAVE_VRS 23, 48, 9
85 SAVE_VRS 24, 64, 9
86 SAVE_VRS 25, 80, 9
87 SAVE_VRS 26, 96, 9
88 SAVE_VRS 27, 112, 9
89 SAVE_VRS 28, 128, 9
90 SAVE_VRS 29, 144, 9
91 SAVE_VRS 30, 160, 9
92 SAVE_VRS 31, 176, 9
93.endm # SAVE_REGS
94
95.macro RESTORE_REGS
96 addi 9, 1, 256
97 RESTORE_VRS 20, 0, 9
98 RESTORE_VRS 21, 16, 9
99 RESTORE_VRS 22, 32, 9
100 RESTORE_VRS 23, 48, 9
101 RESTORE_VRS 24, 64, 9
102 RESTORE_VRS 25, 80, 9
103 RESTORE_VRS 26, 96, 9
104 RESTORE_VRS 27, 112, 9
105 RESTORE_VRS 28, 128, 9
106 RESTORE_VRS 29, 144, 9
107 RESTORE_VRS 30, 160, 9
108 RESTORE_VRS 31, 176, 9
109
110 RESTORE_GPR 14, 112, 1
111 RESTORE_GPR 15, 120, 1
112 RESTORE_GPR 16, 128, 1
113 RESTORE_GPR 17, 136, 1
114 RESTORE_GPR 18, 144, 1
115 RESTORE_GPR 19, 152, 1
116 RESTORE_GPR 20, 160, 1
117 RESTORE_GPR 21, 168, 1
118 RESTORE_GPR 22, 176, 1
119 RESTORE_GPR 23, 184, 1
120 RESTORE_GPR 24, 192, 1
121
122 addi 1, 1, 512
123 ld 0, 16(1)
124 mtlr 0
125.endm # RESTORE_REGS
126
127# 4x loops
128.macro AES_CIPHER_4x _VCIPHER ST r
129 \_VCIPHER \ST, \ST, \r
130 \_VCIPHER \ST+1, \ST+1, \r
131 \_VCIPHER \ST+2, \ST+2, \r
132 \_VCIPHER \ST+3, \ST+3, \r
133.endm
134
135# 8x loops
136.macro AES_CIPHER_8x _VCIPHER ST r
137 \_VCIPHER \ST, \ST, \r
138 \_VCIPHER \ST+1, \ST+1, \r
139 \_VCIPHER \ST+2, \ST+2, \r
140 \_VCIPHER \ST+3, \ST+3, \r
141 \_VCIPHER \ST+4, \ST+4, \r
142 \_VCIPHER \ST+5, \ST+5, \r
143 \_VCIPHER \ST+6, \ST+6, \r
144 \_VCIPHER \ST+7, \ST+7, \r
145.endm
146
147.macro LOOP_8AES_STATE
148 xxlor 32+23, 1, 1
149 xxlor 32+24, 2, 2
150 xxlor 32+25, 3, 3
151 xxlor 32+26, 4, 4
152 AES_CIPHER_8x vcipher, 15, 23
153 AES_CIPHER_8x vcipher, 15, 24
154 AES_CIPHER_8x vcipher, 15, 25
155 AES_CIPHER_8x vcipher, 15, 26
156 xxlor 32+23, 5, 5
157 xxlor 32+24, 6, 6
158 xxlor 32+25, 7, 7
159 xxlor 32+26, 8, 8
160 AES_CIPHER_8x vcipher, 15, 23
161 AES_CIPHER_8x vcipher, 15, 24
162 AES_CIPHER_8x vcipher, 15, 25
163 AES_CIPHER_8x vcipher, 15, 26
164.endm
165
166#
167# PPC_GHASH4x(H, S1, S2, S3, S4): Compute 4x hash values based on Karatsuba method.
168# H: returning digest
169# S#: states
170#
171# S1 should xor with the previous digest
172#
173# Xi = v0
174# H Poly = v2
175# Hash keys = v3 - v14
176# Scratch: v23 - v29
177#
178.macro PPC_GHASH4x H S1 S2 S3 S4
179
180 vpmsumd 23, 12, \S1 # H4.L * X.L
181 vpmsumd 24, 9, \S2
182 vpmsumd 25, 6, \S3
183 vpmsumd 26, 3, \S4
184
185 vpmsumd 27, 13, \S1 # H4.L * X.H + H4.H * X.L
186 vpmsumd 28, 10, \S2 # H3.L * X1.H + H3.H * X1.L
187
188 vxor 23, 23, 24
189 vxor 23, 23, 25
190 vxor 23, 23, 26 # L
191
192 vxor 24, 27, 28
193 vpmsumd 25, 7, \S3
194 vpmsumd 26, 4, \S4
195
196 vxor 24, 24, 25
197 vxor 24, 24, 26 # M
198
199 # sum hash and reduction with H Poly
200 vpmsumd 28, 23, 2 # reduction
201
202 vxor 1, 1, 1
203 vsldoi 25, 24, 1, 8 # mL
204 vsldoi 1, 1, 24, 8 # mH
205 vxor 23, 23, 25 # mL + L
206
207 # This performs swap and xor like,
208 # vsldoi 23, 23, 23, 8 # swap
209 # vxor 23, 23, 28
210 xxlor 32+25, 10, 10
211 vpermxor 23, 23, 28, 25
212
213 vpmsumd 26, 14, \S1 # H4.H * X.H
214 vpmsumd 27, 11, \S2
215 vpmsumd 28, 8, \S3
216 vpmsumd 29, 5, \S4
217
218 vxor 24, 26, 27
219 vxor 24, 24, 28
220 vxor 24, 24, 29
221
222 vxor 24, 24, 1
223
224 # sum hash and reduction with H Poly
225 vsldoi 25, 23, 23, 8 # swap
226 vpmsumd 23, 23, 2
227 vxor 27, 25, 24
228 vxor \H, 23, 27
229.endm
230
231#
232# Compute update single ghash
233# scratch: v1, v22..v27
234#
235.macro PPC_GHASH1x H S1
236
237 vxor 1, 1, 1
238
239 vpmsumd 22, 3, \S1 # L
240 vpmsumd 23, 4, \S1 # M
241 vpmsumd 24, 5, \S1 # H
242
243 vpmsumd 27, 22, 2 # reduction
244
245 vsldoi 25, 23, 1, 8 # mL
246 vsldoi 26, 1, 23, 8 # mH
247 vxor 22, 22, 25 # LL + LL
248 vxor 24, 24, 26 # HH + HH
249
250 xxlor 32+25, 10, 10
251 vpermxor 22, 22, 27, 25
252
253 vsldoi 23, 22, 22, 8 # swap
254 vpmsumd 22, 22, 2 # reduction
255 vxor 23, 23, 24
256 vxor \H, 22, 23
257.endm
258
259#
260# LOAD_HASH_TABLE
261# Xi = v0
262# H Poly = v2
263# Hash keys = v3 - v14
264#
265.macro LOAD_HASH_TABLE
266 # Load Xi
267 lxvb16x 32, 0, 8 # load Xi
268
269 # load Hash - h^4, h^3, h^2, h
270 li 10, 32
271 lxvd2x 2+32, 10, 8 # H Poli
272 li 10, 48
273 lxvd2x 3+32, 10, 8 # Hl
274 li 10, 64
275 lxvd2x 4+32, 10, 8 # H
276 li 10, 80
277 lxvd2x 5+32, 10, 8 # Hh
278
279 li 10, 96
280 lxvd2x 6+32, 10, 8 # H^2l
281 li 10, 112
282 lxvd2x 7+32, 10, 8 # H^2
283 li 10, 128
284 lxvd2x 8+32, 10, 8 # H^2h
285
286 li 10, 144
287 lxvd2x 9+32, 10, 8 # H^3l
288 li 10, 160
289 lxvd2x 10+32, 10, 8 # H^3
290 li 10, 176
291 lxvd2x 11+32, 10, 8 # H^3h
292
293 li 10, 192
294 lxvd2x 12+32, 10, 8 # H^4l
295 li 10, 208
296 lxvd2x 13+32, 10, 8 # H^4
297 li 10, 224
298 lxvd2x 14+32, 10, 8 # H^4h
299.endm
300
301################################################################################
302# Compute AES and ghash one block at a time.
303# r23: AES rounds
304# v30: current IV
305# vs0: roundkey 0
306#
307################################################################################
308SYM_FUNC_START_LOCAL(aes_gcm_crypt_1x)
309
310 cmpdi 5, 16
311 bge __More_1x
312 blr
313__More_1x:
314 li 10, 16
315 divdu 12, 5, 10
316
317 xxlxor 32+15, 32+30, 0
318
319 # Pre-load 8 AES rounds to scratch vectors.
320 xxlor 32+16, 1, 1
321 xxlor 32+17, 2, 2
322 xxlor 32+18, 3, 3
323 xxlor 32+19, 4, 4
324 xxlor 32+20, 5, 5
325 xxlor 32+21, 6, 6
326 xxlor 32+28, 7, 7
327 xxlor 32+29, 8, 8
328 lwz 23, 240(6) # n rounds
329 addi 22, 23, -9 # remaing AES rounds
330
331 cmpdi 12, 0
332 bgt __Loop_1x
333 blr
334
335__Loop_1x:
336 mtctr 22
337 addi 10, 6, 144
338 vcipher 15, 15, 16
339 vcipher 15, 15, 17
340 vcipher 15, 15, 18
341 vcipher 15, 15, 19
342 vcipher 15, 15, 20
343 vcipher 15, 15, 21
344 vcipher 15, 15, 28
345 vcipher 15, 15, 29
346
347__Loop_aes_1state:
348 lxv 32+1, 0(10)
349 vcipher 15, 15, 1
350 addi 10, 10, 16
351 bdnz __Loop_aes_1state
352 lxv 32+1, 0(10) # last round key
353 lxvb16x 11, 0, 14 # load input block
354 vcipherlast 15, 15, 1
355
356 xxlxor 32+15, 32+15, 11
357 stxvb16x 32+15, 0, 9 # store output
358 addi 14, 14, 16
359 addi 9, 9, 16
360
361 cmpdi 24, 0 # decrypt?
362 bne __Encrypt_1x
363 xxlor 15+32, 11, 11
364__Encrypt_1x:
365 vxor 15, 15, 0
366 PPC_GHASH1x 0, 15
367
368 addi 5, 5, -16
369 addi 11, 11, 16
370
371 vadduwm 30, 30, 31 # IV + counter
372 xxlxor 32+15, 32+30, 0
373 addi 12, 12, -1
374 cmpdi 12, 0
375 bgt __Loop_1x
376
377 stxvb16x 32+30, 0, 7 # update IV
378 stxvb16x 32+0, 0, 8 # update Xi
379 blr
380SYM_FUNC_END(aes_gcm_crypt_1x)
381
382################################################################################
383# Process a normal partial block when we come here.
384# Compute partial mask, Load and store partial block to stack.
385# Update partial_len and pblock.
386# pblock is (encrypted ^ AES state) for encrypt
387# and (input ^ AES state) for decrypt.
388#
389################################################################################
390SYM_FUNC_START_LOCAL(__Process_partial)
391
392 # create partial mask
393 vspltisb 16, -1
394 li 12, 16
395 sub 12, 12, 5
396 sldi 12, 12, 3
397 mtvsrdd 32+17, 0, 12
398 vslo 16, 16, 17 # partial block mask
399
400 lxvb16x 11, 0, 14 # load partial block
401 xxland 11, 11, 32+16
402
403 # AES crypt partial
404 xxlxor 32+15, 32+30, 0
405 lwz 23, 240(6) # n rounds
406 addi 22, 23, -1 # loop - 1
407 mtctr 22
408 addi 10, 6, 16
409
410__Loop_aes_pstate:
411 lxv 32+1, 0(10)
412 vcipher 15, 15, 1
413 addi 10, 10, 16
414 bdnz __Loop_aes_pstate
415 lxv 32+1, 0(10) # last round key
416 vcipherlast 15, 15, 1
417
418 xxlxor 32+15, 32+15, 11
419 vand 15, 15, 16
420
421 # AES crypt output v15
422 # Write partial
423 li 10, 224
424 stxvb16x 15+32, 10, 1 # write v15 to stack
425 addi 10, 1, 223
426 addi 12, 9, -1
427 mtctr 5 # partial block len
428__Write_partial:
429 lbzu 22, 1(10)
430 stbu 22, 1(12)
431 bdnz __Write_partial
432
433 cmpdi 24, 0 # decrypt?
434 bne __Encrypt_partial
435 xxlor 32+15, 11, 11 # decrypt using the input block
436__Encrypt_partial:
437 #vxor 15, 15, 0 # ^ previous hash
438 #PPC_GHASH1x 0, 15
439
440 add 14, 14, 5
441 add 9, 9, 5
442 std 5, 56(7) # update partial
443 sub 11, 11, 5
444 li 5, 0 # done last byte
445
446 #
447 # Don't increase IV since this is the last partial.
448 # It should get updated in gcm_update if no more data blocks.
449 #vadduwm 30, 30, 31 # increase IV
450 stxvb16x 32+30, 0, 7 # update IV
451 li 10, 64
452 stxvb16x 32+0, 0, 8 # Update X1
453 stxvb16x 32+15, 10, 7 # Update pblock
454 blr
455SYM_FUNC_END(__Process_partial)
456
457################################################################################
458# Combine partial blocks and ghash when we come here.
459#
460# The partial block has to be shifted to the right location to encrypt/decrypt
461# and compute ghash if combing the previous partial block is needed.
462# - Compute ghash for a full block. Clear Partial_len and pblock. Update IV.
463# Write Xi.
464# - Don't compute ghash if not full block. gcm_update will take care of it
465# is the last block. Update Partial_len and pblock.
466#
467################################################################################
468SYM_FUNC_START_LOCAL(__Combine_partial)
469
470 ld 12, 56(7)
471 mr 21, 5 # these bytes to be processed
472
473 li 17, 0
474 li 16, 16
475 sub 22, 16, 12 # bytes to complete a block
476 sub 17, 22, 5 # remaining bytes in a block
477 cmpdi 5, 16
478 ble __Inp_msg_less16
479 li 17, 0
480 mr 21, 22
481 b __Combine_continue
482__Inp_msg_less16:
483 cmpd 22, 5
484 bgt __Combine_continue
485 li 17, 0
486 mr 21, 22 # these bytes to be processed
487
488__Combine_continue:
489 # load msg and shift to the proper location and mask
490 vspltisb 16, -1
491 sldi 15, 12, 3
492 mtvsrdd 32+17, 0, 15
493 vslo 16, 16, 17
494 vsro 16, 16, 17
495 sldi 15, 17, 3
496 mtvsrdd 32+17, 0, 15
497 vsro 16, 16, 17
498 vslo 16, 16, 17 # mask
499
500 lxvb16x 32+19, 0, 14 # load partial block
501 sldi 15, 12, 3
502 mtvsrdd 32+17, 0, 15
503 vsro 19, 19, 17 # 0x00..xxxx??..??
504 sldi 15, 17, 3
505 mtvsrdd 32+17, 0, 15
506 vsro 19, 19, 17 # 0x00..xxxx
507 vslo 19, 19, 17 # shift back to form 0x00..xxxx00..00
508
509 # AES crypt partial
510 xxlxor 32+15, 32+30, 0
511 lwz 23, 240(6) # n rounds
512 addi 22, 23, -1 # loop - 1
513 mtctr 22
514 addi 10, 6, 16
515
516__Loop_aes_cpstate:
517 lxv 32+1, 0(10)
518 vcipher 15, 15, 1
519 addi 10, 10, 16
520 bdnz __Loop_aes_cpstate
521 lxv 32+1, 0(10) # last round key
522 vcipherlast 15, 15, 1
523
524 vxor 15, 15, 19
525 vand 15, 15, 16
526
527 # AES crypt output v15
528 # Write partial
529 li 10, 224
530 stxvb16x 15+32, 10, 1 # write v15 to stack
531 addi 10, 1, 223
532 add 10, 10, 12 # add offset
533 addi 15, 9, -1
534 mtctr 21 # partial block len
535__Write_combine_partial:
536 lbzu 22, 1(10)
537 stbu 22, 1(15)
538 bdnz __Write_combine_partial
539
540 add 14, 14, 21
541 add 11, 11, 21
542 add 9, 9, 21
543 sub 5, 5, 21
544
545 # Encrypt/Decrypt?
546 cmpdi 24, 0 # decrypt?
547 bne __Encrypt_combine_partial
548 vmr 15, 19 # decrypt using the input block
549
550__Encrypt_combine_partial:
551 #
552 # Update partial flag and combine ghash.
553__Update_partial_ghash:
554 li 10, 64
555 lxvb16x 32+17, 10, 7 # load previous pblock
556 add 12, 12, 21 # combined pprocessed
557 vxor 15, 15, 17 # combined pblock
558
559 cmpdi 12, 16
560 beq __Clear_partial_flag
561 std 12, 56(7) # update partial len
562 stxvb16x 32+15, 10, 7 # Update current pblock
563 blr
564
565__Clear_partial_flag:
566 li 12, 0
567 std 12, 56(7)
568 # Update IV and ghash here
569 vadduwm 30, 30, 31 # increase IV
570 stxvb16x 32+30, 0, 7 # update IV
571
572 # v15 either is either (input blockor encrypted)^(AES state)
573 vxor 15, 15, 0
574 PPC_GHASH1x 0, 15
575 stxvb16x 32+0, 10, 7 # update pblock for debug?
576 stxvb16x 32+0, 0, 8 # update Xi
577 blr
578SYM_FUNC_END(__Combine_partial)
579
580################################################################################
581# gcm_update(iv, Xi) - compute last hash
582#
583################################################################################
584SYM_FUNC_START(gcm_update)
585
586 ld 10, 56(3)
587 cmpdi 10, 0
588 beq __no_update
589
590 lxvb16x 32, 0, 4 # load Xi
591 # load Hash - h^4, h^3, h^2, h
592 li 10, 32
593 lxvd2x 2+32, 10, 4 # H Poli
594 li 10, 48
595 lxvd2x 3+32, 10, 4 # Hl
596 li 10, 64
597 lxvd2x 4+32, 10, 4 # H
598 li 10, 80
599 lxvd2x 5+32, 10, 4 # Hh
600
601 addis 11, 2, permx@toc@ha
602 addi 11, 11, permx@toc@l
603 lxv 10, 0(11) # vs10: vpermxor vector
604
605 li 9, 64
606 lxvb16x 32+6, 9, 3 # load pblock
607 vxor 6, 6, 0
608
609 vxor 1, 1, 1
610 vpmsumd 12, 3, 6 # L
611 vpmsumd 13, 4, 6 # M
612 vpmsumd 14, 5, 6 # H
613 vpmsumd 17, 12, 2 # reduction
614 vsldoi 15, 13, 1, 8 # mL
615 vsldoi 16, 1, 13, 8 # mH
616 vxor 12, 12, 15 # LL + LL
617 vxor 14, 14, 16 # HH + HH
618 xxlor 32+15, 10, 10
619 vpermxor 12, 12, 17, 15
620 vsldoi 13, 12, 12, 8 # swap
621 vpmsumd 12, 12, 2 # reduction
622 vxor 13, 13, 14
623 vxor 7, 12, 13
624
625 #vxor 0, 0, 0
626 #stxvb16x 32+0, 9, 3
627 li 10, 0
628 std 10, 56(3)
629 stxvb16x 32+7, 0, 4
630
631__no_update:
632 blr
633SYM_FUNC_END(gcm_update)
634
635################################################################################
636# aes_p10_gcm_encrypt (const void *inp, void *out, size_t len,
637# const char *rk, unsigned char iv[16], void *Xip);
638#
639# r3 - inp
640# r4 - out
641# r5 - len
642# r6 - AES round keys
643# r7 - iv and other data
644# r8 - Xi, HPoli, hash keys
645#
646# rounds is at offset 240 in rk
647# Xi is at 0 in gcm_table (Xip).
648#
649################################################################################
650SYM_FUNC_START(aes_p10_gcm_encrypt)
651
652 cmpdi 5, 0
653 ble __Invalid_msg_len
654
655 SAVE_REGS
656 LOAD_HASH_TABLE
657
658 # initialize ICB: GHASH( IV ), IV - r7
659 lxvb16x 30+32, 0, 7 # load IV - v30
660
661 mr 14, 3
662 mr 9, 4
663
664 # counter 1
665 vxor 31, 31, 31
666 vspltisb 22, 1
667 vsldoi 31, 31, 22,1 # counter 1
668
669 addis 11, 2, permx@toc@ha
670 addi 11, 11, permx@toc@l
671 lxv 10, 0(11) # vs10: vpermxor vector
672 li 11, 0
673
674 # load 9 round keys to VSR
675 lxv 0, 0(6) # round key 0
676 lxv 1, 16(6) # round key 1
677 lxv 2, 32(6) # round key 2
678 lxv 3, 48(6) # round key 3
679 lxv 4, 64(6) # round key 4
680 lxv 5, 80(6) # round key 5
681 lxv 6, 96(6) # round key 6
682 lxv 7, 112(6) # round key 7
683 lxv 8, 128(6) # round key 8
684
685 # load rounds - 10 (128), 12 (192), 14 (256)
686 lwz 23, 240(6) # n rounds
687 li 24, 1 # encrypt
688
689__Process_encrypt:
690 #
691 # Process different blocks
692 #
693 ld 12, 56(7)
694 cmpdi 12, 0
695 bgt __Do_combine_enc
696 cmpdi 5, 128
697 blt __Process_more_enc
698
699#
700# Process 8x AES/GCM blocks
701#
702__Process_8x_enc:
703 # 8x blcoks
704 li 10, 128
705 divdu 12, 5, 10 # n 128 bytes-blocks
706
707 addi 12, 12, -1 # loop - 1
708
709 vmr 15, 30 # first state: IV
710 vadduwm 16, 15, 31 # state + counter
711 vadduwm 17, 16, 31
712 vadduwm 18, 17, 31
713 vadduwm 19, 18, 31
714 vadduwm 20, 19, 31
715 vadduwm 21, 20, 31
716 vadduwm 22, 21, 31
717 xxlor 9, 32+22, 32+22 # save last state
718
719 # vxor state, state, w # addroundkey
720 xxlor 32+29, 0, 0
721 vxor 15, 15, 29 # IV + round key - add round key 0
722 vxor 16, 16, 29
723 vxor 17, 17, 29
724 vxor 18, 18, 29
725 vxor 19, 19, 29
726 vxor 20, 20, 29
727 vxor 21, 21, 29
728 vxor 22, 22, 29
729
730 li 15, 16
731 li 16, 32
732 li 17, 48
733 li 18, 64
734 li 19, 80
735 li 20, 96
736 li 21, 112
737
738 #
739 # Pre-compute first 8 AES state and leave 1/3/5 more rounds
740 # for the loop.
741 #
742 addi 22, 23, -9 # process 8 keys
743 mtctr 22 # AES key loop
744 addi 10, 6, 144
745
746 LOOP_8AES_STATE # process 8 AES keys
747
748__PreLoop_aes_state:
749 lxv 32+1, 0(10) # round key
750 AES_CIPHER_8x vcipher 15 1
751 addi 10, 10, 16
752 bdnz __PreLoop_aes_state
753 lxv 32+1, 0(10) # last round key (v1)
754
755 cmpdi 12, 0 # Only one loop (8 block)
756 beq __Finish_ghash
757
758#
759# Loop 8x blocks and compute ghash
760#
761__Loop_8x_block_enc:
762 vcipherlast 15, 15, 1
763 vcipherlast 16, 16, 1
764 vcipherlast 17, 17, 1
765 vcipherlast 18, 18, 1
766 vcipherlast 19, 19, 1
767 vcipherlast 20, 20, 1
768 vcipherlast 21, 21, 1
769 vcipherlast 22, 22, 1
770
771 lxvb16x 32+23, 0, 14 # load block
772 lxvb16x 32+24, 15, 14 # load block
773 lxvb16x 32+25, 16, 14 # load block
774 lxvb16x 32+26, 17, 14 # load block
775 lxvb16x 32+27, 18, 14 # load block
776 lxvb16x 32+28, 19, 14 # load block
777 lxvb16x 32+29, 20, 14 # load block
778 lxvb16x 32+30, 21, 14 # load block
779 addi 14, 14, 128
780
781 vxor 15, 15, 23
782 vxor 16, 16, 24
783 vxor 17, 17, 25
784 vxor 18, 18, 26
785 vxor 19, 19, 27
786 vxor 20, 20, 28
787 vxor 21, 21, 29
788 vxor 22, 22, 30
789
790 stxvb16x 47, 0, 9 # store output
791 stxvb16x 48, 15, 9 # store output
792 stxvb16x 49, 16, 9 # store output
793 stxvb16x 50, 17, 9 # store output
794 stxvb16x 51, 18, 9 # store output
795 stxvb16x 52, 19, 9 # store output
796 stxvb16x 53, 20, 9 # store output
797 stxvb16x 54, 21, 9 # store output
798 addi 9, 9, 128
799
800 # ghash here
801 vxor 15, 15, 0
802 PPC_GHASH4x 0, 15, 16, 17, 18
803
804 vxor 19, 19, 0
805 PPC_GHASH4x 0, 19, 20, 21, 22
806
807 xxlor 32+15, 9, 9 # last state
808 vadduwm 15, 15, 31 # state + counter
809 vadduwm 16, 15, 31
810 vadduwm 17, 16, 31
811 vadduwm 18, 17, 31
812 vadduwm 19, 18, 31
813 vadduwm 20, 19, 31
814 vadduwm 21, 20, 31
815 vadduwm 22, 21, 31
816 xxlor 9, 32+22, 32+22 # save last state
817
818 xxlor 32+27, 0, 0 # restore roundkey 0
819 vxor 15, 15, 27 # IV + round key - add round key 0
820 vxor 16, 16, 27
821 vxor 17, 17, 27
822 vxor 18, 18, 27
823 vxor 19, 19, 27
824 vxor 20, 20, 27
825 vxor 21, 21, 27
826 vxor 22, 22, 27
827
828 addi 5, 5, -128
829 addi 11, 11, 128
830
831 LOOP_8AES_STATE # process 8 AES keys
832 mtctr 22 # AES key loop
833 addi 10, 6, 144
834__LastLoop_aes_state:
835 lxv 32+1, 0(10) # round key
836 AES_CIPHER_8x vcipher 15 1
837 addi 10, 10, 16
838 bdnz __LastLoop_aes_state
839 lxv 32+1, 0(10) # last round key (v1)
840
841 addi 12, 12, -1
842 cmpdi 12, 0
843 bne __Loop_8x_block_enc
844
845__Finish_ghash:
846 vcipherlast 15, 15, 1
847 vcipherlast 16, 16, 1
848 vcipherlast 17, 17, 1
849 vcipherlast 18, 18, 1
850 vcipherlast 19, 19, 1
851 vcipherlast 20, 20, 1
852 vcipherlast 21, 21, 1
853 vcipherlast 22, 22, 1
854
855 lxvb16x 32+23, 0, 14 # load block
856 lxvb16x 32+24, 15, 14 # load block
857 lxvb16x 32+25, 16, 14 # load block
858 lxvb16x 32+26, 17, 14 # load block
859 lxvb16x 32+27, 18, 14 # load block
860 lxvb16x 32+28, 19, 14 # load block
861 lxvb16x 32+29, 20, 14 # load block
862 lxvb16x 32+30, 21, 14 # load block
863 addi 14, 14, 128
864
865 vxor 15, 15, 23
866 vxor 16, 16, 24
867 vxor 17, 17, 25
868 vxor 18, 18, 26
869 vxor 19, 19, 27
870 vxor 20, 20, 28
871 vxor 21, 21, 29
872 vxor 22, 22, 30
873
874 stxvb16x 47, 0, 9 # store output
875 stxvb16x 48, 15, 9 # store output
876 stxvb16x 49, 16, 9 # store output
877 stxvb16x 50, 17, 9 # store output
878 stxvb16x 51, 18, 9 # store output
879 stxvb16x 52, 19, 9 # store output
880 stxvb16x 53, 20, 9 # store output
881 stxvb16x 54, 21, 9 # store output
882 addi 9, 9, 128
883
884 vxor 15, 15, 0
885 PPC_GHASH4x 0, 15, 16, 17, 18
886
887 vxor 19, 19, 0
888 PPC_GHASH4x 0, 19, 20, 21, 22
889
890 xxlor 30+32, 9, 9 # last ctr
891 vadduwm 30, 30, 31 # increase ctr
892 stxvb16x 32+30, 0, 7 # update IV
893 stxvb16x 32+0, 0, 8 # update Xi
894
895 addi 5, 5, -128
896 addi 11, 11, 128
897
898 #
899 # Done 8x blocks
900 #
901
902 cmpdi 5, 0
903 beq aes_gcm_out
904
905__Process_more_enc:
906 li 24, 1 # encrypt
907 bl aes_gcm_crypt_1x
908 cmpdi 5, 0
909 beq aes_gcm_out
910
911 bl __Process_partial
912 cmpdi 5, 0
913 beq aes_gcm_out
914__Do_combine_enc:
915 bl __Combine_partial
916 cmpdi 5, 0
917 bgt __Process_encrypt
918 b aes_gcm_out
919
920SYM_FUNC_END(aes_p10_gcm_encrypt)
921
922################################################################################
923# aes_p10_gcm_decrypt (const void *inp, void *out, size_t len,
924# const char *rk, unsigned char iv[16], void *Xip);
925# 8x Decrypt
926#
927################################################################################
928SYM_FUNC_START(aes_p10_gcm_decrypt)
929
930 cmpdi 5, 0
931 ble __Invalid_msg_len
932
933 SAVE_REGS
934 LOAD_HASH_TABLE
935
936 # initialize ICB: GHASH( IV ), IV - r7
937 lxvb16x 30+32, 0, 7 # load IV - v30
938
939 mr 14, 3
940 mr 9, 4
941
942 # counter 1
943 vxor 31, 31, 31
944 vspltisb 22, 1
945 vsldoi 31, 31, 22,1 # counter 1
946
947 addis 11, 2, permx@toc@ha
948 addi 11, 11, permx@toc@l
949 lxv 10, 0(11) # vs10: vpermxor vector
950 li 11, 0
951
952 # load 9 round keys to VSR
953 lxv 0, 0(6) # round key 0
954 lxv 1, 16(6) # round key 1
955 lxv 2, 32(6) # round key 2
956 lxv 3, 48(6) # round key 3
957 lxv 4, 64(6) # round key 4
958 lxv 5, 80(6) # round key 5
959 lxv 6, 96(6) # round key 6
960 lxv 7, 112(6) # round key 7
961 lxv 8, 128(6) # round key 8
962
963 # load rounds - 10 (128), 12 (192), 14 (256)
964 lwz 23, 240(6) # n rounds
965 li 24, 0 # decrypt
966
967__Process_decrypt:
968 #
969 # Process different blocks
970 #
971 ld 12, 56(7)
972 cmpdi 12, 0
973 bgt __Do_combine_dec
974 cmpdi 5, 128
975 blt __Process_more_dec
976
977#
978# Process 8x AES/GCM blocks
979#
980__Process_8x_dec:
981 # 8x blcoks
982 li 10, 128
983 divdu 12, 5, 10 # n 128 bytes-blocks
984
985 addi 12, 12, -1 # loop - 1
986
987 vmr 15, 30 # first state: IV
988 vadduwm 16, 15, 31 # state + counter
989 vadduwm 17, 16, 31
990 vadduwm 18, 17, 31
991 vadduwm 19, 18, 31
992 vadduwm 20, 19, 31
993 vadduwm 21, 20, 31
994 vadduwm 22, 21, 31
995 xxlor 9, 32+22, 32+22 # save last state
996
997 # vxor state, state, w # addroundkey
998 xxlor 32+29, 0, 0
999 vxor 15, 15, 29 # IV + round key - add round key 0
1000 vxor 16, 16, 29
1001 vxor 17, 17, 29
1002 vxor 18, 18, 29
1003 vxor 19, 19, 29
1004 vxor 20, 20, 29
1005 vxor 21, 21, 29
1006 vxor 22, 22, 29
1007
1008 li 15, 16
1009 li 16, 32
1010 li 17, 48
1011 li 18, 64
1012 li 19, 80
1013 li 20, 96
1014 li 21, 112
1015
1016 #
1017 # Pre-compute first 8 AES state and leave 1/3/5 more rounds
1018 # for the loop.
1019 #
1020 addi 22, 23, -9 # process 8 keys
1021 mtctr 22 # AES key loop
1022 addi 10, 6, 144
1023
1024 LOOP_8AES_STATE # process 8 AES keys
1025
1026__PreLoop_aes_state_dec:
1027 lxv 32+1, 0(10) # round key
1028 AES_CIPHER_8x vcipher 15 1
1029 addi 10, 10, 16
1030 bdnz __PreLoop_aes_state_dec
1031 lxv 32+1, 0(10) # last round key (v1)
1032
1033 cmpdi 12, 0 # Only one loop (8 block)
1034 beq __Finish_ghash_dec
1035
1036#
1037# Loop 8x blocks and compute ghash
1038#
1039__Loop_8x_block_dec:
1040 vcipherlast 15, 15, 1
1041 vcipherlast 16, 16, 1
1042 vcipherlast 17, 17, 1
1043 vcipherlast 18, 18, 1
1044 vcipherlast 19, 19, 1
1045 vcipherlast 20, 20, 1
1046 vcipherlast 21, 21, 1
1047 vcipherlast 22, 22, 1
1048
1049 lxvb16x 32+23, 0, 14 # load block
1050 lxvb16x 32+24, 15, 14 # load block
1051 lxvb16x 32+25, 16, 14 # load block
1052 lxvb16x 32+26, 17, 14 # load block
1053 lxvb16x 32+27, 18, 14 # load block
1054 lxvb16x 32+28, 19, 14 # load block
1055 lxvb16x 32+29, 20, 14 # load block
1056 lxvb16x 32+30, 21, 14 # load block
1057 addi 14, 14, 128
1058
1059 vxor 15, 15, 23
1060 vxor 16, 16, 24
1061 vxor 17, 17, 25
1062 vxor 18, 18, 26
1063 vxor 19, 19, 27
1064 vxor 20, 20, 28
1065 vxor 21, 21, 29
1066 vxor 22, 22, 30
1067
1068 stxvb16x 47, 0, 9 # store output
1069 stxvb16x 48, 15, 9 # store output
1070 stxvb16x 49, 16, 9 # store output
1071 stxvb16x 50, 17, 9 # store output
1072 stxvb16x 51, 18, 9 # store output
1073 stxvb16x 52, 19, 9 # store output
1074 stxvb16x 53, 20, 9 # store output
1075 stxvb16x 54, 21, 9 # store output
1076
1077 addi 9, 9, 128
1078
1079 vmr 15, 23
1080 vmr 16, 24
1081 vmr 17, 25
1082 vmr 18, 26
1083 vmr 19, 27
1084 vmr 20, 28
1085 vmr 21, 29
1086 vmr 22, 30
1087
1088 # ghash here
1089 vxor 15, 15, 0
1090 PPC_GHASH4x 0, 15, 16, 17, 18
1091
1092 vxor 19, 19, 0
1093 PPC_GHASH4x 0, 19, 20, 21, 22
1094
1095 xxlor 32+15, 9, 9 # last state
1096 vadduwm 15, 15, 31 # state + counter
1097 vadduwm 16, 15, 31
1098 vadduwm 17, 16, 31
1099 vadduwm 18, 17, 31
1100 vadduwm 19, 18, 31
1101 vadduwm 20, 19, 31
1102 vadduwm 21, 20, 31
1103 vadduwm 22, 21, 31
1104 xxlor 9, 32+22, 32+22 # save last state
1105
1106 xxlor 32+27, 0, 0 # restore roundkey 0
1107 vxor 15, 15, 27 # IV + round key - add round key 0
1108 vxor 16, 16, 27
1109 vxor 17, 17, 27
1110 vxor 18, 18, 27
1111 vxor 19, 19, 27
1112 vxor 20, 20, 27
1113 vxor 21, 21, 27
1114 vxor 22, 22, 27
1115
1116 addi 5, 5, -128
1117 addi 11, 11, 128
1118
1119 LOOP_8AES_STATE # process 8 AES keys
1120 mtctr 22 # AES key loop
1121 addi 10, 6, 144
1122__LastLoop_aes_state_dec:
1123 lxv 32+1, 0(10) # round key
1124 AES_CIPHER_8x vcipher 15 1
1125 addi 10, 10, 16
1126 bdnz __LastLoop_aes_state_dec
1127 lxv 32+1, 0(10) # last round key (v1)
1128
1129 addi 12, 12, -1
1130 cmpdi 12, 0
1131 bne __Loop_8x_block_dec
1132
1133__Finish_ghash_dec:
1134 vcipherlast 15, 15, 1
1135 vcipherlast 16, 16, 1
1136 vcipherlast 17, 17, 1
1137 vcipherlast 18, 18, 1
1138 vcipherlast 19, 19, 1
1139 vcipherlast 20, 20, 1
1140 vcipherlast 21, 21, 1
1141 vcipherlast 22, 22, 1
1142
1143 lxvb16x 32+23, 0, 14 # load block
1144 lxvb16x 32+24, 15, 14 # load block
1145 lxvb16x 32+25, 16, 14 # load block
1146 lxvb16x 32+26, 17, 14 # load block
1147 lxvb16x 32+27, 18, 14 # load block
1148 lxvb16x 32+28, 19, 14 # load block
1149 lxvb16x 32+29, 20, 14 # load block
1150 lxvb16x 32+30, 21, 14 # load block
1151 addi 14, 14, 128
1152
1153 vxor 15, 15, 23
1154 vxor 16, 16, 24
1155 vxor 17, 17, 25
1156 vxor 18, 18, 26
1157 vxor 19, 19, 27
1158 vxor 20, 20, 28
1159 vxor 21, 21, 29
1160 vxor 22, 22, 30
1161
1162 stxvb16x 47, 0, 9 # store output
1163 stxvb16x 48, 15, 9 # store output
1164 stxvb16x 49, 16, 9 # store output
1165 stxvb16x 50, 17, 9 # store output
1166 stxvb16x 51, 18, 9 # store output
1167 stxvb16x 52, 19, 9 # store output
1168 stxvb16x 53, 20, 9 # store output
1169 stxvb16x 54, 21, 9 # store output
1170 addi 9, 9, 128
1171
1172 #vmr 15, 23
1173 vxor 15, 23, 0
1174 vmr 16, 24
1175 vmr 17, 25
1176 vmr 18, 26
1177 vmr 19, 27
1178 vmr 20, 28
1179 vmr 21, 29
1180 vmr 22, 30
1181
1182 #vxor 15, 15, 0
1183 PPC_GHASH4x 0, 15, 16, 17, 18
1184
1185 vxor 19, 19, 0
1186 PPC_GHASH4x 0, 19, 20, 21, 22
1187
1188 xxlor 30+32, 9, 9 # last ctr
1189 vadduwm 30, 30, 31 # increase ctr
1190 stxvb16x 32+30, 0, 7 # update IV
1191 stxvb16x 32+0, 0, 8 # update Xi
1192
1193 addi 5, 5, -128
1194 addi 11, 11, 128
1195
1196 #
1197 # Done 8x blocks
1198 #
1199
1200 cmpdi 5, 0
1201 beq aes_gcm_out
1202
1203__Process_more_dec:
1204 li 24, 0 # decrypt
1205 bl aes_gcm_crypt_1x
1206 cmpdi 5, 0
1207 beq aes_gcm_out
1208
1209 bl __Process_partial
1210 cmpdi 5, 0
1211 beq aes_gcm_out
1212__Do_combine_dec:
1213 bl __Combine_partial
1214 cmpdi 5, 0
1215 bgt __Process_decrypt
1216 b aes_gcm_out
1217SYM_FUNC_END(aes_p10_gcm_decrypt)
1218
1219SYM_FUNC_START_LOCAL(aes_gcm_out)
1220
1221 mr 3, 11 # return count
1222
1223 RESTORE_REGS
1224 blr
1225
1226__Invalid_msg_len:
1227 li 3, 0
1228 blr
1229SYM_FUNC_END(aes_gcm_out)
1230
1231SYM_DATA_START_LOCAL(PERMX)
1232.align 4
1233# for vector permute and xor
1234permx:
1235.long 0x4c5d6e7f, 0x08192a3b, 0xc4d5e6f7, 0x8091a2b3
1236SYM_DATA_END(permx)