Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2#
3# Accelerated chacha20 implementation for ppc64le.
4#
5# Copyright 2023- IBM Corp. All rights reserved
6#
7#===================================================================================
8# Written by Danny Tsen <dtsen@us.ibm.com>
9#
10# do rounds, 8 quarter rounds
11# 1. a += b; d ^= a; d <<<= 16;
12# 2. c += d; b ^= c; b <<<= 12;
13# 3. a += b; d ^= a; d <<<= 8;
14# 4. c += d; b ^= c; b <<<= 7
15#
16# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16
17# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12
18# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8
19# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7
20#
21# 4 blocks (a b c d)
22#
23# a0 b0 c0 d0
24# a1 b1 c1 d1
25# ...
26# a4 b4 c4 d4
27# ...
28# a8 b8 c8 d8
29# ...
30# a12 b12 c12 d12
31# a13 ...
32# a14 ...
33# a15 b15 c15 d15
34#
35# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
36# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
37#
38
39#include <asm/ppc_asm.h>
40#include <asm/asm-offsets.h>
41#include <asm/asm-compat.h>
42#include <linux/linkage.h>
43
44.machine "any"
45.text
46
47.macro SAVE_GPR GPR OFFSET FRAME
48 std \GPR,\OFFSET(\FRAME)
49.endm
50
51.macro SAVE_VRS VRS OFFSET FRAME
52 li 16, \OFFSET
53 stvx \VRS, 16, \FRAME
54.endm
55
56.macro SAVE_VSX VSX OFFSET FRAME
57 li 16, \OFFSET
58 stxvx \VSX, 16, \FRAME
59.endm
60
61.macro RESTORE_GPR GPR OFFSET FRAME
62 ld \GPR,\OFFSET(\FRAME)
63.endm
64
65.macro RESTORE_VRS VRS OFFSET FRAME
66 li 16, \OFFSET
67 lvx \VRS, 16, \FRAME
68.endm
69
70.macro RESTORE_VSX VSX OFFSET FRAME
71 li 16, \OFFSET
72 lxvx \VSX, 16, \FRAME
73.endm
74
75.macro SAVE_REGS
76 mflr 0
77 std 0, 16(1)
78 stdu 1,-752(1)
79
80 SAVE_GPR 14, 112, 1
81 SAVE_GPR 15, 120, 1
82 SAVE_GPR 16, 128, 1
83 SAVE_GPR 17, 136, 1
84 SAVE_GPR 18, 144, 1
85 SAVE_GPR 19, 152, 1
86 SAVE_GPR 20, 160, 1
87 SAVE_GPR 21, 168, 1
88 SAVE_GPR 22, 176, 1
89 SAVE_GPR 23, 184, 1
90 SAVE_GPR 24, 192, 1
91 SAVE_GPR 25, 200, 1
92 SAVE_GPR 26, 208, 1
93 SAVE_GPR 27, 216, 1
94 SAVE_GPR 28, 224, 1
95 SAVE_GPR 29, 232, 1
96 SAVE_GPR 30, 240, 1
97 SAVE_GPR 31, 248, 1
98
99 addi 9, 1, 256
100 SAVE_VRS 20, 0, 9
101 SAVE_VRS 21, 16, 9
102 SAVE_VRS 22, 32, 9
103 SAVE_VRS 23, 48, 9
104 SAVE_VRS 24, 64, 9
105 SAVE_VRS 25, 80, 9
106 SAVE_VRS 26, 96, 9
107 SAVE_VRS 27, 112, 9
108 SAVE_VRS 28, 128, 9
109 SAVE_VRS 29, 144, 9
110 SAVE_VRS 30, 160, 9
111 SAVE_VRS 31, 176, 9
112
113 SAVE_VSX 14, 192, 9
114 SAVE_VSX 15, 208, 9
115 SAVE_VSX 16, 224, 9
116 SAVE_VSX 17, 240, 9
117 SAVE_VSX 18, 256, 9
118 SAVE_VSX 19, 272, 9
119 SAVE_VSX 20, 288, 9
120 SAVE_VSX 21, 304, 9
121 SAVE_VSX 22, 320, 9
122 SAVE_VSX 23, 336, 9
123 SAVE_VSX 24, 352, 9
124 SAVE_VSX 25, 368, 9
125 SAVE_VSX 26, 384, 9
126 SAVE_VSX 27, 400, 9
127 SAVE_VSX 28, 416, 9
128 SAVE_VSX 29, 432, 9
129 SAVE_VSX 30, 448, 9
130 SAVE_VSX 31, 464, 9
131.endm # SAVE_REGS
132
133.macro RESTORE_REGS
134 addi 9, 1, 256
135 RESTORE_VRS 20, 0, 9
136 RESTORE_VRS 21, 16, 9
137 RESTORE_VRS 22, 32, 9
138 RESTORE_VRS 23, 48, 9
139 RESTORE_VRS 24, 64, 9
140 RESTORE_VRS 25, 80, 9
141 RESTORE_VRS 26, 96, 9
142 RESTORE_VRS 27, 112, 9
143 RESTORE_VRS 28, 128, 9
144 RESTORE_VRS 29, 144, 9
145 RESTORE_VRS 30, 160, 9
146 RESTORE_VRS 31, 176, 9
147
148 RESTORE_VSX 14, 192, 9
149 RESTORE_VSX 15, 208, 9
150 RESTORE_VSX 16, 224, 9
151 RESTORE_VSX 17, 240, 9
152 RESTORE_VSX 18, 256, 9
153 RESTORE_VSX 19, 272, 9
154 RESTORE_VSX 20, 288, 9
155 RESTORE_VSX 21, 304, 9
156 RESTORE_VSX 22, 320, 9
157 RESTORE_VSX 23, 336, 9
158 RESTORE_VSX 24, 352, 9
159 RESTORE_VSX 25, 368, 9
160 RESTORE_VSX 26, 384, 9
161 RESTORE_VSX 27, 400, 9
162 RESTORE_VSX 28, 416, 9
163 RESTORE_VSX 29, 432, 9
164 RESTORE_VSX 30, 448, 9
165 RESTORE_VSX 31, 464, 9
166
167 RESTORE_GPR 14, 112, 1
168 RESTORE_GPR 15, 120, 1
169 RESTORE_GPR 16, 128, 1
170 RESTORE_GPR 17, 136, 1
171 RESTORE_GPR 18, 144, 1
172 RESTORE_GPR 19, 152, 1
173 RESTORE_GPR 20, 160, 1
174 RESTORE_GPR 21, 168, 1
175 RESTORE_GPR 22, 176, 1
176 RESTORE_GPR 23, 184, 1
177 RESTORE_GPR 24, 192, 1
178 RESTORE_GPR 25, 200, 1
179 RESTORE_GPR 26, 208, 1
180 RESTORE_GPR 27, 216, 1
181 RESTORE_GPR 28, 224, 1
182 RESTORE_GPR 29, 232, 1
183 RESTORE_GPR 30, 240, 1
184 RESTORE_GPR 31, 248, 1
185
186 addi 1, 1, 752
187 ld 0, 16(1)
188 mtlr 0
189.endm # RESTORE_REGS
190
191.macro QT_loop_8x
192 # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
193 xxlor 0, 32+25, 32+25
194 xxlor 32+25, 20, 20
195 vadduwm 0, 0, 4
196 vadduwm 1, 1, 5
197 vadduwm 2, 2, 6
198 vadduwm 3, 3, 7
199 vadduwm 16, 16, 20
200 vadduwm 17, 17, 21
201 vadduwm 18, 18, 22
202 vadduwm 19, 19, 23
203
204 vpermxor 12, 12, 0, 25
205 vpermxor 13, 13, 1, 25
206 vpermxor 14, 14, 2, 25
207 vpermxor 15, 15, 3, 25
208 vpermxor 28, 28, 16, 25
209 vpermxor 29, 29, 17, 25
210 vpermxor 30, 30, 18, 25
211 vpermxor 31, 31, 19, 25
212 xxlor 32+25, 0, 0
213 vadduwm 8, 8, 12
214 vadduwm 9, 9, 13
215 vadduwm 10, 10, 14
216 vadduwm 11, 11, 15
217 vadduwm 24, 24, 28
218 vadduwm 25, 25, 29
219 vadduwm 26, 26, 30
220 vadduwm 27, 27, 31
221 vxor 4, 4, 8
222 vxor 5, 5, 9
223 vxor 6, 6, 10
224 vxor 7, 7, 11
225 vxor 20, 20, 24
226 vxor 21, 21, 25
227 vxor 22, 22, 26
228 vxor 23, 23, 27
229
230 xxlor 0, 32+25, 32+25
231 xxlor 32+25, 21, 21
232 vrlw 4, 4, 25 #
233 vrlw 5, 5, 25
234 vrlw 6, 6, 25
235 vrlw 7, 7, 25
236 vrlw 20, 20, 25 #
237 vrlw 21, 21, 25
238 vrlw 22, 22, 25
239 vrlw 23, 23, 25
240 xxlor 32+25, 0, 0
241 vadduwm 0, 0, 4
242 vadduwm 1, 1, 5
243 vadduwm 2, 2, 6
244 vadduwm 3, 3, 7
245 vadduwm 16, 16, 20
246 vadduwm 17, 17, 21
247 vadduwm 18, 18, 22
248 vadduwm 19, 19, 23
249
250 xxlor 0, 32+25, 32+25
251 xxlor 32+25, 22, 22
252 vpermxor 12, 12, 0, 25
253 vpermxor 13, 13, 1, 25
254 vpermxor 14, 14, 2, 25
255 vpermxor 15, 15, 3, 25
256 vpermxor 28, 28, 16, 25
257 vpermxor 29, 29, 17, 25
258 vpermxor 30, 30, 18, 25
259 vpermxor 31, 31, 19, 25
260 xxlor 32+25, 0, 0
261 vadduwm 8, 8, 12
262 vadduwm 9, 9, 13
263 vadduwm 10, 10, 14
264 vadduwm 11, 11, 15
265 vadduwm 24, 24, 28
266 vadduwm 25, 25, 29
267 vadduwm 26, 26, 30
268 vadduwm 27, 27, 31
269 xxlor 0, 32+28, 32+28
270 xxlor 32+28, 23, 23
271 vxor 4, 4, 8
272 vxor 5, 5, 9
273 vxor 6, 6, 10
274 vxor 7, 7, 11
275 vxor 20, 20, 24
276 vxor 21, 21, 25
277 vxor 22, 22, 26
278 vxor 23, 23, 27
279 vrlw 4, 4, 28 #
280 vrlw 5, 5, 28
281 vrlw 6, 6, 28
282 vrlw 7, 7, 28
283 vrlw 20, 20, 28 #
284 vrlw 21, 21, 28
285 vrlw 22, 22, 28
286 vrlw 23, 23, 28
287 xxlor 32+28, 0, 0
288
289 # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
290 xxlor 0, 32+25, 32+25
291 xxlor 32+25, 20, 20
292 vadduwm 0, 0, 5
293 vadduwm 1, 1, 6
294 vadduwm 2, 2, 7
295 vadduwm 3, 3, 4
296 vadduwm 16, 16, 21
297 vadduwm 17, 17, 22
298 vadduwm 18, 18, 23
299 vadduwm 19, 19, 20
300
301 vpermxor 15, 15, 0, 25
302 vpermxor 12, 12, 1, 25
303 vpermxor 13, 13, 2, 25
304 vpermxor 14, 14, 3, 25
305 vpermxor 31, 31, 16, 25
306 vpermxor 28, 28, 17, 25
307 vpermxor 29, 29, 18, 25
308 vpermxor 30, 30, 19, 25
309
310 xxlor 32+25, 0, 0
311 vadduwm 10, 10, 15
312 vadduwm 11, 11, 12
313 vadduwm 8, 8, 13
314 vadduwm 9, 9, 14
315 vadduwm 26, 26, 31
316 vadduwm 27, 27, 28
317 vadduwm 24, 24, 29
318 vadduwm 25, 25, 30
319 vxor 5, 5, 10
320 vxor 6, 6, 11
321 vxor 7, 7, 8
322 vxor 4, 4, 9
323 vxor 21, 21, 26
324 vxor 22, 22, 27
325 vxor 23, 23, 24
326 vxor 20, 20, 25
327
328 xxlor 0, 32+25, 32+25
329 xxlor 32+25, 21, 21
330 vrlw 5, 5, 25
331 vrlw 6, 6, 25
332 vrlw 7, 7, 25
333 vrlw 4, 4, 25
334 vrlw 21, 21, 25
335 vrlw 22, 22, 25
336 vrlw 23, 23, 25
337 vrlw 20, 20, 25
338 xxlor 32+25, 0, 0
339
340 vadduwm 0, 0, 5
341 vadduwm 1, 1, 6
342 vadduwm 2, 2, 7
343 vadduwm 3, 3, 4
344 vadduwm 16, 16, 21
345 vadduwm 17, 17, 22
346 vadduwm 18, 18, 23
347 vadduwm 19, 19, 20
348
349 xxlor 0, 32+25, 32+25
350 xxlor 32+25, 22, 22
351 vpermxor 15, 15, 0, 25
352 vpermxor 12, 12, 1, 25
353 vpermxor 13, 13, 2, 25
354 vpermxor 14, 14, 3, 25
355 vpermxor 31, 31, 16, 25
356 vpermxor 28, 28, 17, 25
357 vpermxor 29, 29, 18, 25
358 vpermxor 30, 30, 19, 25
359 xxlor 32+25, 0, 0
360
361 vadduwm 10, 10, 15
362 vadduwm 11, 11, 12
363 vadduwm 8, 8, 13
364 vadduwm 9, 9, 14
365 vadduwm 26, 26, 31
366 vadduwm 27, 27, 28
367 vadduwm 24, 24, 29
368 vadduwm 25, 25, 30
369
370 xxlor 0, 32+28, 32+28
371 xxlor 32+28, 23, 23
372 vxor 5, 5, 10
373 vxor 6, 6, 11
374 vxor 7, 7, 8
375 vxor 4, 4, 9
376 vxor 21, 21, 26
377 vxor 22, 22, 27
378 vxor 23, 23, 24
379 vxor 20, 20, 25
380 vrlw 5, 5, 28
381 vrlw 6, 6, 28
382 vrlw 7, 7, 28
383 vrlw 4, 4, 28
384 vrlw 21, 21, 28
385 vrlw 22, 22, 28
386 vrlw 23, 23, 28
387 vrlw 20, 20, 28
388 xxlor 32+28, 0, 0
389.endm
390
391.macro QT_loop_4x
392 # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
393 vadduwm 0, 0, 4
394 vadduwm 1, 1, 5
395 vadduwm 2, 2, 6
396 vadduwm 3, 3, 7
397 vpermxor 12, 12, 0, 20
398 vpermxor 13, 13, 1, 20
399 vpermxor 14, 14, 2, 20
400 vpermxor 15, 15, 3, 20
401 vadduwm 8, 8, 12
402 vadduwm 9, 9, 13
403 vadduwm 10, 10, 14
404 vadduwm 11, 11, 15
405 vxor 4, 4, 8
406 vxor 5, 5, 9
407 vxor 6, 6, 10
408 vxor 7, 7, 11
409 vrlw 4, 4, 21
410 vrlw 5, 5, 21
411 vrlw 6, 6, 21
412 vrlw 7, 7, 21
413 vadduwm 0, 0, 4
414 vadduwm 1, 1, 5
415 vadduwm 2, 2, 6
416 vadduwm 3, 3, 7
417 vpermxor 12, 12, 0, 22
418 vpermxor 13, 13, 1, 22
419 vpermxor 14, 14, 2, 22
420 vpermxor 15, 15, 3, 22
421 vadduwm 8, 8, 12
422 vadduwm 9, 9, 13
423 vadduwm 10, 10, 14
424 vadduwm 11, 11, 15
425 vxor 4, 4, 8
426 vxor 5, 5, 9
427 vxor 6, 6, 10
428 vxor 7, 7, 11
429 vrlw 4, 4, 23
430 vrlw 5, 5, 23
431 vrlw 6, 6, 23
432 vrlw 7, 7, 23
433
434 # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
435 vadduwm 0, 0, 5
436 vadduwm 1, 1, 6
437 vadduwm 2, 2, 7
438 vadduwm 3, 3, 4
439 vpermxor 15, 15, 0, 20
440 vpermxor 12, 12, 1, 20
441 vpermxor 13, 13, 2, 20
442 vpermxor 14, 14, 3, 20
443 vadduwm 10, 10, 15
444 vadduwm 11, 11, 12
445 vadduwm 8, 8, 13
446 vadduwm 9, 9, 14
447 vxor 5, 5, 10
448 vxor 6, 6, 11
449 vxor 7, 7, 8
450 vxor 4, 4, 9
451 vrlw 5, 5, 21
452 vrlw 6, 6, 21
453 vrlw 7, 7, 21
454 vrlw 4, 4, 21
455 vadduwm 0, 0, 5
456 vadduwm 1, 1, 6
457 vadduwm 2, 2, 7
458 vadduwm 3, 3, 4
459 vpermxor 15, 15, 0, 22
460 vpermxor 12, 12, 1, 22
461 vpermxor 13, 13, 2, 22
462 vpermxor 14, 14, 3, 22
463 vadduwm 10, 10, 15
464 vadduwm 11, 11, 12
465 vadduwm 8, 8, 13
466 vadduwm 9, 9, 14
467 vxor 5, 5, 10
468 vxor 6, 6, 11
469 vxor 7, 7, 8
470 vxor 4, 4, 9
471 vrlw 5, 5, 23
472 vrlw 6, 6, 23
473 vrlw 7, 7, 23
474 vrlw 4, 4, 23
475.endm
476
477# Transpose
478.macro TP_4x a0 a1 a2 a3
479 xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1
480 xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3
481 xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1
482 xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3
483 xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3
484 xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3
485 xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3
486 xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3
487.endm
488
489# key stream = working state + state
490.macro Add_state S
491 vadduwm \S+0, \S+0, 16-\S
492 vadduwm \S+4, \S+4, 17-\S
493 vadduwm \S+8, \S+8, 18-\S
494 vadduwm \S+12, \S+12, 19-\S
495
496 vadduwm \S+1, \S+1, 16-\S
497 vadduwm \S+5, \S+5, 17-\S
498 vadduwm \S+9, \S+9, 18-\S
499 vadduwm \S+13, \S+13, 19-\S
500
501 vadduwm \S+2, \S+2, 16-\S
502 vadduwm \S+6, \S+6, 17-\S
503 vadduwm \S+10, \S+10, 18-\S
504 vadduwm \S+14, \S+14, 19-\S
505
506 vadduwm \S+3, \S+3, 16-\S
507 vadduwm \S+7, \S+7, 17-\S
508 vadduwm \S+11, \S+11, 18-\S
509 vadduwm \S+15, \S+15, 19-\S
510.endm
511
512#
513# write 256 bytes
514#
515.macro Write_256 S
516 add 9, 14, 5
517 add 16, 14, 4
518 lxvw4x 0, 0, 9
519 lxvw4x 1, 17, 9
520 lxvw4x 2, 18, 9
521 lxvw4x 3, 19, 9
522 lxvw4x 4, 20, 9
523 lxvw4x 5, 21, 9
524 lxvw4x 6, 22, 9
525 lxvw4x 7, 23, 9
526 lxvw4x 8, 24, 9
527 lxvw4x 9, 25, 9
528 lxvw4x 10, 26, 9
529 lxvw4x 11, 27, 9
530 lxvw4x 12, 28, 9
531 lxvw4x 13, 29, 9
532 lxvw4x 14, 30, 9
533 lxvw4x 15, 31, 9
534
535 xxlxor \S+32, \S+32, 0
536 xxlxor \S+36, \S+36, 1
537 xxlxor \S+40, \S+40, 2
538 xxlxor \S+44, \S+44, 3
539 xxlxor \S+33, \S+33, 4
540 xxlxor \S+37, \S+37, 5
541 xxlxor \S+41, \S+41, 6
542 xxlxor \S+45, \S+45, 7
543 xxlxor \S+34, \S+34, 8
544 xxlxor \S+38, \S+38, 9
545 xxlxor \S+42, \S+42, 10
546 xxlxor \S+46, \S+46, 11
547 xxlxor \S+35, \S+35, 12
548 xxlxor \S+39, \S+39, 13
549 xxlxor \S+43, \S+43, 14
550 xxlxor \S+47, \S+47, 15
551
552 stxvw4x \S+32, 0, 16
553 stxvw4x \S+36, 17, 16
554 stxvw4x \S+40, 18, 16
555 stxvw4x \S+44, 19, 16
556
557 stxvw4x \S+33, 20, 16
558 stxvw4x \S+37, 21, 16
559 stxvw4x \S+41, 22, 16
560 stxvw4x \S+45, 23, 16
561
562 stxvw4x \S+34, 24, 16
563 stxvw4x \S+38, 25, 16
564 stxvw4x \S+42, 26, 16
565 stxvw4x \S+46, 27, 16
566
567 stxvw4x \S+35, 28, 16
568 stxvw4x \S+39, 29, 16
569 stxvw4x \S+43, 30, 16
570 stxvw4x \S+47, 31, 16
571
572.endm
573
574#
575# void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, const u8 *src,
576# unsigned int len, int nrounds);
577#
578SYM_FUNC_START(chacha_p10le_8x)
579.align 5
580 cmpdi 6, 0
581 ble Out_no_chacha
582
583 SAVE_REGS
584
585 # r17 - r31 mainly for Write_256 macro.
586 li 17, 16
587 li 18, 32
588 li 19, 48
589 li 20, 64
590 li 21, 80
591 li 22, 96
592 li 23, 112
593 li 24, 128
594 li 25, 144
595 li 26, 160
596 li 27, 176
597 li 28, 192
598 li 29, 208
599 li 30, 224
600 li 31, 240
601
602 mr 15, 6 # len
603 li 14, 0 # offset to inp and outp
604
605 lxvw4x 48, 0, 3 # vr16, constants
606 lxvw4x 49, 17, 3 # vr17, key 1
607 lxvw4x 50, 18, 3 # vr18, key 2
608 lxvw4x 51, 19, 3 # vr19, counter, nonce
609
610 # create (0, 1, 2, 3) counters
611 vspltisw 0, 0
612 vspltisw 1, 1
613 vspltisw 2, 2
614 vspltisw 3, 3
615 vmrghw 4, 0, 1
616 vmrglw 5, 2, 3
617 vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3)
618
619 vspltisw 21, 12
620 vspltisw 23, 7
621
622 addis 11, 2, permx@toc@ha
623 addi 11, 11, permx@toc@l
624 lxvw4x 32+20, 0, 11
625 lxvw4x 32+22, 17, 11
626
627 sradi 8, 7, 1
628
629 mtctr 8
630
631 # save constants to vsx
632 xxlor 16, 48, 48
633 xxlor 17, 49, 49
634 xxlor 18, 50, 50
635 xxlor 19, 51, 51
636
637 vspltisw 25, 4
638 vspltisw 26, 8
639
640 xxlor 25, 32+26, 32+26
641 xxlor 24, 32+25, 32+25
642
643 vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4)
644 xxlor 30, 32+30, 32+30
645 xxlor 31, 32+31, 32+31
646
647 xxlor 20, 32+20, 32+20
648 xxlor 21, 32+21, 32+21
649 xxlor 22, 32+22, 32+22
650 xxlor 23, 32+23, 32+23
651
652 cmpdi 6, 512
653 blt Loop_last
654
655Loop_8x:
656 xxspltw 32+0, 16, 0
657 xxspltw 32+1, 16, 1
658 xxspltw 32+2, 16, 2
659 xxspltw 32+3, 16, 3
660
661 xxspltw 32+4, 17, 0
662 xxspltw 32+5, 17, 1
663 xxspltw 32+6, 17, 2
664 xxspltw 32+7, 17, 3
665 xxspltw 32+8, 18, 0
666 xxspltw 32+9, 18, 1
667 xxspltw 32+10, 18, 2
668 xxspltw 32+11, 18, 3
669 xxspltw 32+12, 19, 0
670 xxspltw 32+13, 19, 1
671 xxspltw 32+14, 19, 2
672 xxspltw 32+15, 19, 3
673 vadduwm 12, 12, 30 # increase counter
674
675 xxspltw 32+16, 16, 0
676 xxspltw 32+17, 16, 1
677 xxspltw 32+18, 16, 2
678 xxspltw 32+19, 16, 3
679
680 xxspltw 32+20, 17, 0
681 xxspltw 32+21, 17, 1
682 xxspltw 32+22, 17, 2
683 xxspltw 32+23, 17, 3
684 xxspltw 32+24, 18, 0
685 xxspltw 32+25, 18, 1
686 xxspltw 32+26, 18, 2
687 xxspltw 32+27, 18, 3
688 xxspltw 32+28, 19, 0
689 xxspltw 32+29, 19, 1
690 vadduwm 28, 28, 31 # increase counter
691 xxspltw 32+30, 19, 2
692 xxspltw 32+31, 19, 3
693
694.align 5
695quarter_loop_8x:
696 QT_loop_8x
697
698 bdnz quarter_loop_8x
699
700 xxlor 0, 32+30, 32+30
701 xxlor 32+30, 30, 30
702 vadduwm 12, 12, 30
703 xxlor 32+30, 0, 0
704 TP_4x 0, 1, 2, 3
705 TP_4x 4, 5, 6, 7
706 TP_4x 8, 9, 10, 11
707 TP_4x 12, 13, 14, 15
708
709 xxlor 0, 48, 48
710 xxlor 1, 49, 49
711 xxlor 2, 50, 50
712 xxlor 3, 51, 51
713 xxlor 48, 16, 16
714 xxlor 49, 17, 17
715 xxlor 50, 18, 18
716 xxlor 51, 19, 19
717 Add_state 0
718 xxlor 48, 0, 0
719 xxlor 49, 1, 1
720 xxlor 50, 2, 2
721 xxlor 51, 3, 3
722 Write_256 0
723 addi 14, 14, 256 # offset +=256
724 addi 15, 15, -256 # len -=256
725
726 xxlor 5, 32+31, 32+31
727 xxlor 32+31, 31, 31
728 vadduwm 28, 28, 31
729 xxlor 32+31, 5, 5
730 TP_4x 16+0, 16+1, 16+2, 16+3
731 TP_4x 16+4, 16+5, 16+6, 16+7
732 TP_4x 16+8, 16+9, 16+10, 16+11
733 TP_4x 16+12, 16+13, 16+14, 16+15
734
735 xxlor 32, 16, 16
736 xxlor 33, 17, 17
737 xxlor 34, 18, 18
738 xxlor 35, 19, 19
739 Add_state 16
740 Write_256 16
741 addi 14, 14, 256 # offset +=256
742 addi 15, 15, -256 # len +=256
743
744 xxlor 32+24, 24, 24
745 xxlor 32+25, 25, 25
746 xxlor 32+30, 30, 30
747 vadduwm 30, 30, 25
748 vadduwm 31, 30, 24
749 xxlor 30, 32+30, 32+30
750 xxlor 31, 32+31, 32+31
751
752 cmpdi 15, 0
753 beq Out_loop
754
755 cmpdi 15, 512
756 blt Loop_last
757
758 mtctr 8
759 b Loop_8x
760
761Loop_last:
762 lxvw4x 48, 0, 3 # vr16, constants
763 lxvw4x 49, 17, 3 # vr17, key 1
764 lxvw4x 50, 18, 3 # vr18, key 2
765 lxvw4x 51, 19, 3 # vr19, counter, nonce
766
767 vspltisw 21, 12
768 vspltisw 23, 7
769 addis 11, 2, permx@toc@ha
770 addi 11, 11, permx@toc@l
771 lxvw4x 32+20, 0, 11
772 lxvw4x 32+22, 17, 11
773
774 sradi 8, 7, 1
775 mtctr 8
776
777Loop_4x:
778 vspltw 0, 16, 0
779 vspltw 1, 16, 1
780 vspltw 2, 16, 2
781 vspltw 3, 16, 3
782
783 vspltw 4, 17, 0
784 vspltw 5, 17, 1
785 vspltw 6, 17, 2
786 vspltw 7, 17, 3
787 vspltw 8, 18, 0
788 vspltw 9, 18, 1
789 vspltw 10, 18, 2
790 vspltw 11, 18, 3
791 vspltw 12, 19, 0
792 vadduwm 12, 12, 30 # increase counter
793 vspltw 13, 19, 1
794 vspltw 14, 19, 2
795 vspltw 15, 19, 3
796
797.align 5
798quarter_loop:
799 QT_loop_4x
800
801 bdnz quarter_loop
802
803 vadduwm 12, 12, 30
804 TP_4x 0, 1, 2, 3
805 TP_4x 4, 5, 6, 7
806 TP_4x 8, 9, 10, 11
807 TP_4x 12, 13, 14, 15
808
809 Add_state 0
810 Write_256 0
811 addi 14, 14, 256 # offset += 256
812 addi 15, 15, -256 # len += 256
813
814 # Update state counter
815 vspltisw 25, 4
816 vadduwm 30, 30, 25
817
818 cmpdi 15, 0
819 beq Out_loop
820 cmpdi 15, 256
821 blt Out_loop
822
823 mtctr 8
824 b Loop_4x
825
826Out_loop:
827 RESTORE_REGS
828 blr
829
830Out_no_chacha:
831 li 3, 0
832 blr
833SYM_FUNC_END(chacha_p10le_8x)
834
835SYM_DATA_START_LOCAL(PERMX)
836.align 5
837permx:
838.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
839.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
840SYM_DATA_END(PERMX)