Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1#ifndef __KERNEL__
2# include "arm_arch.h"
3.extern OPENSSL_armcap_P
4#endif
5
6.text
7
8// forward "declarations" are required for Apple
9.globl poly1305_blocks
10.globl poly1305_emit
11
12.globl poly1305_init
13.type poly1305_init,%function
14.align 5
15poly1305_init:
16 cmp x1,xzr
17 stp xzr,xzr,[x0] // zero hash value
18 stp xzr,xzr,[x0,#16] // [along with is_base2_26]
19
20 csel x0,xzr,x0,eq
21 b.eq .Lno_key
22
23#ifndef __KERNEL__
24 adrp x17,OPENSSL_armcap_P
25 ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
26#endif
27
28 ldp x7,x8,[x1] // load key
29 mov x9,#0xfffffffc0fffffff
30 movk x9,#0x0fff,lsl#48
31#ifdef __AARCH64EB__
32 rev x7,x7 // flip bytes
33 rev x8,x8
34#endif
35 and x7,x7,x9 // &=0ffffffc0fffffff
36 and x9,x9,#-4
37 and x8,x8,x9 // &=0ffffffc0ffffffc
38 mov w9,#-1
39 stp x7,x8,[x0,#32] // save key value
40 str w9,[x0,#48] // impossible key power value
41
42#ifndef __KERNEL__
43 tst w17,#ARMV7_NEON
44
45 adr x12,.Lpoly1305_blocks
46 adr x7,.Lpoly1305_blocks_neon
47 adr x13,.Lpoly1305_emit
48
49 csel x12,x12,x7,eq
50
51# ifdef __ILP32__
52 stp w12,w13,[x2]
53# else
54 stp x12,x13,[x2]
55# endif
56#endif
57 mov x0,#1
58.Lno_key:
59 ret
60.size poly1305_init,.-poly1305_init
61
62.type poly1305_blocks,%function
63.align 5
64poly1305_blocks:
65.Lpoly1305_blocks:
66 ands x2,x2,#-16
67 b.eq .Lno_data
68
69 ldp x4,x5,[x0] // load hash value
70 ldp x6,x17,[x0,#16] // [along with is_base2_26]
71 ldp x7,x8,[x0,#32] // load key value
72
73#ifdef __AARCH64EB__
74 lsr x12,x4,#32
75 mov w13,w4
76 lsr x14,x5,#32
77 mov w15,w5
78 lsr x16,x6,#32
79#else
80 mov w12,w4
81 lsr x13,x4,#32
82 mov w14,w5
83 lsr x15,x5,#32
84 mov w16,w6
85#endif
86
87 add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
88 lsr x13,x14,#12
89 adds x12,x12,x14,lsl#52
90 add x13,x13,x15,lsl#14
91 adc x13,x13,xzr
92 lsr x14,x16,#24
93 adds x13,x13,x16,lsl#40
94 adc x14,x14,xzr
95
96 cmp x17,#0 // is_base2_26?
97 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
98 csel x4,x4,x12,eq // choose between radixes
99 csel x5,x5,x13,eq
100 csel x6,x6,x14,eq
101
102.Loop:
103 ldp x10,x11,[x1],#16 // load input
104 sub x2,x2,#16
105#ifdef __AARCH64EB__
106 rev x10,x10
107 rev x11,x11
108#endif
109 adds x4,x4,x10 // accumulate input
110 adcs x5,x5,x11
111
112 mul x12,x4,x7 // h0*r0
113 adc x6,x6,x3
114 umulh x13,x4,x7
115
116 mul x10,x5,x9 // h1*5*r1
117 umulh x11,x5,x9
118
119 adds x12,x12,x10
120 mul x10,x4,x8 // h0*r1
121 adc x13,x13,x11
122 umulh x14,x4,x8
123
124 adds x13,x13,x10
125 mul x10,x5,x7 // h1*r0
126 adc x14,x14,xzr
127 umulh x11,x5,x7
128
129 adds x13,x13,x10
130 mul x10,x6,x9 // h2*5*r1
131 adc x14,x14,x11
132 mul x11,x6,x7 // h2*r0
133
134 adds x13,x13,x10
135 adc x14,x14,x11
136
137 and x10,x14,#-4 // final reduction
138 and x6,x14,#3
139 add x10,x10,x14,lsr#2
140 adds x4,x12,x10
141 adcs x5,x13,xzr
142 adc x6,x6,xzr
143
144 cbnz x2,.Loop
145
146 stp x4,x5,[x0] // store hash value
147 stp x6,xzr,[x0,#16] // [and clear is_base2_26]
148
149.Lno_data:
150 ret
151.size poly1305_blocks,.-poly1305_blocks
152
153.type poly1305_emit,%function
154.align 5
155poly1305_emit:
156.Lpoly1305_emit:
157 ldp x4,x5,[x0] // load hash base 2^64
158 ldp x6,x7,[x0,#16] // [along with is_base2_26]
159 ldp x10,x11,[x2] // load nonce
160
161#ifdef __AARCH64EB__
162 lsr x12,x4,#32
163 mov w13,w4
164 lsr x14,x5,#32
165 mov w15,w5
166 lsr x16,x6,#32
167#else
168 mov w12,w4
169 lsr x13,x4,#32
170 mov w14,w5
171 lsr x15,x5,#32
172 mov w16,w6
173#endif
174
175 add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
176 lsr x13,x14,#12
177 adds x12,x12,x14,lsl#52
178 add x13,x13,x15,lsl#14
179 adc x13,x13,xzr
180 lsr x14,x16,#24
181 adds x13,x13,x16,lsl#40
182 adc x14,x14,xzr
183
184 cmp x7,#0 // is_base2_26?
185 csel x4,x4,x12,eq // choose between radixes
186 csel x5,x5,x13,eq
187 csel x6,x6,x14,eq
188
189 adds x12,x4,#5 // compare to modulus
190 adcs x13,x5,xzr
191 adc x14,x6,xzr
192
193 tst x14,#-4 // see if it's carried/borrowed
194
195 csel x4,x4,x12,eq
196 csel x5,x5,x13,eq
197
198#ifdef __AARCH64EB__
199 ror x10,x10,#32 // flip nonce words
200 ror x11,x11,#32
201#endif
202 adds x4,x4,x10 // accumulate nonce
203 adc x5,x5,x11
204#ifdef __AARCH64EB__
205 rev x4,x4 // flip output bytes
206 rev x5,x5
207#endif
208 stp x4,x5,[x1] // write result
209
210 ret
211.size poly1305_emit,.-poly1305_emit
212.type poly1305_mult,%function
213.align 5
214poly1305_mult:
215 mul x12,x4,x7 // h0*r0
216 umulh x13,x4,x7
217
218 mul x10,x5,x9 // h1*5*r1
219 umulh x11,x5,x9
220
221 adds x12,x12,x10
222 mul x10,x4,x8 // h0*r1
223 adc x13,x13,x11
224 umulh x14,x4,x8
225
226 adds x13,x13,x10
227 mul x10,x5,x7 // h1*r0
228 adc x14,x14,xzr
229 umulh x11,x5,x7
230
231 adds x13,x13,x10
232 mul x10,x6,x9 // h2*5*r1
233 adc x14,x14,x11
234 mul x11,x6,x7 // h2*r0
235
236 adds x13,x13,x10
237 adc x14,x14,x11
238
239 and x10,x14,#-4 // final reduction
240 and x6,x14,#3
241 add x10,x10,x14,lsr#2
242 adds x4,x12,x10
243 adcs x5,x13,xzr
244 adc x6,x6,xzr
245
246 ret
247.size poly1305_mult,.-poly1305_mult
248
249.type poly1305_splat,%function
250.align 4
251poly1305_splat:
252 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
253 ubfx x13,x4,#26,#26
254 extr x14,x5,x4,#52
255 and x14,x14,#0x03ffffff
256 ubfx x15,x5,#14,#26
257 extr x16,x6,x5,#40
258
259 str w12,[x0,#16*0] // r0
260 add w12,w13,w13,lsl#2 // r1*5
261 str w13,[x0,#16*1] // r1
262 add w13,w14,w14,lsl#2 // r2*5
263 str w12,[x0,#16*2] // s1
264 str w14,[x0,#16*3] // r2
265 add w14,w15,w15,lsl#2 // r3*5
266 str w13,[x0,#16*4] // s2
267 str w15,[x0,#16*5] // r3
268 add w15,w16,w16,lsl#2 // r4*5
269 str w14,[x0,#16*6] // s3
270 str w16,[x0,#16*7] // r4
271 str w15,[x0,#16*8] // s4
272
273 ret
274.size poly1305_splat,.-poly1305_splat
275
276#ifdef __KERNEL__
277.globl poly1305_blocks_neon
278#endif
279.type poly1305_blocks_neon,%function
280.align 5
281poly1305_blocks_neon:
282.Lpoly1305_blocks_neon:
283 ldr x17,[x0,#24]
284 cmp x2,#128
285 b.lo .Lpoly1305_blocks
286
287 .inst 0xd503233f // paciasp
288 stp x29,x30,[sp,#-80]!
289 add x29,sp,#0
290
291 stp d8,d9,[sp,#16] // meet ABI requirements
292 stp d10,d11,[sp,#32]
293 stp d12,d13,[sp,#48]
294 stp d14,d15,[sp,#64]
295
296 cbz x17,.Lbase2_64_neon
297
298 ldp w10,w11,[x0] // load hash value base 2^26
299 ldp w12,w13,[x0,#8]
300 ldr w14,[x0,#16]
301
302 tst x2,#31
303 b.eq .Leven_neon
304
305 ldp x7,x8,[x0,#32] // load key value
306
307 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
308 lsr x5,x12,#12
309 adds x4,x4,x12,lsl#52
310 add x5,x5,x13,lsl#14
311 adc x5,x5,xzr
312 lsr x6,x14,#24
313 adds x5,x5,x14,lsl#40
314 adc x14,x6,xzr // can be partially reduced...
315
316 ldp x12,x13,[x1],#16 // load input
317 sub x2,x2,#16
318 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
319
320#ifdef __AARCH64EB__
321 rev x12,x12
322 rev x13,x13
323#endif
324 adds x4,x4,x12 // accumulate input
325 adcs x5,x5,x13
326 adc x6,x6,x3
327
328 bl poly1305_mult
329
330 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
331 ubfx x11,x4,#26,#26
332 extr x12,x5,x4,#52
333 and x12,x12,#0x03ffffff
334 ubfx x13,x5,#14,#26
335 extr x14,x6,x5,#40
336
337 b .Leven_neon
338
339.align 4
340.Lbase2_64_neon:
341 ldp x7,x8,[x0,#32] // load key value
342
343 ldp x4,x5,[x0] // load hash value base 2^64
344 ldr x6,[x0,#16]
345
346 tst x2,#31
347 b.eq .Linit_neon
348
349 ldp x12,x13,[x1],#16 // load input
350 sub x2,x2,#16
351 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
352#ifdef __AARCH64EB__
353 rev x12,x12
354 rev x13,x13
355#endif
356 adds x4,x4,x12 // accumulate input
357 adcs x5,x5,x13
358 adc x6,x6,x3
359
360 bl poly1305_mult
361
362.Linit_neon:
363 ldr w17,[x0,#48] // first table element
364 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
365 ubfx x11,x4,#26,#26
366 extr x12,x5,x4,#52
367 and x12,x12,#0x03ffffff
368 ubfx x13,x5,#14,#26
369 extr x14,x6,x5,#40
370
371 cmp w17,#-1 // is value impossible?
372 b.ne .Leven_neon
373
374 fmov d24,x10
375 fmov d25,x11
376 fmov d26,x12
377 fmov d27,x13
378 fmov d28,x14
379
380 ////////////////////////////////// initialize r^n table
381 mov x4,x7 // r^1
382 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
383 mov x5,x8
384 mov x6,xzr
385 add x0,x0,#48+12
386 bl poly1305_splat
387
388 bl poly1305_mult // r^2
389 sub x0,x0,#4
390 bl poly1305_splat
391
392 bl poly1305_mult // r^3
393 sub x0,x0,#4
394 bl poly1305_splat
395
396 bl poly1305_mult // r^4
397 sub x0,x0,#4
398 bl poly1305_splat
399 sub x0,x0,#48 // restore original x0
400 b .Ldo_neon
401
402.align 4
403.Leven_neon:
404 fmov d24,x10
405 fmov d25,x11
406 fmov d26,x12
407 fmov d27,x13
408 fmov d28,x14
409
410.Ldo_neon:
411 ldp x8,x12,[x1,#32] // inp[2:3]
412 subs x2,x2,#64
413 ldp x9,x13,[x1,#48]
414 add x16,x1,#96
415 adr x17,.Lzeros
416
417 lsl x3,x3,#24
418 add x15,x0,#48
419
420#ifdef __AARCH64EB__
421 rev x8,x8
422 rev x12,x12
423 rev x9,x9
424 rev x13,x13
425#endif
426 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
427 and x5,x9,#0x03ffffff
428 ubfx x6,x8,#26,#26
429 ubfx x7,x9,#26,#26
430 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
431 extr x8,x12,x8,#52
432 extr x9,x13,x9,#52
433 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
434 fmov d14,x4
435 and x8,x8,#0x03ffffff
436 and x9,x9,#0x03ffffff
437 ubfx x10,x12,#14,#26
438 ubfx x11,x13,#14,#26
439 add x12,x3,x12,lsr#40
440 add x13,x3,x13,lsr#40
441 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
442 fmov d15,x6
443 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
444 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
445 fmov d16,x8
446 fmov d17,x10
447 fmov d18,x12
448
449 ldp x8,x12,[x1],#16 // inp[0:1]
450 ldp x9,x13,[x1],#48
451
452 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
453 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
454 ld1 {v8.4s},[x15]
455
456#ifdef __AARCH64EB__
457 rev x8,x8
458 rev x12,x12
459 rev x9,x9
460 rev x13,x13
461#endif
462 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
463 and x5,x9,#0x03ffffff
464 ubfx x6,x8,#26,#26
465 ubfx x7,x9,#26,#26
466 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
467 extr x8,x12,x8,#52
468 extr x9,x13,x9,#52
469 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
470 fmov d9,x4
471 and x8,x8,#0x03ffffff
472 and x9,x9,#0x03ffffff
473 ubfx x10,x12,#14,#26
474 ubfx x11,x13,#14,#26
475 add x12,x3,x12,lsr#40
476 add x13,x3,x13,lsr#40
477 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
478 fmov d10,x6
479 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
480 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
481 movi v31.2d,#-1
482 fmov d11,x8
483 fmov d12,x10
484 fmov d13,x12
485 ushr v31.2d,v31.2d,#38
486
487 b.ls .Lskip_loop
488
489.align 4
490.Loop_neon:
491 ////////////////////////////////////////////////////////////////
492 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
493 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
494 // ___________________/
495 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
496 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
497 // ___________________/ ____________________/
498 //
499 // Note that we start with inp[2:3]*r^2. This is because it
500 // doesn't depend on reduction in previous iteration.
501 ////////////////////////////////////////////////////////////////
502 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
503 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
504 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
505 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
506 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
507
508 subs x2,x2,#64
509 umull v23.2d,v14.2s,v7.s[2]
510 csel x16,x17,x16,lo
511 umull v22.2d,v14.2s,v5.s[2]
512 umull v21.2d,v14.2s,v3.s[2]
513 ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
514 umull v20.2d,v14.2s,v1.s[2]
515 ldp x9,x13,[x16],#48
516 umull v19.2d,v14.2s,v0.s[2]
517#ifdef __AARCH64EB__
518 rev x8,x8
519 rev x12,x12
520 rev x9,x9
521 rev x13,x13
522#endif
523
524 umlal v23.2d,v15.2s,v5.s[2]
525 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
526 umlal v22.2d,v15.2s,v3.s[2]
527 and x5,x9,#0x03ffffff
528 umlal v21.2d,v15.2s,v1.s[2]
529 ubfx x6,x8,#26,#26
530 umlal v20.2d,v15.2s,v0.s[2]
531 ubfx x7,x9,#26,#26
532 umlal v19.2d,v15.2s,v8.s[2]
533 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
534
535 umlal v23.2d,v16.2s,v3.s[2]
536 extr x8,x12,x8,#52
537 umlal v22.2d,v16.2s,v1.s[2]
538 extr x9,x13,x9,#52
539 umlal v21.2d,v16.2s,v0.s[2]
540 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
541 umlal v20.2d,v16.2s,v8.s[2]
542 fmov d14,x4
543 umlal v19.2d,v16.2s,v6.s[2]
544 and x8,x8,#0x03ffffff
545
546 umlal v23.2d,v17.2s,v1.s[2]
547 and x9,x9,#0x03ffffff
548 umlal v22.2d,v17.2s,v0.s[2]
549 ubfx x10,x12,#14,#26
550 umlal v21.2d,v17.2s,v8.s[2]
551 ubfx x11,x13,#14,#26
552 umlal v20.2d,v17.2s,v6.s[2]
553 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
554 umlal v19.2d,v17.2s,v4.s[2]
555 fmov d15,x6
556
557 add v11.2s,v11.2s,v26.2s
558 add x12,x3,x12,lsr#40
559 umlal v23.2d,v18.2s,v0.s[2]
560 add x13,x3,x13,lsr#40
561 umlal v22.2d,v18.2s,v8.s[2]
562 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
563 umlal v21.2d,v18.2s,v6.s[2]
564 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
565 umlal v20.2d,v18.2s,v4.s[2]
566 fmov d16,x8
567 umlal v19.2d,v18.2s,v2.s[2]
568 fmov d17,x10
569
570 ////////////////////////////////////////////////////////////////
571 // (hash+inp[0:1])*r^4 and accumulate
572
573 add v9.2s,v9.2s,v24.2s
574 fmov d18,x12
575 umlal v22.2d,v11.2s,v1.s[0]
576 ldp x8,x12,[x1],#16 // inp[0:1]
577 umlal v19.2d,v11.2s,v6.s[0]
578 ldp x9,x13,[x1],#48
579 umlal v23.2d,v11.2s,v3.s[0]
580 umlal v20.2d,v11.2s,v8.s[0]
581 umlal v21.2d,v11.2s,v0.s[0]
582#ifdef __AARCH64EB__
583 rev x8,x8
584 rev x12,x12
585 rev x9,x9
586 rev x13,x13
587#endif
588
589 add v10.2s,v10.2s,v25.2s
590 umlal v22.2d,v9.2s,v5.s[0]
591 umlal v23.2d,v9.2s,v7.s[0]
592 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
593 umlal v21.2d,v9.2s,v3.s[0]
594 and x5,x9,#0x03ffffff
595 umlal v19.2d,v9.2s,v0.s[0]
596 ubfx x6,x8,#26,#26
597 umlal v20.2d,v9.2s,v1.s[0]
598 ubfx x7,x9,#26,#26
599
600 add v12.2s,v12.2s,v27.2s
601 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
602 umlal v22.2d,v10.2s,v3.s[0]
603 extr x8,x12,x8,#52
604 umlal v23.2d,v10.2s,v5.s[0]
605 extr x9,x13,x9,#52
606 umlal v19.2d,v10.2s,v8.s[0]
607 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
608 umlal v21.2d,v10.2s,v1.s[0]
609 fmov d9,x4
610 umlal v20.2d,v10.2s,v0.s[0]
611 and x8,x8,#0x03ffffff
612
613 add v13.2s,v13.2s,v28.2s
614 and x9,x9,#0x03ffffff
615 umlal v22.2d,v12.2s,v0.s[0]
616 ubfx x10,x12,#14,#26
617 umlal v19.2d,v12.2s,v4.s[0]
618 ubfx x11,x13,#14,#26
619 umlal v23.2d,v12.2s,v1.s[0]
620 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
621 umlal v20.2d,v12.2s,v6.s[0]
622 fmov d10,x6
623 umlal v21.2d,v12.2s,v8.s[0]
624 add x12,x3,x12,lsr#40
625
626 umlal v22.2d,v13.2s,v8.s[0]
627 add x13,x3,x13,lsr#40
628 umlal v19.2d,v13.2s,v2.s[0]
629 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
630 umlal v23.2d,v13.2s,v0.s[0]
631 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
632 umlal v20.2d,v13.2s,v4.s[0]
633 fmov d11,x8
634 umlal v21.2d,v13.2s,v6.s[0]
635 fmov d12,x10
636 fmov d13,x12
637
638 /////////////////////////////////////////////////////////////////
639 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
640 // and P. Schwabe
641 //
642 // [see discussion in poly1305-armv4 module]
643
644 ushr v29.2d,v22.2d,#26
645 xtn v27.2s,v22.2d
646 ushr v30.2d,v19.2d,#26
647 and v19.16b,v19.16b,v31.16b
648 add v23.2d,v23.2d,v29.2d // h3 -> h4
649 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
650 add v20.2d,v20.2d,v30.2d // h0 -> h1
651
652 ushr v29.2d,v23.2d,#26
653 xtn v28.2s,v23.2d
654 ushr v30.2d,v20.2d,#26
655 xtn v25.2s,v20.2d
656 bic v28.2s,#0xfc,lsl#24
657 add v21.2d,v21.2d,v30.2d // h1 -> h2
658
659 add v19.2d,v19.2d,v29.2d
660 shl v29.2d,v29.2d,#2
661 shrn v30.2s,v21.2d,#26
662 xtn v26.2s,v21.2d
663 add v19.2d,v19.2d,v29.2d // h4 -> h0
664 bic v25.2s,#0xfc,lsl#24
665 add v27.2s,v27.2s,v30.2s // h2 -> h3
666 bic v26.2s,#0xfc,lsl#24
667
668 shrn v29.2s,v19.2d,#26
669 xtn v24.2s,v19.2d
670 ushr v30.2s,v27.2s,#26
671 bic v27.2s,#0xfc,lsl#24
672 bic v24.2s,#0xfc,lsl#24
673 add v25.2s,v25.2s,v29.2s // h0 -> h1
674 add v28.2s,v28.2s,v30.2s // h3 -> h4
675
676 b.hi .Loop_neon
677
678.Lskip_loop:
679 dup v16.2d,v16.d[0]
680 add v11.2s,v11.2s,v26.2s
681
682 ////////////////////////////////////////////////////////////////
683 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
684
685 adds x2,x2,#32
686 b.ne .Long_tail
687
688 dup v16.2d,v11.d[0]
689 add v14.2s,v9.2s,v24.2s
690 add v17.2s,v12.2s,v27.2s
691 add v15.2s,v10.2s,v25.2s
692 add v18.2s,v13.2s,v28.2s
693
694.Long_tail:
695 dup v14.2d,v14.d[0]
696 umull2 v19.2d,v16.4s,v6.4s
697 umull2 v22.2d,v16.4s,v1.4s
698 umull2 v23.2d,v16.4s,v3.4s
699 umull2 v21.2d,v16.4s,v0.4s
700 umull2 v20.2d,v16.4s,v8.4s
701
702 dup v15.2d,v15.d[0]
703 umlal2 v19.2d,v14.4s,v0.4s
704 umlal2 v21.2d,v14.4s,v3.4s
705 umlal2 v22.2d,v14.4s,v5.4s
706 umlal2 v23.2d,v14.4s,v7.4s
707 umlal2 v20.2d,v14.4s,v1.4s
708
709 dup v17.2d,v17.d[0]
710 umlal2 v19.2d,v15.4s,v8.4s
711 umlal2 v22.2d,v15.4s,v3.4s
712 umlal2 v21.2d,v15.4s,v1.4s
713 umlal2 v23.2d,v15.4s,v5.4s
714 umlal2 v20.2d,v15.4s,v0.4s
715
716 dup v18.2d,v18.d[0]
717 umlal2 v22.2d,v17.4s,v0.4s
718 umlal2 v23.2d,v17.4s,v1.4s
719 umlal2 v19.2d,v17.4s,v4.4s
720 umlal2 v20.2d,v17.4s,v6.4s
721 umlal2 v21.2d,v17.4s,v8.4s
722
723 umlal2 v22.2d,v18.4s,v8.4s
724 umlal2 v19.2d,v18.4s,v2.4s
725 umlal2 v23.2d,v18.4s,v0.4s
726 umlal2 v20.2d,v18.4s,v4.4s
727 umlal2 v21.2d,v18.4s,v6.4s
728
729 b.eq .Lshort_tail
730
731 ////////////////////////////////////////////////////////////////
732 // (hash+inp[0:1])*r^4:r^3 and accumulate
733
734 add v9.2s,v9.2s,v24.2s
735 umlal v22.2d,v11.2s,v1.2s
736 umlal v19.2d,v11.2s,v6.2s
737 umlal v23.2d,v11.2s,v3.2s
738 umlal v20.2d,v11.2s,v8.2s
739 umlal v21.2d,v11.2s,v0.2s
740
741 add v10.2s,v10.2s,v25.2s
742 umlal v22.2d,v9.2s,v5.2s
743 umlal v19.2d,v9.2s,v0.2s
744 umlal v23.2d,v9.2s,v7.2s
745 umlal v20.2d,v9.2s,v1.2s
746 umlal v21.2d,v9.2s,v3.2s
747
748 add v12.2s,v12.2s,v27.2s
749 umlal v22.2d,v10.2s,v3.2s
750 umlal v19.2d,v10.2s,v8.2s
751 umlal v23.2d,v10.2s,v5.2s
752 umlal v20.2d,v10.2s,v0.2s
753 umlal v21.2d,v10.2s,v1.2s
754
755 add v13.2s,v13.2s,v28.2s
756 umlal v22.2d,v12.2s,v0.2s
757 umlal v19.2d,v12.2s,v4.2s
758 umlal v23.2d,v12.2s,v1.2s
759 umlal v20.2d,v12.2s,v6.2s
760 umlal v21.2d,v12.2s,v8.2s
761
762 umlal v22.2d,v13.2s,v8.2s
763 umlal v19.2d,v13.2s,v2.2s
764 umlal v23.2d,v13.2s,v0.2s
765 umlal v20.2d,v13.2s,v4.2s
766 umlal v21.2d,v13.2s,v6.2s
767
768.Lshort_tail:
769 ////////////////////////////////////////////////////////////////
770 // horizontal add
771
772 addp v22.2d,v22.2d,v22.2d
773 ldp d8,d9,[sp,#16] // meet ABI requirements
774 addp v19.2d,v19.2d,v19.2d
775 ldp d10,d11,[sp,#32]
776 addp v23.2d,v23.2d,v23.2d
777 ldp d12,d13,[sp,#48]
778 addp v20.2d,v20.2d,v20.2d
779 ldp d14,d15,[sp,#64]
780 addp v21.2d,v21.2d,v21.2d
781 ldr x30,[sp,#8]
782
783 ////////////////////////////////////////////////////////////////
784 // lazy reduction, but without narrowing
785
786 ushr v29.2d,v22.2d,#26
787 and v22.16b,v22.16b,v31.16b
788 ushr v30.2d,v19.2d,#26
789 and v19.16b,v19.16b,v31.16b
790
791 add v23.2d,v23.2d,v29.2d // h3 -> h4
792 add v20.2d,v20.2d,v30.2d // h0 -> h1
793
794 ushr v29.2d,v23.2d,#26
795 and v23.16b,v23.16b,v31.16b
796 ushr v30.2d,v20.2d,#26
797 and v20.16b,v20.16b,v31.16b
798 add v21.2d,v21.2d,v30.2d // h1 -> h2
799
800 add v19.2d,v19.2d,v29.2d
801 shl v29.2d,v29.2d,#2
802 ushr v30.2d,v21.2d,#26
803 and v21.16b,v21.16b,v31.16b
804 add v19.2d,v19.2d,v29.2d // h4 -> h0
805 add v22.2d,v22.2d,v30.2d // h2 -> h3
806
807 ushr v29.2d,v19.2d,#26
808 and v19.16b,v19.16b,v31.16b
809 ushr v30.2d,v22.2d,#26
810 and v22.16b,v22.16b,v31.16b
811 add v20.2d,v20.2d,v29.2d // h0 -> h1
812 add v23.2d,v23.2d,v30.2d // h3 -> h4
813
814 ////////////////////////////////////////////////////////////////
815 // write the result, can be partially reduced
816
817 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
818 mov x4,#1
819 st1 {v23.s}[0],[x0]
820 str x4,[x0,#8] // set is_base2_26
821
822 ldr x29,[sp],#80
823 .inst 0xd50323bf // autiasp
824 ret
825.size poly1305_blocks_neon,.-poly1305_blocks_neon
826
827.align 5
828.Lzeros:
829.long 0,0,0,0,0,0,0,0
830.asciz "Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm"
831.align 2
832#if !defined(__KERNEL__) && !defined(_WIN64)
833.comm OPENSSL_armcap_P,4,4
834.hidden OPENSSL_armcap_P
835#endif