Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/*
2 * AVX2 implementation of MORUS-1280
3 *
4 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
5 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
10 */
11
12#include <linux/linkage.h>
13#include <asm/frame.h>
14
15#define SHUFFLE_MASK(i0, i1, i2, i3) \
16 (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
17
18#define MASK1 SHUFFLE_MASK(3, 0, 1, 2)
19#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
20#define MASK3 SHUFFLE_MASK(1, 2, 3, 0)
21
22#define STATE0 %ymm0
23#define STATE0_LOW %xmm0
24#define STATE1 %ymm1
25#define STATE2 %ymm2
26#define STATE3 %ymm3
27#define STATE4 %ymm4
28#define KEY %ymm5
29#define MSG %ymm5
30#define MSG_LOW %xmm5
31#define T0 %ymm6
32#define T0_LOW %xmm6
33#define T1 %ymm7
34
35.section .rodata.cst32.morus1280_const, "aM", @progbits, 32
36.align 32
37.Lmorus1280_const:
38 .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
39 .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
40 .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
41 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
42
43.section .rodata.cst32.morus1280_counter, "aM", @progbits, 32
44.align 32
45.Lmorus1280_counter:
46 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
47 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
48 .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
49 .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
50
51.text
52
53.macro morus1280_round s0, s1, s2, s3, s4, b, w
54 vpand \s1, \s2, T0
55 vpxor T0, \s0, \s0
56 vpxor \s3, \s0, \s0
57 vpsllq $\b, \s0, T0
58 vpsrlq $(64 - \b), \s0, \s0
59 vpxor T0, \s0, \s0
60 vpermq $\w, \s3, \s3
61.endm
62
63/*
64 * __morus1280_update: internal ABI
65 * input:
66 * STATE[0-4] - input state
67 * MSG - message block
68 * output:
69 * STATE[0-4] - output state
70 * changed:
71 * T0
72 */
73__morus1280_update:
74 morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1
75 vpxor MSG, STATE1, STATE1
76 morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2
77 vpxor MSG, STATE2, STATE2
78 morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3
79 vpxor MSG, STATE3, STATE3
80 morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2
81 vpxor MSG, STATE4, STATE4
82 morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1
83 ret
84ENDPROC(__morus1280_update)
85
86/*
87 * __morus1280_update_zero: internal ABI
88 * input:
89 * STATE[0-4] - input state
90 * output:
91 * STATE[0-4] - output state
92 * changed:
93 * T0
94 */
95__morus1280_update_zero:
96 morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1
97 morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2
98 morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3
99 morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2
100 morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1
101 ret
102ENDPROC(__morus1280_update_zero)
103
104/*
105 * __load_partial: internal ABI
106 * input:
107 * %rsi - src
108 * %rcx - bytes
109 * output:
110 * MSG - message block
111 * changed:
112 * %r8
113 * %r9
114 */
115__load_partial:
116 xor %r9d, %r9d
117 vpxor MSG, MSG, MSG
118
119 mov %rcx, %r8
120 and $0x1, %r8
121 jz .Lld_partial_1
122
123 mov %rcx, %r8
124 and $0x1E, %r8
125 add %rsi, %r8
126 mov (%r8), %r9b
127
128.Lld_partial_1:
129 mov %rcx, %r8
130 and $0x2, %r8
131 jz .Lld_partial_2
132
133 mov %rcx, %r8
134 and $0x1C, %r8
135 add %rsi, %r8
136 shl $16, %r9
137 mov (%r8), %r9w
138
139.Lld_partial_2:
140 mov %rcx, %r8
141 and $0x4, %r8
142 jz .Lld_partial_4
143
144 mov %rcx, %r8
145 and $0x18, %r8
146 add %rsi, %r8
147 shl $32, %r9
148 mov (%r8), %r8d
149 xor %r8, %r9
150
151.Lld_partial_4:
152 movq %r9, MSG_LOW
153
154 mov %rcx, %r8
155 and $0x8, %r8
156 jz .Lld_partial_8
157
158 mov %rcx, %r8
159 and $0x10, %r8
160 add %rsi, %r8
161 pshufd $MASK2, MSG_LOW, MSG_LOW
162 pinsrq $0, (%r8), MSG_LOW
163
164.Lld_partial_8:
165 mov %rcx, %r8
166 and $0x10, %r8
167 jz .Lld_partial_16
168
169 vpermq $MASK2, MSG, MSG
170 movdqu (%rsi), MSG_LOW
171
172.Lld_partial_16:
173 ret
174ENDPROC(__load_partial)
175
176/*
177 * __store_partial: internal ABI
178 * input:
179 * %rdx - dst
180 * %rcx - bytes
181 * output:
182 * T0 - message block
183 * changed:
184 * %r8
185 * %r9
186 * %r10
187 */
188__store_partial:
189 mov %rcx, %r8
190 mov %rdx, %r9
191
192 cmp $16, %r8
193 jl .Lst_partial_16
194
195 movdqu T0_LOW, (%r9)
196 vpermq $MASK2, T0, T0
197
198 sub $16, %r8
199 add $16, %r9
200
201.Lst_partial_16:
202 movq T0_LOW, %r10
203
204 cmp $8, %r8
205 jl .Lst_partial_8
206
207 mov %r10, (%r9)
208 pextrq $1, T0_LOW, %r10
209
210 sub $8, %r8
211 add $8, %r9
212
213.Lst_partial_8:
214 cmp $4, %r8
215 jl .Lst_partial_4
216
217 mov %r10d, (%r9)
218 shr $32, %r10
219
220 sub $4, %r8
221 add $4, %r9
222
223.Lst_partial_4:
224 cmp $2, %r8
225 jl .Lst_partial_2
226
227 mov %r10w, (%r9)
228 shr $16, %r10
229
230 sub $2, %r8
231 add $2, %r9
232
233.Lst_partial_2:
234 cmp $1, %r8
235 jl .Lst_partial_1
236
237 mov %r10b, (%r9)
238
239.Lst_partial_1:
240 ret
241ENDPROC(__store_partial)
242
243/*
244 * void crypto_morus1280_avx2_init(void *state, const void *key,
245 * const void *iv);
246 */
247ENTRY(crypto_morus1280_avx2_init)
248 FRAME_BEGIN
249
250 /* load IV: */
251 vpxor STATE0, STATE0, STATE0
252 movdqu (%rdx), STATE0_LOW
253 /* load key: */
254 vmovdqu (%rsi), KEY
255 vmovdqa KEY, STATE1
256 /* load all ones: */
257 vpcmpeqd STATE2, STATE2, STATE2
258 /* load all zeros: */
259 vpxor STATE3, STATE3, STATE3
260 /* load the constant: */
261 vmovdqa .Lmorus1280_const, STATE4
262
263 /* update 16 times with zero: */
264 call __morus1280_update_zero
265 call __morus1280_update_zero
266 call __morus1280_update_zero
267 call __morus1280_update_zero
268 call __morus1280_update_zero
269 call __morus1280_update_zero
270 call __morus1280_update_zero
271 call __morus1280_update_zero
272 call __morus1280_update_zero
273 call __morus1280_update_zero
274 call __morus1280_update_zero
275 call __morus1280_update_zero
276 call __morus1280_update_zero
277 call __morus1280_update_zero
278 call __morus1280_update_zero
279 call __morus1280_update_zero
280
281 /* xor-in the key again after updates: */
282 vpxor KEY, STATE1, STATE1
283
284 /* store the state: */
285 vmovdqu STATE0, (0 * 32)(%rdi)
286 vmovdqu STATE1, (1 * 32)(%rdi)
287 vmovdqu STATE2, (2 * 32)(%rdi)
288 vmovdqu STATE3, (3 * 32)(%rdi)
289 vmovdqu STATE4, (4 * 32)(%rdi)
290
291 FRAME_END
292 ret
293ENDPROC(crypto_morus1280_avx2_init)
294
295/*
296 * void crypto_morus1280_avx2_ad(void *state, const void *data,
297 * unsigned int length);
298 */
299ENTRY(crypto_morus1280_avx2_ad)
300 FRAME_BEGIN
301
302 cmp $32, %rdx
303 jb .Lad_out
304
305 /* load the state: */
306 vmovdqu (0 * 32)(%rdi), STATE0
307 vmovdqu (1 * 32)(%rdi), STATE1
308 vmovdqu (2 * 32)(%rdi), STATE2
309 vmovdqu (3 * 32)(%rdi), STATE3
310 vmovdqu (4 * 32)(%rdi), STATE4
311
312 mov %rsi, %r8
313 and $0x1F, %r8
314 jnz .Lad_u_loop
315
316.align 4
317.Lad_a_loop:
318 vmovdqa (%rsi), MSG
319 call __morus1280_update
320 sub $32, %rdx
321 add $32, %rsi
322 cmp $32, %rdx
323 jge .Lad_a_loop
324
325 jmp .Lad_cont
326.align 4
327.Lad_u_loop:
328 vmovdqu (%rsi), MSG
329 call __morus1280_update
330 sub $32, %rdx
331 add $32, %rsi
332 cmp $32, %rdx
333 jge .Lad_u_loop
334
335.Lad_cont:
336 /* store the state: */
337 vmovdqu STATE0, (0 * 32)(%rdi)
338 vmovdqu STATE1, (1 * 32)(%rdi)
339 vmovdqu STATE2, (2 * 32)(%rdi)
340 vmovdqu STATE3, (3 * 32)(%rdi)
341 vmovdqu STATE4, (4 * 32)(%rdi)
342
343.Lad_out:
344 FRAME_END
345 ret
346ENDPROC(crypto_morus1280_avx2_ad)
347
348/*
349 * void crypto_morus1280_avx2_enc(void *state, const void *src, void *dst,
350 * unsigned int length);
351 */
352ENTRY(crypto_morus1280_avx2_enc)
353 FRAME_BEGIN
354
355 cmp $32, %rcx
356 jb .Lenc_out
357
358 /* load the state: */
359 vmovdqu (0 * 32)(%rdi), STATE0
360 vmovdqu (1 * 32)(%rdi), STATE1
361 vmovdqu (2 * 32)(%rdi), STATE2
362 vmovdqu (3 * 32)(%rdi), STATE3
363 vmovdqu (4 * 32)(%rdi), STATE4
364
365 mov %rsi, %r8
366 or %rdx, %r8
367 and $0x1F, %r8
368 jnz .Lenc_u_loop
369
370.align 4
371.Lenc_a_loop:
372 vmovdqa (%rsi), MSG
373 vmovdqa MSG, T0
374 vpxor STATE0, T0, T0
375 vpermq $MASK3, STATE1, T1
376 vpxor T1, T0, T0
377 vpand STATE2, STATE3, T1
378 vpxor T1, T0, T0
379 vmovdqa T0, (%rdx)
380
381 call __morus1280_update
382 sub $32, %rcx
383 add $32, %rsi
384 add $32, %rdx
385 cmp $32, %rcx
386 jge .Lenc_a_loop
387
388 jmp .Lenc_cont
389.align 4
390.Lenc_u_loop:
391 vmovdqu (%rsi), MSG
392 vmovdqa MSG, T0
393 vpxor STATE0, T0, T0
394 vpermq $MASK3, STATE1, T1
395 vpxor T1, T0, T0
396 vpand STATE2, STATE3, T1
397 vpxor T1, T0, T0
398 vmovdqu T0, (%rdx)
399
400 call __morus1280_update
401 sub $32, %rcx
402 add $32, %rsi
403 add $32, %rdx
404 cmp $32, %rcx
405 jge .Lenc_u_loop
406
407.Lenc_cont:
408 /* store the state: */
409 vmovdqu STATE0, (0 * 32)(%rdi)
410 vmovdqu STATE1, (1 * 32)(%rdi)
411 vmovdqu STATE2, (2 * 32)(%rdi)
412 vmovdqu STATE3, (3 * 32)(%rdi)
413 vmovdqu STATE4, (4 * 32)(%rdi)
414
415.Lenc_out:
416 FRAME_END
417 ret
418ENDPROC(crypto_morus1280_avx2_enc)
419
420/*
421 * void crypto_morus1280_avx2_enc_tail(void *state, const void *src, void *dst,
422 * unsigned int length);
423 */
424ENTRY(crypto_morus1280_avx2_enc_tail)
425 FRAME_BEGIN
426
427 /* load the state: */
428 vmovdqu (0 * 32)(%rdi), STATE0
429 vmovdqu (1 * 32)(%rdi), STATE1
430 vmovdqu (2 * 32)(%rdi), STATE2
431 vmovdqu (3 * 32)(%rdi), STATE3
432 vmovdqu (4 * 32)(%rdi), STATE4
433
434 /* encrypt message: */
435 call __load_partial
436
437 vmovdqa MSG, T0
438 vpxor STATE0, T0, T0
439 vpermq $MASK3, STATE1, T1
440 vpxor T1, T0, T0
441 vpand STATE2, STATE3, T1
442 vpxor T1, T0, T0
443
444 call __store_partial
445
446 call __morus1280_update
447
448 /* store the state: */
449 vmovdqu STATE0, (0 * 32)(%rdi)
450 vmovdqu STATE1, (1 * 32)(%rdi)
451 vmovdqu STATE2, (2 * 32)(%rdi)
452 vmovdqu STATE3, (3 * 32)(%rdi)
453 vmovdqu STATE4, (4 * 32)(%rdi)
454
455 FRAME_END
456 ret
457ENDPROC(crypto_morus1280_avx2_enc_tail)
458
459/*
460 * void crypto_morus1280_avx2_dec(void *state, const void *src, void *dst,
461 * unsigned int length);
462 */
463ENTRY(crypto_morus1280_avx2_dec)
464 FRAME_BEGIN
465
466 cmp $32, %rcx
467 jb .Ldec_out
468
469 /* load the state: */
470 vmovdqu (0 * 32)(%rdi), STATE0
471 vmovdqu (1 * 32)(%rdi), STATE1
472 vmovdqu (2 * 32)(%rdi), STATE2
473 vmovdqu (3 * 32)(%rdi), STATE3
474 vmovdqu (4 * 32)(%rdi), STATE4
475
476 mov %rsi, %r8
477 or %rdx, %r8
478 and $0x1F, %r8
479 jnz .Ldec_u_loop
480
481.align 4
482.Ldec_a_loop:
483 vmovdqa (%rsi), MSG
484 vpxor STATE0, MSG, MSG
485 vpermq $MASK3, STATE1, T0
486 vpxor T0, MSG, MSG
487 vpand STATE2, STATE3, T0
488 vpxor T0, MSG, MSG
489 vmovdqa MSG, (%rdx)
490
491 call __morus1280_update
492 sub $32, %rcx
493 add $32, %rsi
494 add $32, %rdx
495 cmp $32, %rcx
496 jge .Ldec_a_loop
497
498 jmp .Ldec_cont
499.align 4
500.Ldec_u_loop:
501 vmovdqu (%rsi), MSG
502 vpxor STATE0, MSG, MSG
503 vpermq $MASK3, STATE1, T0
504 vpxor T0, MSG, MSG
505 vpand STATE2, STATE3, T0
506 vpxor T0, MSG, MSG
507 vmovdqu MSG, (%rdx)
508
509 call __morus1280_update
510 sub $32, %rcx
511 add $32, %rsi
512 add $32, %rdx
513 cmp $32, %rcx
514 jge .Ldec_u_loop
515
516.Ldec_cont:
517 /* store the state: */
518 vmovdqu STATE0, (0 * 32)(%rdi)
519 vmovdqu STATE1, (1 * 32)(%rdi)
520 vmovdqu STATE2, (2 * 32)(%rdi)
521 vmovdqu STATE3, (3 * 32)(%rdi)
522 vmovdqu STATE4, (4 * 32)(%rdi)
523
524.Ldec_out:
525 FRAME_END
526 ret
527ENDPROC(crypto_morus1280_avx2_dec)
528
529/*
530 * void crypto_morus1280_avx2_dec_tail(void *state, const void *src, void *dst,
531 * unsigned int length);
532 */
533ENTRY(crypto_morus1280_avx2_dec_tail)
534 FRAME_BEGIN
535
536 /* load the state: */
537 vmovdqu (0 * 32)(%rdi), STATE0
538 vmovdqu (1 * 32)(%rdi), STATE1
539 vmovdqu (2 * 32)(%rdi), STATE2
540 vmovdqu (3 * 32)(%rdi), STATE3
541 vmovdqu (4 * 32)(%rdi), STATE4
542
543 /* decrypt message: */
544 call __load_partial
545
546 vpxor STATE0, MSG, MSG
547 vpermq $MASK3, STATE1, T0
548 vpxor T0, MSG, MSG
549 vpand STATE2, STATE3, T0
550 vpxor T0, MSG, MSG
551 vmovdqa MSG, T0
552
553 call __store_partial
554
555 /* mask with byte count: */
556 movq %rcx, T0_LOW
557 vpbroadcastb T0_LOW, T0
558 vmovdqa .Lmorus1280_counter, T1
559 vpcmpgtb T1, T0, T0
560 vpand T0, MSG, MSG
561
562 call __morus1280_update
563
564 /* store the state: */
565 vmovdqu STATE0, (0 * 32)(%rdi)
566 vmovdqu STATE1, (1 * 32)(%rdi)
567 vmovdqu STATE2, (2 * 32)(%rdi)
568 vmovdqu STATE3, (3 * 32)(%rdi)
569 vmovdqu STATE4, (4 * 32)(%rdi)
570
571 FRAME_END
572 ret
573ENDPROC(crypto_morus1280_avx2_dec_tail)
574
575/*
576 * void crypto_morus1280_avx2_final(void *state, void *tag_xor,
577 * u64 assoclen, u64 cryptlen);
578 */
579ENTRY(crypto_morus1280_avx2_final)
580 FRAME_BEGIN
581
582 /* load the state: */
583 vmovdqu (0 * 32)(%rdi), STATE0
584 vmovdqu (1 * 32)(%rdi), STATE1
585 vmovdqu (2 * 32)(%rdi), STATE2
586 vmovdqu (3 * 32)(%rdi), STATE3
587 vmovdqu (4 * 32)(%rdi), STATE4
588
589 /* xor state[0] into state[4]: */
590 vpxor STATE0, STATE4, STATE4
591
592 /* prepare length block: */
593 vpxor MSG, MSG, MSG
594 vpinsrq $0, %rdx, MSG_LOW, MSG_LOW
595 vpinsrq $1, %rcx, MSG_LOW, MSG_LOW
596 vpsllq $3, MSG, MSG /* multiply by 8 (to get bit count) */
597
598 /* update state: */
599 call __morus1280_update
600 call __morus1280_update
601 call __morus1280_update
602 call __morus1280_update
603 call __morus1280_update
604 call __morus1280_update
605 call __morus1280_update
606 call __morus1280_update
607 call __morus1280_update
608 call __morus1280_update
609
610 /* xor tag: */
611 vmovdqu (%rsi), MSG
612
613 vpxor STATE0, MSG, MSG
614 vpermq $MASK3, STATE1, T0
615 vpxor T0, MSG, MSG
616 vpand STATE2, STATE3, T0
617 vpxor T0, MSG, MSG
618 vmovdqu MSG, (%rsi)
619
620 FRAME_END
621 ret
622ENDPROC(crypto_morus1280_avx2_final)