Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
4 * as specified in
5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6 *
7 * Copyright (C) 2022, Alibaba Group.
8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13#include "sm4-ce-asm.h"
14
15.arch armv8-a+crypto
16
17.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
18 20, 24, 25, 26, 27, 28, 29, 30, 31
19 .set .Lv\b\().4s, \b
20.endr
21
22.macro sm4e, vd, vn
23 .inst 0xcec08400 | (.L\vn << 5) | .L\vd
24.endm
25
26.macro sm4ekey, vd, vn, vm
27 .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
28.endm
29
30/* Register macros */
31
32#define RTMP0 v16
33#define RTMP1 v17
34#define RTMP2 v18
35#define RTMP3 v19
36
37#define RIV v20
38#define RMAC v20
39#define RMASK v21
40
41
42.align 3
43SYM_FUNC_START(sm4_ce_expand_key)
44 /* input:
45 * x0: 128-bit key
46 * x1: rkey_enc
47 * x2: rkey_dec
48 * x3: fk array
49 * x4: ck array
50 */
51 ld1 {v0.16b}, [x0];
52 rev32 v0.16b, v0.16b;
53 ld1 {v1.16b}, [x3];
54 /* load ck */
55 ld1 {v24.16b-v27.16b}, [x4], #64;
56 ld1 {v28.16b-v31.16b}, [x4];
57
58 /* input ^ fk */
59 eor v0.16b, v0.16b, v1.16b;
60
61 sm4ekey v0.4s, v0.4s, v24.4s;
62 sm4ekey v1.4s, v0.4s, v25.4s;
63 sm4ekey v2.4s, v1.4s, v26.4s;
64 sm4ekey v3.4s, v2.4s, v27.4s;
65 sm4ekey v4.4s, v3.4s, v28.4s;
66 sm4ekey v5.4s, v4.4s, v29.4s;
67 sm4ekey v6.4s, v5.4s, v30.4s;
68 sm4ekey v7.4s, v6.4s, v31.4s;
69
70 adr_l x5, .Lbswap128_mask
71 ld1 {v24.16b}, [x5]
72
73 st1 {v0.16b-v3.16b}, [x1], #64;
74 st1 {v4.16b-v7.16b}, [x1];
75
76 tbl v16.16b, {v7.16b}, v24.16b
77 tbl v17.16b, {v6.16b}, v24.16b
78 tbl v18.16b, {v5.16b}, v24.16b
79 tbl v19.16b, {v4.16b}, v24.16b
80 tbl v20.16b, {v3.16b}, v24.16b
81 tbl v21.16b, {v2.16b}, v24.16b
82 tbl v22.16b, {v1.16b}, v24.16b
83 tbl v23.16b, {v0.16b}, v24.16b
84
85 st1 {v16.16b-v19.16b}, [x2], #64
86 st1 {v20.16b-v23.16b}, [x2]
87
88 ret;
89SYM_FUNC_END(sm4_ce_expand_key)
90
91.align 3
92SYM_FUNC_START(sm4_ce_crypt_block)
93 /* input:
94 * x0: round key array, CTX
95 * x1: dst
96 * x2: src
97 */
98 SM4_PREPARE(x0)
99
100 ld1 {v0.16b}, [x2];
101 SM4_CRYPT_BLK(v0);
102 st1 {v0.16b}, [x1];
103
104 ret;
105SYM_FUNC_END(sm4_ce_crypt_block)
106
107.align 3
108SYM_FUNC_START(sm4_ce_crypt)
109 /* input:
110 * x0: round key array, CTX
111 * x1: dst
112 * x2: src
113 * w3: nblocks
114 */
115 SM4_PREPARE(x0)
116
117.Lcrypt_loop_blk:
118 sub w3, w3, #8;
119 tbnz w3, #31, .Lcrypt_tail8;
120
121 ld1 {v0.16b-v3.16b}, [x2], #64;
122 ld1 {v4.16b-v7.16b}, [x2], #64;
123
124 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
125
126 st1 {v0.16b-v3.16b}, [x1], #64;
127 st1 {v4.16b-v7.16b}, [x1], #64;
128
129 cbz w3, .Lcrypt_end;
130 b .Lcrypt_loop_blk;
131
132.Lcrypt_tail8:
133 add w3, w3, #8;
134 cmp w3, #4;
135 blt .Lcrypt_tail4;
136
137 sub w3, w3, #4;
138
139 ld1 {v0.16b-v3.16b}, [x2], #64;
140 SM4_CRYPT_BLK4(v0, v1, v2, v3);
141 st1 {v0.16b-v3.16b}, [x1], #64;
142
143 cbz w3, .Lcrypt_end;
144
145.Lcrypt_tail4:
146 sub w3, w3, #1;
147
148 ld1 {v0.16b}, [x2], #16;
149 SM4_CRYPT_BLK(v0);
150 st1 {v0.16b}, [x1], #16;
151
152 cbnz w3, .Lcrypt_tail4;
153
154.Lcrypt_end:
155 ret;
156SYM_FUNC_END(sm4_ce_crypt)
157
158.align 3
159SYM_FUNC_START(sm4_ce_cbc_enc)
160 /* input:
161 * x0: round key array, CTX
162 * x1: dst
163 * x2: src
164 * x3: iv (big endian, 128 bit)
165 * w4: nblocks
166 */
167 SM4_PREPARE(x0)
168
169 ld1 {RIV.16b}, [x3]
170
171.Lcbc_enc_loop_4x:
172 cmp w4, #4
173 blt .Lcbc_enc_loop_1x
174
175 sub w4, w4, #4
176
177 ld1 {v0.16b-v3.16b}, [x2], #64
178
179 eor v0.16b, v0.16b, RIV.16b
180 SM4_CRYPT_BLK(v0)
181 eor v1.16b, v1.16b, v0.16b
182 SM4_CRYPT_BLK(v1)
183 eor v2.16b, v2.16b, v1.16b
184 SM4_CRYPT_BLK(v2)
185 eor v3.16b, v3.16b, v2.16b
186 SM4_CRYPT_BLK(v3)
187
188 st1 {v0.16b-v3.16b}, [x1], #64
189 mov RIV.16b, v3.16b
190
191 cbz w4, .Lcbc_enc_end
192 b .Lcbc_enc_loop_4x
193
194.Lcbc_enc_loop_1x:
195 sub w4, w4, #1
196
197 ld1 {v0.16b}, [x2], #16
198
199 eor RIV.16b, RIV.16b, v0.16b
200 SM4_CRYPT_BLK(RIV)
201
202 st1 {RIV.16b}, [x1], #16
203
204 cbnz w4, .Lcbc_enc_loop_1x
205
206.Lcbc_enc_end:
207 /* store new IV */
208 st1 {RIV.16b}, [x3]
209
210 ret
211SYM_FUNC_END(sm4_ce_cbc_enc)
212
213.align 3
214SYM_FUNC_START(sm4_ce_cbc_dec)
215 /* input:
216 * x0: round key array, CTX
217 * x1: dst
218 * x2: src
219 * x3: iv (big endian, 128 bit)
220 * w4: nblocks
221 */
222 SM4_PREPARE(x0)
223
224 ld1 {RIV.16b}, [x3]
225
226.Lcbc_dec_loop_8x:
227 sub w4, w4, #8
228 tbnz w4, #31, .Lcbc_dec_4x
229
230 ld1 {v0.16b-v3.16b}, [x2], #64
231 ld1 {v4.16b-v7.16b}, [x2], #64
232
233 rev32 v8.16b, v0.16b
234 rev32 v9.16b, v1.16b
235 rev32 v10.16b, v2.16b
236 rev32 v11.16b, v3.16b
237 rev32 v12.16b, v4.16b
238 rev32 v13.16b, v5.16b
239 rev32 v14.16b, v6.16b
240 rev32 v15.16b, v7.16b
241
242 SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
243
244 eor v8.16b, v8.16b, RIV.16b
245 eor v9.16b, v9.16b, v0.16b
246 eor v10.16b, v10.16b, v1.16b
247 eor v11.16b, v11.16b, v2.16b
248 eor v12.16b, v12.16b, v3.16b
249 eor v13.16b, v13.16b, v4.16b
250 eor v14.16b, v14.16b, v5.16b
251 eor v15.16b, v15.16b, v6.16b
252
253 st1 {v8.16b-v11.16b}, [x1], #64
254 st1 {v12.16b-v15.16b}, [x1], #64
255
256 mov RIV.16b, v7.16b
257
258 cbz w4, .Lcbc_dec_end
259 b .Lcbc_dec_loop_8x
260
261.Lcbc_dec_4x:
262 add w4, w4, #8
263 cmp w4, #4
264 blt .Lcbc_dec_loop_1x
265
266 sub w4, w4, #4
267
268 ld1 {v0.16b-v3.16b}, [x2], #64
269
270 rev32 v8.16b, v0.16b
271 rev32 v9.16b, v1.16b
272 rev32 v10.16b, v2.16b
273 rev32 v11.16b, v3.16b
274
275 SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
276
277 eor v8.16b, v8.16b, RIV.16b
278 eor v9.16b, v9.16b, v0.16b
279 eor v10.16b, v10.16b, v1.16b
280 eor v11.16b, v11.16b, v2.16b
281
282 st1 {v8.16b-v11.16b}, [x1], #64
283
284 mov RIV.16b, v3.16b
285
286 cbz w4, .Lcbc_dec_end
287
288.Lcbc_dec_loop_1x:
289 sub w4, w4, #1
290
291 ld1 {v0.16b}, [x2], #16
292
293 rev32 v8.16b, v0.16b
294
295 SM4_CRYPT_BLK_BE(v8)
296
297 eor v8.16b, v8.16b, RIV.16b
298 st1 {v8.16b}, [x1], #16
299
300 mov RIV.16b, v0.16b
301
302 cbnz w4, .Lcbc_dec_loop_1x
303
304.Lcbc_dec_end:
305 /* store new IV */
306 st1 {RIV.16b}, [x3]
307
308 ret
309SYM_FUNC_END(sm4_ce_cbc_dec)
310
311.align 3
312SYM_FUNC_START(sm4_ce_cbc_cts_enc)
313 /* input:
314 * x0: round key array, CTX
315 * x1: dst
316 * x2: src
317 * x3: iv (big endian, 128 bit)
318 * w4: nbytes
319 */
320 SM4_PREPARE(x0)
321
322 sub w5, w4, #16
323 uxtw x5, w5
324
325 ld1 {RIV.16b}, [x3]
326
327 ld1 {v0.16b}, [x2]
328 eor RIV.16b, RIV.16b, v0.16b
329 SM4_CRYPT_BLK(RIV)
330
331 /* load permute table */
332 adr_l x6, .Lcts_permute_table
333 add x7, x6, #32
334 add x6, x6, x5
335 sub x7, x7, x5
336 ld1 {v3.16b}, [x6]
337 ld1 {v4.16b}, [x7]
338
339 /* overlapping loads */
340 add x2, x2, x5
341 ld1 {v1.16b}, [x2]
342
343 /* create Cn from En-1 */
344 tbl v0.16b, {RIV.16b}, v3.16b
345 /* padding Pn with zeros */
346 tbl v1.16b, {v1.16b}, v4.16b
347
348 eor v1.16b, v1.16b, RIV.16b
349 SM4_CRYPT_BLK(v1)
350
351 /* overlapping stores */
352 add x5, x1, x5
353 st1 {v0.16b}, [x5]
354 st1 {v1.16b}, [x1]
355
356 ret
357SYM_FUNC_END(sm4_ce_cbc_cts_enc)
358
359.align 3
360SYM_FUNC_START(sm4_ce_cbc_cts_dec)
361 /* input:
362 * x0: round key array, CTX
363 * x1: dst
364 * x2: src
365 * x3: iv (big endian, 128 bit)
366 * w4: nbytes
367 */
368 SM4_PREPARE(x0)
369
370 sub w5, w4, #16
371 uxtw x5, w5
372
373 ld1 {RIV.16b}, [x3]
374
375 /* load permute table */
376 adr_l x6, .Lcts_permute_table
377 add x7, x6, #32
378 add x6, x6, x5
379 sub x7, x7, x5
380 ld1 {v3.16b}, [x6]
381 ld1 {v4.16b}, [x7]
382
383 /* overlapping loads */
384 ld1 {v0.16b}, [x2], x5
385 ld1 {v1.16b}, [x2]
386
387 SM4_CRYPT_BLK(v0)
388 /* select the first Ln bytes of Xn to create Pn */
389 tbl v2.16b, {v0.16b}, v3.16b
390 eor v2.16b, v2.16b, v1.16b
391
392 /* overwrite the first Ln bytes with Cn to create En-1 */
393 tbx v0.16b, {v1.16b}, v4.16b
394 SM4_CRYPT_BLK(v0)
395 eor v0.16b, v0.16b, RIV.16b
396
397 /* overlapping stores */
398 add x5, x1, x5
399 st1 {v2.16b}, [x5]
400 st1 {v0.16b}, [x1]
401
402 ret
403SYM_FUNC_END(sm4_ce_cbc_cts_dec)
404
405.align 3
406SYM_FUNC_START(sm4_ce_cfb_enc)
407 /* input:
408 * x0: round key array, CTX
409 * x1: dst
410 * x2: src
411 * x3: iv (big endian, 128 bit)
412 * w4: nblocks
413 */
414 SM4_PREPARE(x0)
415
416 ld1 {RIV.16b}, [x3]
417
418.Lcfb_enc_loop_4x:
419 cmp w4, #4
420 blt .Lcfb_enc_loop_1x
421
422 sub w4, w4, #4
423
424 ld1 {v0.16b-v3.16b}, [x2], #64
425
426 rev32 v8.16b, RIV.16b
427 SM4_CRYPT_BLK_BE(v8)
428 eor v0.16b, v0.16b, v8.16b
429
430 rev32 v8.16b, v0.16b
431 SM4_CRYPT_BLK_BE(v8)
432 eor v1.16b, v1.16b, v8.16b
433
434 rev32 v8.16b, v1.16b
435 SM4_CRYPT_BLK_BE(v8)
436 eor v2.16b, v2.16b, v8.16b
437
438 rev32 v8.16b, v2.16b
439 SM4_CRYPT_BLK_BE(v8)
440 eor v3.16b, v3.16b, v8.16b
441
442 st1 {v0.16b-v3.16b}, [x1], #64
443 mov RIV.16b, v3.16b
444
445 cbz w4, .Lcfb_enc_end
446 b .Lcfb_enc_loop_4x
447
448.Lcfb_enc_loop_1x:
449 sub w4, w4, #1
450
451 ld1 {v0.16b}, [x2], #16
452
453 SM4_CRYPT_BLK(RIV)
454 eor RIV.16b, RIV.16b, v0.16b
455
456 st1 {RIV.16b}, [x1], #16
457
458 cbnz w4, .Lcfb_enc_loop_1x
459
460.Lcfb_enc_end:
461 /* store new IV */
462 st1 {RIV.16b}, [x3]
463
464 ret
465SYM_FUNC_END(sm4_ce_cfb_enc)
466
467.align 3
468SYM_FUNC_START(sm4_ce_cfb_dec)
469 /* input:
470 * x0: round key array, CTX
471 * x1: dst
472 * x2: src
473 * x3: iv (big endian, 128 bit)
474 * w4: nblocks
475 */
476 SM4_PREPARE(x0)
477
478 ld1 {RIV.16b}, [x3]
479
480.Lcfb_dec_loop_8x:
481 sub w4, w4, #8
482 tbnz w4, #31, .Lcfb_dec_4x
483
484 ld1 {v0.16b-v3.16b}, [x2], #64
485 ld1 {v4.16b-v7.16b}, [x2], #64
486
487 rev32 v8.16b, RIV.16b
488 rev32 v9.16b, v0.16b
489 rev32 v10.16b, v1.16b
490 rev32 v11.16b, v2.16b
491 rev32 v12.16b, v3.16b
492 rev32 v13.16b, v4.16b
493 rev32 v14.16b, v5.16b
494 rev32 v15.16b, v6.16b
495
496 SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
497
498 mov RIV.16b, v7.16b
499
500 eor v0.16b, v0.16b, v8.16b
501 eor v1.16b, v1.16b, v9.16b
502 eor v2.16b, v2.16b, v10.16b
503 eor v3.16b, v3.16b, v11.16b
504 eor v4.16b, v4.16b, v12.16b
505 eor v5.16b, v5.16b, v13.16b
506 eor v6.16b, v6.16b, v14.16b
507 eor v7.16b, v7.16b, v15.16b
508
509 st1 {v0.16b-v3.16b}, [x1], #64
510 st1 {v4.16b-v7.16b}, [x1], #64
511
512 cbz w4, .Lcfb_dec_end
513 b .Lcfb_dec_loop_8x
514
515.Lcfb_dec_4x:
516 add w4, w4, #8
517 cmp w4, #4
518 blt .Lcfb_dec_loop_1x
519
520 sub w4, w4, #4
521
522 ld1 {v0.16b-v3.16b}, [x2], #64
523
524 rev32 v8.16b, RIV.16b
525 rev32 v9.16b, v0.16b
526 rev32 v10.16b, v1.16b
527 rev32 v11.16b, v2.16b
528
529 SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
530
531 mov RIV.16b, v3.16b
532
533 eor v0.16b, v0.16b, v8.16b
534 eor v1.16b, v1.16b, v9.16b
535 eor v2.16b, v2.16b, v10.16b
536 eor v3.16b, v3.16b, v11.16b
537
538 st1 {v0.16b-v3.16b}, [x1], #64
539
540 cbz w4, .Lcfb_dec_end
541
542.Lcfb_dec_loop_1x:
543 sub w4, w4, #1
544
545 ld1 {v0.16b}, [x2], #16
546
547 SM4_CRYPT_BLK(RIV)
548
549 eor RIV.16b, RIV.16b, v0.16b
550 st1 {RIV.16b}, [x1], #16
551
552 mov RIV.16b, v0.16b
553
554 cbnz w4, .Lcfb_dec_loop_1x
555
556.Lcfb_dec_end:
557 /* store new IV */
558 st1 {RIV.16b}, [x3]
559
560 ret
561SYM_FUNC_END(sm4_ce_cfb_dec)
562
563.align 3
564SYM_FUNC_START(sm4_ce_ctr_enc)
565 /* input:
566 * x0: round key array, CTX
567 * x1: dst
568 * x2: src
569 * x3: ctr (big endian, 128 bit)
570 * w4: nblocks
571 */
572 SM4_PREPARE(x0)
573
574 ldp x7, x8, [x3]
575 rev x7, x7
576 rev x8, x8
577
578.Lctr_loop_8x:
579 sub w4, w4, #8
580 tbnz w4, #31, .Lctr_4x
581
582#define inc_le128(vctr) \
583 mov vctr.d[1], x8; \
584 mov vctr.d[0], x7; \
585 adds x8, x8, #1; \
586 rev64 vctr.16b, vctr.16b; \
587 adc x7, x7, xzr;
588
589 /* construct CTRs */
590 inc_le128(v0) /* +0 */
591 inc_le128(v1) /* +1 */
592 inc_le128(v2) /* +2 */
593 inc_le128(v3) /* +3 */
594 inc_le128(v4) /* +4 */
595 inc_le128(v5) /* +5 */
596 inc_le128(v6) /* +6 */
597 inc_le128(v7) /* +7 */
598
599 ld1 {v8.16b-v11.16b}, [x2], #64
600 ld1 {v12.16b-v15.16b}, [x2], #64
601
602 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
603
604 eor v0.16b, v0.16b, v8.16b
605 eor v1.16b, v1.16b, v9.16b
606 eor v2.16b, v2.16b, v10.16b
607 eor v3.16b, v3.16b, v11.16b
608 eor v4.16b, v4.16b, v12.16b
609 eor v5.16b, v5.16b, v13.16b
610 eor v6.16b, v6.16b, v14.16b
611 eor v7.16b, v7.16b, v15.16b
612
613 st1 {v0.16b-v3.16b}, [x1], #64
614 st1 {v4.16b-v7.16b}, [x1], #64
615
616 cbz w4, .Lctr_end
617 b .Lctr_loop_8x
618
619.Lctr_4x:
620 add w4, w4, #8
621 cmp w4, #4
622 blt .Lctr_loop_1x
623
624 sub w4, w4, #4
625
626 /* construct CTRs */
627 inc_le128(v0) /* +0 */
628 inc_le128(v1) /* +1 */
629 inc_le128(v2) /* +2 */
630 inc_le128(v3) /* +3 */
631
632 ld1 {v8.16b-v11.16b}, [x2], #64
633
634 SM4_CRYPT_BLK4(v0, v1, v2, v3)
635
636 eor v0.16b, v0.16b, v8.16b
637 eor v1.16b, v1.16b, v9.16b
638 eor v2.16b, v2.16b, v10.16b
639 eor v3.16b, v3.16b, v11.16b
640
641 st1 {v0.16b-v3.16b}, [x1], #64
642
643 cbz w4, .Lctr_end
644
645.Lctr_loop_1x:
646 sub w4, w4, #1
647
648 /* construct CTRs */
649 inc_le128(v0)
650
651 ld1 {v8.16b}, [x2], #16
652
653 SM4_CRYPT_BLK(v0)
654
655 eor v0.16b, v0.16b, v8.16b
656 st1 {v0.16b}, [x1], #16
657
658 cbnz w4, .Lctr_loop_1x
659
660.Lctr_end:
661 /* store new CTR */
662 rev x7, x7
663 rev x8, x8
664 stp x7, x8, [x3]
665
666 ret
667SYM_FUNC_END(sm4_ce_ctr_enc)
668
669
670#define tweak_next(vt, vin, RTMP) \
671 sshr RTMP.2d, vin.2d, #63; \
672 and RTMP.16b, RTMP.16b, RMASK.16b; \
673 add vt.2d, vin.2d, vin.2d; \
674 ext RTMP.16b, RTMP.16b, RTMP.16b, #8; \
675 eor vt.16b, vt.16b, RTMP.16b;
676
677.align 3
678SYM_FUNC_START(sm4_ce_xts_enc)
679 /* input:
680 * x0: round key array, CTX
681 * x1: dst
682 * x2: src
683 * x3: tweak (big endian, 128 bit)
684 * w4: nbytes
685 * x5: round key array for IV
686 */
687 ld1 {v8.16b}, [x3]
688
689 cbz x5, .Lxts_enc_nofirst
690
691 SM4_PREPARE(x5)
692
693 /* Generate first tweak */
694 SM4_CRYPT_BLK(v8)
695
696.Lxts_enc_nofirst:
697 SM4_PREPARE(x0)
698
699 ands w5, w4, #15
700 lsr w4, w4, #4
701 sub w6, w4, #1
702 csel w4, w4, w6, eq
703 uxtw x5, w5
704
705 movi RMASK.2s, #0x1
706 movi RTMP0.2s, #0x87
707 uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
708
709 cbz w4, .Lxts_enc_cts
710
711.Lxts_enc_loop_8x:
712 sub w4, w4, #8
713 tbnz w4, #31, .Lxts_enc_4x
714
715 tweak_next( v9, v8, RTMP0)
716 tweak_next(v10, v9, RTMP1)
717 tweak_next(v11, v10, RTMP2)
718 tweak_next(v12, v11, RTMP3)
719 tweak_next(v13, v12, RTMP0)
720 tweak_next(v14, v13, RTMP1)
721 tweak_next(v15, v14, RTMP2)
722
723 ld1 {v0.16b-v3.16b}, [x2], #64
724 ld1 {v4.16b-v7.16b}, [x2], #64
725 eor v0.16b, v0.16b, v8.16b
726 eor v1.16b, v1.16b, v9.16b
727 eor v2.16b, v2.16b, v10.16b
728 eor v3.16b, v3.16b, v11.16b
729 eor v4.16b, v4.16b, v12.16b
730 eor v5.16b, v5.16b, v13.16b
731 eor v6.16b, v6.16b, v14.16b
732 eor v7.16b, v7.16b, v15.16b
733
734 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
735
736 eor v0.16b, v0.16b, v8.16b
737 eor v1.16b, v1.16b, v9.16b
738 eor v2.16b, v2.16b, v10.16b
739 eor v3.16b, v3.16b, v11.16b
740 eor v4.16b, v4.16b, v12.16b
741 eor v5.16b, v5.16b, v13.16b
742 eor v6.16b, v6.16b, v14.16b
743 eor v7.16b, v7.16b, v15.16b
744 st1 {v0.16b-v3.16b}, [x1], #64
745 st1 {v4.16b-v7.16b}, [x1], #64
746
747 tweak_next(v8, v15, RTMP3)
748
749 cbz w4, .Lxts_enc_cts
750 b .Lxts_enc_loop_8x
751
752.Lxts_enc_4x:
753 add w4, w4, #8
754 cmp w4, #4
755 blt .Lxts_enc_loop_1x
756
757 sub w4, w4, #4
758
759 tweak_next( v9, v8, RTMP0)
760 tweak_next(v10, v9, RTMP1)
761 tweak_next(v11, v10, RTMP2)
762
763 ld1 {v0.16b-v3.16b}, [x2], #64
764 eor v0.16b, v0.16b, v8.16b
765 eor v1.16b, v1.16b, v9.16b
766 eor v2.16b, v2.16b, v10.16b
767 eor v3.16b, v3.16b, v11.16b
768
769 SM4_CRYPT_BLK4(v0, v1, v2, v3)
770
771 eor v0.16b, v0.16b, v8.16b
772 eor v1.16b, v1.16b, v9.16b
773 eor v2.16b, v2.16b, v10.16b
774 eor v3.16b, v3.16b, v11.16b
775 st1 {v0.16b-v3.16b}, [x1], #64
776
777 tweak_next(v8, v11, RTMP3)
778
779 cbz w4, .Lxts_enc_cts
780
781.Lxts_enc_loop_1x:
782 sub w4, w4, #1
783
784 ld1 {v0.16b}, [x2], #16
785 eor v0.16b, v0.16b, v8.16b
786
787 SM4_CRYPT_BLK(v0)
788
789 eor v0.16b, v0.16b, v8.16b
790 st1 {v0.16b}, [x1], #16
791
792 tweak_next(v8, v8, RTMP0)
793
794 cbnz w4, .Lxts_enc_loop_1x
795
796.Lxts_enc_cts:
797 cbz x5, .Lxts_enc_end
798
799 /* cipher text stealing */
800
801 tweak_next(v9, v8, RTMP0)
802 ld1 {v0.16b}, [x2]
803 eor v0.16b, v0.16b, v8.16b
804 SM4_CRYPT_BLK(v0)
805 eor v0.16b, v0.16b, v8.16b
806
807 /* load permute table */
808 adr_l x6, .Lcts_permute_table
809 add x7, x6, #32
810 add x6, x6, x5
811 sub x7, x7, x5
812 ld1 {v3.16b}, [x6]
813 ld1 {v4.16b}, [x7]
814
815 /* overlapping loads */
816 add x2, x2, x5
817 ld1 {v1.16b}, [x2]
818
819 /* create Cn from En-1 */
820 tbl v2.16b, {v0.16b}, v3.16b
821 /* padding Pn with En-1 at the end */
822 tbx v0.16b, {v1.16b}, v4.16b
823
824 eor v0.16b, v0.16b, v9.16b
825 SM4_CRYPT_BLK(v0)
826 eor v0.16b, v0.16b, v9.16b
827
828
829 /* overlapping stores */
830 add x5, x1, x5
831 st1 {v2.16b}, [x5]
832 st1 {v0.16b}, [x1]
833
834 b .Lxts_enc_ret
835
836.Lxts_enc_end:
837 /* store new tweak */
838 st1 {v8.16b}, [x3]
839
840.Lxts_enc_ret:
841 ret
842SYM_FUNC_END(sm4_ce_xts_enc)
843
844.align 3
845SYM_FUNC_START(sm4_ce_xts_dec)
846 /* input:
847 * x0: round key array, CTX
848 * x1: dst
849 * x2: src
850 * x3: tweak (big endian, 128 bit)
851 * w4: nbytes
852 * x5: round key array for IV
853 */
854 ld1 {v8.16b}, [x3]
855
856 cbz x5, .Lxts_dec_nofirst
857
858 SM4_PREPARE(x5)
859
860 /* Generate first tweak */
861 SM4_CRYPT_BLK(v8)
862
863.Lxts_dec_nofirst:
864 SM4_PREPARE(x0)
865
866 ands w5, w4, #15
867 lsr w4, w4, #4
868 sub w6, w4, #1
869 csel w4, w4, w6, eq
870 uxtw x5, w5
871
872 movi RMASK.2s, #0x1
873 movi RTMP0.2s, #0x87
874 uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
875
876 cbz w4, .Lxts_dec_cts
877
878.Lxts_dec_loop_8x:
879 sub w4, w4, #8
880 tbnz w4, #31, .Lxts_dec_4x
881
882 tweak_next( v9, v8, RTMP0)
883 tweak_next(v10, v9, RTMP1)
884 tweak_next(v11, v10, RTMP2)
885 tweak_next(v12, v11, RTMP3)
886 tweak_next(v13, v12, RTMP0)
887 tweak_next(v14, v13, RTMP1)
888 tweak_next(v15, v14, RTMP2)
889
890 ld1 {v0.16b-v3.16b}, [x2], #64
891 ld1 {v4.16b-v7.16b}, [x2], #64
892 eor v0.16b, v0.16b, v8.16b
893 eor v1.16b, v1.16b, v9.16b
894 eor v2.16b, v2.16b, v10.16b
895 eor v3.16b, v3.16b, v11.16b
896 eor v4.16b, v4.16b, v12.16b
897 eor v5.16b, v5.16b, v13.16b
898 eor v6.16b, v6.16b, v14.16b
899 eor v7.16b, v7.16b, v15.16b
900
901 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
902
903 eor v0.16b, v0.16b, v8.16b
904 eor v1.16b, v1.16b, v9.16b
905 eor v2.16b, v2.16b, v10.16b
906 eor v3.16b, v3.16b, v11.16b
907 eor v4.16b, v4.16b, v12.16b
908 eor v5.16b, v5.16b, v13.16b
909 eor v6.16b, v6.16b, v14.16b
910 eor v7.16b, v7.16b, v15.16b
911 st1 {v0.16b-v3.16b}, [x1], #64
912 st1 {v4.16b-v7.16b}, [x1], #64
913
914 tweak_next(v8, v15, RTMP3)
915
916 cbz w4, .Lxts_dec_cts
917 b .Lxts_dec_loop_8x
918
919.Lxts_dec_4x:
920 add w4, w4, #8
921 cmp w4, #4
922 blt .Lxts_dec_loop_1x
923
924 sub w4, w4, #4
925
926 tweak_next( v9, v8, RTMP0)
927 tweak_next(v10, v9, RTMP1)
928 tweak_next(v11, v10, RTMP2)
929
930 ld1 {v0.16b-v3.16b}, [x2], #64
931 eor v0.16b, v0.16b, v8.16b
932 eor v1.16b, v1.16b, v9.16b
933 eor v2.16b, v2.16b, v10.16b
934 eor v3.16b, v3.16b, v11.16b
935
936 SM4_CRYPT_BLK4(v0, v1, v2, v3)
937
938 eor v0.16b, v0.16b, v8.16b
939 eor v1.16b, v1.16b, v9.16b
940 eor v2.16b, v2.16b, v10.16b
941 eor v3.16b, v3.16b, v11.16b
942 st1 {v0.16b-v3.16b}, [x1], #64
943
944 tweak_next(v8, v11, RTMP3)
945
946 cbz w4, .Lxts_dec_cts
947
948.Lxts_dec_loop_1x:
949 sub w4, w4, #1
950
951 ld1 {v0.16b}, [x2], #16
952 eor v0.16b, v0.16b, v8.16b
953
954 SM4_CRYPT_BLK(v0)
955
956 eor v0.16b, v0.16b, v8.16b
957 st1 {v0.16b}, [x1], #16
958
959 tweak_next(v8, v8, RTMP0)
960
961 cbnz w4, .Lxts_dec_loop_1x
962
963.Lxts_dec_cts:
964 cbz x5, .Lxts_dec_end
965
966 /* cipher text stealing */
967
968 tweak_next(v9, v8, RTMP0)
969 ld1 {v0.16b}, [x2]
970 eor v0.16b, v0.16b, v9.16b
971 SM4_CRYPT_BLK(v0)
972 eor v0.16b, v0.16b, v9.16b
973
974 /* load permute table */
975 adr_l x6, .Lcts_permute_table
976 add x7, x6, #32
977 add x6, x6, x5
978 sub x7, x7, x5
979 ld1 {v3.16b}, [x6]
980 ld1 {v4.16b}, [x7]
981
982 /* overlapping loads */
983 add x2, x2, x5
984 ld1 {v1.16b}, [x2]
985
986 /* create Cn from En-1 */
987 tbl v2.16b, {v0.16b}, v3.16b
988 /* padding Pn with En-1 at the end */
989 tbx v0.16b, {v1.16b}, v4.16b
990
991 eor v0.16b, v0.16b, v8.16b
992 SM4_CRYPT_BLK(v0)
993 eor v0.16b, v0.16b, v8.16b
994
995
996 /* overlapping stores */
997 add x5, x1, x5
998 st1 {v2.16b}, [x5]
999 st1 {v0.16b}, [x1]
1000
1001 b .Lxts_dec_ret
1002
1003.Lxts_dec_end:
1004 /* store new tweak */
1005 st1 {v8.16b}, [x3]
1006
1007.Lxts_dec_ret:
1008 ret
1009SYM_FUNC_END(sm4_ce_xts_dec)
1010
1011.align 3
1012SYM_FUNC_START(sm4_ce_mac_update)
1013 /* input:
1014 * x0: round key array, CTX
1015 * x1: digest
1016 * x2: src
1017 * w3: nblocks
1018 * w4: enc_before
1019 * w5: enc_after
1020 */
1021 SM4_PREPARE(x0)
1022
1023 ld1 {RMAC.16b}, [x1]
1024
1025 cbz w4, .Lmac_update
1026
1027 SM4_CRYPT_BLK(RMAC)
1028
1029.Lmac_update:
1030 cbz w3, .Lmac_ret
1031
1032 sub w6, w3, #1
1033 cmp w5, wzr
1034 csel w3, w3, w6, ne
1035
1036 cbz w3, .Lmac_end
1037
1038.Lmac_loop_4x:
1039 cmp w3, #4
1040 blt .Lmac_loop_1x
1041
1042 sub w3, w3, #4
1043
1044 ld1 {v0.16b-v3.16b}, [x2], #64
1045
1046 eor RMAC.16b, RMAC.16b, v0.16b
1047 SM4_CRYPT_BLK(RMAC)
1048 eor RMAC.16b, RMAC.16b, v1.16b
1049 SM4_CRYPT_BLK(RMAC)
1050 eor RMAC.16b, RMAC.16b, v2.16b
1051 SM4_CRYPT_BLK(RMAC)
1052 eor RMAC.16b, RMAC.16b, v3.16b
1053 SM4_CRYPT_BLK(RMAC)
1054
1055 cbz w3, .Lmac_end
1056 b .Lmac_loop_4x
1057
1058.Lmac_loop_1x:
1059 sub w3, w3, #1
1060
1061 ld1 {v0.16b}, [x2], #16
1062
1063 eor RMAC.16b, RMAC.16b, v0.16b
1064 SM4_CRYPT_BLK(RMAC)
1065
1066 cbnz w3, .Lmac_loop_1x
1067
1068
1069.Lmac_end:
1070 cbnz w5, .Lmac_ret
1071
1072 ld1 {v0.16b}, [x2], #16
1073 eor RMAC.16b, RMAC.16b, v0.16b
1074
1075.Lmac_ret:
1076 st1 {RMAC.16b}, [x1]
1077 ret
1078SYM_FUNC_END(sm4_ce_mac_update)
1079
1080
1081 .section ".rodata", "a"
1082 .align 4
1083.Lbswap128_mask:
1084 .byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
1085 .byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
1086
1087.Lcts_permute_table:
1088 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1089 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1090 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
1091 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
1092 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1093 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff