Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Bit sliced AES using NEON instructions
4 *
5 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8/*
9 * The algorithm implemented here is described in detail by the paper
10 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
11 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
12 *
13 * This implementation is based primarily on the OpenSSL implementation
14 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
15 */
16
17#include <linux/linkage.h>
18#include <asm/assembler.h>
19
20 .text
21
22 rounds .req x11
23 bskey .req x12
24
25 .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
26 eor \b2, \b2, \b1
27 eor \b5, \b5, \b6
28 eor \b3, \b3, \b0
29 eor \b6, \b6, \b2
30 eor \b5, \b5, \b0
31 eor \b6, \b6, \b3
32 eor \b3, \b3, \b7
33 eor \b7, \b7, \b5
34 eor \b3, \b3, \b4
35 eor \b4, \b4, \b5
36 eor \b2, \b2, \b7
37 eor \b3, \b3, \b1
38 eor \b1, \b1, \b5
39 .endm
40
41 .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
42 eor \b0, \b0, \b6
43 eor \b1, \b1, \b4
44 eor \b4, \b4, \b6
45 eor \b2, \b2, \b0
46 eor \b6, \b6, \b1
47 eor \b1, \b1, \b5
48 eor \b5, \b5, \b3
49 eor \b3, \b3, \b7
50 eor \b7, \b7, \b5
51 eor \b2, \b2, \b5
52 eor \b4, \b4, \b7
53 .endm
54
55 .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
56 eor \b1, \b1, \b7
57 eor \b4, \b4, \b7
58 eor \b7, \b7, \b5
59 eor \b1, \b1, \b3
60 eor \b2, \b2, \b5
61 eor \b3, \b3, \b7
62 eor \b6, \b6, \b1
63 eor \b2, \b2, \b0
64 eor \b5, \b5, \b3
65 eor \b4, \b4, \b6
66 eor \b0, \b0, \b6
67 eor \b1, \b1, \b4
68 .endm
69
70 .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
71 eor \b1, \b1, \b5
72 eor \b2, \b2, \b7
73 eor \b3, \b3, \b1
74 eor \b4, \b4, \b5
75 eor \b7, \b7, \b5
76 eor \b3, \b3, \b4
77 eor \b5, \b5, \b0
78 eor \b3, \b3, \b7
79 eor \b6, \b6, \b2
80 eor \b2, \b2, \b1
81 eor \b6, \b6, \b3
82 eor \b3, \b3, \b0
83 eor \b5, \b5, \b6
84 .endm
85
86 .macro mul_gf4, x0, x1, y0, y1, t0, t1
87 eor \t0, \y0, \y1
88 and \t0, \t0, \x0
89 eor \x0, \x0, \x1
90 and \t1, \x1, \y0
91 and \x0, \x0, \y1
92 eor \x1, \t1, \t0
93 eor \x0, \x0, \t1
94 .endm
95
96 .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
97 eor \t0, \y0, \y1
98 eor \t1, \y2, \y3
99 and \t0, \t0, \x0
100 and \t1, \t1, \x2
101 eor \x0, \x0, \x1
102 eor \x2, \x2, \x3
103 and \x1, \x1, \y0
104 and \x3, \x3, \y2
105 and \x0, \x0, \y1
106 and \x2, \x2, \y3
107 eor \x1, \x1, \x0
108 eor \x2, \x2, \x3
109 eor \x0, \x0, \t0
110 eor \x3, \x3, \t1
111 .endm
112
113 .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
114 y0, y1, y2, y3, t0, t1, t2, t3
115 eor \t0, \x0, \x2
116 eor \t1, \x1, \x3
117 mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3
118 eor \y0, \y0, \y2
119 eor \y1, \y1, \y3
120 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
121 eor \x0, \x0, \t0
122 eor \x2, \x2, \t0
123 eor \x1, \x1, \t1
124 eor \x3, \x3, \t1
125 eor \t0, \x4, \x6
126 eor \t1, \x5, \x7
127 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
128 eor \y0, \y0, \y2
129 eor \y1, \y1, \y3
130 mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3
131 eor \x4, \x4, \t0
132 eor \x6, \x6, \t0
133 eor \x5, \x5, \t1
134 eor \x7, \x7, \t1
135 .endm
136
137 .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
138 t0, t1, t2, t3, s0, s1, s2, s3
139 eor \t3, \x4, \x6
140 eor \t0, \x5, \x7
141 eor \t1, \x1, \x3
142 eor \s1, \x7, \x6
143 eor \s0, \x0, \x2
144 eor \s3, \t3, \t0
145 orr \t2, \t0, \t1
146 and \s2, \t3, \s0
147 orr \t3, \t3, \s0
148 eor \s0, \s0, \t1
149 and \t0, \t0, \t1
150 eor \t1, \x3, \x2
151 and \s3, \s3, \s0
152 and \s1, \s1, \t1
153 eor \t1, \x4, \x5
154 eor \s0, \x1, \x0
155 eor \t3, \t3, \s1
156 eor \t2, \t2, \s1
157 and \s1, \t1, \s0
158 orr \t1, \t1, \s0
159 eor \t3, \t3, \s3
160 eor \t0, \t0, \s1
161 eor \t2, \t2, \s2
162 eor \t1, \t1, \s3
163 eor \t0, \t0, \s2
164 and \s0, \x7, \x3
165 eor \t1, \t1, \s2
166 and \s1, \x6, \x2
167 and \s2, \x5, \x1
168 orr \s3, \x4, \x0
169 eor \t3, \t3, \s0
170 eor \t1, \t1, \s2
171 eor \s0, \t0, \s3
172 eor \t2, \t2, \s1
173 and \s2, \t3, \t1
174 eor \s1, \t2, \s2
175 eor \s3, \s0, \s2
176 bsl \s1, \t1, \s0
177 not \t0, \s0
178 bsl \s0, \s1, \s3
179 bsl \t0, \s1, \s3
180 bsl \s3, \t3, \t2
181 eor \t3, \t3, \t2
182 and \s2, \s0, \s3
183 eor \t1, \t1, \t0
184 eor \s2, \s2, \t3
185 mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
186 \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
187 .endm
188
189 .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
190 t0, t1, t2, t3, s0, s1, s2, s3
191 in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
192 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
193 inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
194 \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
195 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
196 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
197 out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
198 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
199 .endm
200
201 .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
202 t0, t1, t2, t3, s0, s1, s2, s3
203 inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
204 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
205 inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
206 \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
207 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
208 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
209 inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
210 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
211 .endm
212
213 .macro enc_next_rk
214 ldp q16, q17, [bskey], #128
215 ldp q18, q19, [bskey, #-96]
216 ldp q20, q21, [bskey, #-64]
217 ldp q22, q23, [bskey, #-32]
218 .endm
219
220 .macro dec_next_rk
221 ldp q16, q17, [bskey, #-128]!
222 ldp q18, q19, [bskey, #32]
223 ldp q20, q21, [bskey, #64]
224 ldp q22, q23, [bskey, #96]
225 .endm
226
227 .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
228 eor \x0\().16b, \x0\().16b, v16.16b
229 eor \x1\().16b, \x1\().16b, v17.16b
230 eor \x2\().16b, \x2\().16b, v18.16b
231 eor \x3\().16b, \x3\().16b, v19.16b
232 eor \x4\().16b, \x4\().16b, v20.16b
233 eor \x5\().16b, \x5\().16b, v21.16b
234 eor \x6\().16b, \x6\().16b, v22.16b
235 eor \x7\().16b, \x7\().16b, v23.16b
236 .endm
237
238 .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
239 tbl \x0\().16b, {\x0\().16b}, \mask\().16b
240 tbl \x1\().16b, {\x1\().16b}, \mask\().16b
241 tbl \x2\().16b, {\x2\().16b}, \mask\().16b
242 tbl \x3\().16b, {\x3\().16b}, \mask\().16b
243 tbl \x4\().16b, {\x4\().16b}, \mask\().16b
244 tbl \x5\().16b, {\x5\().16b}, \mask\().16b
245 tbl \x6\().16b, {\x6\().16b}, \mask\().16b
246 tbl \x7\().16b, {\x7\().16b}, \mask\().16b
247 .endm
248
249 .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
250 t0, t1, t2, t3, t4, t5, t6, t7, inv
251 ext \t0\().16b, \x0\().16b, \x0\().16b, #12
252 ext \t1\().16b, \x1\().16b, \x1\().16b, #12
253 eor \x0\().16b, \x0\().16b, \t0\().16b
254 ext \t2\().16b, \x2\().16b, \x2\().16b, #12
255 eor \x1\().16b, \x1\().16b, \t1\().16b
256 ext \t3\().16b, \x3\().16b, \x3\().16b, #12
257 eor \x2\().16b, \x2\().16b, \t2\().16b
258 ext \t4\().16b, \x4\().16b, \x4\().16b, #12
259 eor \x3\().16b, \x3\().16b, \t3\().16b
260 ext \t5\().16b, \x5\().16b, \x5\().16b, #12
261 eor \x4\().16b, \x4\().16b, \t4\().16b
262 ext \t6\().16b, \x6\().16b, \x6\().16b, #12
263 eor \x5\().16b, \x5\().16b, \t5\().16b
264 ext \t7\().16b, \x7\().16b, \x7\().16b, #12
265 eor \x6\().16b, \x6\().16b, \t6\().16b
266 eor \t1\().16b, \t1\().16b, \x0\().16b
267 eor \x7\().16b, \x7\().16b, \t7\().16b
268 ext \x0\().16b, \x0\().16b, \x0\().16b, #8
269 eor \t2\().16b, \t2\().16b, \x1\().16b
270 eor \t0\().16b, \t0\().16b, \x7\().16b
271 eor \t1\().16b, \t1\().16b, \x7\().16b
272 ext \x1\().16b, \x1\().16b, \x1\().16b, #8
273 eor \t5\().16b, \t5\().16b, \x4\().16b
274 eor \x0\().16b, \x0\().16b, \t0\().16b
275 eor \t6\().16b, \t6\().16b, \x5\().16b
276 eor \x1\().16b, \x1\().16b, \t1\().16b
277 ext \t0\().16b, \x4\().16b, \x4\().16b, #8
278 eor \t4\().16b, \t4\().16b, \x3\().16b
279 ext \t1\().16b, \x5\().16b, \x5\().16b, #8
280 eor \t7\().16b, \t7\().16b, \x6\().16b
281 ext \x4\().16b, \x3\().16b, \x3\().16b, #8
282 eor \t3\().16b, \t3\().16b, \x2\().16b
283 ext \x5\().16b, \x7\().16b, \x7\().16b, #8
284 eor \t4\().16b, \t4\().16b, \x7\().16b
285 ext \x3\().16b, \x6\().16b, \x6\().16b, #8
286 eor \t3\().16b, \t3\().16b, \x7\().16b
287 ext \x6\().16b, \x2\().16b, \x2\().16b, #8
288 eor \x7\().16b, \t1\().16b, \t5\().16b
289 .ifb \inv
290 eor \x2\().16b, \t0\().16b, \t4\().16b
291 eor \x4\().16b, \x4\().16b, \t3\().16b
292 eor \x5\().16b, \x5\().16b, \t7\().16b
293 eor \x3\().16b, \x3\().16b, \t6\().16b
294 eor \x6\().16b, \x6\().16b, \t2\().16b
295 .else
296 eor \t3\().16b, \t3\().16b, \x4\().16b
297 eor \x5\().16b, \x5\().16b, \t7\().16b
298 eor \x2\().16b, \x3\().16b, \t6\().16b
299 eor \x3\().16b, \t0\().16b, \t4\().16b
300 eor \x4\().16b, \x6\().16b, \t2\().16b
301 mov \x6\().16b, \t3\().16b
302 .endif
303 .endm
304
305 .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
306 t0, t1, t2, t3, t4, t5, t6, t7
307 ext \t0\().16b, \x0\().16b, \x0\().16b, #8
308 ext \t6\().16b, \x6\().16b, \x6\().16b, #8
309 ext \t7\().16b, \x7\().16b, \x7\().16b, #8
310 eor \t0\().16b, \t0\().16b, \x0\().16b
311 ext \t1\().16b, \x1\().16b, \x1\().16b, #8
312 eor \t6\().16b, \t6\().16b, \x6\().16b
313 ext \t2\().16b, \x2\().16b, \x2\().16b, #8
314 eor \t7\().16b, \t7\().16b, \x7\().16b
315 ext \t3\().16b, \x3\().16b, \x3\().16b, #8
316 eor \t1\().16b, \t1\().16b, \x1\().16b
317 ext \t4\().16b, \x4\().16b, \x4\().16b, #8
318 eor \t2\().16b, \t2\().16b, \x2\().16b
319 ext \t5\().16b, \x5\().16b, \x5\().16b, #8
320 eor \t3\().16b, \t3\().16b, \x3\().16b
321 eor \t4\().16b, \t4\().16b, \x4\().16b
322 eor \t5\().16b, \t5\().16b, \x5\().16b
323 eor \x0\().16b, \x0\().16b, \t6\().16b
324 eor \x1\().16b, \x1\().16b, \t6\().16b
325 eor \x2\().16b, \x2\().16b, \t0\().16b
326 eor \x4\().16b, \x4\().16b, \t2\().16b
327 eor \x3\().16b, \x3\().16b, \t1\().16b
328 eor \x1\().16b, \x1\().16b, \t7\().16b
329 eor \x2\().16b, \x2\().16b, \t7\().16b
330 eor \x4\().16b, \x4\().16b, \t6\().16b
331 eor \x5\().16b, \x5\().16b, \t3\().16b
332 eor \x3\().16b, \x3\().16b, \t6\().16b
333 eor \x6\().16b, \x6\().16b, \t4\().16b
334 eor \x4\().16b, \x4\().16b, \t7\().16b
335 eor \x5\().16b, \x5\().16b, \t7\().16b
336 eor \x7\().16b, \x7\().16b, \t5\().16b
337 mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
338 \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
339 .endm
340
341 .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
342 ushr \t0\().2d, \b0\().2d, #\n
343 ushr \t1\().2d, \b1\().2d, #\n
344 eor \t0\().16b, \t0\().16b, \a0\().16b
345 eor \t1\().16b, \t1\().16b, \a1\().16b
346 and \t0\().16b, \t0\().16b, \mask\().16b
347 and \t1\().16b, \t1\().16b, \mask\().16b
348 eor \a0\().16b, \a0\().16b, \t0\().16b
349 shl \t0\().2d, \t0\().2d, #\n
350 eor \a1\().16b, \a1\().16b, \t1\().16b
351 shl \t1\().2d, \t1\().2d, #\n
352 eor \b0\().16b, \b0\().16b, \t0\().16b
353 eor \b1\().16b, \b1\().16b, \t1\().16b
354 .endm
355
356 .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
357 movi \t0\().16b, #0x55
358 movi \t1\().16b, #0x33
359 swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
360 swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
361 movi \t0\().16b, #0x0f
362 swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
363 swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
364 swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
365 swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
366 .endm
367
368
369 .align 6
370M0: .octa 0x0004080c0105090d02060a0e03070b0f
371
372M0SR: .octa 0x0004080c05090d010a0e02060f03070b
373SR: .octa 0x0f0e0d0c0a09080b0504070600030201
374SRM0: .octa 0x01060b0c0207080d0304090e00050a0f
375
376M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03
377ISR: .octa 0x0f0e0d0c080b0a090504070602010003
378ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f
379
380 /*
381 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
382 */
383SYM_FUNC_START(aesbs_convert_key)
384 ld1 {v7.4s}, [x1], #16 // load round 0 key
385 ld1 {v17.4s}, [x1], #16 // load round 1 key
386
387 movi v8.16b, #0x01 // bit masks
388 movi v9.16b, #0x02
389 movi v10.16b, #0x04
390 movi v11.16b, #0x08
391 movi v12.16b, #0x10
392 movi v13.16b, #0x20
393 movi v14.16b, #0x40
394 movi v15.16b, #0x80
395 ldr q16, M0
396
397 sub x2, x2, #1
398 str q7, [x0], #16 // save round 0 key
399
400.Lkey_loop:
401 tbl v7.16b ,{v17.16b}, v16.16b
402 ld1 {v17.4s}, [x1], #16 // load next round key
403
404 cmtst v0.16b, v7.16b, v8.16b
405 cmtst v1.16b, v7.16b, v9.16b
406 cmtst v2.16b, v7.16b, v10.16b
407 cmtst v3.16b, v7.16b, v11.16b
408 cmtst v4.16b, v7.16b, v12.16b
409 cmtst v5.16b, v7.16b, v13.16b
410 cmtst v6.16b, v7.16b, v14.16b
411 cmtst v7.16b, v7.16b, v15.16b
412 not v0.16b, v0.16b
413 not v1.16b, v1.16b
414 not v5.16b, v5.16b
415 not v6.16b, v6.16b
416
417 subs x2, x2, #1
418 stp q0, q1, [x0], #128
419 stp q2, q3, [x0, #-96]
420 stp q4, q5, [x0, #-64]
421 stp q6, q7, [x0, #-32]
422 b.ne .Lkey_loop
423
424 movi v7.16b, #0x63 // compose .L63
425 eor v17.16b, v17.16b, v7.16b
426 str q17, [x0]
427 ret
428SYM_FUNC_END(aesbs_convert_key)
429
430 .align 4
431SYM_FUNC_START_LOCAL(aesbs_encrypt8)
432 ldr q9, [bskey], #16 // round 0 key
433 ldr q8, M0SR
434 ldr q24, SR
435
436 eor v10.16b, v0.16b, v9.16b // xor with round0 key
437 eor v11.16b, v1.16b, v9.16b
438 tbl v0.16b, {v10.16b}, v8.16b
439 eor v12.16b, v2.16b, v9.16b
440 tbl v1.16b, {v11.16b}, v8.16b
441 eor v13.16b, v3.16b, v9.16b
442 tbl v2.16b, {v12.16b}, v8.16b
443 eor v14.16b, v4.16b, v9.16b
444 tbl v3.16b, {v13.16b}, v8.16b
445 eor v15.16b, v5.16b, v9.16b
446 tbl v4.16b, {v14.16b}, v8.16b
447 eor v10.16b, v6.16b, v9.16b
448 tbl v5.16b, {v15.16b}, v8.16b
449 eor v11.16b, v7.16b, v9.16b
450 tbl v6.16b, {v10.16b}, v8.16b
451 tbl v7.16b, {v11.16b}, v8.16b
452
453 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
454
455 sub rounds, rounds, #1
456 b .Lenc_sbox
457
458.Lenc_loop:
459 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
460.Lenc_sbox:
461 sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
462 v13, v14, v15
463 subs rounds, rounds, #1
464 b.cc .Lenc_done
465
466 enc_next_rk
467
468 mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
469 v13, v14, v15
470
471 add_round_key v0, v1, v2, v3, v4, v5, v6, v7
472
473 b.ne .Lenc_loop
474 ldr q24, SRM0
475 b .Lenc_loop
476
477.Lenc_done:
478 ldr q12, [bskey] // last round key
479
480 bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
481
482 eor v0.16b, v0.16b, v12.16b
483 eor v1.16b, v1.16b, v12.16b
484 eor v4.16b, v4.16b, v12.16b
485 eor v6.16b, v6.16b, v12.16b
486 eor v3.16b, v3.16b, v12.16b
487 eor v7.16b, v7.16b, v12.16b
488 eor v2.16b, v2.16b, v12.16b
489 eor v5.16b, v5.16b, v12.16b
490 ret
491SYM_FUNC_END(aesbs_encrypt8)
492
493 .align 4
494SYM_FUNC_START_LOCAL(aesbs_decrypt8)
495 lsl x9, rounds, #7
496 add bskey, bskey, x9
497
498 ldr q9, [bskey, #-112]! // round 0 key
499 ldr q8, M0ISR
500 ldr q24, ISR
501
502 eor v10.16b, v0.16b, v9.16b // xor with round0 key
503 eor v11.16b, v1.16b, v9.16b
504 tbl v0.16b, {v10.16b}, v8.16b
505 eor v12.16b, v2.16b, v9.16b
506 tbl v1.16b, {v11.16b}, v8.16b
507 eor v13.16b, v3.16b, v9.16b
508 tbl v2.16b, {v12.16b}, v8.16b
509 eor v14.16b, v4.16b, v9.16b
510 tbl v3.16b, {v13.16b}, v8.16b
511 eor v15.16b, v5.16b, v9.16b
512 tbl v4.16b, {v14.16b}, v8.16b
513 eor v10.16b, v6.16b, v9.16b
514 tbl v5.16b, {v15.16b}, v8.16b
515 eor v11.16b, v7.16b, v9.16b
516 tbl v6.16b, {v10.16b}, v8.16b
517 tbl v7.16b, {v11.16b}, v8.16b
518
519 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
520
521 sub rounds, rounds, #1
522 b .Ldec_sbox
523
524.Ldec_loop:
525 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
526.Ldec_sbox:
527 inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
528 v13, v14, v15
529 subs rounds, rounds, #1
530 b.cc .Ldec_done
531
532 dec_next_rk
533
534 add_round_key v0, v1, v6, v4, v2, v7, v3, v5
535
536 inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
537 v13, v14, v15
538
539 b.ne .Ldec_loop
540 ldr q24, ISRM0
541 b .Ldec_loop
542.Ldec_done:
543 ldr q12, [bskey, #-16] // last round key
544
545 bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
546
547 eor v0.16b, v0.16b, v12.16b
548 eor v1.16b, v1.16b, v12.16b
549 eor v6.16b, v6.16b, v12.16b
550 eor v4.16b, v4.16b, v12.16b
551 eor v2.16b, v2.16b, v12.16b
552 eor v7.16b, v7.16b, v12.16b
553 eor v3.16b, v3.16b, v12.16b
554 eor v5.16b, v5.16b, v12.16b
555 ret
556SYM_FUNC_END(aesbs_decrypt8)
557
558 /*
559 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
560 * int blocks)
561 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
562 * int blocks)
563 */
564 .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
565 frame_push 5
566
567 mov x19, x0
568 mov x20, x1
569 mov x21, x2
570 mov x22, x3
571 mov x23, x4
572
57399: mov x5, #1
574 lsl x5, x5, x23
575 subs w23, w23, #8
576 csel x23, x23, xzr, pl
577 csel x5, x5, xzr, mi
578
579 ld1 {v0.16b}, [x20], #16
580 tbnz x5, #1, 0f
581 ld1 {v1.16b}, [x20], #16
582 tbnz x5, #2, 0f
583 ld1 {v2.16b}, [x20], #16
584 tbnz x5, #3, 0f
585 ld1 {v3.16b}, [x20], #16
586 tbnz x5, #4, 0f
587 ld1 {v4.16b}, [x20], #16
588 tbnz x5, #5, 0f
589 ld1 {v5.16b}, [x20], #16
590 tbnz x5, #6, 0f
591 ld1 {v6.16b}, [x20], #16
592 tbnz x5, #7, 0f
593 ld1 {v7.16b}, [x20], #16
594
5950: mov bskey, x21
596 mov rounds, x22
597 bl \do8
598
599 st1 {\o0\().16b}, [x19], #16
600 tbnz x5, #1, 1f
601 st1 {\o1\().16b}, [x19], #16
602 tbnz x5, #2, 1f
603 st1 {\o2\().16b}, [x19], #16
604 tbnz x5, #3, 1f
605 st1 {\o3\().16b}, [x19], #16
606 tbnz x5, #4, 1f
607 st1 {\o4\().16b}, [x19], #16
608 tbnz x5, #5, 1f
609 st1 {\o5\().16b}, [x19], #16
610 tbnz x5, #6, 1f
611 st1 {\o6\().16b}, [x19], #16
612 tbnz x5, #7, 1f
613 st1 {\o7\().16b}, [x19], #16
614
615 cbz x23, 1f
616 b 99b
617
6181: frame_pop
619 ret
620 .endm
621
622 .align 4
623SYM_FUNC_START(aesbs_ecb_encrypt)
624 __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
625SYM_FUNC_END(aesbs_ecb_encrypt)
626
627 .align 4
628SYM_FUNC_START(aesbs_ecb_decrypt)
629 __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
630SYM_FUNC_END(aesbs_ecb_decrypt)
631
632 /*
633 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
634 * int blocks, u8 iv[])
635 */
636 .align 4
637SYM_FUNC_START(aesbs_cbc_decrypt)
638 frame_push 6
639
640 mov x19, x0
641 mov x20, x1
642 mov x21, x2
643 mov x22, x3
644 mov x23, x4
645 mov x24, x5
646
64799: mov x6, #1
648 lsl x6, x6, x23
649 subs w23, w23, #8
650 csel x23, x23, xzr, pl
651 csel x6, x6, xzr, mi
652
653 ld1 {v0.16b}, [x20], #16
654 mov v25.16b, v0.16b
655 tbnz x6, #1, 0f
656 ld1 {v1.16b}, [x20], #16
657 mov v26.16b, v1.16b
658 tbnz x6, #2, 0f
659 ld1 {v2.16b}, [x20], #16
660 mov v27.16b, v2.16b
661 tbnz x6, #3, 0f
662 ld1 {v3.16b}, [x20], #16
663 mov v28.16b, v3.16b
664 tbnz x6, #4, 0f
665 ld1 {v4.16b}, [x20], #16
666 mov v29.16b, v4.16b
667 tbnz x6, #5, 0f
668 ld1 {v5.16b}, [x20], #16
669 mov v30.16b, v5.16b
670 tbnz x6, #6, 0f
671 ld1 {v6.16b}, [x20], #16
672 mov v31.16b, v6.16b
673 tbnz x6, #7, 0f
674 ld1 {v7.16b}, [x20]
675
6760: mov bskey, x21
677 mov rounds, x22
678 bl aesbs_decrypt8
679
680 ld1 {v24.16b}, [x24] // load IV
681
682 eor v1.16b, v1.16b, v25.16b
683 eor v6.16b, v6.16b, v26.16b
684 eor v4.16b, v4.16b, v27.16b
685 eor v2.16b, v2.16b, v28.16b
686 eor v7.16b, v7.16b, v29.16b
687 eor v0.16b, v0.16b, v24.16b
688 eor v3.16b, v3.16b, v30.16b
689 eor v5.16b, v5.16b, v31.16b
690
691 st1 {v0.16b}, [x19], #16
692 mov v24.16b, v25.16b
693 tbnz x6, #1, 1f
694 st1 {v1.16b}, [x19], #16
695 mov v24.16b, v26.16b
696 tbnz x6, #2, 1f
697 st1 {v6.16b}, [x19], #16
698 mov v24.16b, v27.16b
699 tbnz x6, #3, 1f
700 st1 {v4.16b}, [x19], #16
701 mov v24.16b, v28.16b
702 tbnz x6, #4, 1f
703 st1 {v2.16b}, [x19], #16
704 mov v24.16b, v29.16b
705 tbnz x6, #5, 1f
706 st1 {v7.16b}, [x19], #16
707 mov v24.16b, v30.16b
708 tbnz x6, #6, 1f
709 st1 {v3.16b}, [x19], #16
710 mov v24.16b, v31.16b
711 tbnz x6, #7, 1f
712 ld1 {v24.16b}, [x20], #16
713 st1 {v5.16b}, [x19], #16
7141: st1 {v24.16b}, [x24] // store IV
715
716 cbz x23, 2f
717 b 99b
718
7192: frame_pop
720 ret
721SYM_FUNC_END(aesbs_cbc_decrypt)
722
723 .macro next_tweak, out, in, const, tmp
724 sshr \tmp\().2d, \in\().2d, #63
725 and \tmp\().16b, \tmp\().16b, \const\().16b
726 add \out\().2d, \in\().2d, \in\().2d
727 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
728 eor \out\().16b, \out\().16b, \tmp\().16b
729 .endm
730
731 /*
732 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
733 * int blocks, u8 iv[])
734 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
735 * int blocks, u8 iv[])
736 */
737SYM_FUNC_START_LOCAL(__xts_crypt8)
738 movi v18.2s, #0x1
739 movi v19.2s, #0x87
740 uzp1 v18.4s, v18.4s, v19.4s
741
742 ld1 {v0.16b-v3.16b}, [x1], #64
743 ld1 {v4.16b-v7.16b}, [x1], #64
744
745 next_tweak v26, v25, v18, v19
746 next_tweak v27, v26, v18, v19
747 next_tweak v28, v27, v18, v19
748 next_tweak v29, v28, v18, v19
749 next_tweak v30, v29, v18, v19
750 next_tweak v31, v30, v18, v19
751 next_tweak v16, v31, v18, v19
752 next_tweak v17, v16, v18, v19
753
754 eor v0.16b, v0.16b, v25.16b
755 eor v1.16b, v1.16b, v26.16b
756 eor v2.16b, v2.16b, v27.16b
757 eor v3.16b, v3.16b, v28.16b
758 eor v4.16b, v4.16b, v29.16b
759 eor v5.16b, v5.16b, v30.16b
760 eor v6.16b, v6.16b, v31.16b
761 eor v7.16b, v7.16b, v16.16b
762
763 stp q16, q17, [sp, #16]
764
765 mov bskey, x2
766 mov rounds, x3
767 br x16
768SYM_FUNC_END(__xts_crypt8)
769
770 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
771 stp x29, x30, [sp, #-48]!
772 mov x29, sp
773
774 ld1 {v25.16b}, [x5]
775
7760: adr x16, \do8
777 bl __xts_crypt8
778
779 eor v16.16b, \o0\().16b, v25.16b
780 eor v17.16b, \o1\().16b, v26.16b
781 eor v18.16b, \o2\().16b, v27.16b
782 eor v19.16b, \o3\().16b, v28.16b
783
784 ldp q24, q25, [sp, #16]
785
786 eor v20.16b, \o4\().16b, v29.16b
787 eor v21.16b, \o5\().16b, v30.16b
788 eor v22.16b, \o6\().16b, v31.16b
789 eor v23.16b, \o7\().16b, v24.16b
790
791 st1 {v16.16b-v19.16b}, [x0], #64
792 st1 {v20.16b-v23.16b}, [x0], #64
793
794 subs x4, x4, #8
795 b.gt 0b
796
797 st1 {v25.16b}, [x5]
798 ldp x29, x30, [sp], #48
799 ret
800 .endm
801
802SYM_FUNC_START(aesbs_xts_encrypt)
803 __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
804SYM_FUNC_END(aesbs_xts_encrypt)
805
806SYM_FUNC_START(aesbs_xts_decrypt)
807 __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
808SYM_FUNC_END(aesbs_xts_decrypt)
809
810 .macro next_ctr, v
811 mov \v\().d[1], x8
812 adds x8, x8, #1
813 mov \v\().d[0], x7
814 adc x7, x7, xzr
815 rev64 \v\().16b, \v\().16b
816 .endm
817
818 /*
819 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
820 * int rounds, int blocks, u8 iv[])
821 */
822SYM_FUNC_START(aesbs_ctr_encrypt)
823 stp x29, x30, [sp, #-16]!
824 mov x29, sp
825
826 ldp x7, x8, [x5]
827 ld1 {v0.16b}, [x5]
828CPU_LE( rev x7, x7 )
829CPU_LE( rev x8, x8 )
830 adds x8, x8, #1
831 adc x7, x7, xzr
832
8330: next_ctr v1
834 next_ctr v2
835 next_ctr v3
836 next_ctr v4
837 next_ctr v5
838 next_ctr v6
839 next_ctr v7
840
841 mov bskey, x2
842 mov rounds, x3
843 bl aesbs_encrypt8
844
845 ld1 { v8.16b-v11.16b}, [x1], #64
846 ld1 {v12.16b-v15.16b}, [x1], #64
847
848 eor v8.16b, v0.16b, v8.16b
849 eor v9.16b, v1.16b, v9.16b
850 eor v10.16b, v4.16b, v10.16b
851 eor v11.16b, v6.16b, v11.16b
852 eor v12.16b, v3.16b, v12.16b
853 eor v13.16b, v7.16b, v13.16b
854 eor v14.16b, v2.16b, v14.16b
855 eor v15.16b, v5.16b, v15.16b
856
857 st1 { v8.16b-v11.16b}, [x0], #64
858 st1 {v12.16b-v15.16b}, [x0], #64
859
860 next_ctr v0
861 subs x4, x4, #8
862 b.gt 0b
863
864 st1 {v0.16b}, [x5]
865 ldp x29, x30, [sp], #16
866 ret
867SYM_FUNC_END(aesbs_ctr_encrypt)