Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Core SHA-224/SHA-256 transform using v8 Crypto Extensions
4 *
5 * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <asm/assembler.h>
10
11 .text
12 .arch armv8-a+crypto
13
14 dga .req q20
15 dgav .req v20
16 dgb .req q21
17 dgbv .req v21
18
19 t0 .req v22
20 t1 .req v23
21
22 dg0q .req q24
23 dg0v .req v24
24 dg1q .req q25
25 dg1v .req v25
26 dg2q .req q26
27 dg2v .req v26
28
29 .macro add_only, ev, rc, s0
30 mov dg2v.16b, dg0v.16b
31 .ifeq \ev
32 add t1.4s, v\s0\().4s, \rc\().4s
33 sha256h dg0q, dg1q, t0.4s
34 sha256h2 dg1q, dg2q, t0.4s
35 .else
36 .ifnb \s0
37 add t0.4s, v\s0\().4s, \rc\().4s
38 .endif
39 sha256h dg0q, dg1q, t1.4s
40 sha256h2 dg1q, dg2q, t1.4s
41 .endif
42 .endm
43
44 .macro add_update, ev, rc, s0, s1, s2, s3
45 sha256su0 v\s0\().4s, v\s1\().4s
46 add_only \ev, \rc, \s1
47 sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s
48 .endm
49
50 /*
51 * The SHA-256 round constants
52 */
53 .section ".rodata", "a"
54 .align 4
55.Lsha2_rcon:
56 .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
57 .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
58 .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
59 .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
60 .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
61 .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
62 .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
63 .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
64 .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
65 .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
66 .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
67 .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
68 .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
69 .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
70 .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
71 .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
72
73 .macro load_round_constants tmp
74 adr_l \tmp, .Lsha2_rcon
75 ld1 { v0.4s- v3.4s}, [\tmp], #64
76 ld1 { v4.4s- v7.4s}, [\tmp], #64
77 ld1 { v8.4s-v11.4s}, [\tmp], #64
78 ld1 {v12.4s-v15.4s}, [\tmp]
79 .endm
80
81 /*
82 * size_t __sha256_ce_transform(struct sha256_block_state *state,
83 * const u8 *data, size_t nblocks);
84 */
85 .text
86SYM_FUNC_START(__sha256_ce_transform)
87
88 load_round_constants x8
89
90 /* load state */
91 ld1 {dgav.4s, dgbv.4s}, [x0]
92
93 /* load input */
940: ld1 {v16.4s-v19.4s}, [x1], #64
95 sub x2, x2, #1
96
97CPU_LE( rev32 v16.16b, v16.16b )
98CPU_LE( rev32 v17.16b, v17.16b )
99CPU_LE( rev32 v18.16b, v18.16b )
100CPU_LE( rev32 v19.16b, v19.16b )
101
102 add t0.4s, v16.4s, v0.4s
103 mov dg0v.16b, dgav.16b
104 mov dg1v.16b, dgbv.16b
105
106 add_update 0, v1, 16, 17, 18, 19
107 add_update 1, v2, 17, 18, 19, 16
108 add_update 0, v3, 18, 19, 16, 17
109 add_update 1, v4, 19, 16, 17, 18
110
111 add_update 0, v5, 16, 17, 18, 19
112 add_update 1, v6, 17, 18, 19, 16
113 add_update 0, v7, 18, 19, 16, 17
114 add_update 1, v8, 19, 16, 17, 18
115
116 add_update 0, v9, 16, 17, 18, 19
117 add_update 1, v10, 17, 18, 19, 16
118 add_update 0, v11, 18, 19, 16, 17
119 add_update 1, v12, 19, 16, 17, 18
120
121 add_only 0, v13, 17
122 add_only 1, v14, 18
123 add_only 0, v15, 19
124 add_only 1
125
126 /* update state */
127 add dgav.4s, dgav.4s, dg0v.4s
128 add dgbv.4s, dgbv.4s, dg1v.4s
129
130 /* return early if voluntary preemption is needed */
131 cond_yield 1f, x5, x6
132
133 /* handled all input blocks? */
134 cbnz x2, 0b
135
136 /* store new state */
1371: st1 {dgav.4s, dgbv.4s}, [x0]
138 mov x0, x2
139 ret
140SYM_FUNC_END(__sha256_ce_transform)
141
142 .unreq dga
143 .unreq dgav
144 .unreq dgb
145 .unreq dgbv
146 .unreq t0
147 .unreq t1
148 .unreq dg0q
149 .unreq dg0v
150 .unreq dg1q
151 .unreq dg1v
152 .unreq dg2q
153 .unreq dg2v
154
155 // parameters for sha256_ce_finup2x()
156 ctx .req x0
157 data1 .req x1
158 data2 .req x2
159 len .req w3
160 out1 .req x4
161 out2 .req x5
162
163 // other scalar variables
164 count .req x6
165 final_step .req w7
166
167 // x8-x9 are used as temporaries.
168
169 // v0-v15 are used to cache the SHA-256 round constants.
170 // v16-v19 are used for the message schedule for the first message.
171 // v20-v23 are used for the message schedule for the second message.
172 // v24-v31 are used for the state and temporaries as given below.
173 // *_a are for the first message and *_b for the second.
174 state0_a_q .req q24
175 state0_a .req v24
176 state1_a_q .req q25
177 state1_a .req v25
178 state0_b_q .req q26
179 state0_b .req v26
180 state1_b_q .req q27
181 state1_b .req v27
182 t0_a .req v28
183 t0_b .req v29
184 t1_a_q .req q30
185 t1_a .req v30
186 t1_b_q .req q31
187 t1_b .req v31
188
189#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
190#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
191// offsetof(struct __sha256_ctx, state) is assumed to be 0.
192
193 // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a
194 // and m0_b contain the current 4 message schedule words for the first
195 // and second message respectively.
196 //
197 // If not all the message schedule words have been computed yet, then
198 // this also computes 4 more message schedule words for each message.
199 // m1_a-m3_a contain the next 3 groups of 4 message schedule words for
200 // the first message, and likewise m1_b-m3_b for the second. After
201 // consuming the current value of m0_a, this macro computes the group
202 // after m3_a and writes it to m0_a, and likewise for *_b. This means
203 // that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
204 // m3_a, m0_a), and likewise for *_b, so the caller must cycle through
205 // the registers accordingly.
206 .macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \
207 m0_b, m1_b, m2_b, m3_b
208 add t0_a\().4s, \m0_a\().4s, \k\().4s
209 add t0_b\().4s, \m0_b\().4s, \k\().4s
210 .if \i < 48
211 sha256su0 \m0_a\().4s, \m1_a\().4s
212 sha256su0 \m0_b\().4s, \m1_b\().4s
213 sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s
214 sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s
215 .endif
216 mov t1_a.16b, state0_a.16b
217 mov t1_b.16b, state0_b.16b
218 sha256h state0_a_q, state1_a_q, t0_a\().4s
219 sha256h state0_b_q, state1_b_q, t0_b\().4s
220 sha256h2 state1_a_q, t1_a_q, t0_a\().4s
221 sha256h2 state1_b_q, t1_b_q, t0_b\().4s
222 .endm
223
224 .macro do_16rounds_2x i, k0, k1, k2, k3
225 do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23
226 do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20
227 do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21
228 do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22
229 .endm
230
231//
232// void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
233// const u8 *data1, const u8 *data2, int len,
234// u8 out1[SHA256_DIGEST_SIZE],
235// u8 out2[SHA256_DIGEST_SIZE]);
236//
237// This function computes the SHA-256 digests of two messages |data1| and
238// |data2| that are both |len| bytes long, starting from the initial context
239// |ctx|. |len| must be at least SHA256_BLOCK_SIZE.
240//
241// The instructions for the two SHA-256 operations are interleaved. On many
242// CPUs, this is almost twice as fast as hashing each message individually due
243// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
244//
245SYM_FUNC_START(sha256_ce_finup2x)
246 sub sp, sp, #128
247 mov final_step, #0
248 load_round_constants x8
249
250 // Load the initial state from ctx->state.
251 ld1 {state0_a.4s-state1_a.4s}, [ctx]
252
253 // Load ctx->bytecount. Take the mod 64 of it to get the number of
254 // bytes that are buffered in ctx->buf. Also save it in a register with
255 // len added to it.
256 ldr x8, [ctx, #OFFSETOF_BYTECOUNT]
257 add count, x8, len, sxtw
258 and x8, x8, #63
259 cbz x8, .Lfinup2x_enter_loop // No bytes buffered?
260
261 // x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them
262 // followed by the first 64 - x8 bytes of data. Since len >= 64, we
263 // just load 64 bytes from each of ctx->buf, data1, and data2
264 // unconditionally and rearrange the data as needed.
265 add x9, ctx, #OFFSETOF_BUF
266 ld1 {v16.16b-v19.16b}, [x9]
267 st1 {v16.16b-v19.16b}, [sp]
268
269 ld1 {v16.16b-v19.16b}, [data1], #64
270 add x9, sp, x8
271 st1 {v16.16b-v19.16b}, [x9]
272 ld1 {v16.4s-v19.4s}, [sp]
273
274 ld1 {v20.16b-v23.16b}, [data2], #64
275 st1 {v20.16b-v23.16b}, [x9]
276 ld1 {v20.4s-v23.4s}, [sp]
277
278 sub len, len, #64
279 sub data1, data1, x8
280 sub data2, data2, x8
281 add len, len, w8
282 mov state0_b.16b, state0_a.16b
283 mov state1_b.16b, state1_a.16b
284 b .Lfinup2x_loop_have_data
285
286.Lfinup2x_enter_loop:
287 sub len, len, #64
288 mov state0_b.16b, state0_a.16b
289 mov state1_b.16b, state1_a.16b
290.Lfinup2x_loop:
291 // Load the next two data blocks.
292 ld1 {v16.4s-v19.4s}, [data1], #64
293 ld1 {v20.4s-v23.4s}, [data2], #64
294.Lfinup2x_loop_have_data:
295 // Convert the words of the data blocks from big endian.
296CPU_LE( rev32 v16.16b, v16.16b )
297CPU_LE( rev32 v17.16b, v17.16b )
298CPU_LE( rev32 v18.16b, v18.16b )
299CPU_LE( rev32 v19.16b, v19.16b )
300CPU_LE( rev32 v20.16b, v20.16b )
301CPU_LE( rev32 v21.16b, v21.16b )
302CPU_LE( rev32 v22.16b, v22.16b )
303CPU_LE( rev32 v23.16b, v23.16b )
304.Lfinup2x_loop_have_bswapped_data:
305
306 // Save the original state for each block.
307 st1 {state0_a.4s-state1_b.4s}, [sp]
308
309 // Do the SHA-256 rounds on each block.
310 do_16rounds_2x 0, v0, v1, v2, v3
311 do_16rounds_2x 16, v4, v5, v6, v7
312 do_16rounds_2x 32, v8, v9, v10, v11
313 do_16rounds_2x 48, v12, v13, v14, v15
314
315 // Add the original state for each block.
316 ld1 {v16.4s-v19.4s}, [sp]
317 add state0_a.4s, state0_a.4s, v16.4s
318 add state1_a.4s, state1_a.4s, v17.4s
319 add state0_b.4s, state0_b.4s, v18.4s
320 add state1_b.4s, state1_b.4s, v19.4s
321
322 // Update len and loop back if more blocks remain.
323 sub len, len, #64
324 tbz len, #31, .Lfinup2x_loop // len >= 0?
325
326 // Check if any final blocks need to be handled.
327 // final_step = 2: all done
328 // final_step = 1: need to do count-only padding block
329 // final_step = 0: need to do the block with 0x80 padding byte
330 tbnz final_step, #1, .Lfinup2x_done
331 tbnz final_step, #0, .Lfinup2x_finalize_countonly
332 add len, len, #64
333 cbz len, .Lfinup2x_finalize_blockaligned
334
335 // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block.
336 // To do this, write the padding starting with the 0x80 byte to
337 // &sp[64]. Then for each message, copy the last 64 data bytes to sp
338 // and load from &sp[64 - len] to get the needed padding block. This
339 // code relies on the data buffers being >= 64 bytes in length.
340 sub w8, len, #64 // w8 = len - 64
341 add data1, data1, w8, sxtw // data1 += len - 64
342 add data2, data2, w8, sxtw // data2 += len - 64
343CPU_LE( mov x9, #0x80 )
344CPU_LE( fmov d16, x9 )
345CPU_BE( movi v16.16b, #0 )
346CPU_BE( mov x9, #0x8000000000000000 )
347CPU_BE( mov v16.d[1], x9 )
348 movi v17.16b, #0
349 stp q16, q17, [sp, #64]
350 stp q17, q17, [sp, #96]
351 sub x9, sp, w8, sxtw // x9 = &sp[64 - len]
352 cmp len, #56
353 b.ge 1f // will count spill into its own block?
354 lsl count, count, #3
355CPU_LE( rev count, count )
356 str count, [x9, #56]
357 mov final_step, #2 // won't need count-only block
358 b 2f
3591:
360 mov final_step, #1 // will need count-only block
3612:
362 ld1 {v16.16b-v19.16b}, [data1]
363 st1 {v16.16b-v19.16b}, [sp]
364 ld1 {v16.4s-v19.4s}, [x9]
365 ld1 {v20.16b-v23.16b}, [data2]
366 st1 {v20.16b-v23.16b}, [sp]
367 ld1 {v20.4s-v23.4s}, [x9]
368 b .Lfinup2x_loop_have_data
369
370 // Prepare a padding block, either:
371 //
372 // {0x80, 0, 0, 0, ..., count (as __be64)}
373 // This is for a block aligned message.
374 //
375 // { 0, 0, 0, 0, ..., count (as __be64)}
376 // This is for a message whose length mod 64 is >= 56.
377 //
378 // Pre-swap the endianness of the words.
379.Lfinup2x_finalize_countonly:
380 movi v16.2d, #0
381 b 1f
382.Lfinup2x_finalize_blockaligned:
383 mov x8, #0x80000000
384 fmov d16, x8
3851:
386 movi v17.2d, #0
387 movi v18.2d, #0
388 ror count, count, #29 // ror(lsl(count, 3), 32)
389 mov v19.d[0], xzr
390 mov v19.d[1], count
391 mov v20.16b, v16.16b
392 movi v21.2d, #0
393 movi v22.2d, #0
394 mov v23.16b, v19.16b
395 mov final_step, #2
396 b .Lfinup2x_loop_have_bswapped_data
397
398.Lfinup2x_done:
399 // Write the two digests with all bytes in the correct order.
400CPU_LE( rev32 state0_a.16b, state0_a.16b )
401CPU_LE( rev32 state1_a.16b, state1_a.16b )
402CPU_LE( rev32 state0_b.16b, state0_b.16b )
403CPU_LE( rev32 state1_b.16b, state1_b.16b )
404 st1 {state0_a.4s-state1_a.4s}, [out1]
405 st1 {state0_b.4s-state1_b.4s}, [out2]
406 add sp, sp, #128
407 ret
408SYM_FUNC_END(sha256_ce_finup2x)