Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4 */
5
6#include <linux/linkage.h>
7#include <asm/frame.h>
8
9.section .rodata, "a"
10.align 16
11CONSTANTS: .octa 0x6b20657479622d323320646e61707865
12.text
13
14/*
15 * Very basic SSE2 implementation of ChaCha20. Produces a given positive number
16 * of blocks of output with a nonce of 0, taking an input key and 8-byte
17 * counter. Importantly does not spill to the stack. Its arguments are:
18 *
19 * rdi: output bytes
20 * rsi: 32-byte key input
21 * rdx: 8-byte counter input/output
22 * rcx: number of 64-byte blocks to write to output
23 */
24SYM_FUNC_START(__arch_chacha20_blocks_nostack)
25
26.set output, %rdi
27.set key, %rsi
28.set counter, %rdx
29.set nblocks, %rcx
30.set i, %al
31/* xmm registers are *not* callee-save. */
32.set temp, %xmm0
33.set state0, %xmm1
34.set state1, %xmm2
35.set state2, %xmm3
36.set state3, %xmm4
37.set copy0, %xmm5
38.set copy1, %xmm6
39.set copy2, %xmm7
40.set copy3, %xmm8
41.set one, %xmm9
42
43 /* copy0 = "expand 32-byte k" */
44 movaps CONSTANTS(%rip),copy0
45 /* copy1,copy2 = key */
46 movups 0x00(key),copy1
47 movups 0x10(key),copy2
48 /* copy3 = counter || zero nonce */
49 movq 0x00(counter),copy3
50 /* one = 1 || 0 */
51 movq $1,%rax
52 movq %rax,one
53
54.Lblock:
55 /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */
56 movdqa copy0,state0
57 movdqa copy1,state1
58 movdqa copy2,state2
59 movdqa copy3,state3
60
61 movb $10,i
62.Lpermute:
63 /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
64 paddd state1,state0
65 pxor state0,state3
66 movdqa state3,temp
67 pslld $16,temp
68 psrld $16,state3
69 por temp,state3
70
71 /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
72 paddd state3,state2
73 pxor state2,state1
74 movdqa state1,temp
75 pslld $12,temp
76 psrld $20,state1
77 por temp,state1
78
79 /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
80 paddd state1,state0
81 pxor state0,state3
82 movdqa state3,temp
83 pslld $8,temp
84 psrld $24,state3
85 por temp,state3
86
87 /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
88 paddd state3,state2
89 pxor state2,state1
90 movdqa state1,temp
91 pslld $7,temp
92 psrld $25,state1
93 por temp,state1
94
95 /* state1[0,1,2,3] = state1[1,2,3,0] */
96 pshufd $0x39,state1,state1
97 /* state2[0,1,2,3] = state2[2,3,0,1] */
98 pshufd $0x4e,state2,state2
99 /* state3[0,1,2,3] = state3[3,0,1,2] */
100 pshufd $0x93,state3,state3
101
102 /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
103 paddd state1,state0
104 pxor state0,state3
105 movdqa state3,temp
106 pslld $16,temp
107 psrld $16,state3
108 por temp,state3
109
110 /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
111 paddd state3,state2
112 pxor state2,state1
113 movdqa state1,temp
114 pslld $12,temp
115 psrld $20,state1
116 por temp,state1
117
118 /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
119 paddd state1,state0
120 pxor state0,state3
121 movdqa state3,temp
122 pslld $8,temp
123 psrld $24,state3
124 por temp,state3
125
126 /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
127 paddd state3,state2
128 pxor state2,state1
129 movdqa state1,temp
130 pslld $7,temp
131 psrld $25,state1
132 por temp,state1
133
134 /* state1[0,1,2,3] = state1[3,0,1,2] */
135 pshufd $0x93,state1,state1
136 /* state2[0,1,2,3] = state2[2,3,0,1] */
137 pshufd $0x4e,state2,state2
138 /* state3[0,1,2,3] = state3[1,2,3,0] */
139 pshufd $0x39,state3,state3
140
141 decb i
142 jnz .Lpermute
143
144 /* output0 = state0 + copy0 */
145 paddd copy0,state0
146 movups state0,0x00(output)
147 /* output1 = state1 + copy1 */
148 paddd copy1,state1
149 movups state1,0x10(output)
150 /* output2 = state2 + copy2 */
151 paddd copy2,state2
152 movups state2,0x20(output)
153 /* output3 = state3 + copy3 */
154 paddd copy3,state3
155 movups state3,0x30(output)
156
157 /* ++copy3.counter */
158 paddq one,copy3
159
160 /* output += 64, --nblocks */
161 addq $64,output
162 decq nblocks
163 jnz .Lblock
164
165 /* counter = copy3.counter */
166 movq copy3,0x00(counter)
167
168 /* Zero out the potentially sensitive regs, in case nothing uses these again. */
169 pxor state0,state0
170 pxor state1,state1
171 pxor state2,state2
172 pxor state3,state3
173 pxor copy1,copy1
174 pxor copy2,copy2
175 pxor temp,temp
176
177 ret
178SYM_FUNC_END(__arch_chacha20_blocks_nostack)