Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2//
3// AES block cipher using AES-NI instructions
4//
5// Copyright 2026 Google LLC
6//
7// The code in this file supports 32-bit and 64-bit CPUs, and it doesn't require
8// AVX. It does use up to SSE4.1, which all CPUs with AES-NI have.
9#include <linux/linkage.h>
10
11.section .rodata
12#ifdef __x86_64__
13#define RODATA(label) label(%rip)
14#else
15#define RODATA(label) label
16#endif
17
18 // A mask for pshufb that extracts the last dword, rotates it right by 8
19 // bits, and copies the result to all four dwords.
20.p2align 4
21.Lmask:
22 .byte 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12
23
24 // The AES round constants, used during key expansion
25.Lrcon:
26 .long 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
27
28.text
29
30// Transform four dwords [a0, a1, a2, a3] in \a into
31// [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3]. \tmp is a temporary xmm register.
32//
33// Note: this could be done in four instructions, shufps + pxor + shufps + pxor,
34// if the temporary register were zero-initialized ahead of time. We instead do
35// it in an easier-to-understand way that doesn't require zero-initialization
36// and avoids the unusual shufps instruction. movdqa is usually "free" anyway.
37.macro _prefix_sum a, tmp
38 movdqa \a, \tmp // [a0, a1, a2, a3]
39 pslldq $4, \a // [0, a0, a1, a2]
40 pxor \tmp, \a // [a0, a0^a1, a1^a2, a2^a3]
41 movdqa \a, \tmp
42 pslldq $8, \a // [0, 0, a0, a0^a1]
43 pxor \tmp, \a // [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3]
44.endm
45
46.macro _gen_round_key a, b
47 // Compute four copies of rcon[i] ^ SubBytes(ror32(w, 8)), where w is
48 // the last dword of the previous round key (given in \b).
49 //
50 // 'aesenclast src, dst' does dst = src XOR SubBytes(ShiftRows(dst)).
51 // It is used here solely for the SubBytes and the XOR. The ShiftRows
52 // is a no-op because all four columns are the same here.
53 //
54 // Don't use the 'aeskeygenassist' instruction, since:
55 // - On most Intel CPUs it is microcoded, making it have a much higher
56 // latency and use more execution ports than 'aesenclast'.
57 // - It cannot be used in a loop, since it requires an immediate.
58 // - It doesn't do much more than 'aesenclast' in the first place.
59 movdqa \b, %xmm2
60 pshufb MASK, %xmm2
61 aesenclast RCON, %xmm2
62
63 // XOR in the prefix sum of the four dwords of \a, which is the
64 // previous round key (AES-128) or the first round key in the previous
65 // pair of round keys (AES-256). The result is the next round key.
66 _prefix_sum \a, tmp=%xmm3
67 pxor %xmm2, \a
68
69 // Store the next round key to memory. Also leave it in \a.
70 movdqu \a, (RNDKEYS)
71.endm
72
73.macro _aes_expandkey_aesni is_aes128
74#ifdef __x86_64__
75 // Arguments
76 .set RNDKEYS, %rdi
77 .set INV_RNDKEYS, %rsi
78 .set IN_KEY, %rdx
79
80 // Other local variables
81 .set RCON_PTR, %rcx
82 .set COUNTER, %eax
83#else
84 // Arguments, assuming -mregparm=3
85 .set RNDKEYS, %eax
86 .set INV_RNDKEYS, %edx
87 .set IN_KEY, %ecx
88
89 // Other local variables
90 .set RCON_PTR, %ebx
91 .set COUNTER, %esi
92#endif
93 .set RCON, %xmm6
94 .set MASK, %xmm7
95
96#ifdef __i386__
97 push %ebx
98 push %esi
99#endif
100
101.if \is_aes128
102 // AES-128: the first round key is simply a copy of the raw key.
103 movdqu (IN_KEY), %xmm0
104 movdqu %xmm0, (RNDKEYS)
105.else
106 // AES-256: the first two round keys are simply a copy of the raw key.
107 movdqu (IN_KEY), %xmm0
108 movdqu %xmm0, (RNDKEYS)
109 movdqu 16(IN_KEY), %xmm1
110 movdqu %xmm1, 16(RNDKEYS)
111 add $32, RNDKEYS
112.endif
113
114 // Generate the remaining round keys.
115 movdqa RODATA(.Lmask), MASK
116.if \is_aes128
117 lea RODATA(.Lrcon), RCON_PTR
118 mov $10, COUNTER
119.Lgen_next_aes128_round_key:
120 add $16, RNDKEYS
121 movd (RCON_PTR), RCON
122 pshufd $0x00, RCON, RCON
123 add $4, RCON_PTR
124 _gen_round_key %xmm0, %xmm0
125 dec COUNTER
126 jnz .Lgen_next_aes128_round_key
127.else
128 // AES-256: only the first 7 round constants are needed, so instead of
129 // loading each one from memory, just start by loading [1, 1, 1, 1] and
130 // then generate the rest by doubling.
131 pshufd $0x00, RODATA(.Lrcon), RCON
132 pxor %xmm5, %xmm5 // All-zeroes
133 mov $7, COUNTER
134.Lgen_next_aes256_round_key_pair:
135 // Generate the next AES-256 round key: either the first of a pair of
136 // two, or the last one.
137 _gen_round_key %xmm0, %xmm1
138
139 dec COUNTER
140 jz .Lgen_aes256_round_keys_done
141
142 // Generate the second AES-256 round key of the pair. Compared to the
143 // first, there's no rotation and no XOR of a round constant.
144 pshufd $0xff, %xmm0, %xmm2 // Get four copies of last dword
145 aesenclast %xmm5, %xmm2 // Just does SubBytes
146 _prefix_sum %xmm1, tmp=%xmm3
147 pxor %xmm2, %xmm1
148 movdqu %xmm1, 16(RNDKEYS)
149 add $32, RNDKEYS
150 paddd RCON, RCON // RCON <<= 1
151 jmp .Lgen_next_aes256_round_key_pair
152.Lgen_aes256_round_keys_done:
153.endif
154
155 // If INV_RNDKEYS is non-NULL, write the round keys for the Equivalent
156 // Inverse Cipher to it. To do that, reverse the standard round keys,
157 // and apply aesimc (InvMixColumn) to each except the first and last.
158 test INV_RNDKEYS, INV_RNDKEYS
159 jz .Ldone\@
160 movdqu (RNDKEYS), %xmm0 // Last standard round key
161 movdqu %xmm0, (INV_RNDKEYS) // => First inverse round key
162.if \is_aes128
163 mov $9, COUNTER
164.else
165 mov $13, COUNTER
166.endif
167.Lgen_next_inv_round_key\@:
168 sub $16, RNDKEYS
169 add $16, INV_RNDKEYS
170 movdqu (RNDKEYS), %xmm0
171 aesimc %xmm0, %xmm0
172 movdqu %xmm0, (INV_RNDKEYS)
173 dec COUNTER
174 jnz .Lgen_next_inv_round_key\@
175 movdqu -16(RNDKEYS), %xmm0 // First standard round key
176 movdqu %xmm0, 16(INV_RNDKEYS) // => Last inverse round key
177
178.Ldone\@:
179#ifdef __i386__
180 pop %esi
181 pop %ebx
182#endif
183 RET
184.endm
185
186// void aes128_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
187// const u8 in_key[AES_KEYSIZE_128]);
188SYM_FUNC_START(aes128_expandkey_aesni)
189 _aes_expandkey_aesni 1
190SYM_FUNC_END(aes128_expandkey_aesni)
191
192// void aes256_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
193// const u8 in_key[AES_KEYSIZE_256]);
194SYM_FUNC_START(aes256_expandkey_aesni)
195 _aes_expandkey_aesni 0
196SYM_FUNC_END(aes256_expandkey_aesni)
197
198.macro _aes_crypt_aesni enc
199#ifdef __x86_64__
200 .set RNDKEYS, %rdi
201 .set NROUNDS, %esi
202 .set OUT, %rdx
203 .set IN, %rcx
204#else
205 // Assuming -mregparm=3
206 .set RNDKEYS, %eax
207 .set NROUNDS, %edx
208 .set OUT, %ecx
209 .set IN, %ebx // Passed on stack
210#endif
211
212#ifdef __i386__
213 push %ebx
214 mov 8(%esp), %ebx
215#endif
216
217 // Zero-th round
218 movdqu (IN), %xmm0
219 movdqu (RNDKEYS), %xmm1
220 pxor %xmm1, %xmm0
221
222 // Normal rounds
223 add $16, RNDKEYS
224 dec NROUNDS
225.Lnext_round\@:
226 movdqu (RNDKEYS), %xmm1
227.if \enc
228 aesenc %xmm1, %xmm0
229.else
230 aesdec %xmm1, %xmm0
231.endif
232 add $16, RNDKEYS
233 dec NROUNDS
234 jne .Lnext_round\@
235
236 // Last round
237 movdqu (RNDKEYS), %xmm1
238.if \enc
239 aesenclast %xmm1, %xmm0
240.else
241 aesdeclast %xmm1, %xmm0
242.endif
243 movdqu %xmm0, (OUT)
244
245#ifdef __i386__
246 pop %ebx
247#endif
248 RET
249.endm
250
251// void aes_encrypt_aesni(const u32 rndkeys[], int nrounds,
252// u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
253SYM_FUNC_START(aes_encrypt_aesni)
254 _aes_crypt_aesni 1
255SYM_FUNC_END(aes_encrypt_aesni)
256
257// void aes_decrypt_aesni(const u32 inv_rndkeys[], int nrounds,
258// u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
259SYM_FUNC_START(aes_decrypt_aesni)
260 _aes_crypt_aesni 0
261SYM_FUNC_END(aes_decrypt_aesni)