Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

lib/crypto: x86/sha512: Migrate optimized SHA-512 code to library

Instead of exposing the x86-optimized SHA-512 code via x86-specific
crypto_shash algorithms, instead just implement the sha512_blocks()
library function. This is much simpler, it makes the SHA-512 (and
SHA-384) library functions be x86-optimized, and it fixes the
longstanding issue where the x86-optimized SHA-512 code was disabled by
default. SHA-512 still remains available through crypto_shash, but
individual architectures no longer need to handle it.

To match sha512_blocks(), change the type of the nblocks parameter of
the assembly functions from int to size_t. The assembly functions
actually already treated it as size_t.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250630160320.2888-15-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>

+395 -675
-13
arch/x86/crypto/Kconfig
··· 390 390 - AVX2 (Advanced Vector Extensions 2) 391 391 - SHA-NI (SHA Extensions New Instructions) 392 392 393 - config CRYPTO_SHA512_SSSE3 394 - tristate "Hash functions: SHA-384 and SHA-512 (SSSE3/AVX/AVX2)" 395 - depends on 64BIT 396 - select CRYPTO_SHA512 397 - select CRYPTO_HASH 398 - help 399 - SHA-384 and SHA-512 secure hash algorithms (FIPS 180) 400 - 401 - Architecture: x86_64 using: 402 - - SSSE3 (Supplemental SSE3) 403 - - AVX (Advanced Vector Extensions) 404 - - AVX2 (Advanced Vector Extensions 2) 405 - 406 393 config CRYPTO_SM3_AVX_X86_64 407 394 tristate "Hash functions: SM3 (AVX)" 408 395 depends on 64BIT
-3
arch/x86/crypto/Makefile
··· 54 54 obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o 55 55 sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ni_asm.o sha1_ssse3_glue.o 56 56 57 - obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o 58 - sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o 59 - 60 57 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o 61 58 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o 62 59
+170 -170
arch/x86/crypto/sha512-avx-asm.S lib/crypto/x86/sha512-ssse3-asm.S
··· 1 1 ######################################################################## 2 - # Implement fast SHA-512 with AVX instructions. (x86_64) 2 + # Implement fast SHA-512 with SSSE3 instructions. (x86_64) 3 3 # 4 4 # Copyright (C) 2013 Intel Corporation. 5 5 # ··· 48 48 ######################################################################## 49 49 50 50 #include <linux/linkage.h> 51 - #include <linux/cfi_types.h> 52 51 53 52 .text 54 53 55 54 # Virtual Registers 56 55 # ARG1 57 - digest = %rdi 56 + digest = %rdi 58 57 # ARG2 59 - msg = %rsi 58 + msg = %rsi 60 59 # ARG3 61 - msglen = %rdx 62 - T1 = %rcx 63 - T2 = %r8 64 - a_64 = %r9 65 - b_64 = %r10 66 - c_64 = %r11 67 - d_64 = %r12 68 - e_64 = %r13 69 - f_64 = %r14 70 - g_64 = %r15 71 - h_64 = %rbx 72 - tmp0 = %rax 60 + msglen = %rdx 61 + T1 = %rcx 62 + T2 = %r8 63 + a_64 = %r9 64 + b_64 = %r10 65 + c_64 = %r11 66 + d_64 = %r12 67 + e_64 = %r13 68 + f_64 = %r14 69 + g_64 = %r15 70 + h_64 = %rbx 71 + tmp0 = %rax 73 72 74 73 # Local variables (stack frame) 75 74 76 - # Message Schedule 77 75 W_SIZE = 80*8 78 - # W[t] + K[t] | W[t+1] + K[t+1] 79 76 WK_SIZE = 2*8 80 77 81 78 frame_W = 0 ··· 111 114 a_64 = TMP 112 115 .endm 113 116 114 - .macro RORQ p1 p2 115 - # shld is faster than ror on Sandybridge 116 - shld $(64-\p2), \p1, \p1 117 - .endm 118 - 119 117 .macro SHA512_Round rnd 118 + 120 119 # Compute Round %%t 121 - mov f_64, T1 # T1 = f 122 - mov e_64, tmp0 # tmp = e 123 - xor g_64, T1 # T1 = f ^ g 124 - RORQ tmp0, 23 # 41 # tmp = e ror 23 125 - and e_64, T1 # T1 = (f ^ g) & e 126 - xor e_64, tmp0 # tmp = (e ror 23) ^ e 127 - xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) 120 + mov f_64, T1 # T1 = f 121 + mov e_64, tmp0 # tmp = e 122 + xor g_64, T1 # T1 = f ^ g 123 + ror $23, tmp0 # 41 # tmp = e ror 23 124 + and e_64, T1 # T1 = (f ^ g) & e 125 + xor e_64, tmp0 # tmp = (e ror 23) ^ e 126 + xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) 128 127 idx = \rnd 129 - add WK_2(idx), T1 # W[t] + K[t] from message scheduler 130 - RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4 131 - xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e 132 - mov a_64, T2 # T2 = a 133 - add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h 134 - RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) 135 - add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) 136 - mov a_64, tmp0 # tmp = a 137 - xor c_64, T2 # T2 = a ^ c 138 - and c_64, tmp0 # tmp = a & c 139 - and b_64, T2 # T2 = (a ^ c) & b 140 - xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) 141 - mov a_64, tmp0 # tmp = a 142 - RORQ tmp0, 5 # 39 # tmp = a ror 5 143 - xor a_64, tmp0 # tmp = (a ror 5) ^ a 144 - add T1, d_64 # e(next_state) = d + T1 145 - RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6 146 - xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a 147 - lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) 148 - RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) 149 - add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) 128 + add WK_2(idx), T1 # W[t] + K[t] from message scheduler 129 + ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4 130 + xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e 131 + mov a_64, T2 # T2 = a 132 + add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h 133 + ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) 134 + add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) 135 + mov a_64, tmp0 # tmp = a 136 + xor c_64, T2 # T2 = a ^ c 137 + and c_64, tmp0 # tmp = a & c 138 + and b_64, T2 # T2 = (a ^ c) & b 139 + xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) 140 + mov a_64, tmp0 # tmp = a 141 + ror $5, tmp0 # 39 # tmp = a ror 5 142 + xor a_64, tmp0 # tmp = (a ror 5) ^ a 143 + add T1, d_64 # e(next_state) = d + T1 144 + ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6 145 + xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a 146 + lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) 147 + ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) 148 + add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) 150 149 RotateState 151 150 .endm 152 151 153 - .macro SHA512_2Sched_2Round_avx rnd 152 + .macro SHA512_2Sched_2Round_sse rnd 153 + 154 154 # Compute rounds t-2 and t-1 155 155 # Compute message schedule QWORDS t and t+1 156 156 157 157 # Two rounds are computed based on the values for K[t-2]+W[t-2] and 158 158 # K[t-1]+W[t-1] which were previously stored at WK_2 by the message 159 159 # scheduler. 160 - # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)]. 160 + # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. 161 161 # They are then added to their respective SHA512 constants at 162 - # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)] 162 + # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] 163 163 # For brievity, the comments following vectored instructions only refer to 164 164 # the first of a pair of QWORDS. 165 - # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} 165 + # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} 166 166 # The computation of the message schedule and the rounds are tightly 167 167 # stitched to take advantage of instruction-level parallelism. 168 + # For clarity, integer instructions (for the rounds calculation) are indented 169 + # by one tab. Vectored instructions (for the message scheduler) are indented 170 + # by two tabs. 168 171 169 - idx = \rnd - 2 170 - vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2] 171 - idx = \rnd - 15 172 - vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15] 173 172 mov f_64, T1 174 - vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61 175 - mov e_64, tmp0 176 - vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1 173 + idx = \rnd -2 174 + movdqa W_t(idx), %xmm2 # XMM2 = W[t-2] 177 175 xor g_64, T1 178 - RORQ tmp0, 23 # 41 179 - vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19 180 176 and e_64, T1 181 - xor e_64, tmp0 182 - vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 177 + movdqa %xmm2, %xmm0 # XMM0 = W[t-2] 183 178 xor g_64, T1 184 179 idx = \rnd 185 - add WK_2(idx), T1# 186 - vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8 187 - RORQ tmp0, 4 # 18 188 - vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6 189 - xor e_64, tmp0 190 - mov a_64, T2 191 - add h_64, T1 192 - vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 193 - RORQ tmp0, 14 # 14 194 - add tmp0, T1 195 - vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7 196 - mov a_64, tmp0 197 - xor c_64, T2 198 - vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3 199 - and c_64, tmp0 200 - and b_64, T2 201 - vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3 202 - xor tmp0, T2 203 - mov a_64, tmp0 204 - vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63 205 - RORQ tmp0, 5 # 39 206 - vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63 207 - xor a_64, tmp0 208 - add T1, d_64 209 - RORQ tmp0, 6 # 34 210 - xor a_64, tmp0 211 - vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ 212 - # W[t-15]>>7 ^ W[t-15]<<63 213 - lea (T1, T2), h_64 214 - RORQ tmp0, 28 # 28 215 - vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25 216 - add tmp0, h_64 217 - RotateState 218 - vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ 219 - # W[t-2]<<25 220 - mov f_64, T1 221 - vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) 180 + add WK_2(idx), T1 181 + idx = \rnd - 15 182 + movdqu W_t(idx), %xmm5 # XMM5 = W[t-15] 222 183 mov e_64, tmp0 223 - xor g_64, T1 224 - idx = \rnd - 16 225 - vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] 226 - idx = \rnd - 7 227 - vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7] 228 - RORQ tmp0, 23 # 41 229 - and e_64, T1 184 + ror $23, tmp0 # 41 185 + movdqa %xmm5, %xmm3 # XMM3 = W[t-15] 230 186 xor e_64, tmp0 187 + ror $4, tmp0 # 18 188 + psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42 189 + xor e_64, tmp0 190 + ror $14, tmp0 # 14 191 + psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1 192 + add tmp0, T1 193 + add h_64, T1 194 + pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2] 195 + mov a_64, T2 196 + xor c_64, T2 197 + pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15] 198 + and b_64, T2 199 + mov a_64, tmp0 200 + psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13 201 + and c_64, tmp0 202 + xor tmp0, T2 203 + psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6 204 + mov a_64, tmp0 205 + ror $5, tmp0 # 39 206 + pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] 207 + xor a_64, tmp0 208 + ror $6, tmp0 # 34 209 + pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] 210 + xor a_64, tmp0 211 + ror $28, tmp0 # 28 212 + psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 213 + add tmp0, T2 214 + add T1, d_64 215 + psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 216 + lea (T1, T2), h_64 217 + RotateState 218 + movdqa %xmm2, %xmm1 # XMM1 = W[t-2] 219 + mov f_64, T1 231 220 xor g_64, T1 232 - vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56 221 + movdqa %xmm5, %xmm4 # XMM4 = W[t-15] 222 + and e_64, T1 223 + xor g_64, T1 224 + psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42 233 225 idx = \rnd + 1 234 226 add WK_2(idx), T1 235 - vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15]) 236 - RORQ tmp0, 4 # 18 237 - vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) 227 + mov e_64, tmp0 228 + psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7 229 + ror $23, tmp0 # 41 238 230 xor e_64, tmp0 239 - vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] + 240 - # s0(W[t-15]) + W[t-16] 241 - mov a_64, T2 242 - add h_64, T1 243 - RORQ tmp0, 14 # 14 231 + pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2] 232 + ror $4, tmp0 # 18 233 + xor e_64, tmp0 234 + pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15] 235 + ror $14, tmp0 # 14 244 236 add tmp0, T1 245 - idx = \rnd 246 - vmovdqa %xmm0, W_t(idx) # Store W[t] 247 - vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t] 248 - vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds 249 - mov a_64, tmp0 237 + psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3 238 + add h_64, T1 239 + mov a_64, T2 240 + psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56 250 241 xor c_64, T2 251 - and c_64, tmp0 252 242 and b_64, T2 253 - xor tmp0, T2 243 + pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2]) 254 244 mov a_64, tmp0 255 - RORQ tmp0, 5 # 39 245 + and c_64, tmp0 246 + idx = \rnd - 7 247 + movdqu W_t(idx), %xmm1 # XMM1 = W[t-7] 248 + xor tmp0, T2 249 + pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15]) 250 + mov a_64, tmp0 251 + paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) 252 + ror $5, tmp0 # 39 253 + idx =\rnd-16 254 + paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] 256 255 xor a_64, tmp0 256 + paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] 257 + ror $6, tmp0 # 34 258 + movdqa %xmm0, W_t(\rnd) # Store scheduled qwords 259 + xor a_64, tmp0 260 + paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t] 261 + ror $28, tmp0 # 28 262 + idx = \rnd 263 + movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds 264 + add tmp0, T2 257 265 add T1, d_64 258 - RORQ tmp0, 6 # 34 259 - xor a_64, tmp0 260 266 lea (T1, T2), h_64 261 - RORQ tmp0, 28 # 28 262 - add tmp0, h_64 263 267 RotateState 264 268 .endm 265 269 266 270 ######################################################################## 267 - # void sha512_transform_avx(sha512_state *state, const u8 *data, int blocks) 271 + # void sha512_transform_ssse3(struct sha512_block_state *state, 272 + # const u8 *data, size_t nblocks); 268 273 # Purpose: Updates the SHA512 digest stored at "state" with the message 269 274 # stored in "data". 270 275 # The size of the message pointed to by "data" must be an integer multiple 271 276 # of SHA512 message blocks. 272 - # "blocks" is the message length in SHA512 blocks 277 + # "nblocks" is the message length in SHA512 blocks 273 278 ######################################################################## 274 - SYM_TYPED_FUNC_START(sha512_transform_avx) 279 + SYM_FUNC_START(sha512_transform_ssse3) 280 + 275 281 test msglen, msglen 276 282 je .Lnowork 277 283 ··· 288 288 # Allocate Stack Space 289 289 push %rbp 290 290 mov %rsp, %rbp 291 - sub $frame_size, %rsp 291 + sub $frame_size, %rsp 292 292 and $~(0x20 - 1), %rsp 293 293 294 294 .Lupdateblock: 295 295 296 - # Load state variables 297 - mov DIGEST(0), a_64 298 - mov DIGEST(1), b_64 299 - mov DIGEST(2), c_64 300 - mov DIGEST(3), d_64 301 - mov DIGEST(4), e_64 302 - mov DIGEST(5), f_64 303 - mov DIGEST(6), g_64 304 - mov DIGEST(7), h_64 296 + # Load state variables 297 + mov DIGEST(0), a_64 298 + mov DIGEST(1), b_64 299 + mov DIGEST(2), c_64 300 + mov DIGEST(3), d_64 301 + mov DIGEST(4), e_64 302 + mov DIGEST(5), f_64 303 + mov DIGEST(6), g_64 304 + mov DIGEST(7), h_64 305 305 306 306 t = 0 307 307 .rept 80/2 + 1 ··· 309 309 # +1 iteration because the scheduler leads hashing by 1 iteration 310 310 .if t < 2 311 311 # BSWAP 2 QWORDS 312 - vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1 313 - vmovdqu MSG(t), %xmm0 314 - vpshufb %xmm1, %xmm0, %xmm0 # BSWAP 315 - vmovdqa %xmm0, W_t(t) # Store Scheduled Pair 316 - vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] 317 - vmovdqa %xmm0, WK_2(t) # Store into WK for rounds 312 + movdqa XMM_QWORD_BSWAP(%rip), %xmm1 313 + movdqu MSG(t), %xmm0 314 + pshufb %xmm1, %xmm0 # BSWAP 315 + movdqa %xmm0, W_t(t) # Store Scheduled Pair 316 + paddq K_t(t), %xmm0 # Compute W[t]+K[t] 317 + movdqa %xmm0, WK_2(t) # Store into WK for rounds 318 318 .elseif t < 16 319 319 # BSWAP 2 QWORDS# Compute 2 Rounds 320 - vmovdqu MSG(t), %xmm0 321 - vpshufb %xmm1, %xmm0, %xmm0 # BSWAP 322 - SHA512_Round t-2 # Round t-2 323 - vmovdqa %xmm0, W_t(t) # Store Scheduled Pair 324 - vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] 325 - SHA512_Round t-1 # Round t-1 326 - vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK 320 + movdqu MSG(t), %xmm0 321 + pshufb %xmm1, %xmm0 # BSWAP 322 + SHA512_Round t-2 # Round t-2 323 + movdqa %xmm0, W_t(t) # Store Scheduled Pair 324 + paddq K_t(t), %xmm0 # Compute W[t]+K[t] 325 + SHA512_Round t-1 # Round t-1 326 + movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK 327 327 .elseif t < 79 328 328 # Schedule 2 QWORDS# Compute 2 Rounds 329 - SHA512_2Sched_2Round_avx t 329 + SHA512_2Sched_2Round_sse t 330 330 .else 331 331 # Compute 2 Rounds 332 332 SHA512_Round t-2 ··· 336 336 .endr 337 337 338 338 # Update digest 339 - add a_64, DIGEST(0) 340 - add b_64, DIGEST(1) 341 - add c_64, DIGEST(2) 342 - add d_64, DIGEST(3) 343 - add e_64, DIGEST(4) 344 - add f_64, DIGEST(5) 345 - add g_64, DIGEST(6) 346 - add h_64, DIGEST(7) 339 + add a_64, DIGEST(0) 340 + add b_64, DIGEST(1) 341 + add c_64, DIGEST(2) 342 + add d_64, DIGEST(3) 343 + add e_64, DIGEST(4) 344 + add f_64, DIGEST(5) 345 + add g_64, DIGEST(6) 346 + add h_64, DIGEST(7) 347 347 348 348 # Advance to next message block 349 - add $16*8, msg 350 - dec msglen 351 - jnz .Lupdateblock 349 + add $16*8, msg 350 + dec msglen 351 + jnz .Lupdateblock 352 352 353 353 # Restore Stack Pointer 354 354 mov %rbp, %rsp ··· 363 363 364 364 .Lnowork: 365 365 RET 366 - SYM_FUNC_END(sha512_transform_avx) 366 + SYM_FUNC_END(sha512_transform_ssse3) 367 367 368 368 ######################################################################## 369 369 ### Binary Data
+5 -4
arch/x86/crypto/sha512-avx2-asm.S lib/crypto/x86/sha512-avx2-asm.S
··· 50 50 ######################################################################## 51 51 52 52 #include <linux/linkage.h> 53 - #include <linux/cfi_types.h> 54 53 55 54 .text 56 55 ··· 558 559 .endm 559 560 560 561 ######################################################################## 561 - # void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks) 562 + # void sha512_transform_rorx(struct sha512_block_state *state, 563 + # const u8 *data, size_t nblocks); 562 564 # Purpose: Updates the SHA512 digest stored at "state" with the message 563 565 # stored in "data". 564 566 # The size of the message pointed to by "data" must be an integer multiple 565 567 # of SHA512 message blocks. 566 - # "blocks" is the message length in SHA512 blocks 568 + # "nblocks" is the message length in SHA512 blocks 567 569 ######################################################################## 568 - SYM_TYPED_FUNC_START(sha512_transform_rorx) 570 + SYM_FUNC_START(sha512_transform_rorx) 571 + 569 572 # Save GPRs 570 573 push %rbx 571 574 push %r12
+162 -163
arch/x86/crypto/sha512-ssse3-asm.S lib/crypto/x86/sha512-avx-asm.S
··· 1 1 ######################################################################## 2 - # Implement fast SHA-512 with SSSE3 instructions. (x86_64) 2 + # Implement fast SHA-512 with AVX instructions. (x86_64) 3 3 # 4 4 # Copyright (C) 2013 Intel Corporation. 5 5 # ··· 48 48 ######################################################################## 49 49 50 50 #include <linux/linkage.h> 51 - #include <linux/cfi_types.h> 52 51 53 52 .text 54 53 55 54 # Virtual Registers 56 55 # ARG1 57 - digest = %rdi 56 + digest = %rdi 58 57 # ARG2 59 - msg = %rsi 58 + msg = %rsi 60 59 # ARG3 61 - msglen = %rdx 62 - T1 = %rcx 63 - T2 = %r8 64 - a_64 = %r9 65 - b_64 = %r10 66 - c_64 = %r11 67 - d_64 = %r12 68 - e_64 = %r13 69 - f_64 = %r14 70 - g_64 = %r15 71 - h_64 = %rbx 72 - tmp0 = %rax 60 + msglen = %rdx 61 + T1 = %rcx 62 + T2 = %r8 63 + a_64 = %r9 64 + b_64 = %r10 65 + c_64 = %r11 66 + d_64 = %r12 67 + e_64 = %r13 68 + f_64 = %r14 69 + g_64 = %r15 70 + h_64 = %rbx 71 + tmp0 = %rax 73 72 74 73 # Local variables (stack frame) 75 74 75 + # Message Schedule 76 76 W_SIZE = 80*8 77 + # W[t] + K[t] | W[t+1] + K[t+1] 77 78 WK_SIZE = 2*8 78 79 79 80 frame_W = 0 ··· 113 112 a_64 = TMP 114 113 .endm 115 114 116 - .macro SHA512_Round rnd 115 + .macro RORQ p1 p2 116 + # shld is faster than ror on Sandybridge 117 + shld $(64-\p2), \p1, \p1 118 + .endm 117 119 120 + .macro SHA512_Round rnd 118 121 # Compute Round %%t 119 - mov f_64, T1 # T1 = f 120 - mov e_64, tmp0 # tmp = e 121 - xor g_64, T1 # T1 = f ^ g 122 - ror $23, tmp0 # 41 # tmp = e ror 23 123 - and e_64, T1 # T1 = (f ^ g) & e 124 - xor e_64, tmp0 # tmp = (e ror 23) ^ e 125 - xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) 122 + mov f_64, T1 # T1 = f 123 + mov e_64, tmp0 # tmp = e 124 + xor g_64, T1 # T1 = f ^ g 125 + RORQ tmp0, 23 # 41 # tmp = e ror 23 126 + and e_64, T1 # T1 = (f ^ g) & e 127 + xor e_64, tmp0 # tmp = (e ror 23) ^ e 128 + xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) 126 129 idx = \rnd 127 - add WK_2(idx), T1 # W[t] + K[t] from message scheduler 128 - ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4 129 - xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e 130 - mov a_64, T2 # T2 = a 131 - add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h 132 - ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) 133 - add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) 134 - mov a_64, tmp0 # tmp = a 135 - xor c_64, T2 # T2 = a ^ c 136 - and c_64, tmp0 # tmp = a & c 137 - and b_64, T2 # T2 = (a ^ c) & b 138 - xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) 139 - mov a_64, tmp0 # tmp = a 140 - ror $5, tmp0 # 39 # tmp = a ror 5 141 - xor a_64, tmp0 # tmp = (a ror 5) ^ a 142 - add T1, d_64 # e(next_state) = d + T1 143 - ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6 144 - xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a 145 - lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) 146 - ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) 147 - add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) 130 + add WK_2(idx), T1 # W[t] + K[t] from message scheduler 131 + RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4 132 + xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e 133 + mov a_64, T2 # T2 = a 134 + add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h 135 + RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) 136 + add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) 137 + mov a_64, tmp0 # tmp = a 138 + xor c_64, T2 # T2 = a ^ c 139 + and c_64, tmp0 # tmp = a & c 140 + and b_64, T2 # T2 = (a ^ c) & b 141 + xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) 142 + mov a_64, tmp0 # tmp = a 143 + RORQ tmp0, 5 # 39 # tmp = a ror 5 144 + xor a_64, tmp0 # tmp = (a ror 5) ^ a 145 + add T1, d_64 # e(next_state) = d + T1 146 + RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6 147 + xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a 148 + lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) 149 + RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) 150 + add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) 148 151 RotateState 149 152 .endm 150 153 151 - .macro SHA512_2Sched_2Round_sse rnd 152 - 154 + .macro SHA512_2Sched_2Round_avx rnd 153 155 # Compute rounds t-2 and t-1 154 156 # Compute message schedule QWORDS t and t+1 155 157 156 158 # Two rounds are computed based on the values for K[t-2]+W[t-2] and 157 159 # K[t-1]+W[t-1] which were previously stored at WK_2 by the message 158 160 # scheduler. 159 - # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. 161 + # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)]. 160 162 # They are then added to their respective SHA512 constants at 161 - # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] 163 + # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)] 162 164 # For brievity, the comments following vectored instructions only refer to 163 165 # the first of a pair of QWORDS. 164 - # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} 166 + # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} 165 167 # The computation of the message schedule and the rounds are tightly 166 168 # stitched to take advantage of instruction-level parallelism. 167 - # For clarity, integer instructions (for the rounds calculation) are indented 168 - # by one tab. Vectored instructions (for the message scheduler) are indented 169 - # by two tabs. 170 169 170 + idx = \rnd - 2 171 + vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2] 172 + idx = \rnd - 15 173 + vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15] 171 174 mov f_64, T1 172 - idx = \rnd -2 173 - movdqa W_t(idx), %xmm2 # XMM2 = W[t-2] 175 + vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61 176 + mov e_64, tmp0 177 + vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1 174 178 xor g_64, T1 179 + RORQ tmp0, 23 # 41 180 + vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19 175 181 and e_64, T1 176 - movdqa %xmm2, %xmm0 # XMM0 = W[t-2] 182 + xor e_64, tmp0 183 + vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 177 184 xor g_64, T1 178 185 idx = \rnd 179 - add WK_2(idx), T1 180 - idx = \rnd - 15 181 - movdqu W_t(idx), %xmm5 # XMM5 = W[t-15] 182 - mov e_64, tmp0 183 - ror $23, tmp0 # 41 184 - movdqa %xmm5, %xmm3 # XMM3 = W[t-15] 186 + add WK_2(idx), T1# 187 + vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8 188 + RORQ tmp0, 4 # 18 189 + vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6 185 190 xor e_64, tmp0 186 - ror $4, tmp0 # 18 187 - psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42 188 - xor e_64, tmp0 189 - ror $14, tmp0 # 14 190 - psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1 191 - add tmp0, T1 192 - add h_64, T1 193 - pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2] 194 191 mov a_64, T2 192 + add h_64, T1 193 + vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 194 + RORQ tmp0, 14 # 14 195 + add tmp0, T1 196 + vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7 197 + mov a_64, tmp0 195 198 xor c_64, T2 196 - pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15] 197 - and b_64, T2 198 - mov a_64, tmp0 199 - psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13 199 + vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3 200 200 and c_64, tmp0 201 + and b_64, T2 202 + vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3 201 203 xor tmp0, T2 202 - psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6 203 204 mov a_64, tmp0 204 - ror $5, tmp0 # 39 205 - pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] 205 + vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63 206 + RORQ tmp0, 5 # 39 207 + vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63 206 208 xor a_64, tmp0 207 - ror $6, tmp0 # 34 208 - pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] 209 - xor a_64, tmp0 210 - ror $28, tmp0 # 28 211 - psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 212 - add tmp0, T2 213 209 add T1, d_64 214 - psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 210 + RORQ tmp0, 6 # 34 211 + xor a_64, tmp0 212 + vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ 213 + # W[t-15]>>7 ^ W[t-15]<<63 215 214 lea (T1, T2), h_64 215 + RORQ tmp0, 28 # 28 216 + vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25 217 + add tmp0, h_64 216 218 RotateState 217 - movdqa %xmm2, %xmm1 # XMM1 = W[t-2] 219 + vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ 220 + # W[t-2]<<25 218 221 mov f_64, T1 222 + vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) 223 + mov e_64, tmp0 219 224 xor g_64, T1 220 - movdqa %xmm5, %xmm4 # XMM4 = W[t-15] 225 + idx = \rnd - 16 226 + vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] 227 + idx = \rnd - 7 228 + vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7] 229 + RORQ tmp0, 23 # 41 221 230 and e_64, T1 231 + xor e_64, tmp0 222 232 xor g_64, T1 223 - psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42 233 + vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56 224 234 idx = \rnd + 1 225 235 add WK_2(idx), T1 226 - mov e_64, tmp0 227 - psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7 228 - ror $23, tmp0 # 41 236 + vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15]) 237 + RORQ tmp0, 4 # 18 238 + vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) 229 239 xor e_64, tmp0 230 - pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2] 231 - ror $4, tmp0 # 18 232 - xor e_64, tmp0 233 - pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15] 234 - ror $14, tmp0 # 14 235 - add tmp0, T1 236 - psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3 237 - add h_64, T1 240 + vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] + 241 + # s0(W[t-15]) + W[t-16] 238 242 mov a_64, T2 239 - psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56 240 - xor c_64, T2 241 - and b_64, T2 242 - pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2]) 243 - mov a_64, tmp0 244 - and c_64, tmp0 245 - idx = \rnd - 7 246 - movdqu W_t(idx), %xmm1 # XMM1 = W[t-7] 247 - xor tmp0, T2 248 - pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15]) 249 - mov a_64, tmp0 250 - paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) 251 - ror $5, tmp0 # 39 252 - idx =\rnd-16 253 - paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] 254 - xor a_64, tmp0 255 - paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] 256 - ror $6, tmp0 # 34 257 - movdqa %xmm0, W_t(\rnd) # Store scheduled qwords 258 - xor a_64, tmp0 259 - paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t] 260 - ror $28, tmp0 # 28 243 + add h_64, T1 244 + RORQ tmp0, 14 # 14 245 + add tmp0, T1 261 246 idx = \rnd 262 - movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds 263 - add tmp0, T2 247 + vmovdqa %xmm0, W_t(idx) # Store W[t] 248 + vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t] 249 + vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds 250 + mov a_64, tmp0 251 + xor c_64, T2 252 + and c_64, tmp0 253 + and b_64, T2 254 + xor tmp0, T2 255 + mov a_64, tmp0 256 + RORQ tmp0, 5 # 39 257 + xor a_64, tmp0 264 258 add T1, d_64 259 + RORQ tmp0, 6 # 34 260 + xor a_64, tmp0 265 261 lea (T1, T2), h_64 262 + RORQ tmp0, 28 # 28 263 + add tmp0, h_64 266 264 RotateState 267 265 .endm 268 266 269 267 ######################################################################## 270 - ## void sha512_transform_ssse3(struct sha512_state *state, const u8 *data, 271 - ## int blocks); 272 - # (struct sha512_state is assumed to begin with u64 state[8]) 268 + # void sha512_transform_avx(struct sha512_block_state *state, 269 + # const u8 *data, size_t nblocks); 273 270 # Purpose: Updates the SHA512 digest stored at "state" with the message 274 271 # stored in "data". 275 272 # The size of the message pointed to by "data" must be an integer multiple 276 273 # of SHA512 message blocks. 277 - # "blocks" is the message length in SHA512 blocks. 274 + # "nblocks" is the message length in SHA512 blocks 278 275 ######################################################################## 279 - SYM_TYPED_FUNC_START(sha512_transform_ssse3) 276 + SYM_FUNC_START(sha512_transform_avx) 280 277 281 278 test msglen, msglen 282 279 je .Lnowork ··· 289 290 # Allocate Stack Space 290 291 push %rbp 291 292 mov %rsp, %rbp 292 - sub $frame_size, %rsp 293 + sub $frame_size, %rsp 293 294 and $~(0x20 - 1), %rsp 294 295 295 296 .Lupdateblock: 296 297 297 - # Load state variables 298 - mov DIGEST(0), a_64 299 - mov DIGEST(1), b_64 300 - mov DIGEST(2), c_64 301 - mov DIGEST(3), d_64 302 - mov DIGEST(4), e_64 303 - mov DIGEST(5), f_64 304 - mov DIGEST(6), g_64 305 - mov DIGEST(7), h_64 298 + # Load state variables 299 + mov DIGEST(0), a_64 300 + mov DIGEST(1), b_64 301 + mov DIGEST(2), c_64 302 + mov DIGEST(3), d_64 303 + mov DIGEST(4), e_64 304 + mov DIGEST(5), f_64 305 + mov DIGEST(6), g_64 306 + mov DIGEST(7), h_64 306 307 307 308 t = 0 308 309 .rept 80/2 + 1 ··· 310 311 # +1 iteration because the scheduler leads hashing by 1 iteration 311 312 .if t < 2 312 313 # BSWAP 2 QWORDS 313 - movdqa XMM_QWORD_BSWAP(%rip), %xmm1 314 - movdqu MSG(t), %xmm0 315 - pshufb %xmm1, %xmm0 # BSWAP 316 - movdqa %xmm0, W_t(t) # Store Scheduled Pair 317 - paddq K_t(t), %xmm0 # Compute W[t]+K[t] 318 - movdqa %xmm0, WK_2(t) # Store into WK for rounds 314 + vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1 315 + vmovdqu MSG(t), %xmm0 316 + vpshufb %xmm1, %xmm0, %xmm0 # BSWAP 317 + vmovdqa %xmm0, W_t(t) # Store Scheduled Pair 318 + vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] 319 + vmovdqa %xmm0, WK_2(t) # Store into WK for rounds 319 320 .elseif t < 16 320 321 # BSWAP 2 QWORDS# Compute 2 Rounds 321 - movdqu MSG(t), %xmm0 322 - pshufb %xmm1, %xmm0 # BSWAP 323 - SHA512_Round t-2 # Round t-2 324 - movdqa %xmm0, W_t(t) # Store Scheduled Pair 325 - paddq K_t(t), %xmm0 # Compute W[t]+K[t] 326 - SHA512_Round t-1 # Round t-1 327 - movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK 322 + vmovdqu MSG(t), %xmm0 323 + vpshufb %xmm1, %xmm0, %xmm0 # BSWAP 324 + SHA512_Round t-2 # Round t-2 325 + vmovdqa %xmm0, W_t(t) # Store Scheduled Pair 326 + vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] 327 + SHA512_Round t-1 # Round t-1 328 + vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK 328 329 .elseif t < 79 329 330 # Schedule 2 QWORDS# Compute 2 Rounds 330 - SHA512_2Sched_2Round_sse t 331 + SHA512_2Sched_2Round_avx t 331 332 .else 332 333 # Compute 2 Rounds 333 334 SHA512_Round t-2 ··· 337 338 .endr 338 339 339 340 # Update digest 340 - add a_64, DIGEST(0) 341 - add b_64, DIGEST(1) 342 - add c_64, DIGEST(2) 343 - add d_64, DIGEST(3) 344 - add e_64, DIGEST(4) 345 - add f_64, DIGEST(5) 346 - add g_64, DIGEST(6) 347 - add h_64, DIGEST(7) 341 + add a_64, DIGEST(0) 342 + add b_64, DIGEST(1) 343 + add c_64, DIGEST(2) 344 + add d_64, DIGEST(3) 345 + add e_64, DIGEST(4) 346 + add f_64, DIGEST(5) 347 + add g_64, DIGEST(6) 348 + add h_64, DIGEST(7) 348 349 349 350 # Advance to next message block 350 - add $16*8, msg 351 - dec msglen 352 - jnz .Lupdateblock 351 + add $16*8, msg 352 + dec msglen 353 + jnz .Lupdateblock 353 354 354 355 # Restore Stack Pointer 355 356 mov %rbp, %rsp ··· 364 365 365 366 .Lnowork: 366 367 RET 367 - SYM_FUNC_END(sha512_transform_ssse3) 368 + SYM_FUNC_END(sha512_transform_avx) 368 369 369 370 ######################################################################## 370 371 ### Binary Data
-322
arch/x86/crypto/sha512_ssse3_glue.c
··· 1 - /* 2 - * Cryptographic API. 3 - * 4 - * Glue code for the SHA512 Secure Hash Algorithm assembler 5 - * implementation using supplemental SSE3 / AVX / AVX2 instructions. 6 - * 7 - * This file is based on sha512_generic.c 8 - * 9 - * Copyright (C) 2013 Intel Corporation 10 - * Author: Tim Chen <tim.c.chen@linux.intel.com> 11 - * 12 - * This program is free software; you can redistribute it and/or modify it 13 - * under the terms of the GNU General Public License as published by the Free 14 - * Software Foundation; either version 2 of the License, or (at your option) 15 - * any later version. 16 - * 17 - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 21 - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 22 - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 23 - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 - * SOFTWARE. 25 - * 26 - */ 27 - 28 - #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 29 - 30 - #include <asm/cpu_device_id.h> 31 - #include <asm/simd.h> 32 - #include <crypto/internal/hash.h> 33 - #include <linux/kernel.h> 34 - #include <linux/module.h> 35 - #include <crypto/sha2.h> 36 - #include <crypto/sha512_base.h> 37 - 38 - asmlinkage void sha512_transform_ssse3(struct sha512_state *state, 39 - const u8 *data, int blocks); 40 - 41 - static int sha512_update_x86(struct shash_desc *desc, const u8 *data, 42 - unsigned int len, sha512_block_fn *sha512_xform) 43 - { 44 - int remain; 45 - 46 - /* 47 - * Make sure struct sha512_state begins directly with the SHA512 48 - * 512-bit internal state, as this is what the asm functions expect. 49 - */ 50 - BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0); 51 - 52 - kernel_fpu_begin(); 53 - remain = sha512_base_do_update_blocks(desc, data, len, sha512_xform); 54 - kernel_fpu_end(); 55 - 56 - return remain; 57 - } 58 - 59 - static int sha512_finup(struct shash_desc *desc, const u8 *data, 60 - unsigned int len, u8 *out, sha512_block_fn *sha512_xform) 61 - { 62 - kernel_fpu_begin(); 63 - sha512_base_do_finup(desc, data, len, sha512_xform); 64 - kernel_fpu_end(); 65 - 66 - return sha512_base_finish(desc, out); 67 - } 68 - 69 - static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, 70 - unsigned int len) 71 - { 72 - return sha512_update_x86(desc, data, len, sha512_transform_ssse3); 73 - } 74 - 75 - static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, 76 - unsigned int len, u8 *out) 77 - { 78 - return sha512_finup(desc, data, len, out, sha512_transform_ssse3); 79 - } 80 - 81 - static struct shash_alg sha512_ssse3_algs[] = { { 82 - .digestsize = SHA512_DIGEST_SIZE, 83 - .init = sha512_base_init, 84 - .update = sha512_ssse3_update, 85 - .finup = sha512_ssse3_finup, 86 - .descsize = SHA512_STATE_SIZE, 87 - .base = { 88 - .cra_name = "sha512", 89 - .cra_driver_name = "sha512-ssse3", 90 - .cra_priority = 150, 91 - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | 92 - CRYPTO_AHASH_ALG_FINUP_MAX, 93 - .cra_blocksize = SHA512_BLOCK_SIZE, 94 - .cra_module = THIS_MODULE, 95 - } 96 - }, { 97 - .digestsize = SHA384_DIGEST_SIZE, 98 - .init = sha384_base_init, 99 - .update = sha512_ssse3_update, 100 - .finup = sha512_ssse3_finup, 101 - .descsize = SHA512_STATE_SIZE, 102 - .base = { 103 - .cra_name = "sha384", 104 - .cra_driver_name = "sha384-ssse3", 105 - .cra_priority = 150, 106 - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | 107 - CRYPTO_AHASH_ALG_FINUP_MAX, 108 - .cra_blocksize = SHA384_BLOCK_SIZE, 109 - .cra_module = THIS_MODULE, 110 - } 111 - } }; 112 - 113 - static int register_sha512_ssse3(void) 114 - { 115 - if (boot_cpu_has(X86_FEATURE_SSSE3)) 116 - return crypto_register_shashes(sha512_ssse3_algs, 117 - ARRAY_SIZE(sha512_ssse3_algs)); 118 - return 0; 119 - } 120 - 121 - static void unregister_sha512_ssse3(void) 122 - { 123 - if (boot_cpu_has(X86_FEATURE_SSSE3)) 124 - crypto_unregister_shashes(sha512_ssse3_algs, 125 - ARRAY_SIZE(sha512_ssse3_algs)); 126 - } 127 - 128 - asmlinkage void sha512_transform_avx(struct sha512_state *state, 129 - const u8 *data, int blocks); 130 - static bool avx_usable(void) 131 - { 132 - if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { 133 - if (boot_cpu_has(X86_FEATURE_AVX)) 134 - pr_info("AVX detected but unusable.\n"); 135 - return false; 136 - } 137 - 138 - return true; 139 - } 140 - 141 - static int sha512_avx_update(struct shash_desc *desc, const u8 *data, 142 - unsigned int len) 143 - { 144 - return sha512_update_x86(desc, data, len, sha512_transform_avx); 145 - } 146 - 147 - static int sha512_avx_finup(struct shash_desc *desc, const u8 *data, 148 - unsigned int len, u8 *out) 149 - { 150 - return sha512_finup(desc, data, len, out, sha512_transform_avx); 151 - } 152 - 153 - static struct shash_alg sha512_avx_algs[] = { { 154 - .digestsize = SHA512_DIGEST_SIZE, 155 - .init = sha512_base_init, 156 - .update = sha512_avx_update, 157 - .finup = sha512_avx_finup, 158 - .descsize = SHA512_STATE_SIZE, 159 - .base = { 160 - .cra_name = "sha512", 161 - .cra_driver_name = "sha512-avx", 162 - .cra_priority = 160, 163 - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | 164 - CRYPTO_AHASH_ALG_FINUP_MAX, 165 - .cra_blocksize = SHA512_BLOCK_SIZE, 166 - .cra_module = THIS_MODULE, 167 - } 168 - }, { 169 - .digestsize = SHA384_DIGEST_SIZE, 170 - .init = sha384_base_init, 171 - .update = sha512_avx_update, 172 - .finup = sha512_avx_finup, 173 - .descsize = SHA512_STATE_SIZE, 174 - .base = { 175 - .cra_name = "sha384", 176 - .cra_driver_name = "sha384-avx", 177 - .cra_priority = 160, 178 - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | 179 - CRYPTO_AHASH_ALG_FINUP_MAX, 180 - .cra_blocksize = SHA384_BLOCK_SIZE, 181 - .cra_module = THIS_MODULE, 182 - } 183 - } }; 184 - 185 - static int register_sha512_avx(void) 186 - { 187 - if (avx_usable()) 188 - return crypto_register_shashes(sha512_avx_algs, 189 - ARRAY_SIZE(sha512_avx_algs)); 190 - return 0; 191 - } 192 - 193 - static void unregister_sha512_avx(void) 194 - { 195 - if (avx_usable()) 196 - crypto_unregister_shashes(sha512_avx_algs, 197 - ARRAY_SIZE(sha512_avx_algs)); 198 - } 199 - 200 - asmlinkage void sha512_transform_rorx(struct sha512_state *state, 201 - const u8 *data, int blocks); 202 - 203 - static int sha512_avx2_update(struct shash_desc *desc, const u8 *data, 204 - unsigned int len) 205 - { 206 - return sha512_update_x86(desc, data, len, sha512_transform_rorx); 207 - } 208 - 209 - static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data, 210 - unsigned int len, u8 *out) 211 - { 212 - return sha512_finup(desc, data, len, out, sha512_transform_rorx); 213 - } 214 - 215 - static struct shash_alg sha512_avx2_algs[] = { { 216 - .digestsize = SHA512_DIGEST_SIZE, 217 - .init = sha512_base_init, 218 - .update = sha512_avx2_update, 219 - .finup = sha512_avx2_finup, 220 - .descsize = SHA512_STATE_SIZE, 221 - .base = { 222 - .cra_name = "sha512", 223 - .cra_driver_name = "sha512-avx2", 224 - .cra_priority = 170, 225 - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | 226 - CRYPTO_AHASH_ALG_FINUP_MAX, 227 - .cra_blocksize = SHA512_BLOCK_SIZE, 228 - .cra_module = THIS_MODULE, 229 - } 230 - }, { 231 - .digestsize = SHA384_DIGEST_SIZE, 232 - .init = sha384_base_init, 233 - .update = sha512_avx2_update, 234 - .finup = sha512_avx2_finup, 235 - .descsize = SHA512_STATE_SIZE, 236 - .base = { 237 - .cra_name = "sha384", 238 - .cra_driver_name = "sha384-avx2", 239 - .cra_priority = 170, 240 - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | 241 - CRYPTO_AHASH_ALG_FINUP_MAX, 242 - .cra_blocksize = SHA384_BLOCK_SIZE, 243 - .cra_module = THIS_MODULE, 244 - } 245 - } }; 246 - 247 - static bool avx2_usable(void) 248 - { 249 - if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && 250 - boot_cpu_has(X86_FEATURE_BMI2)) 251 - return true; 252 - 253 - return false; 254 - } 255 - 256 - static int register_sha512_avx2(void) 257 - { 258 - if (avx2_usable()) 259 - return crypto_register_shashes(sha512_avx2_algs, 260 - ARRAY_SIZE(sha512_avx2_algs)); 261 - return 0; 262 - } 263 - static const struct x86_cpu_id module_cpu_ids[] = { 264 - X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), 265 - X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), 266 - X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), 267 - {} 268 - }; 269 - MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); 270 - 271 - static void unregister_sha512_avx2(void) 272 - { 273 - if (avx2_usable()) 274 - crypto_unregister_shashes(sha512_avx2_algs, 275 - ARRAY_SIZE(sha512_avx2_algs)); 276 - } 277 - 278 - static int __init sha512_ssse3_mod_init(void) 279 - { 280 - if (!x86_match_cpu(module_cpu_ids)) 281 - return -ENODEV; 282 - 283 - if (register_sha512_ssse3()) 284 - goto fail; 285 - 286 - if (register_sha512_avx()) { 287 - unregister_sha512_ssse3(); 288 - goto fail; 289 - } 290 - 291 - if (register_sha512_avx2()) { 292 - unregister_sha512_avx(); 293 - unregister_sha512_ssse3(); 294 - goto fail; 295 - } 296 - 297 - return 0; 298 - fail: 299 - return -ENODEV; 300 - } 301 - 302 - static void __exit sha512_ssse3_mod_fini(void) 303 - { 304 - unregister_sha512_avx2(); 305 - unregister_sha512_avx(); 306 - unregister_sha512_ssse3(); 307 - } 308 - 309 - module_init(sha512_ssse3_mod_init); 310 - module_exit(sha512_ssse3_mod_fini); 311 - 312 - MODULE_LICENSE("GPL"); 313 - MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); 314 - 315 - MODULE_ALIAS_CRYPTO("sha512"); 316 - MODULE_ALIAS_CRYPTO("sha512-ssse3"); 317 - MODULE_ALIAS_CRYPTO("sha512-avx"); 318 - MODULE_ALIAS_CRYPTO("sha512-avx2"); 319 - MODULE_ALIAS_CRYPTO("sha384"); 320 - MODULE_ALIAS_CRYPTO("sha384-ssse3"); 321 - MODULE_ALIAS_CRYPTO("sha384-avx"); 322 - MODULE_ALIAS_CRYPTO("sha384-avx2");
+1
lib/crypto/Kconfig
··· 183 183 default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO 184 184 default y if S390 185 185 default y if SPARC64 186 + default y if X86_64 186 187 187 188 config CRYPTO_LIB_SM3 188 189 tristate
+3
lib/crypto/Makefile
··· 95 95 96 96 libsha512-$(CONFIG_RISCV) += riscv/sha512-riscv64-zvknhb-zvkb.o 97 97 libsha512-$(CONFIG_SPARC) += sparc/sha512_asm.o 98 + libsha512-$(CONFIG_X86) += x86/sha512-ssse3-asm.o \ 99 + x86/sha512-avx-asm.o \ 100 + x86/sha512-avx2-asm.o 98 101 endif # CONFIG_CRYPTO_LIB_SHA512_ARCH 99 102 100 103 obj-$(CONFIG_MPILIB) += mpi/
+54
lib/crypto/x86/sha512.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * x86-optimized SHA-512 block function 4 + * 5 + * Copyright 2025 Google LLC 6 + */ 7 + 8 + #include <asm/fpu/api.h> 9 + #include <crypto/internal/simd.h> 10 + #include <linux/static_call.h> 11 + 12 + DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic); 13 + 14 + #define DEFINE_X86_SHA512_FN(c_fn, asm_fn) \ 15 + asmlinkage void asm_fn(struct sha512_block_state *state, \ 16 + const u8 *data, size_t nblocks); \ 17 + static void c_fn(struct sha512_block_state *state, const u8 *data, \ 18 + size_t nblocks) \ 19 + { \ 20 + if (likely(crypto_simd_usable())) { \ 21 + kernel_fpu_begin(); \ 22 + asm_fn(state, data, nblocks); \ 23 + kernel_fpu_end(); \ 24 + } else { \ 25 + sha512_blocks_generic(state, data, nblocks); \ 26 + } \ 27 + } 28 + 29 + DEFINE_X86_SHA512_FN(sha512_blocks_ssse3, sha512_transform_ssse3); 30 + DEFINE_X86_SHA512_FN(sha512_blocks_avx, sha512_transform_avx); 31 + DEFINE_X86_SHA512_FN(sha512_blocks_avx2, sha512_transform_rorx); 32 + 33 + static void sha512_blocks(struct sha512_block_state *state, 34 + const u8 *data, size_t nblocks) 35 + { 36 + static_call(sha512_blocks_x86)(state, data, nblocks); 37 + } 38 + 39 + #define sha512_mod_init_arch sha512_mod_init_arch 40 + static inline void sha512_mod_init_arch(void) 41 + { 42 + if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && 43 + boot_cpu_has(X86_FEATURE_AVX)) { 44 + if (boot_cpu_has(X86_FEATURE_AVX2) && 45 + boot_cpu_has(X86_FEATURE_BMI2)) 46 + static_call_update(sha512_blocks_x86, 47 + sha512_blocks_avx2); 48 + else 49 + static_call_update(sha512_blocks_x86, 50 + sha512_blocks_avx); 51 + } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { 52 + static_call_update(sha512_blocks_x86, sha512_blocks_ssse3); 53 + } 54 + }