Merge tag 'v6.11-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

+2 -2

Documentation/ABI/testing/sysfs-driver-qat

··· 143 143 This attribute is only available for qat_4xxx devices. 144 144 145 145 What: /sys/bus/pci/devices/<BDF>/qat/auto_reset 146 - Date: March 2024 147 - KernelVersion: 6.8 146 + Date: May 2024 147 + KernelVersion: 6.9 148 148 Contact: qat-linux@intel.com 149 149 Description: (RW) Reports the current state of the autoreset feature 150 150 for a QAT device

+2

Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml

··· 18 18 - allwinner,sun50i-a64-crypto 19 19 - allwinner,sun50i-h5-crypto 20 20 - allwinner,sun50i-h6-crypto 21 + - allwinner,sun50i-h616-crypto 21 22 22 23 reg: 23 24 maxItems: 1 ··· 50 49 compatible: 51 50 enum: 52 51 - allwinner,sun20i-d1-crypto 52 + - allwinner,sun50i-h616-crypto 53 53 then: 54 54 properties: 55 55 clocks:

+3

Documentation/devicetree/bindings/rng/amlogic,meson-rng.yaml

··· 26 26 items: 27 27 - const: core 28 28 29 + power-domains: 30 + maxItems: 1 31 + 29 32 required: 30 33 - compatible 31 34 - reg

+36 -4

Documentation/devicetree/bindings/rng/samsung,exynos5250-trng.yaml

··· 12 12 13 13 properties: 14 14 compatible: 15 - const: samsung,exynos5250-trng 15 + enum: 16 + - samsung,exynos5250-trng 17 + - samsung,exynos850-trng 16 18 17 19 clocks: 18 - maxItems: 1 20 + minItems: 1 21 + maxItems: 2 19 22 20 23 clock-names: 21 - items: 22 - - const: secss 24 + minItems: 1 25 + maxItems: 2 23 26 24 27 reg: 25 28 maxItems: 1 ··· 32 29 - clocks 33 30 - clock-names 34 31 - reg 32 + 33 + allOf: 34 + - if: 35 + properties: 36 + compatible: 37 + contains: 38 + const: samsung,exynos850-trng 39 + 40 + then: 41 + properties: 42 + clocks: 43 + items: 44 + - description: SSS (Security Sub System) operating clock 45 + - description: SSS (Security Sub System) bus clock 46 + 47 + clock-names: 48 + items: 49 + - const: secss 50 + - const: pclk 51 + 52 + else: 53 + properties: 54 + clocks: 55 + items: 56 + - description: SSS (Security Sub System) operating clock 57 + 58 + clock-names: 59 + items: 60 + - const: secss 35 61 36 62 additionalProperties: false 37 63

+6

MAINTAINERS

··· 980 980 F: tools/crypto/ccp/*.c 981 981 F: tools/crypto/ccp/*.py 982 982 983 + AMD CRYPTOGRAPHIC COPROCESSOR (CCP) DRIVER - HSTI SUPPORT 984 + M: Mario Limonciello <mario.limonciello@amd.com> 985 + L: linux-crypto@vger.kernel.org 986 + S: Supported 987 + F: drivers/crypto/ccp/hsti.* 988 + 983 989 AMD DISPLAY CORE 984 990 M: Harry Wentland <harry.wentland@amd.com> 985 991 M: Leo Li <sunpeng.li@amd.com>

+1

arch/arm/crypto/aes-neonbs-glue.c

··· 17 17 #include <linux/module.h> 18 18 19 19 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 20 + MODULE_DESCRIPTION("Bit sliced AES using NEON instructions"); 20 21 MODULE_LICENSE("GPL v2"); 21 22 22 23 MODULE_ALIAS_CRYPTO("ecb(aes)");

+9 -8

arch/arm/crypto/crc32-ce-core.S

··· 48 48 */ 49 49 50 50 #include <linux/linkage.h> 51 + #include <linux/cfi_types.h> 51 52 #include <asm/assembler.h> 52 53 53 54 .text ··· 124 123 * uint crc32_pmull_le(unsigned char const *buffer, 125 124 * size_t len, uint crc32) 126 125 */ 127 - ENTRY(crc32_pmull_le) 126 + SYM_FUNC_START(crc32_pmull_le) 128 127 adr r3, .Lcrc32_constants 129 128 b 0f 129 + SYM_FUNC_END(crc32_pmull_le) 130 130 131 - ENTRY(crc32c_pmull_le) 131 + SYM_FUNC_START(crc32c_pmull_le) 132 132 adr r3, .Lcrc32c_constants 133 133 134 134 0: bic LEN, LEN, #15 ··· 238 236 vmov r0, s5 239 237 240 238 bx lr 241 - ENDPROC(crc32_pmull_le) 242 - ENDPROC(crc32c_pmull_le) 239 + SYM_FUNC_END(crc32c_pmull_le) 243 240 244 241 .macro __crc32, c 245 242 subs ip, r2, #8 ··· 297 296 .endm 298 297 299 298 .align 5 300 - ENTRY(crc32_armv8_le) 299 + SYM_TYPED_FUNC_START(crc32_armv8_le) 301 300 __crc32 302 - ENDPROC(crc32_armv8_le) 301 + SYM_FUNC_END(crc32_armv8_le) 303 302 304 303 .align 5 305 - ENTRY(crc32c_armv8_le) 304 + SYM_TYPED_FUNC_START(crc32c_armv8_le) 306 305 __crc32 c 307 - ENDPROC(crc32c_armv8_le) 306 + SYM_FUNC_END(crc32c_armv8_le)

+1

arch/arm/crypto/crc32-ce-glue.c

··· 241 241 module_exit(crc32_pmull_mod_exit); 242 242 243 243 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 244 + MODULE_DESCRIPTION("Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions"); 244 245 MODULE_LICENSE("GPL v2"); 245 246 MODULE_ALIAS_CRYPTO("crc32"); 246 247 MODULE_ALIAS_CRYPTO("crc32c");

+1

arch/arm/crypto/crct10dif-ce-glue.c

··· 84 84 module_exit(crc_t10dif_mod_exit); 85 85 86 86 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 87 + MODULE_DESCRIPTION("Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions"); 87 88 MODULE_LICENSE("GPL v2"); 88 89 MODULE_ALIAS_CRYPTO("crct10dif");

+1

arch/arm/crypto/curve25519-glue.c

··· 133 133 134 134 MODULE_ALIAS_CRYPTO("curve25519"); 135 135 MODULE_ALIAS_CRYPTO("curve25519-neon"); 136 + MODULE_DESCRIPTION("Public key crypto: Curve25519 (NEON-accelerated)"); 136 137 MODULE_LICENSE("GPL v2");

+1

arch/arm/crypto/poly1305-glue.c

··· 267 267 module_init(arm_poly1305_mod_init); 268 268 module_exit(arm_poly1305_mod_exit); 269 269 270 + MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM"); 270 271 MODULE_LICENSE("GPL v2"); 271 272 MODULE_ALIAS_CRYPTO("poly1305"); 272 273 MODULE_ALIAS_CRYPTO("poly1305-arm");

+1

arch/arm64/crypto/aes-neonbs-glue.c

··· 16 16 #include <linux/module.h> 17 17 18 18 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 19 + MODULE_DESCRIPTION("Bit sliced AES using NEON instructions"); 19 20 MODULE_LICENSE("GPL v2"); 20 21 21 22 MODULE_ALIAS_CRYPTO("ecb(aes)");

+2 -1

arch/arm64/crypto/crct10dif-ce-glue.c

··· 98 98 99 99 .base.cra_name = "crct10dif", 100 100 .base.cra_driver_name = "crct10dif-arm64-neon", 101 - .base.cra_priority = 100, 101 + .base.cra_priority = 150, 102 102 .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE, 103 103 .base.cra_module = THIS_MODULE, 104 104 }, { ··· 138 138 module_exit(crc_t10dif_mod_exit); 139 139 140 140 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 141 + MODULE_DESCRIPTION("CRC-T10DIF using arm64 NEON and Crypto Extensions"); 141 142 MODULE_LICENSE("GPL v2"); 142 143 MODULE_ALIAS_CRYPTO("crct10dif"); 143 144 MODULE_ALIAS_CRYPTO("crct10dif-arm64-ce");

+1

arch/arm64/crypto/poly1305-glue.c

··· 226 226 module_init(neon_poly1305_mod_init); 227 227 module_exit(neon_poly1305_mod_exit); 228 228 229 + MODULE_DESCRIPTION("Poly1305 transform using NEON instructions"); 229 230 MODULE_LICENSE("GPL v2"); 230 231 MODULE_ALIAS_CRYPTO("poly1305"); 231 232 MODULE_ALIAS_CRYPTO("poly1305-neon");

+11

arch/powerpc/crypto/Kconfig

··· 2 2 3 3 menu "Accelerated Cryptographic Algorithms for CPU (powerpc)" 4 4 5 + config CRYPTO_CURVE25519_PPC64 6 + tristate "Public key crypto: Curve25519 (PowerPC64)" 7 + depends on PPC64 && CPU_LITTLE_ENDIAN 8 + select CRYPTO_LIB_CURVE25519_GENERIC 9 + select CRYPTO_ARCH_HAVE_LIB_CURVE25519 10 + help 11 + Curve25519 algorithm 12 + 13 + Architecture: PowerPC64 14 + - Little-endian 15 + 5 16 config CRYPTO_CRC32C_VPMSUM 6 17 tristate "CRC32c" 7 18 depends on PPC64 && ALTIVEC

+2

arch/powerpc/crypto/Makefile

··· 17 17 obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o 18 18 obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o 19 19 obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o 20 + obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o 20 21 21 22 aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o 22 23 md5-ppc-y := md5-asm.o md5-glue.o ··· 30 29 chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o 31 30 poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o 32 31 vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o 32 + curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o 33 33 34 34 ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y) 35 35 override flavour := linux-ppc64le

+299

arch/powerpc/crypto/curve25519-ppc64le-core.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Copyright 2024- IBM Corp. 4 + * 5 + * X25519 scalar multiplication with 51 bits limbs for PPC64le. 6 + * Based on RFC7748 and AArch64 optimized implementation for X25519 7 + * - Algorithm 1 Scalar multiplication of a variable point 8 + */ 9 + 10 + #include <crypto/curve25519.h> 11 + #include <crypto/internal/kpp.h> 12 + 13 + #include <linux/types.h> 14 + #include <linux/jump_label.h> 15 + #include <linux/kernel.h> 16 + #include <linux/module.h> 17 + #include <linux/scatterlist.h> 18 + 19 + #include <linux/cpufeature.h> 20 + #include <linux/processor.h> 21 + 22 + typedef uint64_t fe51[5]; 23 + 24 + asmlinkage void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g); 25 + asmlinkage void x25519_fe51_sqr(fe51 h, const fe51 f); 26 + asmlinkage void x25519_fe51_mul121666(fe51 h, fe51 f); 27 + asmlinkage void x25519_fe51_sqr_times(fe51 h, const fe51 f, int n); 28 + asmlinkage void x25519_fe51_frombytes(fe51 h, const uint8_t *s); 29 + asmlinkage void x25519_fe51_tobytes(uint8_t *s, const fe51 h); 30 + asmlinkage void x25519_cswap(fe51 p, fe51 q, unsigned int bit); 31 + 32 + #define fmul x25519_fe51_mul 33 + #define fsqr x25519_fe51_sqr 34 + #define fmul121666 x25519_fe51_mul121666 35 + #define fe51_tobytes x25519_fe51_tobytes 36 + 37 + static void fadd(fe51 h, const fe51 f, const fe51 g) 38 + { 39 + h[0] = f[0] + g[0]; 40 + h[1] = f[1] + g[1]; 41 + h[2] = f[2] + g[2]; 42 + h[3] = f[3] + g[3]; 43 + h[4] = f[4] + g[4]; 44 + } 45 + 46 + /* 47 + * Prime = 2 ** 255 - 19, 255 bits 48 + * (0x7fffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffed) 49 + * 50 + * Prime in 5 51-bit limbs 51 + */ 52 + static fe51 prime51 = { 0x7ffffffffffed, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff}; 53 + 54 + static void fsub(fe51 h, const fe51 f, const fe51 g) 55 + { 56 + h[0] = (f[0] + ((prime51[0] * 2))) - g[0]; 57 + h[1] = (f[1] + ((prime51[1] * 2))) - g[1]; 58 + h[2] = (f[2] + ((prime51[2] * 2))) - g[2]; 59 + h[3] = (f[3] + ((prime51[3] * 2))) - g[3]; 60 + h[4] = (f[4] + ((prime51[4] * 2))) - g[4]; 61 + } 62 + 63 + static void fe51_frombytes(fe51 h, const uint8_t *s) 64 + { 65 + /* 66 + * Make sure 64-bit aligned. 67 + */ 68 + unsigned char sbuf[32+8]; 69 + unsigned char *sb = PTR_ALIGN((void *)sbuf, 8); 70 + 71 + memcpy(sb, s, 32); 72 + x25519_fe51_frombytes(h, sb); 73 + } 74 + 75 + static void finv(fe51 o, const fe51 i) 76 + { 77 + fe51 a0, b, c, t00; 78 + 79 + fsqr(a0, i); 80 + x25519_fe51_sqr_times(t00, a0, 2); 81 + 82 + fmul(b, t00, i); 83 + fmul(a0, b, a0); 84 + 85 + fsqr(t00, a0); 86 + 87 + fmul(b, t00, b); 88 + x25519_fe51_sqr_times(t00, b, 5); 89 + 90 + fmul(b, t00, b); 91 + x25519_fe51_sqr_times(t00, b, 10); 92 + 93 + fmul(c, t00, b); 94 + x25519_fe51_sqr_times(t00, c, 20); 95 + 96 + fmul(t00, t00, c); 97 + x25519_fe51_sqr_times(t00, t00, 10); 98 + 99 + fmul(b, t00, b); 100 + x25519_fe51_sqr_times(t00, b, 50); 101 + 102 + fmul(c, t00, b); 103 + x25519_fe51_sqr_times(t00, c, 100); 104 + 105 + fmul(t00, t00, c); 106 + x25519_fe51_sqr_times(t00, t00, 50); 107 + 108 + fmul(t00, t00, b); 109 + x25519_fe51_sqr_times(t00, t00, 5); 110 + 111 + fmul(o, t00, a0); 112 + } 113 + 114 + static void curve25519_fe51(uint8_t out[32], const uint8_t scalar[32], 115 + const uint8_t point[32]) 116 + { 117 + fe51 x1, x2, z2, x3, z3; 118 + uint8_t s[32]; 119 + unsigned int swap = 0; 120 + int i; 121 + 122 + memcpy(s, scalar, 32); 123 + s[0] &= 0xf8; 124 + s[31] &= 0x7f; 125 + s[31] |= 0x40; 126 + fe51_frombytes(x1, point); 127 + 128 + z2[0] = z2[1] = z2[2] = z2[3] = z2[4] = 0; 129 + x3[0] = x1[0]; 130 + x3[1] = x1[1]; 131 + x3[2] = x1[2]; 132 + x3[3] = x1[3]; 133 + x3[4] = x1[4]; 134 + 135 + x2[0] = z3[0] = 1; 136 + x2[1] = z3[1] = 0; 137 + x2[2] = z3[2] = 0; 138 + x2[3] = z3[3] = 0; 139 + x2[4] = z3[4] = 0; 140 + 141 + for (i = 254; i >= 0; --i) { 142 + unsigned int k_t = 1 & (s[i / 8] >> (i & 7)); 143 + fe51 a, b, c, d, e; 144 + fe51 da, cb, aa, bb; 145 + fe51 dacb_p, dacb_m; 146 + 147 + swap ^= k_t; 148 + x25519_cswap(x2, x3, swap); 149 + x25519_cswap(z2, z3, swap); 150 + swap = k_t; 151 + 152 + fsub(b, x2, z2); // B = x_2 - z_2 153 + fadd(a, x2, z2); // A = x_2 + z_2 154 + fsub(d, x3, z3); // D = x_3 - z_3 155 + fadd(c, x3, z3); // C = x_3 + z_3 156 + 157 + fsqr(bb, b); // BB = B^2 158 + fsqr(aa, a); // AA = A^2 159 + fmul(da, d, a); // DA = D * A 160 + fmul(cb, c, b); // CB = C * B 161 + 162 + fsub(e, aa, bb); // E = AA - BB 163 + fmul(x2, aa, bb); // x2 = AA * BB 164 + fadd(dacb_p, da, cb); // DA + CB 165 + fsub(dacb_m, da, cb); // DA - CB 166 + 167 + fmul121666(z3, e); // 121666 * E 168 + fsqr(z2, dacb_m); // (DA - CB)^2 169 + fsqr(x3, dacb_p); // x3 = (DA + CB)^2 170 + fadd(b, bb, z3); // BB + 121666 * E 171 + fmul(z3, x1, z2); // z3 = x1 * (DA - CB)^2 172 + fmul(z2, e, b); // z2 = e * (BB + (DA + CB)^2) 173 + } 174 + 175 + finv(z2, z2); 176 + fmul(x2, x2, z2); 177 + fe51_tobytes(out, x2); 178 + } 179 + 180 + void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], 181 + const u8 secret[CURVE25519_KEY_SIZE], 182 + const u8 basepoint[CURVE25519_KEY_SIZE]) 183 + { 184 + curve25519_fe51(mypublic, secret, basepoint); 185 + } 186 + EXPORT_SYMBOL(curve25519_arch); 187 + 188 + void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], 189 + const u8 secret[CURVE25519_KEY_SIZE]) 190 + { 191 + curve25519_fe51(pub, secret, curve25519_base_point); 192 + } 193 + EXPORT_SYMBOL(curve25519_base_arch); 194 + 195 + static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, 196 + unsigned int len) 197 + { 198 + u8 *secret = kpp_tfm_ctx(tfm); 199 + 200 + if (!len) 201 + curve25519_generate_secret(secret); 202 + else if (len == CURVE25519_KEY_SIZE && 203 + crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) 204 + memcpy(secret, buf, CURVE25519_KEY_SIZE); 205 + else 206 + return -EINVAL; 207 + return 0; 208 + } 209 + 210 + static int curve25519_generate_public_key(struct kpp_request *req) 211 + { 212 + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); 213 + const u8 *secret = kpp_tfm_ctx(tfm); 214 + u8 buf[CURVE25519_KEY_SIZE]; 215 + int copied, nbytes; 216 + 217 + if (req->src) 218 + return -EINVAL; 219 + 220 + curve25519_base_arch(buf, secret); 221 + 222 + /* might want less than we've got */ 223 + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); 224 + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, 225 + nbytes), 226 + buf, nbytes); 227 + if (copied != nbytes) 228 + return -EINVAL; 229 + return 0; 230 + } 231 + 232 + static int curve25519_compute_shared_secret(struct kpp_request *req) 233 + { 234 + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); 235 + const u8 *secret = kpp_tfm_ctx(tfm); 236 + u8 public_key[CURVE25519_KEY_SIZE]; 237 + u8 buf[CURVE25519_KEY_SIZE]; 238 + int copied, nbytes; 239 + 240 + if (!req->src) 241 + return -EINVAL; 242 + 243 + copied = sg_copy_to_buffer(req->src, 244 + sg_nents_for_len(req->src, 245 + CURVE25519_KEY_SIZE), 246 + public_key, CURVE25519_KEY_SIZE); 247 + if (copied != CURVE25519_KEY_SIZE) 248 + return -EINVAL; 249 + 250 + curve25519_arch(buf, secret, public_key); 251 + 252 + /* might want less than we've got */ 253 + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); 254 + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, 255 + nbytes), 256 + buf, nbytes); 257 + if (copied != nbytes) 258 + return -EINVAL; 259 + return 0; 260 + } 261 + 262 + static unsigned int curve25519_max_size(struct crypto_kpp *tfm) 263 + { 264 + return CURVE25519_KEY_SIZE; 265 + } 266 + 267 + static struct kpp_alg curve25519_alg = { 268 + .base.cra_name = "curve25519", 269 + .base.cra_driver_name = "curve25519-ppc64le", 270 + .base.cra_priority = 200, 271 + .base.cra_module = THIS_MODULE, 272 + .base.cra_ctxsize = CURVE25519_KEY_SIZE, 273 + 274 + .set_secret = curve25519_set_secret, 275 + .generate_public_key = curve25519_generate_public_key, 276 + .compute_shared_secret = curve25519_compute_shared_secret, 277 + .max_size = curve25519_max_size, 278 + }; 279 + 280 + 281 + static int __init curve25519_mod_init(void) 282 + { 283 + return IS_REACHABLE(CONFIG_CRYPTO_KPP) ? 284 + crypto_register_kpp(&curve25519_alg) : 0; 285 + } 286 + 287 + static void __exit curve25519_mod_exit(void) 288 + { 289 + if (IS_REACHABLE(CONFIG_CRYPTO_KPP)) 290 + crypto_unregister_kpp(&curve25519_alg); 291 + } 292 + 293 + module_init(curve25519_mod_init); 294 + module_exit(curve25519_mod_exit); 295 + 296 + MODULE_ALIAS_CRYPTO("curve25519"); 297 + MODULE_ALIAS_CRYPTO("curve25519-ppc64le"); 298 + MODULE_LICENSE("GPL v2"); 299 + MODULE_AUTHOR("Danny Tsen <dtsen@us.ibm.com>");

+671

arch/powerpc/crypto/curve25519-ppc64le_asm.S

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + # 3 + # This code is taken from CRYPTOGAMs[1] and is included here using the option 4 + # in the license to distribute the code under the GPL. Therefore this program 5 + # is free software; you can redistribute it and/or modify it under the terms of 6 + # the GNU General Public License version 2 as published by the Free Software 7 + # Foundation. 8 + # 9 + # [1] https://github.com/dot-asm/cryptogams/ 10 + 11 + # Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org> 12 + # All rights reserved. 13 + # 14 + # Redistribution and use in source and binary forms, with or without 15 + # modification, are permitted provided that the following conditions 16 + # are met: 17 + # 18 + # * Redistributions of source code must retain copyright notices, 19 + # this list of conditions and the following disclaimer. 20 + # 21 + # * Redistributions in binary form must reproduce the above 22 + # copyright notice, this list of conditions and the following 23 + # disclaimer in the documentation and/or other materials 24 + # provided with the distribution. 25 + # 26 + # * Neither the name of the CRYPTOGAMS nor the names of its 27 + # copyright holder and contributors may be used to endorse or 28 + # promote products derived from this software without specific 29 + # prior written permission. 30 + # 31 + # ALTERNATIVELY, provided that this notice is retained in full, this 32 + # product may be distributed under the terms of the GNU General Public 33 + # License (GPL), in which case the provisions of the GPL apply INSTEAD OF 34 + # those given above. 35 + # 36 + # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS 37 + # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 38 + # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 39 + # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 40 + # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 41 + # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 42 + # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 43 + # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 44 + # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 45 + # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 46 + # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 47 + 48 + # ==================================================================== 49 + # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 50 + # project. The module is, however, dual licensed under OpenSSL and 51 + # CRYPTOGAMS licenses depending on where you obtain it. For further 52 + # details see https://www.openssl.org/~appro/cryptogams/. 53 + # ==================================================================== 54 + 55 + # 56 + # ==================================================================== 57 + # Written and Modified by Danny Tsen <dtsen@us.ibm.com> 58 + # - Added x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes 59 + # and x25519_cswap 60 + # 61 + # Copyright 2024- IBM Corp. 62 + # 63 + # X25519 lower-level primitives for PPC64. 64 + # 65 + 66 + #include <linux/linkage.h> 67 + 68 + .text 69 + 70 + .align 5 71 + SYM_FUNC_START(x25519_fe51_mul) 72 + 73 + stdu 1,-144(1) 74 + std 21,56(1) 75 + std 22,64(1) 76 + std 23,72(1) 77 + std 24,80(1) 78 + std 25,88(1) 79 + std 26,96(1) 80 + std 27,104(1) 81 + std 28,112(1) 82 + std 29,120(1) 83 + std 30,128(1) 84 + std 31,136(1) 85 + 86 + ld 6,0(5) 87 + ld 7,0(4) 88 + ld 8,8(4) 89 + ld 9,16(4) 90 + ld 10,24(4) 91 + ld 11,32(4) 92 + 93 + mulld 22,7,6 94 + mulhdu 23,7,6 95 + 96 + mulld 24,8,6 97 + mulhdu 25,8,6 98 + 99 + mulld 30,11,6 100 + mulhdu 31,11,6 101 + ld 4,8(5) 102 + mulli 11,11,19 103 + 104 + mulld 26,9,6 105 + mulhdu 27,9,6 106 + 107 + mulld 28,10,6 108 + mulhdu 29,10,6 109 + mulld 12,11,4 110 + mulhdu 21,11,4 111 + addc 22,22,12 112 + adde 23,23,21 113 + 114 + mulld 12,7,4 115 + mulhdu 21,7,4 116 + addc 24,24,12 117 + adde 25,25,21 118 + 119 + mulld 12,10,4 120 + mulhdu 21,10,4 121 + ld 6,16(5) 122 + mulli 10,10,19 123 + addc 30,30,12 124 + adde 31,31,21 125 + 126 + mulld 12,8,4 127 + mulhdu 21,8,4 128 + addc 26,26,12 129 + adde 27,27,21 130 + 131 + mulld 12,9,4 132 + mulhdu 21,9,4 133 + addc 28,28,12 134 + adde 29,29,21 135 + mulld 12,10,6 136 + mulhdu 21,10,6 137 + addc 22,22,12 138 + adde 23,23,21 139 + 140 + mulld 12,11,6 141 + mulhdu 21,11,6 142 + addc 24,24,12 143 + adde 25,25,21 144 + 145 + mulld 12,9,6 146 + mulhdu 21,9,6 147 + ld 4,24(5) 148 + mulli 9,9,19 149 + addc 30,30,12 150 + adde 31,31,21 151 + 152 + mulld 12,7,6 153 + mulhdu 21,7,6 154 + addc 26,26,12 155 + adde 27,27,21 156 + 157 + mulld 12,8,6 158 + mulhdu 21,8,6 159 + addc 28,28,12 160 + adde 29,29,21 161 + mulld 12,9,4 162 + mulhdu 21,9,4 163 + addc 22,22,12 164 + adde 23,23,21 165 + 166 + mulld 12,10,4 167 + mulhdu 21,10,4 168 + addc 24,24,12 169 + adde 25,25,21 170 + 171 + mulld 12,8,4 172 + mulhdu 21,8,4 173 + ld 6,32(5) 174 + mulli 8,8,19 175 + addc 30,30,12 176 + adde 31,31,21 177 + 178 + mulld 12,11,4 179 + mulhdu 21,11,4 180 + addc 26,26,12 181 + adde 27,27,21 182 + 183 + mulld 12,7,4 184 + mulhdu 21,7,4 185 + addc 28,28,12 186 + adde 29,29,21 187 + mulld 12,8,6 188 + mulhdu 21,8,6 189 + addc 22,22,12 190 + adde 23,23,21 191 + 192 + mulld 12,9,6 193 + mulhdu 21,9,6 194 + addc 24,24,12 195 + adde 25,25,21 196 + 197 + mulld 12,10,6 198 + mulhdu 21,10,6 199 + addc 26,26,12 200 + adde 27,27,21 201 + 202 + mulld 12,11,6 203 + mulhdu 21,11,6 204 + addc 28,28,12 205 + adde 29,29,21 206 + 207 + mulld 12,7,6 208 + mulhdu 21,7,6 209 + addc 30,30,12 210 + adde 31,31,21 211 + 212 + .Lfe51_reduce: 213 + li 0,-1 214 + srdi 0,0,13 215 + 216 + srdi 12,26,51 217 + and 9,26,0 218 + insrdi 12,27,51,0 219 + srdi 21,22,51 220 + and 7,22,0 221 + insrdi 21,23,51,0 222 + addc 28,28,12 223 + addze 29,29 224 + addc 24,24,21 225 + addze 25,25 226 + 227 + srdi 12,28,51 228 + and 10,28,0 229 + insrdi 12,29,51,0 230 + srdi 21,24,51 231 + and 8,24,0 232 + insrdi 21,25,51,0 233 + addc 30,30,12 234 + addze 31,31 235 + add 9,9,21 236 + 237 + srdi 12,30,51 238 + and 11,30,0 239 + insrdi 12,31,51,0 240 + mulli 12,12,19 241 + 242 + add 7,7,12 243 + 244 + srdi 21,9,51 245 + and 9,9,0 246 + add 10,10,21 247 + 248 + srdi 12,7,51 249 + and 7,7,0 250 + add 8,8,12 251 + 252 + std 9,16(3) 253 + std 10,24(3) 254 + std 11,32(3) 255 + std 7,0(3) 256 + std 8,8(3) 257 + 258 + ld 21,56(1) 259 + ld 22,64(1) 260 + ld 23,72(1) 261 + ld 24,80(1) 262 + ld 25,88(1) 263 + ld 26,96(1) 264 + ld 27,104(1) 265 + ld 28,112(1) 266 + ld 29,120(1) 267 + ld 30,128(1) 268 + ld 31,136(1) 269 + addi 1,1,144 270 + blr 271 + SYM_FUNC_END(x25519_fe51_mul) 272 + 273 + .align 5 274 + SYM_FUNC_START(x25519_fe51_sqr) 275 + 276 + stdu 1,-144(1) 277 + std 21,56(1) 278 + std 22,64(1) 279 + std 23,72(1) 280 + std 24,80(1) 281 + std 25,88(1) 282 + std 26,96(1) 283 + std 27,104(1) 284 + std 28,112(1) 285 + std 29,120(1) 286 + std 30,128(1) 287 + std 31,136(1) 288 + 289 + ld 7,0(4) 290 + ld 8,8(4) 291 + ld 9,16(4) 292 + ld 10,24(4) 293 + ld 11,32(4) 294 + 295 + add 6,7,7 296 + mulli 21,11,19 297 + 298 + mulld 22,7,7 299 + mulhdu 23,7,7 300 + mulld 24,8,6 301 + mulhdu 25,8,6 302 + mulld 26,9,6 303 + mulhdu 27,9,6 304 + mulld 28,10,6 305 + mulhdu 29,10,6 306 + mulld 30,11,6 307 + mulhdu 31,11,6 308 + add 6,8,8 309 + mulld 12,11,21 310 + mulhdu 11,11,21 311 + addc 28,28,12 312 + adde 29,29,11 313 + 314 + mulli 5,10,19 315 + 316 + mulld 12,8,8 317 + mulhdu 11,8,8 318 + addc 26,26,12 319 + adde 27,27,11 320 + mulld 12,9,6 321 + mulhdu 11,9,6 322 + addc 28,28,12 323 + adde 29,29,11 324 + mulld 12,10,6 325 + mulhdu 11,10,6 326 + addc 30,30,12 327 + adde 31,31,11 328 + mulld 12,21,6 329 + mulhdu 11,21,6 330 + add 6,10,10 331 + addc 22,22,12 332 + adde 23,23,11 333 + mulld 12,10,5 334 + mulhdu 10,10,5 335 + addc 24,24,12 336 + adde 25,25,10 337 + mulld 12,6,21 338 + mulhdu 10,6,21 339 + add 6,9,9 340 + addc 26,26,12 341 + adde 27,27,10 342 + 343 + mulld 12,9,9 344 + mulhdu 10,9,9 345 + addc 30,30,12 346 + adde 31,31,10 347 + mulld 12,5,6 348 + mulhdu 10,5,6 349 + addc 22,22,12 350 + adde 23,23,10 351 + mulld 12,21,6 352 + mulhdu 10,21,6 353 + addc 24,24,12 354 + adde 25,25,10 355 + 356 + b .Lfe51_reduce 357 + SYM_FUNC_END(x25519_fe51_sqr) 358 + 359 + .align 5 360 + SYM_FUNC_START(x25519_fe51_mul121666) 361 + 362 + stdu 1,-144(1) 363 + std 21,56(1) 364 + std 22,64(1) 365 + std 23,72(1) 366 + std 24,80(1) 367 + std 25,88(1) 368 + std 26,96(1) 369 + std 27,104(1) 370 + std 28,112(1) 371 + std 29,120(1) 372 + std 30,128(1) 373 + std 31,136(1) 374 + 375 + lis 6,1 376 + ori 6,6,56130 377 + ld 7,0(4) 378 + ld 8,8(4) 379 + ld 9,16(4) 380 + ld 10,24(4) 381 + ld 11,32(4) 382 + 383 + mulld 22,7,6 384 + mulhdu 23,7,6 385 + mulld 24,8,6 386 + mulhdu 25,8,6 387 + mulld 26,9,6 388 + mulhdu 27,9,6 389 + mulld 28,10,6 390 + mulhdu 29,10,6 391 + mulld 30,11,6 392 + mulhdu 31,11,6 393 + 394 + b .Lfe51_reduce 395 + SYM_FUNC_END(x25519_fe51_mul121666) 396 + 397 + .align 5 398 + SYM_FUNC_START(x25519_fe51_sqr_times) 399 + 400 + stdu 1,-144(1) 401 + std 21,56(1) 402 + std 22,64(1) 403 + std 23,72(1) 404 + std 24,80(1) 405 + std 25,88(1) 406 + std 26,96(1) 407 + std 27,104(1) 408 + std 28,112(1) 409 + std 29,120(1) 410 + std 30,128(1) 411 + std 31,136(1) 412 + 413 + ld 7,0(4) 414 + ld 8,8(4) 415 + ld 9,16(4) 416 + ld 10,24(4) 417 + ld 11,32(4) 418 + 419 + mtctr 5 420 + 421 + .Lsqr_times_loop: 422 + add 6,7,7 423 + mulli 21,11,19 424 + 425 + mulld 22,7,7 426 + mulhdu 23,7,7 427 + mulld 24,8,6 428 + mulhdu 25,8,6 429 + mulld 26,9,6 430 + mulhdu 27,9,6 431 + mulld 28,10,6 432 + mulhdu 29,10,6 433 + mulld 30,11,6 434 + mulhdu 31,11,6 435 + add 6,8,8 436 + mulld 12,11,21 437 + mulhdu 11,11,21 438 + addc 28,28,12 439 + adde 29,29,11 440 + 441 + mulli 5,10,19 442 + 443 + mulld 12,8,8 444 + mulhdu 11,8,8 445 + addc 26,26,12 446 + adde 27,27,11 447 + mulld 12,9,6 448 + mulhdu 11,9,6 449 + addc 28,28,12 450 + adde 29,29,11 451 + mulld 12,10,6 452 + mulhdu 11,10,6 453 + addc 30,30,12 454 + adde 31,31,11 455 + mulld 12,21,6 456 + mulhdu 11,21,6 457 + add 6,10,10 458 + addc 22,22,12 459 + adde 23,23,11 460 + mulld 12,10,5 461 + mulhdu 10,10,5 462 + addc 24,24,12 463 + adde 25,25,10 464 + mulld 12,6,21 465 + mulhdu 10,6,21 466 + add 6,9,9 467 + addc 26,26,12 468 + adde 27,27,10 469 + 470 + mulld 12,9,9 471 + mulhdu 10,9,9 472 + addc 30,30,12 473 + adde 31,31,10 474 + mulld 12,5,6 475 + mulhdu 10,5,6 476 + addc 22,22,12 477 + adde 23,23,10 478 + mulld 12,21,6 479 + mulhdu 10,21,6 480 + addc 24,24,12 481 + adde 25,25,10 482 + 483 + # fe51_reduce 484 + li 0,-1 485 + srdi 0,0,13 486 + 487 + srdi 12,26,51 488 + and 9,26,0 489 + insrdi 12,27,51,0 490 + srdi 21,22,51 491 + and 7,22,0 492 + insrdi 21,23,51,0 493 + addc 28,28,12 494 + addze 29,29 495 + addc 24,24,21 496 + addze 25,25 497 + 498 + srdi 12,28,51 499 + and 10,28,0 500 + insrdi 12,29,51,0 501 + srdi 21,24,51 502 + and 8,24,0 503 + insrdi 21,25,51,0 504 + addc 30,30,12 505 + addze 31,31 506 + add 9,9,21 507 + 508 + srdi 12,30,51 509 + and 11,30,0 510 + insrdi 12,31,51,0 511 + mulli 12,12,19 512 + 513 + add 7,7,12 514 + 515 + srdi 21,9,51 516 + and 9,9,0 517 + add 10,10,21 518 + 519 + srdi 12,7,51 520 + and 7,7,0 521 + add 8,8,12 522 + 523 + bdnz .Lsqr_times_loop 524 + 525 + std 9,16(3) 526 + std 10,24(3) 527 + std 11,32(3) 528 + std 7,0(3) 529 + std 8,8(3) 530 + 531 + ld 21,56(1) 532 + ld 22,64(1) 533 + ld 23,72(1) 534 + ld 24,80(1) 535 + ld 25,88(1) 536 + ld 26,96(1) 537 + ld 27,104(1) 538 + ld 28,112(1) 539 + ld 29,120(1) 540 + ld 30,128(1) 541 + ld 31,136(1) 542 + addi 1,1,144 543 + blr 544 + SYM_FUNC_END(x25519_fe51_sqr_times) 545 + 546 + .align 5 547 + SYM_FUNC_START(x25519_fe51_frombytes) 548 + 549 + li 12, -1 550 + srdi 12, 12, 13 # 0x7ffffffffffff 551 + 552 + ld 5, 0(4) 553 + ld 6, 8(4) 554 + ld 7, 16(4) 555 + ld 8, 24(4) 556 + 557 + srdi 10, 5, 51 558 + and 5, 5, 12 # h0 559 + 560 + sldi 11, 6, 13 561 + or 11, 10, 11 # h1t 562 + srdi 10, 6, 38 563 + and 6, 11, 12 # h1 564 + 565 + sldi 11, 7, 26 566 + or 10, 10, 11 # h2t 567 + 568 + srdi 11, 7, 25 569 + and 7, 10, 12 # h2 570 + sldi 10, 8, 39 571 + or 11, 11, 10 # h3t 572 + 573 + srdi 9, 8, 12 574 + and 8, 11, 12 # h3 575 + and 9, 9, 12 # h4 576 + 577 + std 5, 0(3) 578 + std 6, 8(3) 579 + std 7, 16(3) 580 + std 8, 24(3) 581 + std 9, 32(3) 582 + 583 + blr 584 + SYM_FUNC_END(x25519_fe51_frombytes) 585 + 586 + .align 5 587 + SYM_FUNC_START(x25519_fe51_tobytes) 588 + 589 + ld 5, 0(4) 590 + ld 6, 8(4) 591 + ld 7, 16(4) 592 + ld 8, 24(4) 593 + ld 9, 32(4) 594 + 595 + li 12, -1 596 + srdi 12, 12, 13 # 0x7ffffffffffff 597 + 598 + # Full reducuction 599 + addi 10, 5, 19 600 + srdi 10, 10, 51 601 + add 10, 10, 6 602 + srdi 10, 10, 51 603 + add 10, 10, 7 604 + srdi 10, 10, 51 605 + add 10, 10, 8 606 + srdi 10, 10, 51 607 + add 10, 10, 9 608 + srdi 10, 10, 51 609 + 610 + mulli 10, 10, 19 611 + add 5, 5, 10 612 + srdi 11, 5, 51 613 + add 6, 6, 11 614 + srdi 11, 6, 51 615 + add 7, 7, 11 616 + srdi 11, 7, 51 617 + add 8, 8, 11 618 + srdi 11, 8, 51 619 + add 9, 9, 11 620 + 621 + and 5, 5, 12 622 + and 6, 6, 12 623 + and 7, 7, 12 624 + and 8, 8, 12 625 + and 9, 9, 12 626 + 627 + sldi 10, 6, 51 628 + or 5, 5, 10 # s0 629 + 630 + srdi 11, 6, 13 631 + sldi 10, 7, 38 632 + or 6, 11, 10 # s1 633 + 634 + srdi 11, 7, 26 635 + sldi 10, 8, 25 636 + or 7, 11, 10 # s2 637 + 638 + srdi 11, 8, 39 639 + sldi 10, 9, 12 640 + or 8, 11, 10 # s4 641 + 642 + std 5, 0(3) 643 + std 6, 8(3) 644 + std 7, 16(3) 645 + std 8, 24(3) 646 + 647 + blr 648 + SYM_FUNC_END(x25519_fe51_tobytes) 649 + 650 + .align 5 651 + SYM_FUNC_START(x25519_cswap) 652 + 653 + li 7, 5 654 + neg 6, 5 655 + mtctr 7 656 + 657 + .Lswap_loop: 658 + ld 8, 0(3) 659 + ld 9, 0(4) 660 + xor 10, 8, 9 661 + and 10, 10, 6 662 + xor 11, 8, 10 663 + xor 12, 9, 10 664 + std 11, 0(3) 665 + addi 3, 3, 8 666 + std 12, 0(4) 667 + addi 4, 4, 8 668 + bdnz .Lswap_loop 669 + 670 + blr 671 + SYM_FUNC_END(x25519_cswap)

+1

arch/x86/crypto/Kconfig

··· 18 18 depends on X86 19 19 select CRYPTO_AEAD 20 20 select CRYPTO_LIB_AES 21 + select CRYPTO_LIB_GF128MUL 21 22 select CRYPTO_ALGAPI 22 23 select CRYPTO_SKCIPHER 23 24 select CRYPTO_SIMD

+6 -2

arch/x86/crypto/Makefile

··· 48 48 49 49 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o 50 50 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o 51 - aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o \ 52 - aes_ctrby8_avx-x86_64.o aes-xts-avx-x86_64.o 51 + aesni-intel-$(CONFIG_64BIT) += aes_ctrby8_avx-x86_64.o \ 52 + aes-gcm-aesni-x86_64.o \ 53 + aes-xts-avx-x86_64.o 54 + ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy) 55 + aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o 56 + endif 53 57 54 58 obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o 55 59 sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o

+1128

arch/x86/crypto/aes-gcm-aesni-x86_64.S

··· 1 + /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ 2 + // 3 + // AES-NI optimized AES-GCM for x86_64 4 + // 5 + // Copyright 2024 Google LLC 6 + // 7 + // Author: Eric Biggers <ebiggers@google.com> 8 + // 9 + //------------------------------------------------------------------------------ 10 + // 11 + // This file is dual-licensed, meaning that you can use it under your choice of 12 + // either of the following two licenses: 13 + // 14 + // Licensed under the Apache License 2.0 (the "License"). You may obtain a copy 15 + // of the License at 16 + // 17 + // http://www.apache.org/licenses/LICENSE-2.0 18 + // 19 + // Unless required by applicable law or agreed to in writing, software 20 + // distributed under the License is distributed on an "AS IS" BASIS, 21 + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 22 + // See the License for the specific language governing permissions and 23 + // limitations under the License. 24 + // 25 + // or 26 + // 27 + // Redistribution and use in source and binary forms, with or without 28 + // modification, are permitted provided that the following conditions are met: 29 + // 30 + // 1. Redistributions of source code must retain the above copyright notice, 31 + // this list of conditions and the following disclaimer. 32 + // 33 + // 2. Redistributions in binary form must reproduce the above copyright 34 + // notice, this list of conditions and the following disclaimer in the 35 + // documentation and/or other materials provided with the distribution. 36 + // 37 + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 + // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 + // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 + // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 41 + // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 + // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 + // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 + // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 + // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 + // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 + // POSSIBILITY OF SUCH DAMAGE. 48 + // 49 + //------------------------------------------------------------------------------ 50 + // 51 + // This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that 52 + // support the original set of AES instructions, i.e. AES-NI. Two 53 + // implementations are provided, one that uses AVX and one that doesn't. They 54 + // are very similar, being generated by the same macros. The only difference is 55 + // that the AVX implementation takes advantage of VEX-coded instructions in some 56 + // places to avoid some 'movdqu' and 'movdqa' instructions. The AVX 57 + // implementation does *not* use 256-bit vectors, as AES is not supported on 58 + // 256-bit vectors until the VAES feature (which this file doesn't target). 59 + // 60 + // The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1 61 + // for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems 62 + // there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.) 63 + // 64 + // The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is 65 + // more thoroughly commented. This file has the following notable changes: 66 + // 67 + // - The vector length is fixed at 128-bit, i.e. xmm registers. This means 68 + // there is only one AES block (and GHASH block) per register. 69 + // 70 + // - Without AVX512 / AVX10, only 16 SIMD registers are available instead of 71 + // 32. We work around this by being much more careful about using 72 + // registers, relying heavily on loads to load values as they are needed. 73 + // 74 + // - Masking is not available either. We work around this by implementing 75 + // partial block loads and stores using overlapping scalar loads and stores 76 + // combined with shifts and SSE4.1 insertion and extraction instructions. 77 + // 78 + // - The main loop is organized differently due to the different design 79 + // constraints. First, with just one AES block per SIMD register, on some 80 + // CPUs 4 registers don't saturate the 'aesenc' throughput. We therefore 81 + // do an 8-register wide loop. Considering that and the fact that we have 82 + // just 16 SIMD registers to work with, it's not feasible to cache AES 83 + // round keys and GHASH key powers in registers across loop iterations. 84 + // That's not ideal, but also not actually that bad, since loads can run in 85 + // parallel with other instructions. Significantly, this also makes it 86 + // possible to roll up the inner loops, relying on hardware loop unrolling 87 + // instead of software loop unrolling, greatly reducing code size. 88 + // 89 + // - We implement the GHASH multiplications in the main loop using Karatsuba 90 + // multiplication instead of schoolbook multiplication. This saves one 91 + // pclmulqdq instruction per block, at the cost of one 64-bit load, one 92 + // pshufd, and 0.25 pxors per block. (This is without the three-argument 93 + // XOR support that would be provided by AVX512 / AVX10, which would be 94 + // more beneficial to schoolbook than Karatsuba.) 95 + // 96 + // As a rough approximation, we can assume that Karatsuba multiplication is 97 + // faster than schoolbook multiplication in this context if one pshufd and 98 + // 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit 99 + // load is "free" due to running in parallel with arithmetic instructions.) 100 + // This is true on AMD CPUs, including all that support pclmulqdq up to at 101 + // least Zen 3. It's also true on older Intel CPUs: Westmere through 102 + // Haswell on the Core side, and Silvermont through Goldmont Plus on the 103 + // low-power side. On some of these CPUs, pclmulqdq is quite slow, and the 104 + // benefit of Karatsuba should be substantial. On newer Intel CPUs, 105 + // schoolbook multiplication should be faster, but only marginally. 106 + // 107 + // Not all these CPUs were available to be tested. However, benchmarks on 108 + // available CPUs suggest that this approximation is plausible. Switching 109 + // to Karatsuba showed negligible change (< 1%) on Intel Broadwell, 110 + // Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%. 111 + // Considering that and the fact that Karatsuba should be even more 112 + // beneficial on older Intel CPUs, it seems like the right choice here. 113 + // 114 + // An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be 115 + // saved by using a multiplication-less reduction method. We don't do that 116 + // because it would require a large number of shift and xor instructions, 117 + // making it less worthwhile and likely harmful on newer CPUs. 118 + // 119 + // It does make sense to sometimes use a different reduction optimization 120 + // that saves a pclmulqdq, though: precompute the hash key times x^64, and 121 + // multiply the low half of the data block by the hash key with the extra 122 + // factor of x^64. This eliminates one step of the reduction. However, 123 + // this is incompatible with Karatsuba multiplication. Therefore, for 124 + // multi-block processing we use Karatsuba multiplication with a regular 125 + // reduction. For single-block processing, we use the x^64 optimization. 126 + 127 + #include <linux/linkage.h> 128 + 129 + .section .rodata 130 + .p2align 4 131 + .Lbswap_mask: 132 + .octa 0x000102030405060708090a0b0c0d0e0f 133 + .Lgfpoly: 134 + .quad 0xc200000000000000 135 + .Lone: 136 + .quad 1 137 + .Lgfpoly_and_internal_carrybit: 138 + .octa 0xc2000000000000010000000000000001 139 + // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of 140 + // 'len' 0xff bytes and the rest zeroes. 141 + .Lzeropad_mask: 142 + .octa 0xffffffffffffffffffffffffffffffff 143 + .octa 0 144 + 145 + // Offsets in struct aes_gcm_key_aesni 146 + #define OFFSETOF_AESKEYLEN 480 147 + #define OFFSETOF_H_POWERS 496 148 + #define OFFSETOF_H_POWERS_XORED 624 149 + #define OFFSETOF_H_TIMES_X64 688 150 + 151 + .text 152 + 153 + // Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback 154 + // assumes that all operands are distinct and that any mem operand is aligned. 155 + .macro _vpclmulqdq imm, src1, src2, dst 156 + .if USE_AVX 157 + vpclmulqdq \imm, \src1, \src2, \dst 158 + .else 159 + movdqa \src2, \dst 160 + pclmulqdq \imm, \src1, \dst 161 + .endif 162 + .endm 163 + 164 + // Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes 165 + // that all operands are distinct and that any mem operand is aligned. 166 + .macro _vpshufb src1, src2, dst 167 + .if USE_AVX 168 + vpshufb \src1, \src2, \dst 169 + .else 170 + movdqa \src2, \dst 171 + pshufb \src1, \dst 172 + .endif 173 + .endm 174 + 175 + // Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that 176 + // all operands are distinct. 177 + .macro _vpand src1, src2, dst 178 + .if USE_AVX 179 + vpand \src1, \src2, \dst 180 + .else 181 + movdqu \src1, \dst 182 + pand \src2, \dst 183 + .endif 184 + .endm 185 + 186 + // XOR the unaligned memory operand \mem into the xmm register \reg. \tmp must 187 + // be a temporary xmm register. 188 + .macro _xor_mem_to_reg mem, reg, tmp 189 + .if USE_AVX 190 + vpxor \mem, \reg, \reg 191 + .else 192 + movdqu \mem, \tmp 193 + pxor \tmp, \reg 194 + .endif 195 + .endm 196 + 197 + // Test the unaligned memory operand \mem against the xmm register \reg. \tmp 198 + // must be a temporary xmm register. 199 + .macro _test_mem mem, reg, tmp 200 + .if USE_AVX 201 + vptest \mem, \reg 202 + .else 203 + movdqu \mem, \tmp 204 + ptest \tmp, \reg 205 + .endif 206 + .endm 207 + 208 + // Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst 209 + // and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. 210 + .macro _load_partial_block src, dst, tmp64, tmp32 211 + sub $8, %ecx // LEN - 8 212 + jle .Lle8\@ 213 + 214 + // Load 9 <= LEN <= 15 bytes. 215 + movq (\src), \dst // Load first 8 bytes 216 + mov (\src, %rcx), %rax // Load last 8 bytes 217 + neg %ecx 218 + shl $3, %ecx 219 + shr %cl, %rax // Discard overlapping bytes 220 + pinsrq $1, %rax, \dst 221 + jmp .Ldone\@ 222 + 223 + .Lle8\@: 224 + add $4, %ecx // LEN - 4 225 + jl .Llt4\@ 226 + 227 + // Load 4 <= LEN <= 8 bytes. 228 + mov (\src), %eax // Load first 4 bytes 229 + mov (\src, %rcx), \tmp32 // Load last 4 bytes 230 + jmp .Lcombine\@ 231 + 232 + .Llt4\@: 233 + // Load 1 <= LEN <= 3 bytes. 234 + add $2, %ecx // LEN - 2 235 + movzbl (\src), %eax // Load first byte 236 + jl .Lmovq\@ 237 + movzwl (\src, %rcx), \tmp32 // Load last 2 bytes 238 + .Lcombine\@: 239 + shl $3, %ecx 240 + shl %cl, \tmp64 241 + or \tmp64, %rax // Combine the two parts 242 + .Lmovq\@: 243 + movq %rax, \dst 244 + .Ldone\@: 245 + .endm 246 + 247 + // Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst. 248 + // Clobbers %rax, %rcx, and %rsi. 249 + .macro _store_partial_block src, dst 250 + sub $8, %ecx // LEN - 8 251 + jl .Llt8\@ 252 + 253 + // Store 8 <= LEN <= 15 bytes. 254 + pextrq $1, \src, %rax 255 + mov %ecx, %esi 256 + shl $3, %ecx 257 + ror %cl, %rax 258 + mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes 259 + movq \src, (\dst) // Store first 8 bytes 260 + jmp .Ldone\@ 261 + 262 + .Llt8\@: 263 + add $4, %ecx // LEN - 4 264 + jl .Llt4\@ 265 + 266 + // Store 4 <= LEN <= 7 bytes. 267 + pextrd $1, \src, %eax 268 + mov %ecx, %esi 269 + shl $3, %ecx 270 + ror %cl, %eax 271 + mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes 272 + movd \src, (\dst) // Store first 4 bytes 273 + jmp .Ldone\@ 274 + 275 + .Llt4\@: 276 + // Store 1 <= LEN <= 3 bytes. 277 + pextrb $0, \src, 0(\dst) 278 + cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? 279 + jl .Ldone\@ 280 + pextrb $1, \src, 1(\dst) 281 + je .Ldone\@ 282 + pextrb $2, \src, 2(\dst) 283 + .Ldone\@: 284 + .endm 285 + 286 + // Do one step of GHASH-multiplying \a by \b and storing the reduced product in 287 + // \b. To complete all steps, this must be invoked with \i=0 through \i=9. 288 + // \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the 289 + // .Lgfpoly constant, and \t0-\t1 must be temporary registers. 290 + .macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1 291 + 292 + // MI = (a_L * b_H) + ((a*x^64)_L * b_L) 293 + .if \i == 0 294 + _vpclmulqdq $0x01, \a, \b, \t0 295 + .elseif \i == 1 296 + _vpclmulqdq $0x00, \a_times_x64, \b, \t1 297 + .elseif \i == 2 298 + pxor \t1, \t0 299 + 300 + // HI = (a_H * b_H) + ((a*x^64)_H * b_L) 301 + .elseif \i == 3 302 + _vpclmulqdq $0x11, \a, \b, \t1 303 + .elseif \i == 4 304 + pclmulqdq $0x10, \a_times_x64, \b 305 + .elseif \i == 5 306 + pxor \t1, \b 307 + .elseif \i == 6 308 + 309 + // Fold MI into HI. 310 + pshufd $0x4e, \t0, \t1 // Swap halves of MI 311 + .elseif \i == 7 312 + pclmulqdq $0x00, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) 313 + .elseif \i == 8 314 + pxor \t1, \b 315 + .elseif \i == 9 316 + pxor \t0, \b 317 + .endif 318 + .endm 319 + 320 + // GHASH-multiply \a by \b and store the reduced product in \b. 321 + // See _ghash_mul_step for details. 322 + .macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1 323 + .irp i, 0,1,2,3,4,5,6,7,8,9 324 + _ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1 325 + .endr 326 + .endm 327 + 328 + // GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi. 329 + // This does Karatsuba multiplication and must be paired with _ghash_reduce. On 330 + // the first call, \lo, \mi, and \hi must be zero. \a_xored must contain the 331 + // two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered. 332 + .macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0 333 + 334 + // LO += a_L * b_L 335 + _vpclmulqdq $0x00, \a, \b, \t0 336 + pxor \t0, \lo 337 + 338 + // b_L + b_H 339 + pshufd $0x4e, \b, \t0 340 + pxor \b, \t0 341 + 342 + // HI += a_H * b_H 343 + pclmulqdq $0x11, \a, \b 344 + pxor \b, \hi 345 + 346 + // MI += (a_L + a_H) * (b_L + b_H) 347 + pclmulqdq $0x00, \a_xored, \t0 348 + pxor \t0, \mi 349 + .endm 350 + 351 + // Reduce the product from \lo, \mi, and \hi, and store the result in \dst. 352 + // This assumes that _ghash_mul_noreduce was used. 353 + .macro _ghash_reduce lo, mi, hi, dst, t0 354 + 355 + movq .Lgfpoly(%rip), \t0 356 + 357 + // MI += LO + HI (needed because we used Karatsuba multiplication) 358 + pxor \lo, \mi 359 + pxor \hi, \mi 360 + 361 + // Fold LO into MI. 362 + pshufd $0x4e, \lo, \dst 363 + pclmulqdq $0x00, \t0, \lo 364 + pxor \dst, \mi 365 + pxor \lo, \mi 366 + 367 + // Fold MI into HI. 368 + pshufd $0x4e, \mi, \dst 369 + pclmulqdq $0x00, \t0, \mi 370 + pxor \hi, \dst 371 + pxor \mi, \dst 372 + .endm 373 + 374 + // Do the first step of the GHASH update of a set of 8 ciphertext blocks. 375 + // 376 + // The whole GHASH update does: 377 + // 378 + // GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 + 379 + // blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1 380 + // 381 + // This macro just does the first step: it does the unreduced multiplication 382 + // (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm 383 + // registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the 384 + // inner block counter in %rax, which is a value that counts up by 8 for each 385 + // block in the set of 8 and is used later to index by 8*blknum and 16*blknum. 386 + // 387 + // To reduce the number of pclmulqdq instructions required, both this macro and 388 + // _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook 389 + // multiplication. See the file comment for more details about this choice. 390 + // 391 + // Both macros expect the ciphertext blocks blk[0-7] to be available at DST if 392 + // encrypting, or SRC if decrypting. They also expect the precomputed hash key 393 + // powers H^i and their XOR'd-together halves to be available in the struct 394 + // pointed to by KEY. Both macros clobber TMP[0-2]. 395 + .macro _ghash_update_begin_8x enc 396 + 397 + // Initialize the inner block counter. 398 + xor %eax, %eax 399 + 400 + // Load the highest hash key power, H^8. 401 + movdqa OFFSETOF_H_POWERS(KEY), TMP0 402 + 403 + // Load the first ciphertext block and byte-reflect it. 404 + .if \enc 405 + movdqu (DST), TMP1 406 + .else 407 + movdqu (SRC), TMP1 408 + .endif 409 + pshufb BSWAP_MASK, TMP1 410 + 411 + // Add the GHASH accumulator to the ciphertext block to get the block 412 + // 'b' that needs to be multiplied with the hash key power 'a'. 413 + pxor TMP1, GHASH_ACC 414 + 415 + // b_L + b_H 416 + pshufd $0x4e, GHASH_ACC, MI 417 + pxor GHASH_ACC, MI 418 + 419 + // LO = a_L * b_L 420 + _vpclmulqdq $0x00, TMP0, GHASH_ACC, LO 421 + 422 + // HI = a_H * b_H 423 + pclmulqdq $0x11, TMP0, GHASH_ACC 424 + 425 + // MI = (a_L + a_H) * (b_L + b_H) 426 + pclmulqdq $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI 427 + .endm 428 + 429 + // Continue the GHASH update of 8 ciphertext blocks as described above by doing 430 + // an unreduced multiplication of the next ciphertext block by the next lowest 431 + // key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI. 432 + .macro _ghash_update_continue_8x enc 433 + add $8, %eax 434 + 435 + // Load the next lowest key power. 436 + movdqa OFFSETOF_H_POWERS(KEY,%rax,2), TMP0 437 + 438 + // Load the next ciphertext block and byte-reflect it. 439 + .if \enc 440 + movdqu (DST,%rax,2), TMP1 441 + .else 442 + movdqu (SRC,%rax,2), TMP1 443 + .endif 444 + pshufb BSWAP_MASK, TMP1 445 + 446 + // LO += a_L * b_L 447 + _vpclmulqdq $0x00, TMP0, TMP1, TMP2 448 + pxor TMP2, LO 449 + 450 + // b_L + b_H 451 + pshufd $0x4e, TMP1, TMP2 452 + pxor TMP1, TMP2 453 + 454 + // HI += a_H * b_H 455 + pclmulqdq $0x11, TMP0, TMP1 456 + pxor TMP1, GHASH_ACC 457 + 458 + // MI += (a_L + a_H) * (b_L + b_H) 459 + movq OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1 460 + pclmulqdq $0x00, TMP1, TMP2 461 + pxor TMP2, MI 462 + .endm 463 + 464 + // Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to 465 + // _ghash_reduce, but it's hardcoded to use the registers of the main loop and 466 + // it uses the same register for HI and the destination. It's also divided into 467 + // two steps. TMP1 must be preserved across steps. 468 + // 469 + // One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of 470 + // shuffling LO, XOR'ing LO into MI, and shuffling MI. However, this would 471 + // increase the critical path length, and it seems to slightly hurt performance. 472 + .macro _ghash_update_end_8x_step i 473 + .if \i == 0 474 + movq .Lgfpoly(%rip), TMP1 475 + pxor LO, MI 476 + pxor GHASH_ACC, MI 477 + pshufd $0x4e, LO, TMP2 478 + pclmulqdq $0x00, TMP1, LO 479 + pxor TMP2, MI 480 + pxor LO, MI 481 + .elseif \i == 1 482 + pshufd $0x4e, MI, TMP2 483 + pclmulqdq $0x00, TMP1, MI 484 + pxor TMP2, GHASH_ACC 485 + pxor MI, GHASH_ACC 486 + .endif 487 + .endm 488 + 489 + // void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key); 490 + // 491 + // Given the expanded AES key, derive the GHASH subkey and initialize the GHASH 492 + // related fields in the key struct. 493 + .macro _aes_gcm_precompute 494 + 495 + // Function arguments 496 + .set KEY, %rdi 497 + 498 + // Additional local variables. 499 + // %xmm0-%xmm1 and %rax are used as temporaries. 500 + .set RNDKEYLAST_PTR, %rsi 501 + .set H_CUR, %xmm2 502 + .set H_POW1, %xmm3 // H^1 503 + .set H_POW1_X64, %xmm4 // H^1 * x^64 504 + .set GFPOLY, %xmm5 505 + 506 + // Encrypt an all-zeroes block to get the raw hash subkey. 507 + movl OFFSETOF_AESKEYLEN(KEY), %eax 508 + lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR 509 + movdqa (KEY), H_POW1 // Zero-th round key XOR all-zeroes block 510 + lea 16(KEY), %rax 511 + 1: 512 + aesenc (%rax), H_POW1 513 + add $16, %rax 514 + cmp %rax, RNDKEYLAST_PTR 515 + jne 1b 516 + aesenclast (RNDKEYLAST_PTR), H_POW1 517 + 518 + // Preprocess the raw hash subkey as needed to operate on GHASH's 519 + // bit-reflected values directly: reflect its bytes, then multiply it by 520 + // x^-1 (using the backwards interpretation of polynomial coefficients 521 + // from the GCM spec) or equivalently x^1 (using the alternative, 522 + // natural interpretation of polynomial coefficients). 523 + pshufb .Lbswap_mask(%rip), H_POW1 524 + movdqa H_POW1, %xmm0 525 + pshufd $0xd3, %xmm0, %xmm0 526 + psrad $31, %xmm0 527 + paddq H_POW1, H_POW1 528 + pand .Lgfpoly_and_internal_carrybit(%rip), %xmm0 529 + pxor %xmm0, H_POW1 530 + 531 + // Store H^1. 532 + movdqa H_POW1, OFFSETOF_H_POWERS+7*16(KEY) 533 + 534 + // Compute and store H^1 * x^64. 535 + movq .Lgfpoly(%rip), GFPOLY 536 + pshufd $0x4e, H_POW1, %xmm0 537 + _vpclmulqdq $0x00, H_POW1, GFPOLY, H_POW1_X64 538 + pxor %xmm0, H_POW1_X64 539 + movdqa H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY) 540 + 541 + // Compute and store the halves of H^1 XOR'd together. 542 + pxor H_POW1, %xmm0 543 + movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY) 544 + 545 + // Compute and store the remaining key powers H^2 through H^8. 546 + movdqa H_POW1, H_CUR 547 + mov $6*8, %eax 548 + .Lprecompute_next\@: 549 + // Compute H^i = H^{i-1} * H^1. 550 + _ghash_mul H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1 551 + // Store H^i. 552 + movdqa H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2) 553 + // Compute and store the halves of H^i XOR'd together. 554 + pshufd $0x4e, H_CUR, %xmm0 555 + pxor H_CUR, %xmm0 556 + movq %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax) 557 + sub $8, %eax 558 + jge .Lprecompute_next\@ 559 + 560 + RET 561 + .endm 562 + 563 + // void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key, 564 + // u8 ghash_acc[16], const u8 *aad, int aadlen); 565 + // 566 + // This function processes the AAD (Additional Authenticated Data) in GCM. 567 + // Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the 568 + // data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all 569 + // zeroes. |aadlen| must be a multiple of 16, except on the last call where it 570 + // can be any length. The caller must do any buffering needed to ensure this. 571 + .macro _aes_gcm_aad_update 572 + 573 + // Function arguments 574 + .set KEY, %rdi 575 + .set GHASH_ACC_PTR, %rsi 576 + .set AAD, %rdx 577 + .set AADLEN, %ecx 578 + // Note: _load_partial_block relies on AADLEN being in %ecx. 579 + 580 + // Additional local variables. 581 + // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers. 582 + .set BSWAP_MASK, %xmm2 583 + .set GHASH_ACC, %xmm3 584 + .set H_POW1, %xmm4 // H^1 585 + .set H_POW1_X64, %xmm5 // H^1 * x^64 586 + .set GFPOLY, %xmm6 587 + 588 + movdqa .Lbswap_mask(%rip), BSWAP_MASK 589 + movdqu (GHASH_ACC_PTR), GHASH_ACC 590 + movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1 591 + movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64 592 + movq .Lgfpoly(%rip), GFPOLY 593 + 594 + // Process the AAD one full block at a time. 595 + sub $16, AADLEN 596 + jl .Laad_loop_1x_done\@ 597 + .Laad_loop_1x\@: 598 + movdqu (AAD), %xmm0 599 + pshufb BSWAP_MASK, %xmm0 600 + pxor %xmm0, GHASH_ACC 601 + _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1 602 + add $16, AAD 603 + sub $16, AADLEN 604 + jge .Laad_loop_1x\@ 605 + .Laad_loop_1x_done\@: 606 + // Check whether there is a partial block at the end. 607 + add $16, AADLEN 608 + jz .Laad_done\@ 609 + 610 + // Process a partial block of length 1 <= AADLEN <= 15. 611 + // _load_partial_block assumes that %ecx contains AADLEN. 612 + _load_partial_block AAD, %xmm0, %r10, %r10d 613 + pshufb BSWAP_MASK, %xmm0 614 + pxor %xmm0, GHASH_ACC 615 + _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1 616 + 617 + .Laad_done\@: 618 + movdqu GHASH_ACC, (GHASH_ACC_PTR) 619 + RET 620 + .endm 621 + 622 + // Increment LE_CTR eight times to generate eight little-endian counter blocks, 623 + // swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with 624 + // the zero-th AES round key. Clobbers TMP0 and TMP1. 625 + .macro _ctr_begin_8x 626 + movq .Lone(%rip), TMP0 627 + movdqa (KEY), TMP1 // zero-th round key 628 + .irp i, 0,1,2,3,4,5,6,7 629 + _vpshufb BSWAP_MASK, LE_CTR, AESDATA\i 630 + pxor TMP1, AESDATA\i 631 + paddd TMP0, LE_CTR 632 + .endr 633 + .endm 634 + 635 + // Do a non-last round of AES on AESDATA[0-7] using \round_key. 636 + .macro _aesenc_8x round_key 637 + .irp i, 0,1,2,3,4,5,6,7 638 + aesenc \round_key, AESDATA\i 639 + .endr 640 + .endm 641 + 642 + // Do the last round of AES on AESDATA[0-7] using \round_key. 643 + .macro _aesenclast_8x round_key 644 + .irp i, 0,1,2,3,4,5,6,7 645 + aesenclast \round_key, AESDATA\i 646 + .endr 647 + .endm 648 + 649 + // XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and 650 + // store the result to DST. Clobbers TMP0. 651 + .macro _xor_data_8x 652 + .irp i, 0,1,2,3,4,5,6,7 653 + _xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0 654 + .endr 655 + .irp i, 0,1,2,3,4,5,6,7 656 + movdqu AESDATA\i, \i*16(DST) 657 + .endr 658 + .endm 659 + 660 + // void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key, 661 + // const u32 le_ctr[4], u8 ghash_acc[16], 662 + // const u8 *src, u8 *dst, int datalen); 663 + // 664 + // This macro generates a GCM encryption or decryption update function with the 665 + // above prototype (with \enc selecting which one). 666 + // 667 + // This function computes the next portion of the CTR keystream, XOR's it with 668 + // |datalen| bytes from |src|, and writes the resulting encrypted or decrypted 669 + // data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the 670 + // next |datalen| ciphertext bytes. 671 + // 672 + // |datalen| must be a multiple of 16, except on the last call where it can be 673 + // any length. The caller must do any buffering needed to ensure this. Both 674 + // in-place and out-of-place en/decryption are supported. 675 + // 676 + // |le_ctr| must give the current counter in little-endian format. For a new 677 + // message, the low word of the counter must be 2. This function loads the 678 + // counter from |le_ctr| and increments the loaded counter as needed, but it 679 + // does *not* store the updated counter back to |le_ctr|. The caller must 680 + // update |le_ctr| if any more data segments follow. Internally, only the low 681 + // 32-bit word of the counter is incremented, following the GCM standard. 682 + .macro _aes_gcm_update enc 683 + 684 + // Function arguments 685 + .set KEY, %rdi 686 + .set LE_CTR_PTR, %rsi // Note: overlaps with usage as temp reg 687 + .set GHASH_ACC_PTR, %rdx 688 + .set SRC, %rcx 689 + .set DST, %r8 690 + .set DATALEN, %r9d 691 + .set DATALEN64, %r9 // Zero-extend DATALEN before using! 692 + // Note: the code setting up for _load_partial_block assumes that SRC is 693 + // in %rcx (and that DATALEN is *not* in %rcx). 694 + 695 + // Additional local variables 696 + 697 + // %rax and %rsi are used as temporary registers. Note: %rsi overlaps 698 + // with LE_CTR_PTR, which is used only at the beginning. 699 + 700 + .set AESKEYLEN, %r10d // AES key length in bytes 701 + .set AESKEYLEN64, %r10 702 + .set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key 703 + 704 + // Put the most frequently used values in %xmm0-%xmm7 to reduce code 705 + // size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.) 706 + .set TMP0, %xmm0 707 + .set TMP1, %xmm1 708 + .set TMP2, %xmm2 709 + .set LO, %xmm3 // Low part of unreduced product 710 + .set MI, %xmm4 // Middle part of unreduced product 711 + .set GHASH_ACC, %xmm5 // GHASH accumulator; in main loop also 712 + // the high part of unreduced product 713 + .set BSWAP_MASK, %xmm6 // Shuffle mask for reflecting bytes 714 + .set LE_CTR, %xmm7 // Little-endian counter value 715 + .set AESDATA0, %xmm8 716 + .set AESDATA1, %xmm9 717 + .set AESDATA2, %xmm10 718 + .set AESDATA3, %xmm11 719 + .set AESDATA4, %xmm12 720 + .set AESDATA5, %xmm13 721 + .set AESDATA6, %xmm14 722 + .set AESDATA7, %xmm15 723 + 724 + movdqa .Lbswap_mask(%rip), BSWAP_MASK 725 + movdqu (GHASH_ACC_PTR), GHASH_ACC 726 + movdqu (LE_CTR_PTR), LE_CTR 727 + 728 + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN 729 + lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR 730 + 731 + // If there are at least 8*16 bytes of data, then continue into the main 732 + // loop, which processes 8*16 bytes of data per iteration. 733 + // 734 + // The main loop interleaves AES and GHASH to improve performance on 735 + // CPUs that can execute these instructions in parallel. When 736 + // decrypting, the GHASH input (the ciphertext) is immediately 737 + // available. When encrypting, we instead encrypt a set of 8 blocks 738 + // first and then GHASH those blocks while encrypting the next set of 8, 739 + // repeat that as needed, and finally GHASH the last set of 8 blocks. 740 + // 741 + // Code size optimization: Prefer adding or subtracting -8*16 over 8*16, 742 + // as this makes the immediate fit in a signed byte, saving 3 bytes. 743 + add $-8*16, DATALEN 744 + jl .Lcrypt_loop_8x_done\@ 745 + .if \enc 746 + // Encrypt the first 8 plaintext blocks. 747 + _ctr_begin_8x 748 + lea 16(KEY), %rsi 749 + .p2align 4 750 + 1: 751 + movdqa (%rsi), TMP0 752 + _aesenc_8x TMP0 753 + add $16, %rsi 754 + cmp %rsi, RNDKEYLAST_PTR 755 + jne 1b 756 + movdqa (%rsi), TMP0 757 + _aesenclast_8x TMP0 758 + _xor_data_8x 759 + // Don't increment DST until the ciphertext blocks have been hashed. 760 + sub $-8*16, SRC 761 + add $-8*16, DATALEN 762 + jl .Lghash_last_ciphertext_8x\@ 763 + .endif 764 + 765 + .p2align 4 766 + .Lcrypt_loop_8x\@: 767 + 768 + // Generate the next set of 8 counter blocks and start encrypting them. 769 + _ctr_begin_8x 770 + lea 16(KEY), %rsi 771 + 772 + // Do a round of AES, and start the GHASH update of 8 ciphertext blocks 773 + // by doing the unreduced multiplication for the first ciphertext block. 774 + movdqa (%rsi), TMP0 775 + add $16, %rsi 776 + _aesenc_8x TMP0 777 + _ghash_update_begin_8x \enc 778 + 779 + // Do 7 more rounds of AES, and continue the GHASH update by doing the 780 + // unreduced multiplication for the remaining ciphertext blocks. 781 + .p2align 4 782 + 1: 783 + movdqa (%rsi), TMP0 784 + add $16, %rsi 785 + _aesenc_8x TMP0 786 + _ghash_update_continue_8x \enc 787 + cmp $7*8, %eax 788 + jne 1b 789 + 790 + // Do the remaining AES rounds. 791 + .p2align 4 792 + 1: 793 + movdqa (%rsi), TMP0 794 + add $16, %rsi 795 + _aesenc_8x TMP0 796 + cmp %rsi, RNDKEYLAST_PTR 797 + jne 1b 798 + 799 + // Do the GHASH reduction and the last round of AES. 800 + movdqa (RNDKEYLAST_PTR), TMP0 801 + _ghash_update_end_8x_step 0 802 + _aesenclast_8x TMP0 803 + _ghash_update_end_8x_step 1 804 + 805 + // XOR the data with the AES-CTR keystream blocks. 806 + .if \enc 807 + sub $-8*16, DST 808 + .endif 809 + _xor_data_8x 810 + sub $-8*16, SRC 811 + .if !\enc 812 + sub $-8*16, DST 813 + .endif 814 + add $-8*16, DATALEN 815 + jge .Lcrypt_loop_8x\@ 816 + 817 + .if \enc 818 + .Lghash_last_ciphertext_8x\@: 819 + // Update GHASH with the last set of 8 ciphertext blocks. 820 + _ghash_update_begin_8x \enc 821 + .p2align 4 822 + 1: 823 + _ghash_update_continue_8x \enc 824 + cmp $7*8, %eax 825 + jne 1b 826 + _ghash_update_end_8x_step 0 827 + _ghash_update_end_8x_step 1 828 + sub $-8*16, DST 829 + .endif 830 + 831 + .Lcrypt_loop_8x_done\@: 832 + 833 + sub $-8*16, DATALEN 834 + jz .Ldone\@ 835 + 836 + // Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep 837 + // things simple and keep the code size down by just going one block at 838 + // a time, again taking advantage of hardware loop unrolling. Since 839 + // there are enough key powers available for all remaining data, we do 840 + // the GHASH multiplications unreduced, and only reduce at the very end. 841 + 842 + .set HI, TMP2 843 + .set H_POW, AESDATA0 844 + .set H_POW_XORED, AESDATA1 845 + .set ONE, AESDATA2 846 + 847 + movq .Lone(%rip), ONE 848 + 849 + // Start collecting the unreduced GHASH intermediate value LO, MI, HI. 850 + pxor LO, LO 851 + pxor MI, MI 852 + pxor HI, HI 853 + 854 + // Set up a block counter %rax to contain 8*(8-n), where n is the number 855 + // of blocks that remain, counting any partial block. This will be used 856 + // to access the key powers H^n through H^1. 857 + mov DATALEN, %eax 858 + neg %eax 859 + and $~15, %eax 860 + sar $1, %eax 861 + add $64, %eax 862 + 863 + sub $16, DATALEN 864 + jl .Lcrypt_loop_1x_done\@ 865 + 866 + // Process the data one full block at a time. 867 + .Lcrypt_loop_1x\@: 868 + 869 + // Encrypt the next counter block. 870 + _vpshufb BSWAP_MASK, LE_CTR, TMP0 871 + paddd ONE, LE_CTR 872 + pxor (KEY), TMP0 873 + lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size 874 + cmp $24, AESKEYLEN 875 + jl 128f // AES-128? 876 + je 192f // AES-192? 877 + // AES-256 878 + aesenc -7*16(%rsi), TMP0 879 + aesenc -6*16(%rsi), TMP0 880 + 192: 881 + aesenc -5*16(%rsi), TMP0 882 + aesenc -4*16(%rsi), TMP0 883 + 128: 884 + .irp i, -3,-2,-1,0,1,2,3,4,5 885 + aesenc \i*16(%rsi), TMP0 886 + .endr 887 + aesenclast (RNDKEYLAST_PTR), TMP0 888 + 889 + // Load the next key power H^i. 890 + movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW 891 + movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED 892 + 893 + // XOR the keystream block that was just generated in TMP0 with the next 894 + // source data block and store the resulting en/decrypted data to DST. 895 + .if \enc 896 + _xor_mem_to_reg (SRC), TMP0, tmp=TMP1 897 + movdqu TMP0, (DST) 898 + .else 899 + movdqu (SRC), TMP1 900 + pxor TMP1, TMP0 901 + movdqu TMP0, (DST) 902 + .endif 903 + 904 + // Update GHASH with the ciphertext block. 905 + .if \enc 906 + pshufb BSWAP_MASK, TMP0 907 + pxor TMP0, GHASH_ACC 908 + .else 909 + pshufb BSWAP_MASK, TMP1 910 + pxor TMP1, GHASH_ACC 911 + .endif 912 + _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0 913 + pxor GHASH_ACC, GHASH_ACC 914 + 915 + add $8, %eax 916 + add $16, SRC 917 + add $16, DST 918 + sub $16, DATALEN 919 + jge .Lcrypt_loop_1x\@ 920 + .Lcrypt_loop_1x_done\@: 921 + // Check whether there is a partial block at the end. 922 + add $16, DATALEN 923 + jz .Lghash_reduce\@ 924 + 925 + // Process a partial block of length 1 <= DATALEN <= 15. 926 + 927 + // Encrypt a counter block for the last time. 928 + pshufb BSWAP_MASK, LE_CTR 929 + pxor (KEY), LE_CTR 930 + lea 16(KEY), %rsi 931 + 1: 932 + aesenc (%rsi), LE_CTR 933 + add $16, %rsi 934 + cmp %rsi, RNDKEYLAST_PTR 935 + jne 1b 936 + aesenclast (RNDKEYLAST_PTR), LE_CTR 937 + 938 + // Load the lowest key power, H^1. 939 + movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW 940 + movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED 941 + 942 + // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is 943 + // in %rcx, but _load_partial_block needs DATALEN in %rcx instead. 944 + // RNDKEYLAST_PTR is no longer needed, so reuse it for SRC. 945 + mov SRC, RNDKEYLAST_PTR 946 + mov DATALEN, %ecx 947 + _load_partial_block RNDKEYLAST_PTR, TMP0, %rsi, %esi 948 + 949 + // XOR the keystream block that was just generated in LE_CTR with the 950 + // source data block and store the resulting en/decrypted data to DST. 951 + pxor TMP0, LE_CTR 952 + mov DATALEN, %ecx 953 + _store_partial_block LE_CTR, DST 954 + 955 + // If encrypting, zero-pad the final ciphertext block for GHASH. (If 956 + // decrypting, this was already done by _load_partial_block.) 957 + .if \enc 958 + lea .Lzeropad_mask+16(%rip), %rax 959 + sub DATALEN64, %rax 960 + _vpand (%rax), LE_CTR, TMP0 961 + .endif 962 + 963 + // Update GHASH with the final ciphertext block. 964 + pshufb BSWAP_MASK, TMP0 965 + pxor TMP0, GHASH_ACC 966 + _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0 967 + 968 + .Lghash_reduce\@: 969 + // Finally, do the GHASH reduction. 970 + _ghash_reduce LO, MI, HI, GHASH_ACC, TMP0 971 + 972 + .Ldone\@: 973 + // Store the updated GHASH accumulator back to memory. 974 + movdqu GHASH_ACC, (GHASH_ACC_PTR) 975 + 976 + RET 977 + .endm 978 + 979 + // void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key, 980 + // const u32 le_ctr[4], u8 ghash_acc[16], 981 + // u64 total_aadlen, u64 total_datalen); 982 + // bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key, 983 + // const u32 le_ctr[4], const u8 ghash_acc[16], 984 + // u64 total_aadlen, u64 total_datalen, 985 + // const u8 tag[16], int taglen); 986 + // 987 + // This macro generates one of the above two functions (with \enc selecting 988 + // which one). Both functions finish computing the GCM authentication tag by 989 + // updating GHASH with the lengths block and encrypting the GHASH accumulator. 990 + // |total_aadlen| and |total_datalen| must be the total length of the additional 991 + // authenticated data and the en/decrypted data in bytes, respectively. 992 + // 993 + // The encryption function then stores the full-length (16-byte) computed 994 + // authentication tag to |ghash_acc|. The decryption function instead loads the 995 + // expected authentication tag (the one that was transmitted) from the 16-byte 996 + // buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the 997 + // computed tag in constant time, and returns true if and only if they match. 998 + .macro _aes_gcm_final enc 999 + 1000 + // Function arguments 1001 + .set KEY, %rdi 1002 + .set LE_CTR_PTR, %rsi 1003 + .set GHASH_ACC_PTR, %rdx 1004 + .set TOTAL_AADLEN, %rcx 1005 + .set TOTAL_DATALEN, %r8 1006 + .set TAG, %r9 1007 + .set TAGLEN, %r10d // Originally at 8(%rsp) 1008 + .set TAGLEN64, %r10 1009 + 1010 + // Additional local variables. 1011 + // %rax and %xmm0-%xmm2 are used as temporary registers. 1012 + .set AESKEYLEN, %r11d 1013 + .set AESKEYLEN64, %r11 1014 + .set BSWAP_MASK, %xmm3 1015 + .set GHASH_ACC, %xmm4 1016 + .set H_POW1, %xmm5 // H^1 1017 + .set H_POW1_X64, %xmm6 // H^1 * x^64 1018 + .set GFPOLY, %xmm7 1019 + 1020 + movdqa .Lbswap_mask(%rip), BSWAP_MASK 1021 + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN 1022 + 1023 + // Set up a counter block with 1 in the low 32-bit word. This is the 1024 + // counter that produces the ciphertext needed to encrypt the auth tag. 1025 + movdqu (LE_CTR_PTR), %xmm0 1026 + mov $1, %eax 1027 + pinsrd $0, %eax, %xmm0 1028 + 1029 + // Build the lengths block and XOR it into the GHASH accumulator. 1030 + movq TOTAL_DATALEN, GHASH_ACC 1031 + pinsrq $1, TOTAL_AADLEN, GHASH_ACC 1032 + psllq $3, GHASH_ACC // Bytes to bits 1033 + _xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm1 1034 + 1035 + movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1 1036 + movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64 1037 + movq .Lgfpoly(%rip), GFPOLY 1038 + 1039 + // Make %rax point to the 6th from last AES round key. (Using signed 1040 + // byte offsets -7*16 through 6*16 decreases code size.) 1041 + lea (KEY,AESKEYLEN64,4), %rax 1042 + 1043 + // AES-encrypt the counter block and also multiply GHASH_ACC by H^1. 1044 + // Interleave the AES and GHASH instructions to improve performance. 1045 + pshufb BSWAP_MASK, %xmm0 1046 + pxor (KEY), %xmm0 1047 + cmp $24, AESKEYLEN 1048 + jl 128f // AES-128? 1049 + je 192f // AES-192? 1050 + // AES-256 1051 + aesenc -7*16(%rax), %xmm0 1052 + aesenc -6*16(%rax), %xmm0 1053 + 192: 1054 + aesenc -5*16(%rax), %xmm0 1055 + aesenc -4*16(%rax), %xmm0 1056 + 128: 1057 + .irp i, 0,1,2,3,4,5,6,7,8 1058 + aesenc (\i-3)*16(%rax), %xmm0 1059 + _ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2 1060 + .endr 1061 + aesenclast 6*16(%rax), %xmm0 1062 + _ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2 1063 + 1064 + // Undo the byte reflection of the GHASH accumulator. 1065 + pshufb BSWAP_MASK, GHASH_ACC 1066 + 1067 + // Encrypt the GHASH accumulator. 1068 + pxor %xmm0, GHASH_ACC 1069 + 1070 + .if \enc 1071 + // Return the computed auth tag. 1072 + movdqu GHASH_ACC, (GHASH_ACC_PTR) 1073 + .else 1074 + .set ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN! 1075 + 1076 + // Verify the auth tag in constant time by XOR'ing the transmitted and 1077 + // computed auth tags together and using the ptest instruction to check 1078 + // whether the first TAGLEN bytes of the result are zero. 1079 + _xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm0 1080 + movl 8(%rsp), TAGLEN 1081 + lea .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR 1082 + sub TAGLEN64, ZEROPAD_MASK_PTR 1083 + xor %eax, %eax 1084 + _test_mem (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0 1085 + sete %al 1086 + .endif 1087 + RET 1088 + .endm 1089 + 1090 + .set USE_AVX, 0 1091 + SYM_FUNC_START(aes_gcm_precompute_aesni) 1092 + _aes_gcm_precompute 1093 + SYM_FUNC_END(aes_gcm_precompute_aesni) 1094 + SYM_FUNC_START(aes_gcm_aad_update_aesni) 1095 + _aes_gcm_aad_update 1096 + SYM_FUNC_END(aes_gcm_aad_update_aesni) 1097 + SYM_FUNC_START(aes_gcm_enc_update_aesni) 1098 + _aes_gcm_update 1 1099 + SYM_FUNC_END(aes_gcm_enc_update_aesni) 1100 + SYM_FUNC_START(aes_gcm_dec_update_aesni) 1101 + _aes_gcm_update 0 1102 + SYM_FUNC_END(aes_gcm_dec_update_aesni) 1103 + SYM_FUNC_START(aes_gcm_enc_final_aesni) 1104 + _aes_gcm_final 1 1105 + SYM_FUNC_END(aes_gcm_enc_final_aesni) 1106 + SYM_FUNC_START(aes_gcm_dec_final_aesni) 1107 + _aes_gcm_final 0 1108 + SYM_FUNC_END(aes_gcm_dec_final_aesni) 1109 + 1110 + .set USE_AVX, 1 1111 + SYM_FUNC_START(aes_gcm_precompute_aesni_avx) 1112 + _aes_gcm_precompute 1113 + SYM_FUNC_END(aes_gcm_precompute_aesni_avx) 1114 + SYM_FUNC_START(aes_gcm_aad_update_aesni_avx) 1115 + _aes_gcm_aad_update 1116 + SYM_FUNC_END(aes_gcm_aad_update_aesni_avx) 1117 + SYM_FUNC_START(aes_gcm_enc_update_aesni_avx) 1118 + _aes_gcm_update 1 1119 + SYM_FUNC_END(aes_gcm_enc_update_aesni_avx) 1120 + SYM_FUNC_START(aes_gcm_dec_update_aesni_avx) 1121 + _aes_gcm_update 0 1122 + SYM_FUNC_END(aes_gcm_dec_update_aesni_avx) 1123 + SYM_FUNC_START(aes_gcm_enc_final_aesni_avx) 1124 + _aes_gcm_final 1 1125 + SYM_FUNC_END(aes_gcm_enc_final_aesni_avx) 1126 + SYM_FUNC_START(aes_gcm_dec_final_aesni_avx) 1127 + _aes_gcm_final 0 1128 + SYM_FUNC_END(aes_gcm_dec_final_aesni_avx)

+1222

arch/x86/crypto/aes-gcm-avx10-x86_64.S

··· 1 + /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ 2 + // 3 + // VAES and VPCLMULQDQ optimized AES-GCM for x86_64 4 + // 5 + // Copyright 2024 Google LLC 6 + // 7 + // Author: Eric Biggers <ebiggers@google.com> 8 + // 9 + //------------------------------------------------------------------------------ 10 + // 11 + // This file is dual-licensed, meaning that you can use it under your choice of 12 + // either of the following two licenses: 13 + // 14 + // Licensed under the Apache License 2.0 (the "License"). You may obtain a copy 15 + // of the License at 16 + // 17 + // http://www.apache.org/licenses/LICENSE-2.0 18 + // 19 + // Unless required by applicable law or agreed to in writing, software 20 + // distributed under the License is distributed on an "AS IS" BASIS, 21 + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 22 + // See the License for the specific language governing permissions and 23 + // limitations under the License. 24 + // 25 + // or 26 + // 27 + // Redistribution and use in source and binary forms, with or without 28 + // modification, are permitted provided that the following conditions are met: 29 + // 30 + // 1. Redistributions of source code must retain the above copyright notice, 31 + // this list of conditions and the following disclaimer. 32 + // 33 + // 2. Redistributions in binary form must reproduce the above copyright 34 + // notice, this list of conditions and the following disclaimer in the 35 + // documentation and/or other materials provided with the distribution. 36 + // 37 + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 + // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 + // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 + // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 41 + // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 + // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 + // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 + // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 + // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 + // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 + // POSSIBILITY OF SUCH DAMAGE. 48 + // 49 + //------------------------------------------------------------------------------ 50 + // 51 + // This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that 52 + // support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and 53 + // either AVX512 or AVX10. Some of the functions, notably the encryption and 54 + // decryption update functions which are the most performance-critical, are 55 + // provided in two variants generated from a macro: one using 256-bit vectors 56 + // (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512). The 57 + // other, "shared" functions (vaes_avx10) use at most 256-bit vectors. 58 + // 59 + // The functions that use 512-bit vectors are intended for CPUs that support 60 + // 512-bit vectors *and* where using them doesn't cause significant 61 + // downclocking. They require the following CPU features: 62 + // 63 + // VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512) 64 + // 65 + // The other functions require the following CPU features: 66 + // 67 + // VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256) 68 + // 69 + // All functions use the "System V" ABI. The Windows ABI is not supported. 70 + // 71 + // Note that we use "avx10" in the names of the functions as a shorthand to 72 + // really mean "AVX10 or a certain set of AVX512 features". Due to Intel's 73 + // introduction of AVX512 and then its replacement by AVX10, there doesn't seem 74 + // to be a simple way to name things that makes sense on all CPUs. 75 + // 76 + // Note that the macros that support both 256-bit and 512-bit vectors could 77 + // fairly easily be changed to support 128-bit too. However, this would *not* 78 + // be sufficient to allow the code to run on CPUs without AVX512 or AVX10, 79 + // because the code heavily uses several features of these extensions other than 80 + // the vector length: the increase in the number of SIMD registers from 16 to 81 + // 32, masking support, and new instructions such as vpternlogd (which can do a 82 + // three-argument XOR). These features are very useful for AES-GCM. 83 + 84 + #include <linux/linkage.h> 85 + 86 + .section .rodata 87 + .p2align 6 88 + 89 + // A shuffle mask that reflects the bytes of 16-byte blocks 90 + .Lbswap_mask: 91 + .octa 0x000102030405060708090a0b0c0d0e0f 92 + 93 + // This is the GHASH reducing polynomial without its constant term, i.e. 94 + // x^128 + x^7 + x^2 + x, represented using the backwards mapping 95 + // between bits and polynomial coefficients. 96 + // 97 + // Alternatively, it can be interpreted as the naturally-ordered 98 + // representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the 99 + // "reversed" GHASH reducing polynomial without its x^128 term. 100 + .Lgfpoly: 101 + .octa 0xc2000000000000000000000000000001 102 + 103 + // Same as above, but with the (1 << 64) bit set. 104 + .Lgfpoly_and_internal_carrybit: 105 + .octa 0xc2000000000000010000000000000001 106 + 107 + // The below constants are used for incrementing the counter blocks. 108 + // ctr_pattern points to the four 128-bit values [0, 1, 2, 3]. 109 + // inc_2blocks and inc_4blocks point to the single 128-bit values 2 and 110 + // 4. Note that the same '2' is reused in ctr_pattern and inc_2blocks. 111 + .Lctr_pattern: 112 + .octa 0 113 + .octa 1 114 + .Linc_2blocks: 115 + .octa 2 116 + .octa 3 117 + .Linc_4blocks: 118 + .octa 4 119 + 120 + // Number of powers of the hash key stored in the key struct. The powers are 121 + // stored from highest (H^NUM_H_POWERS) to lowest (H^1). 122 + #define NUM_H_POWERS 16 123 + 124 + // Offset to AES key length (in bytes) in the key struct 125 + #define OFFSETOF_AESKEYLEN 480 126 + 127 + // Offset to start of hash key powers array in the key struct 128 + #define OFFSETOF_H_POWERS 512 129 + 130 + // Offset to end of hash key powers array in the key struct. 131 + // 132 + // This is immediately followed by three zeroized padding blocks, which are 133 + // included so that partial vectors can be handled more easily. E.g. if VL=64 134 + // and two blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most 135 + // padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded. 136 + #define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16)) 137 + 138 + .text 139 + 140 + // Set the vector length in bytes. This sets the VL variable and defines 141 + // register aliases V0-V31 that map to the ymm or zmm registers. 142 + .macro _set_veclen vl 143 + .set VL, \vl 144 + .irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ 145 + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 146 + .if VL == 32 147 + .set V\i, %ymm\i 148 + .elseif VL == 64 149 + .set V\i, %zmm\i 150 + .else 151 + .error "Unsupported vector length" 152 + .endif 153 + .endr 154 + .endm 155 + 156 + // The _ghash_mul_step macro does one step of GHASH multiplication of the 157 + // 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the 158 + // reduced products in \dst. \t0, \t1, and \t2 are temporary registers of the 159 + // same size as \a and \b. To complete all steps, this must invoked with \i=0 160 + // through \i=9. The division into steps allows users of this macro to 161 + // optionally interleave the computation with other instructions. Users of this 162 + // macro must preserve the parameter registers across steps. 163 + // 164 + // The multiplications are done in GHASH's representation of the finite field 165 + // GF(2^128). Elements of GF(2^128) are represented as binary polynomials 166 + // (i.e. polynomials whose coefficients are bits) modulo a reducing polynomial 167 + // G. The GCM specification uses G = x^128 + x^7 + x^2 + x + 1. Addition is 168 + // just XOR, while multiplication is more complex and has two parts: (a) do 169 + // carryless multiplication of two 128-bit input polynomials to get a 256-bit 170 + // intermediate product polynomial, and (b) reduce the intermediate product to 171 + // 128 bits by adding multiples of G that cancel out terms in it. (Adding 172 + // multiples of G doesn't change which field element the polynomial represents.) 173 + // 174 + // Unfortunately, the GCM specification maps bits to/from polynomial 175 + // coefficients backwards from the natural order. In each byte it specifies the 176 + // highest bit to be the lowest order polynomial coefficient, *not* the highest! 177 + // This makes it nontrivial to work with the GHASH polynomials. We could 178 + // reflect the bits, but x86 doesn't have an instruction that does that. 179 + // 180 + // Instead, we operate on the values without bit-reflecting them. This *mostly* 181 + // just works, since XOR and carryless multiplication are symmetric with respect 182 + // to bit order, but it has some consequences. First, due to GHASH's byte 183 + // order, by skipping bit reflection, *byte* reflection becomes necessary to 184 + // give the polynomial terms a consistent order. E.g., considering an N-bit 185 + // value interpreted using the G = x^128 + x^7 + x^2 + x + 1 convention, bits 0 186 + // through N-1 of the byte-reflected value represent the coefficients of x^(N-1) 187 + // through x^0, whereas bits 0 through N-1 of the non-byte-reflected value 188 + // represent x^7...x^0, x^15...x^8, ..., x^(N-1)...x^(N-8) which can't be worked 189 + // with. Fortunately, x86's vpshufb instruction can do byte reflection. 190 + // 191 + // Second, forgoing the bit reflection causes an extra multiple of x (still 192 + // using the G = x^128 + x^7 + x^2 + x + 1 convention) to be introduced by each 193 + // multiplication. This is because an M-bit by N-bit carryless multiplication 194 + // really produces a (M+N-1)-bit product, but in practice it's zero-extended to 195 + // M+N bits. In the G = x^128 + x^7 + x^2 + x + 1 convention, which maps bits 196 + // to polynomial coefficients backwards, this zero-extension actually changes 197 + // the product by introducing an extra factor of x. Therefore, users of this 198 + // macro must ensure that one of the inputs has an extra factor of x^-1, i.e. 199 + // the multiplicative inverse of x, to cancel out the extra x. 200 + // 201 + // Third, the backwards coefficients convention is just confusing to work with, 202 + // since it makes "low" and "high" in the polynomial math mean the opposite of 203 + // their normal meaning in computer programming. This can be solved by using an 204 + // alternative interpretation: the polynomial coefficients are understood to be 205 + // in the natural order, and the multiplication is actually \a * \b * x^-128 mod 206 + // x^128 + x^127 + x^126 + x^121 + 1. This doesn't change the inputs, outputs, 207 + // or the implementation at all; it just changes the mathematical interpretation 208 + // of what each instruction is doing. Starting from here, we'll use this 209 + // alternative interpretation, as it's easier to understand the code that way. 210 + // 211 + // Moving onto the implementation, the vpclmulqdq instruction does 64 x 64 => 212 + // 128-bit carryless multiplication, so we break the 128 x 128 multiplication 213 + // into parts as follows (the _L and _H suffixes denote low and high 64 bits): 214 + // 215 + // LO = a_L * b_L 216 + // MI = (a_L * b_H) + (a_H * b_L) 217 + // HI = a_H * b_H 218 + // 219 + // The 256-bit product is x^128*HI + x^64*MI + LO. LO, MI, and HI are 128-bit. 220 + // Note that MI "overlaps" with LO and HI. We don't consolidate MI into LO and 221 + // HI right away, since the way the reduction works makes that unnecessary. 222 + // 223 + // For the reduction, we cancel out the low 128 bits by adding multiples of G = 224 + // x^128 + x^127 + x^126 + x^121 + 1. This is done by two iterations, each of 225 + // which cancels out the next lowest 64 bits. Consider a value x^64*A + B, 226 + // where A and B are 128-bit. Adding B_L*G to that value gives: 227 + // 228 + // x^64*A + B + B_L*G 229 + // = x^64*A + x^64*B_H + B_L + B_L*(x^128 + x^127 + x^126 + x^121 + 1) 230 + // = x^64*A + x^64*B_H + B_L + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L 231 + // = x^64*A + x^64*B_H + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L + B_L 232 + // = x^64*(A + B_H + x^64*B_L + B_L*(x^63 + x^62 + x^57)) 233 + // 234 + // So: if we sum A, B with its halves swapped, and the low half of B times x^63 235 + // + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the 236 + // original value x^64*A + B. I.e., the low 64 bits got canceled out. 237 + // 238 + // We just need to apply this twice: first to fold LO into MI, and second to 239 + // fold the updated MI into HI. 240 + // 241 + // The needed three-argument XORs are done using the vpternlogd instruction with 242 + // immediate 0x96, since this is faster than two vpxord instructions. 243 + // 244 + // A potential optimization, assuming that b is fixed per-key (if a is fixed 245 + // per-key it would work the other way around), is to use one iteration of the 246 + // reduction described above to precompute a value c such that x^64*c = b mod G, 247 + // and then multiply a_L by c (and implicitly by x^64) instead of by b: 248 + // 249 + // MI = (a_L * c_L) + (a_H * b_L) 250 + // HI = (a_L * c_H) + (a_H * b_H) 251 + // 252 + // This would eliminate the LO part of the intermediate product, which would 253 + // eliminate the need to fold LO into MI. This would save two instructions, 254 + // including a vpclmulqdq. However, we currently don't use this optimization 255 + // because it would require twice as many per-key precomputed values. 256 + // 257 + // Using Karatsuba multiplication instead of "schoolbook" multiplication 258 + // similarly would save a vpclmulqdq but does not seem to be worth it. 259 + .macro _ghash_mul_step i, a, b, dst, gfpoly, t0, t1, t2 260 + .if \i == 0 261 + vpclmulqdq $0x00, \a, \b, \t0 // LO = a_L * b_L 262 + vpclmulqdq $0x01, \a, \b, \t1 // MI_0 = a_L * b_H 263 + .elseif \i == 1 264 + vpclmulqdq $0x10, \a, \b, \t2 // MI_1 = a_H * b_L 265 + .elseif \i == 2 266 + vpxord \t2, \t1, \t1 // MI = MI_0 + MI_1 267 + .elseif \i == 3 268 + vpclmulqdq $0x01, \t0, \gfpoly, \t2 // LO_L*(x^63 + x^62 + x^57) 269 + .elseif \i == 4 270 + vpshufd $0x4e, \t0, \t0 // Swap halves of LO 271 + .elseif \i == 5 272 + vpternlogd $0x96, \t2, \t0, \t1 // Fold LO into MI 273 + .elseif \i == 6 274 + vpclmulqdq $0x11, \a, \b, \dst // HI = a_H * b_H 275 + .elseif \i == 7 276 + vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) 277 + .elseif \i == 8 278 + vpshufd $0x4e, \t1, \t1 // Swap halves of MI 279 + .elseif \i == 9 280 + vpternlogd $0x96, \t0, \t1, \dst // Fold MI into HI 281 + .endif 282 + .endm 283 + 284 + // GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store 285 + // the reduced products in \dst. See _ghash_mul_step for full explanation. 286 + .macro _ghash_mul a, b, dst, gfpoly, t0, t1, t2 287 + .irp i, 0,1,2,3,4,5,6,7,8,9 288 + _ghash_mul_step \i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2 289 + .endr 290 + .endm 291 + 292 + // GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the 293 + // *unreduced* products to \lo, \mi, and \hi. 294 + .macro _ghash_mul_noreduce a, b, lo, mi, hi, t0, t1, t2, t3 295 + vpclmulqdq $0x00, \a, \b, \t0 // a_L * b_L 296 + vpclmulqdq $0x01, \a, \b, \t1 // a_L * b_H 297 + vpclmulqdq $0x10, \a, \b, \t2 // a_H * b_L 298 + vpclmulqdq $0x11, \a, \b, \t3 // a_H * b_H 299 + vpxord \t0, \lo, \lo 300 + vpternlogd $0x96, \t2, \t1, \mi 301 + vpxord \t3, \hi, \hi 302 + .endm 303 + 304 + // Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit 305 + // reduced products in \hi. See _ghash_mul_step for explanation of reduction. 306 + .macro _ghash_reduce lo, mi, hi, gfpoly, t0 307 + vpclmulqdq $0x01, \lo, \gfpoly, \t0 308 + vpshufd $0x4e, \lo, \lo 309 + vpternlogd $0x96, \t0, \lo, \mi 310 + vpclmulqdq $0x01, \mi, \gfpoly, \t0 311 + vpshufd $0x4e, \mi, \mi 312 + vpternlogd $0x96, \t0, \mi, \hi 313 + .endm 314 + 315 + // void aes_gcm_precompute_##suffix(struct aes_gcm_key_avx10 *key); 316 + // 317 + // Given the expanded AES key |key->aes_key|, this function derives the GHASH 318 + // subkey and initializes |key->ghash_key_powers| with powers of it. 319 + // 320 + // The number of key powers initialized is NUM_H_POWERS, and they are stored in 321 + // the order H^NUM_H_POWERS to H^1. The zeroized padding blocks after the key 322 + // powers themselves are also initialized. 323 + // 324 + // This macro supports both VL=32 and VL=64. _set_veclen must have been invoked 325 + // with the desired length. In the VL=32 case, the function computes twice as 326 + // many key powers than are actually used by the VL=32 GCM update functions. 327 + // This is done to keep the key format the same regardless of vector length. 328 + .macro _aes_gcm_precompute 329 + 330 + // Function arguments 331 + .set KEY, %rdi 332 + 333 + // Additional local variables. V0-V2 and %rax are used as temporaries. 334 + .set POWERS_PTR, %rsi 335 + .set RNDKEYLAST_PTR, %rdx 336 + .set H_CUR, V3 337 + .set H_CUR_YMM, %ymm3 338 + .set H_CUR_XMM, %xmm3 339 + .set H_INC, V4 340 + .set H_INC_YMM, %ymm4 341 + .set H_INC_XMM, %xmm4 342 + .set GFPOLY, V5 343 + .set GFPOLY_YMM, %ymm5 344 + .set GFPOLY_XMM, %xmm5 345 + 346 + // Get pointer to lowest set of key powers (located at end of array). 347 + lea OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR 348 + 349 + // Encrypt an all-zeroes block to get the raw hash subkey. 350 + movl OFFSETOF_AESKEYLEN(KEY), %eax 351 + lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR 352 + vmovdqu (KEY), %xmm0 // Zero-th round key XOR all-zeroes block 353 + add $16, KEY 354 + 1: 355 + vaesenc (KEY), %xmm0, %xmm0 356 + add $16, KEY 357 + cmp KEY, RNDKEYLAST_PTR 358 + jne 1b 359 + vaesenclast (RNDKEYLAST_PTR), %xmm0, %xmm0 360 + 361 + // Reflect the bytes of the raw hash subkey. 362 + vpshufb .Lbswap_mask(%rip), %xmm0, H_CUR_XMM 363 + 364 + // Zeroize the padding blocks. 365 + vpxor %xmm0, %xmm0, %xmm0 366 + vmovdqu %ymm0, VL(POWERS_PTR) 367 + vmovdqu %xmm0, VL+2*16(POWERS_PTR) 368 + 369 + // Finish preprocessing the first key power, H^1. Since this GHASH 370 + // implementation operates directly on values with the backwards bit 371 + // order specified by the GCM standard, it's necessary to preprocess the 372 + // raw key as follows. First, reflect its bytes. Second, multiply it 373 + // by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards 374 + // interpretation of polynomial coefficients), which can also be 375 + // interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121 376 + // + 1 using the alternative, natural interpretation of polynomial 377 + // coefficients. For details, see the comment above _ghash_mul_step. 378 + // 379 + // Either way, for the multiplication the concrete operation performed 380 + // is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2 381 + // << 120) | 1 if a 1 bit was carried out. However, there's no 128-bit 382 + // wide shift instruction, so instead double each of the two 64-bit 383 + // halves and incorporate the internal carry bit into the value XOR'd. 384 + vpshufd $0xd3, H_CUR_XMM, %xmm0 385 + vpsrad $31, %xmm0, %xmm0 386 + vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM 387 + vpand .Lgfpoly_and_internal_carrybit(%rip), %xmm0, %xmm0 388 + vpxor %xmm0, H_CUR_XMM, H_CUR_XMM 389 + 390 + // Load the gfpoly constant. 391 + vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY 392 + 393 + // Square H^1 to get H^2. 394 + // 395 + // Note that as with H^1, all higher key powers also need an extra 396 + // factor of x^-1 (or x using the natural interpretation). Nothing 397 + // special needs to be done to make this happen, though: H^1 * H^1 would 398 + // end up with two factors of x^-1, but the multiplication consumes one. 399 + // So the product H^2 ends up with the desired one factor of x^-1. 400 + _ghash_mul H_CUR_XMM, H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, \ 401 + %xmm0, %xmm1, %xmm2 402 + 403 + // Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2]. 404 + vinserti128 $1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM 405 + vinserti128 $1, H_INC_XMM, H_INC_YMM, H_INC_YMM 406 + 407 + .if VL == 64 408 + // Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4]. 409 + _ghash_mul H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \ 410 + %ymm0, %ymm1, %ymm2 411 + vinserti64x4 $1, H_CUR_YMM, H_INC, H_CUR 412 + vshufi64x2 $0, H_INC, H_INC, H_INC 413 + .endif 414 + 415 + // Store the lowest set of key powers. 416 + vmovdqu8 H_CUR, (POWERS_PTR) 417 + 418 + // Compute and store the remaining key powers. With VL=32, repeatedly 419 + // multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)]. 420 + // With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by 421 + // [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)]. 422 + mov $(NUM_H_POWERS*16/VL) - 1, %eax 423 + .Lprecompute_next\@: 424 + sub $VL, POWERS_PTR 425 + _ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, V0, V1, V2 426 + vmovdqu8 H_CUR, (POWERS_PTR) 427 + dec %eax 428 + jnz .Lprecompute_next\@ 429 + 430 + vzeroupper // This is needed after using ymm or zmm registers. 431 + RET 432 + .endm 433 + 434 + // XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store 435 + // the result in \dst_xmm. This implicitly zeroizes the other lanes of dst. 436 + .macro _horizontal_xor src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm 437 + vextracti32x4 $1, \src, \t0_xmm 438 + .if VL == 32 439 + vpxord \t0_xmm, \src_xmm, \dst_xmm 440 + .elseif VL == 64 441 + vextracti32x4 $2, \src, \t1_xmm 442 + vextracti32x4 $3, \src, \t2_xmm 443 + vpxord \t0_xmm, \src_xmm, \dst_xmm 444 + vpternlogd $0x96, \t1_xmm, \t2_xmm, \dst_xmm 445 + .else 446 + .error "Unsupported vector length" 447 + .endif 448 + .endm 449 + 450 + // Do one step of the GHASH update of the data blocks given in the vector 451 + // registers GHASHDATA[0-3]. \i specifies the step to do, 0 through 9. The 452 + // division into steps allows users of this macro to optionally interleave the 453 + // computation with other instructions. This macro uses the vector register 454 + // GHASH_ACC as input/output; GHASHDATA[0-3] as inputs that are clobbered; 455 + // H_POW[4-1], GFPOLY, and BSWAP_MASK as inputs that aren't clobbered; and 456 + // GHASHTMP[0-2] as temporaries. This macro handles the byte-reflection of the 457 + // data blocks. The parameter registers must be preserved across steps. 458 + // 459 + // The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) + 460 + // H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the 461 + // operations are vectorized operations on vectors of 16-byte blocks. E.g., 462 + // with VL=32 there are 2 blocks per vector and the vectorized terms correspond 463 + // to the following non-vectorized terms: 464 + // 465 + // H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0) 466 + // H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3 467 + // H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5 468 + // H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7 469 + // 470 + // With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15. 471 + // 472 + // More concretely, this code does: 473 + // - Do vectorized "schoolbook" multiplications to compute the intermediate 474 + // 256-bit product of each block and its corresponding hash key power. 475 + // There are 4*VL/16 of these intermediate products. 476 + // - Sum (XOR) the intermediate 256-bit products across vectors. This leaves 477 + // VL/16 256-bit intermediate values. 478 + // - Do a vectorized reduction of these 256-bit intermediate values to 479 + // 128-bits each. This leaves VL/16 128-bit intermediate values. 480 + // - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM. 481 + // 482 + // See _ghash_mul_step for the full explanation of the operations performed for 483 + // each individual finite field multiplication and reduction. 484 + .macro _ghash_step_4x i 485 + .if \i == 0 486 + vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0 487 + vpxord GHASH_ACC, GHASHDATA0, GHASHDATA0 488 + vpshufb BSWAP_MASK, GHASHDATA1, GHASHDATA1 489 + vpshufb BSWAP_MASK, GHASHDATA2, GHASHDATA2 490 + .elseif \i == 1 491 + vpshufb BSWAP_MASK, GHASHDATA3, GHASHDATA3 492 + vpclmulqdq $0x00, H_POW4, GHASHDATA0, GHASH_ACC // LO_0 493 + vpclmulqdq $0x00, H_POW3, GHASHDATA1, GHASHTMP0 // LO_1 494 + vpclmulqdq $0x00, H_POW2, GHASHDATA2, GHASHTMP1 // LO_2 495 + .elseif \i == 2 496 + vpxord GHASHTMP0, GHASH_ACC, GHASH_ACC // sum(LO_{1,0}) 497 + vpclmulqdq $0x00, H_POW1, GHASHDATA3, GHASHTMP2 // LO_3 498 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASH_ACC // LO = sum(LO_{3,2,1,0}) 499 + vpclmulqdq $0x01, H_POW4, GHASHDATA0, GHASHTMP0 // MI_0 500 + .elseif \i == 3 501 + vpclmulqdq $0x01, H_POW3, GHASHDATA1, GHASHTMP1 // MI_1 502 + vpclmulqdq $0x01, H_POW2, GHASHDATA2, GHASHTMP2 // MI_2 503 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{2,1,0}) 504 + vpclmulqdq $0x01, H_POW1, GHASHDATA3, GHASHTMP1 // MI_3 505 + .elseif \i == 4 506 + vpclmulqdq $0x10, H_POW4, GHASHDATA0, GHASHTMP2 // MI_4 507 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{4,3,2,1,0}) 508 + vpclmulqdq $0x10, H_POW3, GHASHDATA1, GHASHTMP1 // MI_5 509 + vpclmulqdq $0x10, H_POW2, GHASHDATA2, GHASHTMP2 // MI_6 510 + .elseif \i == 5 511 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{6,5,4,3,2,1,0}) 512 + vpclmulqdq $0x01, GHASH_ACC, GFPOLY, GHASHTMP2 // LO_L*(x^63 + x^62 + x^57) 513 + vpclmulqdq $0x10, H_POW1, GHASHDATA3, GHASHTMP1 // MI_7 514 + vpxord GHASHTMP1, GHASHTMP0, GHASHTMP0 // MI = sum(MI_{7,6,5,4,3,2,1,0}) 515 + .elseif \i == 6 516 + vpshufd $0x4e, GHASH_ACC, GHASH_ACC // Swap halves of LO 517 + vpclmulqdq $0x11, H_POW4, GHASHDATA0, GHASHDATA0 // HI_0 518 + vpclmulqdq $0x11, H_POW3, GHASHDATA1, GHASHDATA1 // HI_1 519 + vpclmulqdq $0x11, H_POW2, GHASHDATA2, GHASHDATA2 // HI_2 520 + .elseif \i == 7 521 + vpternlogd $0x96, GHASHTMP2, GHASH_ACC, GHASHTMP0 // Fold LO into MI 522 + vpclmulqdq $0x11, H_POW1, GHASHDATA3, GHASHDATA3 // HI_3 523 + vpternlogd $0x96, GHASHDATA2, GHASHDATA1, GHASHDATA0 // sum(HI_{2,1,0}) 524 + vpclmulqdq $0x01, GHASHTMP0, GFPOLY, GHASHTMP1 // MI_L*(x^63 + x^62 + x^57) 525 + .elseif \i == 8 526 + vpxord GHASHDATA3, GHASHDATA0, GHASH_ACC // HI = sum(HI_{3,2,1,0}) 527 + vpshufd $0x4e, GHASHTMP0, GHASHTMP0 // Swap halves of MI 528 + vpternlogd $0x96, GHASHTMP1, GHASHTMP0, GHASH_ACC // Fold MI into HI 529 + .elseif \i == 9 530 + _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \ 531 + GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM 532 + .endif 533 + .endm 534 + 535 + // Do one non-last round of AES encryption on the counter blocks in V0-V3 using 536 + // the round key that has been broadcast to all 128-bit lanes of \round_key. 537 + .macro _vaesenc_4x round_key 538 + vaesenc \round_key, V0, V0 539 + vaesenc \round_key, V1, V1 540 + vaesenc \round_key, V2, V2 541 + vaesenc \round_key, V3, V3 542 + .endm 543 + 544 + // Start the AES encryption of four vectors of counter blocks. 545 + .macro _ctr_begin_4x 546 + 547 + // Increment LE_CTR four times to generate four vectors of little-endian 548 + // counter blocks, swap each to big-endian, and store them in V0-V3. 549 + vpshufb BSWAP_MASK, LE_CTR, V0 550 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR 551 + vpshufb BSWAP_MASK, LE_CTR, V1 552 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR 553 + vpshufb BSWAP_MASK, LE_CTR, V2 554 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR 555 + vpshufb BSWAP_MASK, LE_CTR, V3 556 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR 557 + 558 + // AES "round zero": XOR in the zero-th round key. 559 + vpxord RNDKEY0, V0, V0 560 + vpxord RNDKEY0, V1, V1 561 + vpxord RNDKEY0, V2, V2 562 + vpxord RNDKEY0, V3, V3 563 + .endm 564 + 565 + // void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key, 566 + // const u32 le_ctr[4], u8 ghash_acc[16], 567 + // const u8 *src, u8 *dst, int datalen); 568 + // 569 + // This macro generates a GCM encryption or decryption update function with the 570 + // above prototype (with \enc selecting which one). This macro supports both 571 + // VL=32 and VL=64. _set_veclen must have been invoked with the desired length. 572 + // 573 + // This function computes the next portion of the CTR keystream, XOR's it with 574 + // |datalen| bytes from |src|, and writes the resulting encrypted or decrypted 575 + // data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the 576 + // next |datalen| ciphertext bytes. 577 + // 578 + // |datalen| must be a multiple of 16, except on the last call where it can be 579 + // any length. The caller must do any buffering needed to ensure this. Both 580 + // in-place and out-of-place en/decryption are supported. 581 + // 582 + // |le_ctr| must give the current counter in little-endian format. For a new 583 + // message, the low word of the counter must be 2. This function loads the 584 + // counter from |le_ctr| and increments the loaded counter as needed, but it 585 + // does *not* store the updated counter back to |le_ctr|. The caller must 586 + // update |le_ctr| if any more data segments follow. Internally, only the low 587 + // 32-bit word of the counter is incremented, following the GCM standard. 588 + .macro _aes_gcm_update enc 589 + 590 + // Function arguments 591 + .set KEY, %rdi 592 + .set LE_CTR_PTR, %rsi 593 + .set GHASH_ACC_PTR, %rdx 594 + .set SRC, %rcx 595 + .set DST, %r8 596 + .set DATALEN, %r9d 597 + .set DATALEN64, %r9 // Zero-extend DATALEN before using! 598 + 599 + // Additional local variables 600 + 601 + // %rax and %k1 are used as temporary registers. LE_CTR_PTR is also 602 + // available as a temporary register after the counter is loaded. 603 + 604 + // AES key length in bytes 605 + .set AESKEYLEN, %r10d 606 + .set AESKEYLEN64, %r10 607 + 608 + // Pointer to the last AES round key for the chosen AES variant 609 + .set RNDKEYLAST_PTR, %r11 610 + 611 + // In the main loop, V0-V3 are used as AES input and output. Elsewhere 612 + // they are used as temporary registers. 613 + 614 + // GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data. 615 + .set GHASHDATA0, V4 616 + .set GHASHDATA0_XMM, %xmm4 617 + .set GHASHDATA1, V5 618 + .set GHASHDATA1_XMM, %xmm5 619 + .set GHASHDATA2, V6 620 + .set GHASHDATA2_XMM, %xmm6 621 + .set GHASHDATA3, V7 622 + 623 + // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values 624 + // using vpshufb, copied to all 128-bit lanes. 625 + .set BSWAP_MASK, V8 626 + 627 + // RNDKEY temporarily holds the next AES round key. 628 + .set RNDKEY, V9 629 + 630 + // GHASH_ACC is the accumulator variable for GHASH. When fully reduced, 631 + // only the lowest 128-bit lane can be nonzero. When not fully reduced, 632 + // more than one lane may be used, and they need to be XOR'd together. 633 + .set GHASH_ACC, V10 634 + .set GHASH_ACC_XMM, %xmm10 635 + 636 + // LE_CTR_INC is the vector of 32-bit words that need to be added to a 637 + // vector of little-endian counter blocks to advance it forwards. 638 + .set LE_CTR_INC, V11 639 + 640 + // LE_CTR contains the next set of little-endian counter blocks. 641 + .set LE_CTR, V12 642 + 643 + // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys, 644 + // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key, 645 + // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last. 646 + .set RNDKEY0, V13 647 + .set RNDKEYLAST, V14 648 + .set RNDKEY_M9, V15 649 + .set RNDKEY_M8, V16 650 + .set RNDKEY_M7, V17 651 + .set RNDKEY_M6, V18 652 + .set RNDKEY_M5, V19 653 + 654 + // RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with 655 + // the corresponding block of source data. This is useful because 656 + // vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can 657 + // be computed in parallel with the AES rounds. 658 + .set RNDKEYLAST0, V20 659 + .set RNDKEYLAST1, V21 660 + .set RNDKEYLAST2, V22 661 + .set RNDKEYLAST3, V23 662 + 663 + // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These 664 + // cannot coincide with anything used for AES encryption, since for 665 + // performance reasons GHASH and AES encryption are interleaved. 666 + .set GHASHTMP0, V24 667 + .set GHASHTMP1, V25 668 + .set GHASHTMP2, V26 669 + 670 + // H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1. The 671 + // descending numbering reflects the order of the key powers. 672 + .set H_POW4, V27 673 + .set H_POW3, V28 674 + .set H_POW2, V29 675 + .set H_POW1, V30 676 + 677 + // GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes. 678 + .set GFPOLY, V31 679 + 680 + // Load some constants. 681 + vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK 682 + vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY 683 + 684 + // Load the GHASH accumulator and the starting counter. 685 + vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM 686 + vbroadcasti32x4 (LE_CTR_PTR), LE_CTR 687 + 688 + // Load the AES key length in bytes. 689 + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN 690 + 691 + // Make RNDKEYLAST_PTR point to the last AES round key. This is the 692 + // round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256 693 + // respectively. Then load the zero-th and last round keys. 694 + lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR 695 + vbroadcasti32x4 (KEY), RNDKEY0 696 + vbroadcasti32x4 (RNDKEYLAST_PTR), RNDKEYLAST 697 + 698 + // Finish initializing LE_CTR by adding [0, 1, ...] to its low words. 699 + vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR 700 + 701 + // Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes. 702 + .if VL == 32 703 + vbroadcasti32x4 .Linc_2blocks(%rip), LE_CTR_INC 704 + .elseif VL == 64 705 + vbroadcasti32x4 .Linc_4blocks(%rip), LE_CTR_INC 706 + .else 707 + .error "Unsupported vector length" 708 + .endif 709 + 710 + // If there are at least 4*VL bytes of data, then continue into the loop 711 + // that processes 4*VL bytes of data at a time. Otherwise skip it. 712 + // 713 + // Pre-subtracting 4*VL from DATALEN saves an instruction from the main 714 + // loop and also ensures that at least one write always occurs to 715 + // DATALEN, zero-extending it and allowing DATALEN64 to be used later. 716 + sub $4*VL, DATALEN 717 + jl .Lcrypt_loop_4x_done\@ 718 + 719 + // Load powers of the hash key. 720 + vmovdqu8 OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4 721 + vmovdqu8 OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3 722 + vmovdqu8 OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2 723 + vmovdqu8 OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1 724 + 725 + // Main loop: en/decrypt and hash 4 vectors at a time. 726 + // 727 + // When possible, interleave the AES encryption of the counter blocks 728 + // with the GHASH update of the ciphertext blocks. This improves 729 + // performance on many CPUs because the execution ports used by the VAES 730 + // instructions often differ from those used by vpclmulqdq and other 731 + // instructions used in GHASH. For example, many Intel CPUs dispatch 732 + // vaesenc to ports 0 and 1 and vpclmulqdq to port 5. 733 + // 734 + // The interleaving is easiest to do during decryption, since during 735 + // decryption the ciphertext blocks are immediately available. For 736 + // encryption, instead encrypt the first set of blocks, then hash those 737 + // blocks while encrypting the next set of blocks, repeat that as 738 + // needed, and finally hash the last set of blocks. 739 + 740 + .if \enc 741 + // Encrypt the first 4 vectors of plaintext blocks. Leave the resulting 742 + // ciphertext in GHASHDATA[0-3] for GHASH. 743 + _ctr_begin_4x 744 + lea 16(KEY), %rax 745 + 1: 746 + vbroadcasti32x4 (%rax), RNDKEY 747 + _vaesenc_4x RNDKEY 748 + add $16, %rax 749 + cmp %rax, RNDKEYLAST_PTR 750 + jne 1b 751 + vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0 752 + vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1 753 + vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2 754 + vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3 755 + vaesenclast RNDKEYLAST0, V0, GHASHDATA0 756 + vaesenclast RNDKEYLAST1, V1, GHASHDATA1 757 + vaesenclast RNDKEYLAST2, V2, GHASHDATA2 758 + vaesenclast RNDKEYLAST3, V3, GHASHDATA3 759 + vmovdqu8 GHASHDATA0, 0*VL(DST) 760 + vmovdqu8 GHASHDATA1, 1*VL(DST) 761 + vmovdqu8 GHASHDATA2, 2*VL(DST) 762 + vmovdqu8 GHASHDATA3, 3*VL(DST) 763 + add $4*VL, SRC 764 + add $4*VL, DST 765 + sub $4*VL, DATALEN 766 + jl .Lghash_last_ciphertext_4x\@ 767 + .endif 768 + 769 + // Cache as many additional AES round keys as possible. 770 + .irp i, 9,8,7,6,5 771 + vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i 772 + .endr 773 + 774 + .Lcrypt_loop_4x\@: 775 + 776 + // If decrypting, load more ciphertext blocks into GHASHDATA[0-3]. If 777 + // encrypting, GHASHDATA[0-3] already contain the previous ciphertext. 778 + .if !\enc 779 + vmovdqu8 0*VL(SRC), GHASHDATA0 780 + vmovdqu8 1*VL(SRC), GHASHDATA1 781 + vmovdqu8 2*VL(SRC), GHASHDATA2 782 + vmovdqu8 3*VL(SRC), GHASHDATA3 783 + .endif 784 + 785 + // Start the AES encryption of the counter blocks. 786 + _ctr_begin_4x 787 + cmp $24, AESKEYLEN 788 + jl 128f // AES-128? 789 + je 192f // AES-192? 790 + // AES-256 791 + vbroadcasti32x4 -13*16(RNDKEYLAST_PTR), RNDKEY 792 + _vaesenc_4x RNDKEY 793 + vbroadcasti32x4 -12*16(RNDKEYLAST_PTR), RNDKEY 794 + _vaesenc_4x RNDKEY 795 + 192: 796 + vbroadcasti32x4 -11*16(RNDKEYLAST_PTR), RNDKEY 797 + _vaesenc_4x RNDKEY 798 + vbroadcasti32x4 -10*16(RNDKEYLAST_PTR), RNDKEY 799 + _vaesenc_4x RNDKEY 800 + 128: 801 + 802 + // XOR the source data with the last round key, saving the result in 803 + // RNDKEYLAST[0-3]. This reduces latency by taking advantage of the 804 + // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). 805 + .if \enc 806 + vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0 807 + vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1 808 + vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2 809 + vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3 810 + .else 811 + vpxord GHASHDATA0, RNDKEYLAST, RNDKEYLAST0 812 + vpxord GHASHDATA1, RNDKEYLAST, RNDKEYLAST1 813 + vpxord GHASHDATA2, RNDKEYLAST, RNDKEYLAST2 814 + vpxord GHASHDATA3, RNDKEYLAST, RNDKEYLAST3 815 + .endif 816 + 817 + // Finish the AES encryption of the counter blocks in V0-V3, interleaved 818 + // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3]. 819 + .irp i, 9,8,7,6,5 820 + _vaesenc_4x RNDKEY_M\i 821 + _ghash_step_4x (9 - \i) 822 + .endr 823 + .irp i, 4,3,2,1 824 + vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY 825 + _vaesenc_4x RNDKEY 826 + _ghash_step_4x (9 - \i) 827 + .endr 828 + _ghash_step_4x 9 829 + 830 + // Do the last AES round. This handles the XOR with the source data 831 + // too, as per the optimization described above. 832 + vaesenclast RNDKEYLAST0, V0, GHASHDATA0 833 + vaesenclast RNDKEYLAST1, V1, GHASHDATA1 834 + vaesenclast RNDKEYLAST2, V2, GHASHDATA2 835 + vaesenclast RNDKEYLAST3, V3, GHASHDATA3 836 + 837 + // Store the en/decrypted data to DST. 838 + vmovdqu8 GHASHDATA0, 0*VL(DST) 839 + vmovdqu8 GHASHDATA1, 1*VL(DST) 840 + vmovdqu8 GHASHDATA2, 2*VL(DST) 841 + vmovdqu8 GHASHDATA3, 3*VL(DST) 842 + 843 + add $4*VL, SRC 844 + add $4*VL, DST 845 + sub $4*VL, DATALEN 846 + jge .Lcrypt_loop_4x\@ 847 + 848 + .if \enc 849 + .Lghash_last_ciphertext_4x\@: 850 + // Update GHASH with the last set of ciphertext blocks. 851 + .irp i, 0,1,2,3,4,5,6,7,8,9 852 + _ghash_step_4x \i 853 + .endr 854 + .endif 855 + 856 + .Lcrypt_loop_4x_done\@: 857 + 858 + // Undo the extra subtraction by 4*VL and check whether data remains. 859 + add $4*VL, DATALEN 860 + jz .Ldone\@ 861 + 862 + // The data length isn't a multiple of 4*VL. Process the remaining data 863 + // of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time. 864 + // Going one vector at a time may seem inefficient compared to having 865 + // separate code paths for each possible number of vectors remaining. 866 + // However, using a loop keeps the code size down, and it performs 867 + // surprising well; modern CPUs will start executing the next iteration 868 + // before the previous one finishes and also predict the number of loop 869 + // iterations. For a similar reason, we roll up the AES rounds. 870 + // 871 + // On the last iteration, the remaining length may be less than VL. 872 + // Handle this using masking. 873 + // 874 + // Since there are enough key powers available for all remaining data, 875 + // there is no need to do a GHASH reduction after each iteration. 876 + // Instead, multiply each remaining block by its own key power, and only 877 + // do a GHASH reduction at the very end. 878 + 879 + // Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N 880 + // is the number of blocks that remain. 881 + .set POWERS_PTR, LE_CTR_PTR // LE_CTR_PTR is free to be reused. 882 + mov DATALEN, %eax 883 + neg %rax 884 + and $~15, %rax // -round_up(DATALEN, 16) 885 + lea OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR 886 + 887 + // Start collecting the unreduced GHASH intermediate value LO, MI, HI. 888 + .set LO, GHASHDATA0 889 + .set LO_XMM, GHASHDATA0_XMM 890 + .set MI, GHASHDATA1 891 + .set MI_XMM, GHASHDATA1_XMM 892 + .set HI, GHASHDATA2 893 + .set HI_XMM, GHASHDATA2_XMM 894 + vpxor LO_XMM, LO_XMM, LO_XMM 895 + vpxor MI_XMM, MI_XMM, MI_XMM 896 + vpxor HI_XMM, HI_XMM, HI_XMM 897 + 898 + .Lcrypt_loop_1x\@: 899 + 900 + // Select the appropriate mask for this iteration: all 1's if 901 + // DATALEN >= VL, otherwise DATALEN 1's. Do this branchlessly using the 902 + // bzhi instruction from BMI2. (This relies on DATALEN <= 255.) 903 + .if VL < 64 904 + mov $-1, %eax 905 + bzhi DATALEN, %eax, %eax 906 + kmovd %eax, %k1 907 + .else 908 + mov $-1, %rax 909 + bzhi DATALEN64, %rax, %rax 910 + kmovq %rax, %k1 911 + .endif 912 + 913 + // Encrypt a vector of counter blocks. This does not need to be masked. 914 + vpshufb BSWAP_MASK, LE_CTR, V0 915 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR 916 + vpxord RNDKEY0, V0, V0 917 + lea 16(KEY), %rax 918 + 1: 919 + vbroadcasti32x4 (%rax), RNDKEY 920 + vaesenc RNDKEY, V0, V0 921 + add $16, %rax 922 + cmp %rax, RNDKEYLAST_PTR 923 + jne 1b 924 + vaesenclast RNDKEYLAST, V0, V0 925 + 926 + // XOR the data with the appropriate number of keystream bytes. 927 + vmovdqu8 (SRC), V1{%k1}{z} 928 + vpxord V1, V0, V0 929 + vmovdqu8 V0, (DST){%k1} 930 + 931 + // Update GHASH with the ciphertext block(s), without reducing. 932 + // 933 + // In the case of DATALEN < VL, the ciphertext is zero-padded to VL. 934 + // (If decrypting, it's done by the above masked load. If encrypting, 935 + // it's done by the below masked register-to-register move.) Note that 936 + // if DATALEN <= VL - 16, there will be additional padding beyond the 937 + // padding of the last block specified by GHASH itself; i.e., there may 938 + // be whole block(s) that get processed by the GHASH multiplication and 939 + // reduction instructions but should not actually be included in the 940 + // GHASH. However, any such blocks are all-zeroes, and the values that 941 + // they're multiplied with are also all-zeroes. Therefore they just add 942 + // 0 * 0 = 0 to the final GHASH result, which makes no difference. 943 + vmovdqu8 (POWERS_PTR), H_POW1 944 + .if \enc 945 + vmovdqu8 V0, V1{%k1}{z} 946 + .endif 947 + vpshufb BSWAP_MASK, V1, V0 948 + vpxord GHASH_ACC, V0, V0 949 + _ghash_mul_noreduce H_POW1, V0, LO, MI, HI, GHASHDATA3, V1, V2, V3 950 + vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM 951 + 952 + add $VL, POWERS_PTR 953 + add $VL, SRC 954 + add $VL, DST 955 + sub $VL, DATALEN 956 + jg .Lcrypt_loop_1x\@ 957 + 958 + // Finally, do the GHASH reduction. 959 + _ghash_reduce LO, MI, HI, GFPOLY, V0 960 + _horizontal_xor HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm2 961 + 962 + .Ldone\@: 963 + // Store the updated GHASH accumulator back to memory. 964 + vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) 965 + 966 + vzeroupper // This is needed after using ymm or zmm registers. 967 + RET 968 + .endm 969 + 970 + // void aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, 971 + // const u32 le_ctr[4], u8 ghash_acc[16], 972 + // u64 total_aadlen, u64 total_datalen); 973 + // bool aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, 974 + // const u32 le_ctr[4], 975 + // const u8 ghash_acc[16], 976 + // u64 total_aadlen, u64 total_datalen, 977 + // const u8 tag[16], int taglen); 978 + // 979 + // This macro generates one of the above two functions (with \enc selecting 980 + // which one). Both functions finish computing the GCM authentication tag by 981 + // updating GHASH with the lengths block and encrypting the GHASH accumulator. 982 + // |total_aadlen| and |total_datalen| must be the total length of the additional 983 + // authenticated data and the en/decrypted data in bytes, respectively. 984 + // 985 + // The encryption function then stores the full-length (16-byte) computed 986 + // authentication tag to |ghash_acc|. The decryption function instead loads the 987 + // expected authentication tag (the one that was transmitted) from the 16-byte 988 + // buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the 989 + // computed tag in constant time, and returns true if and only if they match. 990 + .macro _aes_gcm_final enc 991 + 992 + // Function arguments 993 + .set KEY, %rdi 994 + .set LE_CTR_PTR, %rsi 995 + .set GHASH_ACC_PTR, %rdx 996 + .set TOTAL_AADLEN, %rcx 997 + .set TOTAL_DATALEN, %r8 998 + .set TAG, %r9 999 + .set TAGLEN, %r10d // Originally at 8(%rsp) 1000 + 1001 + // Additional local variables. 1002 + // %rax, %xmm0-%xmm3, and %k1 are used as temporary registers. 1003 + .set AESKEYLEN, %r11d 1004 + .set AESKEYLEN64, %r11 1005 + .set GFPOLY, %xmm4 1006 + .set BSWAP_MASK, %xmm5 1007 + .set LE_CTR, %xmm6 1008 + .set GHASH_ACC, %xmm7 1009 + .set H_POW1, %xmm8 1010 + 1011 + // Load some constants. 1012 + vmovdqa .Lgfpoly(%rip), GFPOLY 1013 + vmovdqa .Lbswap_mask(%rip), BSWAP_MASK 1014 + 1015 + // Load the AES key length in bytes. 1016 + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN 1017 + 1018 + // Set up a counter block with 1 in the low 32-bit word. This is the 1019 + // counter that produces the ciphertext needed to encrypt the auth tag. 1020 + // GFPOLY has 1 in the low word, so grab the 1 from there using a blend. 1021 + vpblendd $0xe, (LE_CTR_PTR), GFPOLY, LE_CTR 1022 + 1023 + // Build the lengths block and XOR it with the GHASH accumulator. 1024 + // Although the lengths block is defined as the AAD length followed by 1025 + // the en/decrypted data length, both in big-endian byte order, a byte 1026 + // reflection of the full block is needed because of the way we compute 1027 + // GHASH (see _ghash_mul_step). By using little-endian values in the 1028 + // opposite order, we avoid having to reflect any bytes here. 1029 + vmovq TOTAL_DATALEN, %xmm0 1030 + vpinsrq $1, TOTAL_AADLEN, %xmm0, %xmm0 1031 + vpsllq $3, %xmm0, %xmm0 // Bytes to bits 1032 + vpxor (GHASH_ACC_PTR), %xmm0, GHASH_ACC 1033 + 1034 + // Load the first hash key power (H^1), which is stored last. 1035 + vmovdqu8 OFFSETOFEND_H_POWERS-16(KEY), H_POW1 1036 + 1037 + .if !\enc 1038 + // Prepare a mask of TAGLEN one bits. 1039 + movl 8(%rsp), TAGLEN 1040 + mov $-1, %eax 1041 + bzhi TAGLEN, %eax, %eax 1042 + kmovd %eax, %k1 1043 + .endif 1044 + 1045 + // Make %rax point to the last AES round key for the chosen AES variant. 1046 + lea 6*16(KEY,AESKEYLEN64,4), %rax 1047 + 1048 + // Start the AES encryption of the counter block by swapping the counter 1049 + // block to big-endian and XOR-ing it with the zero-th AES round key. 1050 + vpshufb BSWAP_MASK, LE_CTR, %xmm0 1051 + vpxor (KEY), %xmm0, %xmm0 1052 + 1053 + // Complete the AES encryption and multiply GHASH_ACC by H^1. 1054 + // Interleave the AES and GHASH instructions to improve performance. 1055 + cmp $24, AESKEYLEN 1056 + jl 128f // AES-128? 1057 + je 192f // AES-192? 1058 + // AES-256 1059 + vaesenc -13*16(%rax), %xmm0, %xmm0 1060 + vaesenc -12*16(%rax), %xmm0, %xmm0 1061 + 192: 1062 + vaesenc -11*16(%rax), %xmm0, %xmm0 1063 + vaesenc -10*16(%rax), %xmm0, %xmm0 1064 + 128: 1065 + .irp i, 0,1,2,3,4,5,6,7,8 1066 + _ghash_mul_step \i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ 1067 + %xmm1, %xmm2, %xmm3 1068 + vaesenc (\i-9)*16(%rax), %xmm0, %xmm0 1069 + .endr 1070 + _ghash_mul_step 9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ 1071 + %xmm1, %xmm2, %xmm3 1072 + 1073 + // Undo the byte reflection of the GHASH accumulator. 1074 + vpshufb BSWAP_MASK, GHASH_ACC, GHASH_ACC 1075 + 1076 + // Do the last AES round and XOR the resulting keystream block with the 1077 + // GHASH accumulator to produce the full computed authentication tag. 1078 + // 1079 + // Reduce latency by taking advantage of the property vaesenclast(key, 1080 + // a) ^ b == vaesenclast(key ^ b, a). I.e., XOR GHASH_ACC into the last 1081 + // round key, instead of XOR'ing the final AES output with GHASH_ACC. 1082 + // 1083 + // enc_final then returns the computed auth tag, while dec_final 1084 + // compares it with the transmitted one and returns a bool. To compare 1085 + // the tags, dec_final XORs them together and uses vptest to check 1086 + // whether the result is all-zeroes. This should be constant-time. 1087 + // dec_final applies the vaesenclast optimization to this additional 1088 + // value XOR'd too, using vpternlogd to XOR the last round key, GHASH 1089 + // accumulator, and transmitted auth tag together in one instruction. 1090 + .if \enc 1091 + vpxor (%rax), GHASH_ACC, %xmm1 1092 + vaesenclast %xmm1, %xmm0, GHASH_ACC 1093 + vmovdqu GHASH_ACC, (GHASH_ACC_PTR) 1094 + .else 1095 + vmovdqu (TAG), %xmm1 1096 + vpternlogd $0x96, (%rax), GHASH_ACC, %xmm1 1097 + vaesenclast %xmm1, %xmm0, %xmm0 1098 + xor %eax, %eax 1099 + vmovdqu8 %xmm0, %xmm0{%k1}{z} // Truncate to TAGLEN bytes 1100 + vptest %xmm0, %xmm0 1101 + sete %al 1102 + .endif 1103 + // No need for vzeroupper here, since only used xmm registers were used. 1104 + RET 1105 + .endm 1106 + 1107 + _set_veclen 32 1108 + SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_256) 1109 + _aes_gcm_precompute 1110 + SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_256) 1111 + SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_256) 1112 + _aes_gcm_update 1 1113 + SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_256) 1114 + SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_256) 1115 + _aes_gcm_update 0 1116 + SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_256) 1117 + 1118 + _set_veclen 64 1119 + SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_512) 1120 + _aes_gcm_precompute 1121 + SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_512) 1122 + SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_512) 1123 + _aes_gcm_update 1 1124 + SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_512) 1125 + SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_512) 1126 + _aes_gcm_update 0 1127 + SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_512) 1128 + 1129 + // void aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, 1130 + // u8 ghash_acc[16], 1131 + // const u8 *aad, int aadlen); 1132 + // 1133 + // This function processes the AAD (Additional Authenticated Data) in GCM. 1134 + // Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the 1135 + // data given by |aad| and |aadlen|. |key->ghash_key_powers| must have been 1136 + // initialized. On the first call, |ghash_acc| must be all zeroes. |aadlen| 1137 + // must be a multiple of 16, except on the last call where it can be any length. 1138 + // The caller must do any buffering needed to ensure this. 1139 + // 1140 + // AES-GCM is almost always used with small amounts of AAD, less than 32 bytes. 1141 + // Therefore, for AAD processing we currently only provide this implementation 1142 + // which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop. This 1143 + // keeps the code size down, and it enables some micro-optimizations, e.g. using 1144 + // VEX-coded instructions instead of EVEX-coded to save some instruction bytes. 1145 + // To optimize for large amounts of AAD, we could implement a 4x-wide loop and 1146 + // provide a version using 512-bit vectors, but that doesn't seem to be useful. 1147 + SYM_FUNC_START(aes_gcm_aad_update_vaes_avx10) 1148 + 1149 + // Function arguments 1150 + .set KEY, %rdi 1151 + .set GHASH_ACC_PTR, %rsi 1152 + .set AAD, %rdx 1153 + .set AADLEN, %ecx 1154 + .set AADLEN64, %rcx // Zero-extend AADLEN before using! 1155 + 1156 + // Additional local variables. 1157 + // %rax, %ymm0-%ymm3, and %k1 are used as temporary registers. 1158 + .set BSWAP_MASK, %ymm4 1159 + .set GFPOLY, %ymm5 1160 + .set GHASH_ACC, %ymm6 1161 + .set GHASH_ACC_XMM, %xmm6 1162 + .set H_POW1, %ymm7 1163 + 1164 + // Load some constants. 1165 + vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK 1166 + vbroadcasti128 .Lgfpoly(%rip), GFPOLY 1167 + 1168 + // Load the GHASH accumulator. 1169 + vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM 1170 + 1171 + // Update GHASH with 32 bytes of AAD at a time. 1172 + // 1173 + // Pre-subtracting 32 from AADLEN saves an instruction from the loop and 1174 + // also ensures that at least one write always occurs to AADLEN, 1175 + // zero-extending it and allowing AADLEN64 to be used later. 1176 + sub $32, AADLEN 1177 + jl .Laad_loop_1x_done 1178 + vmovdqu8 OFFSETOFEND_H_POWERS-32(KEY), H_POW1 // [H^2, H^1] 1179 + .Laad_loop_1x: 1180 + vmovdqu (AAD), %ymm0 1181 + vpshufb BSWAP_MASK, %ymm0, %ymm0 1182 + vpxor %ymm0, GHASH_ACC, GHASH_ACC 1183 + _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ 1184 + %ymm0, %ymm1, %ymm2 1185 + vextracti128 $1, GHASH_ACC, %xmm0 1186 + vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM 1187 + add $32, AAD 1188 + sub $32, AADLEN 1189 + jge .Laad_loop_1x 1190 + .Laad_loop_1x_done: 1191 + add $32, AADLEN 1192 + jz .Laad_done 1193 + 1194 + // Update GHASH with the remaining 1 <= AADLEN < 32 bytes of AAD. 1195 + mov $-1, %eax 1196 + bzhi AADLEN, %eax, %eax 1197 + kmovd %eax, %k1 1198 + vmovdqu8 (AAD), %ymm0{%k1}{z} 1199 + neg AADLEN64 1200 + and $~15, AADLEN64 // -round_up(AADLEN, 16) 1201 + vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1 1202 + vpshufb BSWAP_MASK, %ymm0, %ymm0 1203 + vpxor %ymm0, GHASH_ACC, GHASH_ACC 1204 + _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ 1205 + %ymm0, %ymm1, %ymm2 1206 + vextracti128 $1, GHASH_ACC, %xmm0 1207 + vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM 1208 + 1209 + .Laad_done: 1210 + // Store the updated GHASH accumulator back to memory. 1211 + vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) 1212 + 1213 + vzeroupper // This is needed after using ymm or zmm registers. 1214 + RET 1215 + SYM_FUNC_END(aes_gcm_aad_update_vaes_avx10) 1216 + 1217 + SYM_FUNC_START(aes_gcm_enc_final_vaes_avx10) 1218 + _aes_gcm_final 1 1219 + SYM_FUNC_END(aes_gcm_enc_final_vaes_avx10) 1220 + SYM_FUNC_START(aes_gcm_dec_final_vaes_avx10) 1221 + _aes_gcm_final 0 1222 + SYM_FUNC_END(aes_gcm_dec_final_vaes_avx10)

+1 -1502

arch/x86/crypto/aesni-intel_asm.S

··· 10 10 * Vinodh Gopal <vinodh.gopal@intel.com> 11 11 * Kahraman Akdemir 12 12 * 13 - * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 14 - * interface for 64-bit kernels. 15 - * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 16 - * Aidan O'Mahony (aidan.o.mahony@intel.com) 17 - * Adrian Hoban <adrian.hoban@intel.com> 18 - * James Guilford (james.guilford@intel.com) 19 - * Gabriele Paoloni <gabriele.paoloni@intel.com> 20 - * Tadeusz Struk (tadeusz.struk@intel.com) 21 - * Wajdi Feghali (wajdi.k.feghali@intel.com) 22 - * Copyright (c) 2010, Intel Corporation. 13 + * Copyright (c) 2010, Intel Corporation. 23 14 * 24 15 * Ported x86_64 version to x86: 25 16 * Author: Mathias Krause <minipli@googlemail.com> ··· 18 27 19 28 #include <linux/linkage.h> 20 29 #include <asm/frame.h> 21 - #include <asm/nospec-branch.h> 22 - 23 - /* 24 - * The following macros are used to move an (un)aligned 16 byte value to/from 25 - * an XMM register. This can done for either FP or integer values, for FP use 26 - * movaps (move aligned packed single) or integer use movdqa (move double quad 27 - * aligned). It doesn't make a performance difference which instruction is used 28 - * since Nehalem (original Core i7) was released. However, the movaps is a byte 29 - * shorter, so that is the one we'll use for now. (same for unaligned). 30 - */ 31 - #define MOVADQ movaps 32 - #define MOVUDQ movups 33 - 34 - #ifdef __x86_64__ 35 - 36 - # constants in mergeable sections, linker can reorder and merge 37 - .section .rodata.cst16.POLY, "aM", @progbits, 16 38 - .align 16 39 - POLY: .octa 0xC2000000000000000000000000000001 40 - .section .rodata.cst16.TWOONE, "aM", @progbits, 16 41 - .align 16 42 - TWOONE: .octa 0x00000001000000000000000000000001 43 - 44 - .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 45 - .align 16 46 - SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 47 - .section .rodata.cst16.MASK1, "aM", @progbits, 16 48 - .align 16 49 - MASK1: .octa 0x0000000000000000ffffffffffffffff 50 - .section .rodata.cst16.MASK2, "aM", @progbits, 16 51 - .align 16 52 - MASK2: .octa 0xffffffffffffffff0000000000000000 53 - .section .rodata.cst16.ONE, "aM", @progbits, 16 54 - .align 16 55 - ONE: .octa 0x00000000000000000000000000000001 56 - .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 57 - .align 16 58 - F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 59 - .section .rodata.cst16.dec, "aM", @progbits, 16 60 - .align 16 61 - dec: .octa 0x1 62 - .section .rodata.cst16.enc, "aM", @progbits, 16 63 - .align 16 64 - enc: .octa 0x2 65 - 66 - # order of these constants should not change. 67 - # more specifically, ALL_F should follow SHIFT_MASK, 68 - # and zero should follow ALL_F 69 - .section .rodata, "a", @progbits 70 - .align 16 71 - SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 72 - ALL_F: .octa 0xffffffffffffffffffffffffffffffff 73 - .octa 0x00000000000000000000000000000000 74 - 75 - .text 76 - 77 - #define AadHash 16*0 78 - #define AadLen 16*1 79 - #define InLen (16*1)+8 80 - #define PBlockEncKey 16*2 81 - #define OrigIV 16*3 82 - #define CurCount 16*4 83 - #define PBlockLen 16*5 84 - #define HashKey 16*6 // store HashKey <<1 mod poly here 85 - #define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here 86 - #define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here 87 - #define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here 88 - #define HashKey_k 16*10 // store XOR of High 64 bits and Low 64 89 - // bits of HashKey <<1 mod poly here 90 - //(for Karatsuba purposes) 91 - #define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64 92 - // bits of HashKey^2 <<1 mod poly here 93 - // (for Karatsuba purposes) 94 - #define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64 95 - // bits of HashKey^3 <<1 mod poly here 96 - // (for Karatsuba purposes) 97 - #define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64 98 - // bits of HashKey^4 <<1 mod poly here 99 - // (for Karatsuba purposes) 100 - 101 - #define arg1 rdi 102 - #define arg2 rsi 103 - #define arg3 rdx 104 - #define arg4 rcx 105 - #define arg5 r8 106 - #define arg6 r9 107 - #define keysize 2*15*16(%arg1) 108 - #endif 109 - 110 30 111 31 #define STATE1 %xmm0 112 32 #define STATE2 %xmm4 ··· 62 160 #define KLEN %ebx 63 161 #define T1 %ecx 64 162 #define TKEYP T1 65 - #endif 66 - 67 - .macro FUNC_SAVE 68 - push %r12 69 - push %r13 70 - push %r14 71 - # 72 - # states of %xmm registers %xmm6:%xmm15 not saved 73 - # all %xmm registers are clobbered 74 - # 75 - .endm 76 - 77 - 78 - .macro FUNC_RESTORE 79 - pop %r14 80 - pop %r13 81 - pop %r12 82 - .endm 83 - 84 - # Precompute hashkeys. 85 - # Input: Hash subkey. 86 - # Output: HashKeys stored in gcm_context_data. Only needs to be called 87 - # once per key. 88 - # clobbers r12, and tmp xmm registers. 89 - .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 90 - mov \SUBKEY, %r12 91 - movdqu (%r12), \TMP3 92 - movdqa SHUF_MASK(%rip), \TMP2 93 - pshufb \TMP2, \TMP3 94 - 95 - # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 96 - 97 - movdqa \TMP3, \TMP2 98 - psllq $1, \TMP3 99 - psrlq $63, \TMP2 100 - movdqa \TMP2, \TMP1 101 - pslldq $8, \TMP2 102 - psrldq $8, \TMP1 103 - por \TMP2, \TMP3 104 - 105 - # reduce HashKey<<1 106 - 107 - pshufd $0x24, \TMP1, \TMP2 108 - pcmpeqd TWOONE(%rip), \TMP2 109 - pand POLY(%rip), \TMP2 110 - pxor \TMP2, \TMP3 111 - movdqu \TMP3, HashKey(%arg2) 112 - 113 - movdqa \TMP3, \TMP5 114 - pshufd $78, \TMP3, \TMP1 115 - pxor \TMP3, \TMP1 116 - movdqu \TMP1, HashKey_k(%arg2) 117 - 118 - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 119 - # TMP5 = HashKey^2<<1 (mod poly) 120 - movdqu \TMP5, HashKey_2(%arg2) 121 - # HashKey_2 = HashKey^2<<1 (mod poly) 122 - pshufd $78, \TMP5, \TMP1 123 - pxor \TMP5, \TMP1 124 - movdqu \TMP1, HashKey_2_k(%arg2) 125 - 126 - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 127 - # TMP5 = HashKey^3<<1 (mod poly) 128 - movdqu \TMP5, HashKey_3(%arg2) 129 - pshufd $78, \TMP5, \TMP1 130 - pxor \TMP5, \TMP1 131 - movdqu \TMP1, HashKey_3_k(%arg2) 132 - 133 - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 134 - # TMP5 = HashKey^3<<1 (mod poly) 135 - movdqu \TMP5, HashKey_4(%arg2) 136 - pshufd $78, \TMP5, \TMP1 137 - pxor \TMP5, \TMP1 138 - movdqu \TMP1, HashKey_4_k(%arg2) 139 - .endm 140 - 141 - # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. 142 - # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13 143 - .macro GCM_INIT Iv SUBKEY AAD AADLEN 144 - mov \AADLEN, %r11 145 - mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length 146 - xor %r11d, %r11d 147 - mov %r11, InLen(%arg2) # ctx_data.in_length = 0 148 - mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 149 - mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 150 - mov \Iv, %rax 151 - movdqu (%rax), %xmm0 152 - movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv 153 - 154 - movdqa SHUF_MASK(%rip), %xmm2 155 - pshufb %xmm2, %xmm0 156 - movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv 157 - 158 - PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7 159 - movdqu HashKey(%arg2), %xmm13 160 - 161 - CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ 162 - %xmm4, %xmm5, %xmm6 163 - .endm 164 - 165 - # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context 166 - # struct has been initialized by GCM_INIT. 167 - # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK 168 - # Clobbers rax, r10-r13, and xmm0-xmm15 169 - .macro GCM_ENC_DEC operation 170 - movdqu AadHash(%arg2), %xmm8 171 - movdqu HashKey(%arg2), %xmm13 172 - add %arg5, InLen(%arg2) 173 - 174 - xor %r11d, %r11d # initialise the data pointer offset as zero 175 - PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation 176 - 177 - sub %r11, %arg5 # sub partial block data used 178 - mov %arg5, %r13 # save the number of bytes 179 - 180 - and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 181 - mov %r13, %r12 182 - # Encrypt/Decrypt first few blocks 183 - 184 - and $(3<<4), %r12 185 - jz .L_initial_num_blocks_is_0_\@ 186 - cmp $(2<<4), %r12 187 - jb .L_initial_num_blocks_is_1_\@ 188 - je .L_initial_num_blocks_is_2_\@ 189 - .L_initial_num_blocks_is_3_\@: 190 - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 191 - %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation 192 - sub $48, %r13 193 - jmp .L_initial_blocks_\@ 194 - .L_initial_num_blocks_is_2_\@: 195 - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 196 - %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation 197 - sub $32, %r13 198 - jmp .L_initial_blocks_\@ 199 - .L_initial_num_blocks_is_1_\@: 200 - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 201 - %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation 202 - sub $16, %r13 203 - jmp .L_initial_blocks_\@ 204 - .L_initial_num_blocks_is_0_\@: 205 - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 206 - %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation 207 - .L_initial_blocks_\@: 208 - 209 - # Main loop - Encrypt/Decrypt remaining blocks 210 - 211 - test %r13, %r13 212 - je .L_zero_cipher_left_\@ 213 - sub $64, %r13 214 - je .L_four_cipher_left_\@ 215 - .L_crypt_by_4_\@: 216 - GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ 217 - %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ 218 - %xmm7, %xmm8, enc 219 - add $64, %r11 220 - sub $64, %r13 221 - jne .L_crypt_by_4_\@ 222 - .L_four_cipher_left_\@: 223 - GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 224 - %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 225 - .L_zero_cipher_left_\@: 226 - movdqu %xmm8, AadHash(%arg2) 227 - movdqu %xmm0, CurCount(%arg2) 228 - 229 - mov %arg5, %r13 230 - and $15, %r13 # %r13 = arg5 (mod 16) 231 - je .L_multiple_of_16_bytes_\@ 232 - 233 - mov %r13, PBlockLen(%arg2) 234 - 235 - # Handle the last <16 Byte block separately 236 - paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 237 - movdqu %xmm0, CurCount(%arg2) 238 - movdqa SHUF_MASK(%rip), %xmm10 239 - pshufb %xmm10, %xmm0 240 - 241 - ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 242 - movdqu %xmm0, PBlockEncKey(%arg2) 243 - 244 - cmp $16, %arg5 245 - jge .L_large_enough_update_\@ 246 - 247 - lea (%arg4,%r11,1), %r10 248 - mov %r13, %r12 249 - READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 250 - jmp .L_data_read_\@ 251 - 252 - .L_large_enough_update_\@: 253 - sub $16, %r11 254 - add %r13, %r11 255 - 256 - # receive the last <16 Byte block 257 - movdqu (%arg4, %r11, 1), %xmm1 258 - 259 - sub %r13, %r11 260 - add $16, %r11 261 - 262 - lea SHIFT_MASK+16(%rip), %r12 263 - # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 264 - # (r13 is the number of bytes in plaintext mod 16) 265 - sub %r13, %r12 266 - # get the appropriate shuffle mask 267 - movdqu (%r12), %xmm2 268 - # shift right 16-r13 bytes 269 - pshufb %xmm2, %xmm1 270 - 271 - .L_data_read_\@: 272 - lea ALL_F+16(%rip), %r12 273 - sub %r13, %r12 274 - 275 - .ifc \operation, dec 276 - movdqa %xmm1, %xmm2 277 - .endif 278 - pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn) 279 - movdqu (%r12), %xmm1 280 - # get the appropriate mask to mask out top 16-r13 bytes of xmm0 281 - pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 282 - .ifc \operation, dec 283 - pand %xmm1, %xmm2 284 - movdqa SHUF_MASK(%rip), %xmm10 285 - pshufb %xmm10 ,%xmm2 286 - 287 - pxor %xmm2, %xmm8 288 - .else 289 - movdqa SHUF_MASK(%rip), %xmm10 290 - pshufb %xmm10,%xmm0 291 - 292 - pxor %xmm0, %xmm8 293 - .endif 294 - 295 - movdqu %xmm8, AadHash(%arg2) 296 - .ifc \operation, enc 297 - # GHASH computation for the last <16 byte block 298 - movdqa SHUF_MASK(%rip), %xmm10 299 - # shuffle xmm0 back to output as ciphertext 300 - pshufb %xmm10, %xmm0 301 - .endif 302 - 303 - # Output %r13 bytes 304 - movq %xmm0, %rax 305 - cmp $8, %r13 306 - jle .L_less_than_8_bytes_left_\@ 307 - mov %rax, (%arg3 , %r11, 1) 308 - add $8, %r11 309 - psrldq $8, %xmm0 310 - movq %xmm0, %rax 311 - sub $8, %r13 312 - .L_less_than_8_bytes_left_\@: 313 - mov %al, (%arg3, %r11, 1) 314 - add $1, %r11 315 - shr $8, %rax 316 - sub $1, %r13 317 - jne .L_less_than_8_bytes_left_\@ 318 - .L_multiple_of_16_bytes_\@: 319 - .endm 320 - 321 - # GCM_COMPLETE Finishes update of tag of last partial block 322 - # Output: Authorization Tag (AUTH_TAG) 323 - # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 324 - .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN 325 - movdqu AadHash(%arg2), %xmm8 326 - movdqu HashKey(%arg2), %xmm13 327 - 328 - mov PBlockLen(%arg2), %r12 329 - 330 - test %r12, %r12 331 - je .L_partial_done\@ 332 - 333 - GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 334 - 335 - .L_partial_done\@: 336 - mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) 337 - shl $3, %r12 # convert into number of bits 338 - movd %r12d, %xmm15 # len(A) in %xmm15 339 - mov InLen(%arg2), %r12 340 - shl $3, %r12 # len(C) in bits (*128) 341 - movq %r12, %xmm1 342 - 343 - pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 344 - pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 345 - pxor %xmm15, %xmm8 346 - GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 347 - # final GHASH computation 348 - movdqa SHUF_MASK(%rip), %xmm10 349 - pshufb %xmm10, %xmm8 350 - 351 - movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 352 - ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 353 - pxor %xmm8, %xmm0 354 - .L_return_T_\@: 355 - mov \AUTHTAG, %r10 # %r10 = authTag 356 - mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len 357 - cmp $16, %r11 358 - je .L_T_16_\@ 359 - cmp $8, %r11 360 - jl .L_T_4_\@ 361 - .L_T_8_\@: 362 - movq %xmm0, %rax 363 - mov %rax, (%r10) 364 - add $8, %r10 365 - sub $8, %r11 366 - psrldq $8, %xmm0 367 - test %r11, %r11 368 - je .L_return_T_done_\@ 369 - .L_T_4_\@: 370 - movd %xmm0, %eax 371 - mov %eax, (%r10) 372 - add $4, %r10 373 - sub $4, %r11 374 - psrldq $4, %xmm0 375 - test %r11, %r11 376 - je .L_return_T_done_\@ 377 - .L_T_123_\@: 378 - movd %xmm0, %eax 379 - cmp $2, %r11 380 - jl .L_T_1_\@ 381 - mov %ax, (%r10) 382 - cmp $2, %r11 383 - je .L_return_T_done_\@ 384 - add $2, %r10 385 - sar $16, %eax 386 - .L_T_1_\@: 387 - mov %al, (%r10) 388 - jmp .L_return_T_done_\@ 389 - .L_T_16_\@: 390 - movdqu %xmm0, (%r10) 391 - .L_return_T_done_\@: 392 - .endm 393 - 394 - #ifdef __x86_64__ 395 - /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 396 - * 397 - * 398 - * Input: A and B (128-bits each, bit-reflected) 399 - * Output: C = A*B*x mod poly, (i.e. >>1 ) 400 - * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 401 - * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 402 - * 403 - */ 404 - .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 405 - movdqa \GH, \TMP1 406 - pshufd $78, \GH, \TMP2 407 - pshufd $78, \HK, \TMP3 408 - pxor \GH, \TMP2 # TMP2 = a1+a0 409 - pxor \HK, \TMP3 # TMP3 = b1+b0 410 - pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1 411 - pclmulqdq $0x00, \HK, \GH # GH = a0*b0 412 - pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 413 - pxor \GH, \TMP2 414 - pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 415 - movdqa \TMP2, \TMP3 416 - pslldq $8, \TMP3 # left shift TMP3 2 DWs 417 - psrldq $8, \TMP2 # right shift TMP2 2 DWs 418 - pxor \TMP3, \GH 419 - pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 420 - 421 - # first phase of the reduction 422 - 423 - movdqa \GH, \TMP2 424 - movdqa \GH, \TMP3 425 - movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 426 - # in in order to perform 427 - # independent shifts 428 - pslld $31, \TMP2 # packed right shift <<31 429 - pslld $30, \TMP3 # packed right shift <<30 430 - pslld $25, \TMP4 # packed right shift <<25 431 - pxor \TMP3, \TMP2 # xor the shifted versions 432 - pxor \TMP4, \TMP2 433 - movdqa \TMP2, \TMP5 434 - psrldq $4, \TMP5 # right shift TMP5 1 DW 435 - pslldq $12, \TMP2 # left shift TMP2 3 DWs 436 - pxor \TMP2, \GH 437 - 438 - # second phase of the reduction 439 - 440 - movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 441 - # in in order to perform 442 - # independent shifts 443 - movdqa \GH,\TMP3 444 - movdqa \GH,\TMP4 445 - psrld $1,\TMP2 # packed left shift >>1 446 - psrld $2,\TMP3 # packed left shift >>2 447 - psrld $7,\TMP4 # packed left shift >>7 448 - pxor \TMP3,\TMP2 # xor the shifted versions 449 - pxor \TMP4,\TMP2 450 - pxor \TMP5, \TMP2 451 - pxor \TMP2, \GH 452 - pxor \TMP1, \GH # result is in TMP1 453 - .endm 454 - 455 - # Reads DLEN bytes starting at DPTR and stores in XMMDst 456 - # where 0 < DLEN < 16 457 - # Clobbers %rax, DLEN and XMM1 458 - .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst 459 - cmp $8, \DLEN 460 - jl .L_read_lt8_\@ 461 - mov (\DPTR), %rax 462 - movq %rax, \XMMDst 463 - sub $8, \DLEN 464 - jz .L_done_read_partial_block_\@ 465 - xor %eax, %eax 466 - .L_read_next_byte_\@: 467 - shl $8, %rax 468 - mov 7(\DPTR, \DLEN, 1), %al 469 - dec \DLEN 470 - jnz .L_read_next_byte_\@ 471 - movq %rax, \XMM1 472 - pslldq $8, \XMM1 473 - por \XMM1, \XMMDst 474 - jmp .L_done_read_partial_block_\@ 475 - .L_read_lt8_\@: 476 - xor %eax, %eax 477 - .L_read_next_byte_lt8_\@: 478 - shl $8, %rax 479 - mov -1(\DPTR, \DLEN, 1), %al 480 - dec \DLEN 481 - jnz .L_read_next_byte_lt8_\@ 482 - movq %rax, \XMMDst 483 - .L_done_read_partial_block_\@: 484 - .endm 485 - 486 - # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. 487 - # clobbers r10-11, xmm14 488 - .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \ 489 - TMP6 TMP7 490 - MOVADQ SHUF_MASK(%rip), %xmm14 491 - mov \AAD, %r10 # %r10 = AAD 492 - mov \AADLEN, %r11 # %r11 = aadLen 493 - pxor \TMP7, \TMP7 494 - pxor \TMP6, \TMP6 495 - 496 - cmp $16, %r11 497 - jl .L_get_AAD_rest\@ 498 - .L_get_AAD_blocks\@: 499 - movdqu (%r10), \TMP7 500 - pshufb %xmm14, \TMP7 # byte-reflect the AAD data 501 - pxor \TMP7, \TMP6 502 - GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 503 - add $16, %r10 504 - sub $16, %r11 505 - cmp $16, %r11 506 - jge .L_get_AAD_blocks\@ 507 - 508 - movdqu \TMP6, \TMP7 509 - 510 - /* read the last <16B of AAD */ 511 - .L_get_AAD_rest\@: 512 - test %r11, %r11 513 - je .L_get_AAD_done\@ 514 - 515 - READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 516 - pshufb %xmm14, \TMP7 # byte-reflect the AAD data 517 - pxor \TMP6, \TMP7 518 - GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 519 - movdqu \TMP7, \TMP6 520 - 521 - .L_get_AAD_done\@: 522 - movdqu \TMP6, AadHash(%arg2) 523 - .endm 524 - 525 - # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks 526 - # between update calls. 527 - # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK 528 - # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context 529 - # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 530 - .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ 531 - AAD_HASH operation 532 - mov PBlockLen(%arg2), %r13 533 - test %r13, %r13 534 - je .L_partial_block_done_\@ # Leave Macro if no partial blocks 535 - # Read in input data without over reading 536 - cmp $16, \PLAIN_CYPH_LEN 537 - jl .L_fewer_than_16_bytes_\@ 538 - movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm 539 - jmp .L_data_read_\@ 540 - 541 - .L_fewer_than_16_bytes_\@: 542 - lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 543 - mov \PLAIN_CYPH_LEN, %r12 544 - READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 545 - 546 - mov PBlockLen(%arg2), %r13 547 - 548 - .L_data_read_\@: # Finished reading in data 549 - 550 - movdqu PBlockEncKey(%arg2), %xmm9 551 - movdqu HashKey(%arg2), %xmm13 552 - 553 - lea SHIFT_MASK(%rip), %r12 554 - 555 - # adjust the shuffle mask pointer to be able to shift r13 bytes 556 - # r16-r13 is the number of bytes in plaintext mod 16) 557 - add %r13, %r12 558 - movdqu (%r12), %xmm2 # get the appropriate shuffle mask 559 - pshufb %xmm2, %xmm9 # shift right r13 bytes 560 - 561 - .ifc \operation, dec 562 - movdqa %xmm1, %xmm3 563 - pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn) 564 - 565 - mov \PLAIN_CYPH_LEN, %r10 566 - add %r13, %r10 567 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 568 - sub $16, %r10 569 - # Determine if partial block is not being filled and 570 - # shift mask accordingly 571 - jge .L_no_extra_mask_1_\@ 572 - sub %r10, %r12 573 - .L_no_extra_mask_1_\@: 574 - 575 - movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 576 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 577 - pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9 578 - 579 - pand %xmm1, %xmm3 580 - movdqa SHUF_MASK(%rip), %xmm10 581 - pshufb %xmm10, %xmm3 582 - pshufb %xmm2, %xmm3 583 - pxor %xmm3, \AAD_HASH 584 - 585 - test %r10, %r10 586 - jl .L_partial_incomplete_1_\@ 587 - 588 - # GHASH computation for the last <16 Byte block 589 - GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 590 - xor %eax, %eax 591 - 592 - mov %rax, PBlockLen(%arg2) 593 - jmp .L_dec_done_\@ 594 - .L_partial_incomplete_1_\@: 595 - add \PLAIN_CYPH_LEN, PBlockLen(%arg2) 596 - .L_dec_done_\@: 597 - movdqu \AAD_HASH, AadHash(%arg2) 598 - .else 599 - pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) 600 - 601 - mov \PLAIN_CYPH_LEN, %r10 602 - add %r13, %r10 603 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 604 - sub $16, %r10 605 - # Determine if partial block is not being filled and 606 - # shift mask accordingly 607 - jge .L_no_extra_mask_2_\@ 608 - sub %r10, %r12 609 - .L_no_extra_mask_2_\@: 610 - 611 - movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 612 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 613 - pand %xmm1, %xmm9 614 - 615 - movdqa SHUF_MASK(%rip), %xmm1 616 - pshufb %xmm1, %xmm9 617 - pshufb %xmm2, %xmm9 618 - pxor %xmm9, \AAD_HASH 619 - 620 - test %r10, %r10 621 - jl .L_partial_incomplete_2_\@ 622 - 623 - # GHASH computation for the last <16 Byte block 624 - GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 625 - xor %eax, %eax 626 - 627 - mov %rax, PBlockLen(%arg2) 628 - jmp .L_encode_done_\@ 629 - .L_partial_incomplete_2_\@: 630 - add \PLAIN_CYPH_LEN, PBlockLen(%arg2) 631 - .L_encode_done_\@: 632 - movdqu \AAD_HASH, AadHash(%arg2) 633 - 634 - movdqa SHUF_MASK(%rip), %xmm10 635 - # shuffle xmm9 back to output as ciphertext 636 - pshufb %xmm10, %xmm9 637 - pshufb %xmm2, %xmm9 638 - .endif 639 - # output encrypted Bytes 640 - test %r10, %r10 641 - jl .L_partial_fill_\@ 642 - mov %r13, %r12 643 - mov $16, %r13 644 - # Set r13 to be the number of bytes to write out 645 - sub %r12, %r13 646 - jmp .L_count_set_\@ 647 - .L_partial_fill_\@: 648 - mov \PLAIN_CYPH_LEN, %r13 649 - .L_count_set_\@: 650 - movdqa %xmm9, %xmm0 651 - movq %xmm0, %rax 652 - cmp $8, %r13 653 - jle .L_less_than_8_bytes_left_\@ 654 - 655 - mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 656 - add $8, \DATA_OFFSET 657 - psrldq $8, %xmm0 658 - movq %xmm0, %rax 659 - sub $8, %r13 660 - .L_less_than_8_bytes_left_\@: 661 - movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 662 - add $1, \DATA_OFFSET 663 - shr $8, %rax 664 - sub $1, %r13 665 - jne .L_less_than_8_bytes_left_\@ 666 - .L_partial_block_done_\@: 667 - .endm # PARTIAL_BLOCK 668 - 669 - /* 670 - * if a = number of total plaintext bytes 671 - * b = floor(a/16) 672 - * num_initial_blocks = b mod 4 673 - * encrypt the initial num_initial_blocks blocks and apply ghash on 674 - * the ciphertext 675 - * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 676 - * are clobbered 677 - * arg1, %arg2, %arg3 are used as a pointer only, not modified 678 - */ 679 - 680 - 681 - .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 682 - XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 683 - MOVADQ SHUF_MASK(%rip), %xmm14 684 - 685 - movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0 686 - 687 - # start AES for num_initial_blocks blocks 688 - 689 - movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0 690 - 691 - .if (\i == 5) || (\i == 6) || (\i == 7) 692 - 693 - MOVADQ ONE(%RIP),\TMP1 694 - MOVADQ 0(%arg1),\TMP2 695 - .irpc index, \i_seq 696 - paddd \TMP1, \XMM0 # INCR Y0 697 - .ifc \operation, dec 698 - movdqa \XMM0, %xmm\index 699 - .else 700 - MOVADQ \XMM0, %xmm\index 701 - .endif 702 - pshufb %xmm14, %xmm\index # perform a 16 byte swap 703 - pxor \TMP2, %xmm\index 704 - .endr 705 - lea 0x10(%arg1),%r10 706 - mov keysize,%eax 707 - shr $2,%eax # 128->4, 192->6, 256->8 708 - add $5,%eax # 128->9, 192->11, 256->13 709 - 710 - .Laes_loop_initial_\@: 711 - MOVADQ (%r10),\TMP1 712 - .irpc index, \i_seq 713 - aesenc \TMP1, %xmm\index 714 - .endr 715 - add $16,%r10 716 - sub $1,%eax 717 - jnz .Laes_loop_initial_\@ 718 - 719 - MOVADQ (%r10), \TMP1 720 - .irpc index, \i_seq 721 - aesenclast \TMP1, %xmm\index # Last Round 722 - .endr 723 - .irpc index, \i_seq 724 - movdqu (%arg4 , %r11, 1), \TMP1 725 - pxor \TMP1, %xmm\index 726 - movdqu %xmm\index, (%arg3 , %r11, 1) 727 - # write back plaintext/ciphertext for num_initial_blocks 728 - add $16, %r11 729 - 730 - .ifc \operation, dec 731 - movdqa \TMP1, %xmm\index 732 - .endif 733 - pshufb %xmm14, %xmm\index 734 - 735 - # prepare plaintext/ciphertext for GHASH computation 736 - .endr 737 - .endif 738 - 739 - # apply GHASH on num_initial_blocks blocks 740 - 741 - .if \i == 5 742 - pxor %xmm5, %xmm6 743 - GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 744 - pxor %xmm6, %xmm7 745 - GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 746 - pxor %xmm7, %xmm8 747 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 748 - .elseif \i == 6 749 - pxor %xmm6, %xmm7 750 - GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 751 - pxor %xmm7, %xmm8 752 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 753 - .elseif \i == 7 754 - pxor %xmm7, %xmm8 755 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 756 - .endif 757 - cmp $64, %r13 758 - jl .L_initial_blocks_done\@ 759 - # no need for precomputed values 760 - /* 761 - * 762 - * Precomputations for HashKey parallel with encryption of first 4 blocks. 763 - * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 764 - */ 765 - MOVADQ ONE(%RIP),\TMP1 766 - paddd \TMP1, \XMM0 # INCR Y0 767 - MOVADQ \XMM0, \XMM1 768 - pshufb %xmm14, \XMM1 # perform a 16 byte swap 769 - 770 - paddd \TMP1, \XMM0 # INCR Y0 771 - MOVADQ \XMM0, \XMM2 772 - pshufb %xmm14, \XMM2 # perform a 16 byte swap 773 - 774 - paddd \TMP1, \XMM0 # INCR Y0 775 - MOVADQ \XMM0, \XMM3 776 - pshufb %xmm14, \XMM3 # perform a 16 byte swap 777 - 778 - paddd \TMP1, \XMM0 # INCR Y0 779 - MOVADQ \XMM0, \XMM4 780 - pshufb %xmm14, \XMM4 # perform a 16 byte swap 781 - 782 - MOVADQ 0(%arg1),\TMP1 783 - pxor \TMP1, \XMM1 784 - pxor \TMP1, \XMM2 785 - pxor \TMP1, \XMM3 786 - pxor \TMP1, \XMM4 787 - .irpc index, 1234 # do 4 rounds 788 - movaps 0x10*\index(%arg1), \TMP1 789 - aesenc \TMP1, \XMM1 790 - aesenc \TMP1, \XMM2 791 - aesenc \TMP1, \XMM3 792 - aesenc \TMP1, \XMM4 793 - .endr 794 - .irpc index, 56789 # do next 5 rounds 795 - movaps 0x10*\index(%arg1), \TMP1 796 - aesenc \TMP1, \XMM1 797 - aesenc \TMP1, \XMM2 798 - aesenc \TMP1, \XMM3 799 - aesenc \TMP1, \XMM4 800 - .endr 801 - lea 0xa0(%arg1),%r10 802 - mov keysize,%eax 803 - shr $2,%eax # 128->4, 192->6, 256->8 804 - sub $4,%eax # 128->0, 192->2, 256->4 805 - jz .Laes_loop_pre_done\@ 806 - 807 - .Laes_loop_pre_\@: 808 - MOVADQ (%r10),\TMP2 809 - .irpc index, 1234 810 - aesenc \TMP2, %xmm\index 811 - .endr 812 - add $16,%r10 813 - sub $1,%eax 814 - jnz .Laes_loop_pre_\@ 815 - 816 - .Laes_loop_pre_done\@: 817 - MOVADQ (%r10), \TMP2 818 - aesenclast \TMP2, \XMM1 819 - aesenclast \TMP2, \XMM2 820 - aesenclast \TMP2, \XMM3 821 - aesenclast \TMP2, \XMM4 822 - movdqu 16*0(%arg4 , %r11 , 1), \TMP1 823 - pxor \TMP1, \XMM1 824 - .ifc \operation, dec 825 - movdqu \XMM1, 16*0(%arg3 , %r11 , 1) 826 - movdqa \TMP1, \XMM1 827 - .endif 828 - movdqu 16*1(%arg4 , %r11 , 1), \TMP1 829 - pxor \TMP1, \XMM2 830 - .ifc \operation, dec 831 - movdqu \XMM2, 16*1(%arg3 , %r11 , 1) 832 - movdqa \TMP1, \XMM2 833 - .endif 834 - movdqu 16*2(%arg4 , %r11 , 1), \TMP1 835 - pxor \TMP1, \XMM3 836 - .ifc \operation, dec 837 - movdqu \XMM3, 16*2(%arg3 , %r11 , 1) 838 - movdqa \TMP1, \XMM3 839 - .endif 840 - movdqu 16*3(%arg4 , %r11 , 1), \TMP1 841 - pxor \TMP1, \XMM4 842 - .ifc \operation, dec 843 - movdqu \XMM4, 16*3(%arg3 , %r11 , 1) 844 - movdqa \TMP1, \XMM4 845 - .else 846 - movdqu \XMM1, 16*0(%arg3 , %r11 , 1) 847 - movdqu \XMM2, 16*1(%arg3 , %r11 , 1) 848 - movdqu \XMM3, 16*2(%arg3 , %r11 , 1) 849 - movdqu \XMM4, 16*3(%arg3 , %r11 , 1) 850 - .endif 851 - 852 - add $64, %r11 853 - pshufb %xmm14, \XMM1 # perform a 16 byte swap 854 - pxor \XMMDst, \XMM1 855 - # combine GHASHed value with the corresponding ciphertext 856 - pshufb %xmm14, \XMM2 # perform a 16 byte swap 857 - pshufb %xmm14, \XMM3 # perform a 16 byte swap 858 - pshufb %xmm14, \XMM4 # perform a 16 byte swap 859 - 860 - .L_initial_blocks_done\@: 861 - 862 - .endm 863 - 864 - /* 865 - * encrypt 4 blocks at a time 866 - * ghash the 4 previously encrypted ciphertext blocks 867 - * arg1, %arg3, %arg4 are used as pointers only, not modified 868 - * %r11 is the data offset value 869 - */ 870 - .macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \ 871 - TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 872 - 873 - movdqa \XMM1, \XMM5 874 - movdqa \XMM2, \XMM6 875 - movdqa \XMM3, \XMM7 876 - movdqa \XMM4, \XMM8 877 - 878 - movdqa SHUF_MASK(%rip), %xmm15 879 - # multiply TMP5 * HashKey using karatsuba 880 - 881 - movdqa \XMM5, \TMP4 882 - pshufd $78, \XMM5, \TMP6 883 - pxor \XMM5, \TMP6 884 - paddd ONE(%rip), \XMM0 # INCR CNT 885 - movdqu HashKey_4(%arg2), \TMP5 886 - pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 887 - movdqa \XMM0, \XMM1 888 - paddd ONE(%rip), \XMM0 # INCR CNT 889 - movdqa \XMM0, \XMM2 890 - paddd ONE(%rip), \XMM0 # INCR CNT 891 - movdqa \XMM0, \XMM3 892 - paddd ONE(%rip), \XMM0 # INCR CNT 893 - movdqa \XMM0, \XMM4 894 - pshufb %xmm15, \XMM1 # perform a 16 byte swap 895 - pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 896 - pshufb %xmm15, \XMM2 # perform a 16 byte swap 897 - pshufb %xmm15, \XMM3 # perform a 16 byte swap 898 - pshufb %xmm15, \XMM4 # perform a 16 byte swap 899 - 900 - pxor (%arg1), \XMM1 901 - pxor (%arg1), \XMM2 902 - pxor (%arg1), \XMM3 903 - pxor (%arg1), \XMM4 904 - movdqu HashKey_4_k(%arg2), \TMP5 905 - pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 906 - movaps 0x10(%arg1), \TMP1 907 - aesenc \TMP1, \XMM1 # Round 1 908 - aesenc \TMP1, \XMM2 909 - aesenc \TMP1, \XMM3 910 - aesenc \TMP1, \XMM4 911 - movaps 0x20(%arg1), \TMP1 912 - aesenc \TMP1, \XMM1 # Round 2 913 - aesenc \TMP1, \XMM2 914 - aesenc \TMP1, \XMM3 915 - aesenc \TMP1, \XMM4 916 - movdqa \XMM6, \TMP1 917 - pshufd $78, \XMM6, \TMP2 918 - pxor \XMM6, \TMP2 919 - movdqu HashKey_3(%arg2), \TMP5 920 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 921 - movaps 0x30(%arg1), \TMP3 922 - aesenc \TMP3, \XMM1 # Round 3 923 - aesenc \TMP3, \XMM2 924 - aesenc \TMP3, \XMM3 925 - aesenc \TMP3, \XMM4 926 - pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 927 - movaps 0x40(%arg1), \TMP3 928 - aesenc \TMP3, \XMM1 # Round 4 929 - aesenc \TMP3, \XMM2 930 - aesenc \TMP3, \XMM3 931 - aesenc \TMP3, \XMM4 932 - movdqu HashKey_3_k(%arg2), \TMP5 933 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 934 - movaps 0x50(%arg1), \TMP3 935 - aesenc \TMP3, \XMM1 # Round 5 936 - aesenc \TMP3, \XMM2 937 - aesenc \TMP3, \XMM3 938 - aesenc \TMP3, \XMM4 939 - pxor \TMP1, \TMP4 940 - # accumulate the results in TMP4:XMM5, TMP6 holds the middle part 941 - pxor \XMM6, \XMM5 942 - pxor \TMP2, \TMP6 943 - movdqa \XMM7, \TMP1 944 - pshufd $78, \XMM7, \TMP2 945 - pxor \XMM7, \TMP2 946 - movdqu HashKey_2(%arg2), \TMP5 947 - 948 - # Multiply TMP5 * HashKey using karatsuba 949 - 950 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 951 - movaps 0x60(%arg1), \TMP3 952 - aesenc \TMP3, \XMM1 # Round 6 953 - aesenc \TMP3, \XMM2 954 - aesenc \TMP3, \XMM3 955 - aesenc \TMP3, \XMM4 956 - pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 957 - movaps 0x70(%arg1), \TMP3 958 - aesenc \TMP3, \XMM1 # Round 7 959 - aesenc \TMP3, \XMM2 960 - aesenc \TMP3, \XMM3 961 - aesenc \TMP3, \XMM4 962 - movdqu HashKey_2_k(%arg2), \TMP5 963 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 964 - movaps 0x80(%arg1), \TMP3 965 - aesenc \TMP3, \XMM1 # Round 8 966 - aesenc \TMP3, \XMM2 967 - aesenc \TMP3, \XMM3 968 - aesenc \TMP3, \XMM4 969 - pxor \TMP1, \TMP4 970 - # accumulate the results in TMP4:XMM5, TMP6 holds the middle part 971 - pxor \XMM7, \XMM5 972 - pxor \TMP2, \TMP6 973 - 974 - # Multiply XMM8 * HashKey 975 - # XMM8 and TMP5 hold the values for the two operands 976 - 977 - movdqa \XMM8, \TMP1 978 - pshufd $78, \XMM8, \TMP2 979 - pxor \XMM8, \TMP2 980 - movdqu HashKey(%arg2), \TMP5 981 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 982 - movaps 0x90(%arg1), \TMP3 983 - aesenc \TMP3, \XMM1 # Round 9 984 - aesenc \TMP3, \XMM2 985 - aesenc \TMP3, \XMM3 986 - aesenc \TMP3, \XMM4 987 - pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 988 - lea 0xa0(%arg1),%r10 989 - mov keysize,%eax 990 - shr $2,%eax # 128->4, 192->6, 256->8 991 - sub $4,%eax # 128->0, 192->2, 256->4 992 - jz .Laes_loop_par_enc_done\@ 993 - 994 - .Laes_loop_par_enc\@: 995 - MOVADQ (%r10),\TMP3 996 - .irpc index, 1234 997 - aesenc \TMP3, %xmm\index 998 - .endr 999 - add $16,%r10 1000 - sub $1,%eax 1001 - jnz .Laes_loop_par_enc\@ 1002 - 1003 - .Laes_loop_par_enc_done\@: 1004 - MOVADQ (%r10), \TMP3 1005 - aesenclast \TMP3, \XMM1 # Round 10 1006 - aesenclast \TMP3, \XMM2 1007 - aesenclast \TMP3, \XMM3 1008 - aesenclast \TMP3, \XMM4 1009 - movdqu HashKey_k(%arg2), \TMP5 1010 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1011 - movdqu (%arg4,%r11,1), \TMP3 1012 - pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1013 - movdqu 16(%arg4,%r11,1), \TMP3 1014 - pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1015 - movdqu 32(%arg4,%r11,1), \TMP3 1016 - pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1017 - movdqu 48(%arg4,%r11,1), \TMP3 1018 - pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1019 - movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer 1020 - movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer 1021 - movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer 1022 - movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer 1023 - pshufb %xmm15, \XMM1 # perform a 16 byte swap 1024 - pshufb %xmm15, \XMM2 # perform a 16 byte swap 1025 - pshufb %xmm15, \XMM3 # perform a 16 byte swap 1026 - pshufb %xmm15, \XMM4 # perform a 16 byte swap 1027 - 1028 - pxor \TMP4, \TMP1 1029 - pxor \XMM8, \XMM5 1030 - pxor \TMP6, \TMP2 1031 - pxor \TMP1, \TMP2 1032 - pxor \XMM5, \TMP2 1033 - movdqa \TMP2, \TMP3 1034 - pslldq $8, \TMP3 # left shift TMP3 2 DWs 1035 - psrldq $8, \TMP2 # right shift TMP2 2 DWs 1036 - pxor \TMP3, \XMM5 1037 - pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1038 - 1039 - # first phase of reduction 1040 - 1041 - movdqa \XMM5, \TMP2 1042 - movdqa \XMM5, \TMP3 1043 - movdqa \XMM5, \TMP4 1044 - # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1045 - pslld $31, \TMP2 # packed right shift << 31 1046 - pslld $30, \TMP3 # packed right shift << 30 1047 - pslld $25, \TMP4 # packed right shift << 25 1048 - pxor \TMP3, \TMP2 # xor the shifted versions 1049 - pxor \TMP4, \TMP2 1050 - movdqa \TMP2, \TMP5 1051 - psrldq $4, \TMP5 # right shift T5 1 DW 1052 - pslldq $12, \TMP2 # left shift T2 3 DWs 1053 - pxor \TMP2, \XMM5 1054 - 1055 - # second phase of reduction 1056 - 1057 - movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1058 - movdqa \XMM5,\TMP3 1059 - movdqa \XMM5,\TMP4 1060 - psrld $1, \TMP2 # packed left shift >>1 1061 - psrld $2, \TMP3 # packed left shift >>2 1062 - psrld $7, \TMP4 # packed left shift >>7 1063 - pxor \TMP3,\TMP2 # xor the shifted versions 1064 - pxor \TMP4,\TMP2 1065 - pxor \TMP5, \TMP2 1066 - pxor \TMP2, \XMM5 1067 - pxor \TMP1, \XMM5 # result is in TMP1 1068 - 1069 - pxor \XMM5, \XMM1 1070 - .endm 1071 - 1072 - /* 1073 - * decrypt 4 blocks at a time 1074 - * ghash the 4 previously decrypted ciphertext blocks 1075 - * arg1, %arg3, %arg4 are used as pointers only, not modified 1076 - * %r11 is the data offset value 1077 - */ 1078 - .macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \ 1079 - TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 1080 - 1081 - movdqa \XMM1, \XMM5 1082 - movdqa \XMM2, \XMM6 1083 - movdqa \XMM3, \XMM7 1084 - movdqa \XMM4, \XMM8 1085 - 1086 - movdqa SHUF_MASK(%rip), %xmm15 1087 - # multiply TMP5 * HashKey using karatsuba 1088 - 1089 - movdqa \XMM5, \TMP4 1090 - pshufd $78, \XMM5, \TMP6 1091 - pxor \XMM5, \TMP6 1092 - paddd ONE(%rip), \XMM0 # INCR CNT 1093 - movdqu HashKey_4(%arg2), \TMP5 1094 - pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 1095 - movdqa \XMM0, \XMM1 1096 - paddd ONE(%rip), \XMM0 # INCR CNT 1097 - movdqa \XMM0, \XMM2 1098 - paddd ONE(%rip), \XMM0 # INCR CNT 1099 - movdqa \XMM0, \XMM3 1100 - paddd ONE(%rip), \XMM0 # INCR CNT 1101 - movdqa \XMM0, \XMM4 1102 - pshufb %xmm15, \XMM1 # perform a 16 byte swap 1103 - pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 1104 - pshufb %xmm15, \XMM2 # perform a 16 byte swap 1105 - pshufb %xmm15, \XMM3 # perform a 16 byte swap 1106 - pshufb %xmm15, \XMM4 # perform a 16 byte swap 1107 - 1108 - pxor (%arg1), \XMM1 1109 - pxor (%arg1), \XMM2 1110 - pxor (%arg1), \XMM3 1111 - pxor (%arg1), \XMM4 1112 - movdqu HashKey_4_k(%arg2), \TMP5 1113 - pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1114 - movaps 0x10(%arg1), \TMP1 1115 - aesenc \TMP1, \XMM1 # Round 1 1116 - aesenc \TMP1, \XMM2 1117 - aesenc \TMP1, \XMM3 1118 - aesenc \TMP1, \XMM4 1119 - movaps 0x20(%arg1), \TMP1 1120 - aesenc \TMP1, \XMM1 # Round 2 1121 - aesenc \TMP1, \XMM2 1122 - aesenc \TMP1, \XMM3 1123 - aesenc \TMP1, \XMM4 1124 - movdqa \XMM6, \TMP1 1125 - pshufd $78, \XMM6, \TMP2 1126 - pxor \XMM6, \TMP2 1127 - movdqu HashKey_3(%arg2), \TMP5 1128 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1129 - movaps 0x30(%arg1), \TMP3 1130 - aesenc \TMP3, \XMM1 # Round 3 1131 - aesenc \TMP3, \XMM2 1132 - aesenc \TMP3, \XMM3 1133 - aesenc \TMP3, \XMM4 1134 - pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1135 - movaps 0x40(%arg1), \TMP3 1136 - aesenc \TMP3, \XMM1 # Round 4 1137 - aesenc \TMP3, \XMM2 1138 - aesenc \TMP3, \XMM3 1139 - aesenc \TMP3, \XMM4 1140 - movdqu HashKey_3_k(%arg2), \TMP5 1141 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1142 - movaps 0x50(%arg1), \TMP3 1143 - aesenc \TMP3, \XMM1 # Round 5 1144 - aesenc \TMP3, \XMM2 1145 - aesenc \TMP3, \XMM3 1146 - aesenc \TMP3, \XMM4 1147 - pxor \TMP1, \TMP4 1148 - # accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1149 - pxor \XMM6, \XMM5 1150 - pxor \TMP2, \TMP6 1151 - movdqa \XMM7, \TMP1 1152 - pshufd $78, \XMM7, \TMP2 1153 - pxor \XMM7, \TMP2 1154 - movdqu HashKey_2(%arg2), \TMP5 1155 - 1156 - # Multiply TMP5 * HashKey using karatsuba 1157 - 1158 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1159 - movaps 0x60(%arg1), \TMP3 1160 - aesenc \TMP3, \XMM1 # Round 6 1161 - aesenc \TMP3, \XMM2 1162 - aesenc \TMP3, \XMM3 1163 - aesenc \TMP3, \XMM4 1164 - pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1165 - movaps 0x70(%arg1), \TMP3 1166 - aesenc \TMP3, \XMM1 # Round 7 1167 - aesenc \TMP3, \XMM2 1168 - aesenc \TMP3, \XMM3 1169 - aesenc \TMP3, \XMM4 1170 - movdqu HashKey_2_k(%arg2), \TMP5 1171 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1172 - movaps 0x80(%arg1), \TMP3 1173 - aesenc \TMP3, \XMM1 # Round 8 1174 - aesenc \TMP3, \XMM2 1175 - aesenc \TMP3, \XMM3 1176 - aesenc \TMP3, \XMM4 1177 - pxor \TMP1, \TMP4 1178 - # accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1179 - pxor \XMM7, \XMM5 1180 - pxor \TMP2, \TMP6 1181 - 1182 - # Multiply XMM8 * HashKey 1183 - # XMM8 and TMP5 hold the values for the two operands 1184 - 1185 - movdqa \XMM8, \TMP1 1186 - pshufd $78, \XMM8, \TMP2 1187 - pxor \XMM8, \TMP2 1188 - movdqu HashKey(%arg2), \TMP5 1189 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1190 - movaps 0x90(%arg1), \TMP3 1191 - aesenc \TMP3, \XMM1 # Round 9 1192 - aesenc \TMP3, \XMM2 1193 - aesenc \TMP3, \XMM3 1194 - aesenc \TMP3, \XMM4 1195 - pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1196 - lea 0xa0(%arg1),%r10 1197 - mov keysize,%eax 1198 - shr $2,%eax # 128->4, 192->6, 256->8 1199 - sub $4,%eax # 128->0, 192->2, 256->4 1200 - jz .Laes_loop_par_dec_done\@ 1201 - 1202 - .Laes_loop_par_dec\@: 1203 - MOVADQ (%r10),\TMP3 1204 - .irpc index, 1234 1205 - aesenc \TMP3, %xmm\index 1206 - .endr 1207 - add $16,%r10 1208 - sub $1,%eax 1209 - jnz .Laes_loop_par_dec\@ 1210 - 1211 - .Laes_loop_par_dec_done\@: 1212 - MOVADQ (%r10), \TMP3 1213 - aesenclast \TMP3, \XMM1 # last round 1214 - aesenclast \TMP3, \XMM2 1215 - aesenclast \TMP3, \XMM3 1216 - aesenclast \TMP3, \XMM4 1217 - movdqu HashKey_k(%arg2), \TMP5 1218 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1219 - movdqu (%arg4,%r11,1), \TMP3 1220 - pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1221 - movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer 1222 - movdqa \TMP3, \XMM1 1223 - movdqu 16(%arg4,%r11,1), \TMP3 1224 - pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1225 - movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer 1226 - movdqa \TMP3, \XMM2 1227 - movdqu 32(%arg4,%r11,1), \TMP3 1228 - pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1229 - movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer 1230 - movdqa \TMP3, \XMM3 1231 - movdqu 48(%arg4,%r11,1), \TMP3 1232 - pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1233 - movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer 1234 - movdqa \TMP3, \XMM4 1235 - pshufb %xmm15, \XMM1 # perform a 16 byte swap 1236 - pshufb %xmm15, \XMM2 # perform a 16 byte swap 1237 - pshufb %xmm15, \XMM3 # perform a 16 byte swap 1238 - pshufb %xmm15, \XMM4 # perform a 16 byte swap 1239 - 1240 - pxor \TMP4, \TMP1 1241 - pxor \XMM8, \XMM5 1242 - pxor \TMP6, \TMP2 1243 - pxor \TMP1, \TMP2 1244 - pxor \XMM5, \TMP2 1245 - movdqa \TMP2, \TMP3 1246 - pslldq $8, \TMP3 # left shift TMP3 2 DWs 1247 - psrldq $8, \TMP2 # right shift TMP2 2 DWs 1248 - pxor \TMP3, \XMM5 1249 - pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1250 - 1251 - # first phase of reduction 1252 - 1253 - movdqa \XMM5, \TMP2 1254 - movdqa \XMM5, \TMP3 1255 - movdqa \XMM5, \TMP4 1256 - # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1257 - pslld $31, \TMP2 # packed right shift << 31 1258 - pslld $30, \TMP3 # packed right shift << 30 1259 - pslld $25, \TMP4 # packed right shift << 25 1260 - pxor \TMP3, \TMP2 # xor the shifted versions 1261 - pxor \TMP4, \TMP2 1262 - movdqa \TMP2, \TMP5 1263 - psrldq $4, \TMP5 # right shift T5 1 DW 1264 - pslldq $12, \TMP2 # left shift T2 3 DWs 1265 - pxor \TMP2, \XMM5 1266 - 1267 - # second phase of reduction 1268 - 1269 - movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1270 - movdqa \XMM5,\TMP3 1271 - movdqa \XMM5,\TMP4 1272 - psrld $1, \TMP2 # packed left shift >>1 1273 - psrld $2, \TMP3 # packed left shift >>2 1274 - psrld $7, \TMP4 # packed left shift >>7 1275 - pxor \TMP3,\TMP2 # xor the shifted versions 1276 - pxor \TMP4,\TMP2 1277 - pxor \TMP5, \TMP2 1278 - pxor \TMP2, \XMM5 1279 - pxor \TMP1, \XMM5 # result is in TMP1 1280 - 1281 - pxor \XMM5, \XMM1 1282 - .endm 1283 - 1284 - /* GHASH the last 4 ciphertext blocks. */ 1285 - .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1286 - TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1287 - 1288 - # Multiply TMP6 * HashKey (using Karatsuba) 1289 - 1290 - movdqa \XMM1, \TMP6 1291 - pshufd $78, \XMM1, \TMP2 1292 - pxor \XMM1, \TMP2 1293 - movdqu HashKey_4(%arg2), \TMP5 1294 - pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1295 - pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1296 - movdqu HashKey_4_k(%arg2), \TMP4 1297 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1298 - movdqa \XMM1, \XMMDst 1299 - movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1300 - 1301 - # Multiply TMP1 * HashKey (using Karatsuba) 1302 - 1303 - movdqa \XMM2, \TMP1 1304 - pshufd $78, \XMM2, \TMP2 1305 - pxor \XMM2, \TMP2 1306 - movdqu HashKey_3(%arg2), \TMP5 1307 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1308 - pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1309 - movdqu HashKey_3_k(%arg2), \TMP4 1310 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1311 - pxor \TMP1, \TMP6 1312 - pxor \XMM2, \XMMDst 1313 - pxor \TMP2, \XMM1 1314 - # results accumulated in TMP6, XMMDst, XMM1 1315 - 1316 - # Multiply TMP1 * HashKey (using Karatsuba) 1317 - 1318 - movdqa \XMM3, \TMP1 1319 - pshufd $78, \XMM3, \TMP2 1320 - pxor \XMM3, \TMP2 1321 - movdqu HashKey_2(%arg2), \TMP5 1322 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1323 - pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1324 - movdqu HashKey_2_k(%arg2), \TMP4 1325 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1326 - pxor \TMP1, \TMP6 1327 - pxor \XMM3, \XMMDst 1328 - pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1329 - 1330 - # Multiply TMP1 * HashKey (using Karatsuba) 1331 - movdqa \XMM4, \TMP1 1332 - pshufd $78, \XMM4, \TMP2 1333 - pxor \XMM4, \TMP2 1334 - movdqu HashKey(%arg2), \TMP5 1335 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1336 - pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1337 - movdqu HashKey_k(%arg2), \TMP4 1338 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1339 - pxor \TMP1, \TMP6 1340 - pxor \XMM4, \XMMDst 1341 - pxor \XMM1, \TMP2 1342 - pxor \TMP6, \TMP2 1343 - pxor \XMMDst, \TMP2 1344 - # middle section of the temp results combined as in karatsuba algorithm 1345 - movdqa \TMP2, \TMP4 1346 - pslldq $8, \TMP4 # left shift TMP4 2 DWs 1347 - psrldq $8, \TMP2 # right shift TMP2 2 DWs 1348 - pxor \TMP4, \XMMDst 1349 - pxor \TMP2, \TMP6 1350 - # TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1351 - # first phase of the reduction 1352 - movdqa \XMMDst, \TMP2 1353 - movdqa \XMMDst, \TMP3 1354 - movdqa \XMMDst, \TMP4 1355 - # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1356 - pslld $31, \TMP2 # packed right shifting << 31 1357 - pslld $30, \TMP3 # packed right shifting << 30 1358 - pslld $25, \TMP4 # packed right shifting << 25 1359 - pxor \TMP3, \TMP2 # xor the shifted versions 1360 - pxor \TMP4, \TMP2 1361 - movdqa \TMP2, \TMP7 1362 - psrldq $4, \TMP7 # right shift TMP7 1 DW 1363 - pslldq $12, \TMP2 # left shift TMP2 3 DWs 1364 - pxor \TMP2, \XMMDst 1365 - 1366 - # second phase of the reduction 1367 - movdqa \XMMDst, \TMP2 1368 - # make 3 copies of XMMDst for doing 3 shift operations 1369 - movdqa \XMMDst, \TMP3 1370 - movdqa \XMMDst, \TMP4 1371 - psrld $1, \TMP2 # packed left shift >> 1 1372 - psrld $2, \TMP3 # packed left shift >> 2 1373 - psrld $7, \TMP4 # packed left shift >> 7 1374 - pxor \TMP3, \TMP2 # xor the shifted versions 1375 - pxor \TMP4, \TMP2 1376 - pxor \TMP7, \TMP2 1377 - pxor \TMP2, \XMMDst 1378 - pxor \TMP6, \XMMDst # reduced result is in XMMDst 1379 - .endm 1380 - 1381 - 1382 - /* Encryption of a single block 1383 - * uses eax & r10 1384 - */ 1385 - 1386 - .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1387 - 1388 - pxor (%arg1), \XMM0 1389 - mov keysize,%eax 1390 - shr $2,%eax # 128->4, 192->6, 256->8 1391 - add $5,%eax # 128->9, 192->11, 256->13 1392 - lea 16(%arg1), %r10 # get first expanded key address 1393 - 1394 - _esb_loop_\@: 1395 - MOVADQ (%r10),\TMP1 1396 - aesenc \TMP1,\XMM0 1397 - add $16,%r10 1398 - sub $1,%eax 1399 - jnz _esb_loop_\@ 1400 - 1401 - MOVADQ (%r10),\TMP1 1402 - aesenclast \TMP1,\XMM0 1403 - .endm 1404 - 1405 - /***************************************************************************** 1406 - * void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1407 - * struct gcm_context_data *data, 1408 - * // context data 1409 - * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1410 - * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1411 - * // concatenated with 0x00000001. 16-byte aligned pointer. 1412 - * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1413 - * const u8 *aad, // Additional Authentication Data (AAD) 1414 - * u64 aad_len) // Length of AAD in bytes. 1415 - */ 1416 - SYM_FUNC_START(aesni_gcm_init) 1417 - FUNC_SAVE 1418 - GCM_INIT %arg3, %arg4,%arg5, %arg6 1419 - FUNC_RESTORE 1420 - RET 1421 - SYM_FUNC_END(aesni_gcm_init) 1422 - 1423 - /***************************************************************************** 1424 - * void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1425 - * struct gcm_context_data *data, 1426 - * // context data 1427 - * u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1428 - * const u8 *in, // Plaintext input 1429 - * u64 plaintext_len, // Length of data in bytes for encryption. 1430 - */ 1431 - SYM_FUNC_START(aesni_gcm_enc_update) 1432 - FUNC_SAVE 1433 - GCM_ENC_DEC enc 1434 - FUNC_RESTORE 1435 - RET 1436 - SYM_FUNC_END(aesni_gcm_enc_update) 1437 - 1438 - /***************************************************************************** 1439 - * void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1440 - * struct gcm_context_data *data, 1441 - * // context data 1442 - * u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1443 - * const u8 *in, // Plaintext input 1444 - * u64 plaintext_len, // Length of data in bytes for encryption. 1445 - */ 1446 - SYM_FUNC_START(aesni_gcm_dec_update) 1447 - FUNC_SAVE 1448 - GCM_ENC_DEC dec 1449 - FUNC_RESTORE 1450 - RET 1451 - SYM_FUNC_END(aesni_gcm_dec_update) 1452 - 1453 - /***************************************************************************** 1454 - * void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1455 - * struct gcm_context_data *data, 1456 - * // context data 1457 - * u8 *auth_tag, // Authenticated Tag output. 1458 - * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1459 - * // 12 or 8. 1460 - */ 1461 - SYM_FUNC_START(aesni_gcm_finalize) 1462 - FUNC_SAVE 1463 - GCM_COMPLETE %arg3 %arg4 1464 - FUNC_RESTORE 1465 - RET 1466 - SYM_FUNC_END(aesni_gcm_finalize) 1467 - 1468 163 #endif 1469 164 1470 165 SYM_FUNC_START_LOCAL(_key_expansion_256a)

-2804

arch/x86/crypto/aesni-intel_avx-x86_64.S

··· 1 - ######################################################################## 2 - # Copyright (c) 2013, Intel Corporation 3 - # 4 - # This software is available to you under a choice of one of two 5 - # licenses. You may choose to be licensed under the terms of the GNU 6 - # General Public License (GPL) Version 2, available from the file 7 - # COPYING in the main directory of this source tree, or the 8 - # OpenIB.org BSD license below: 9 - # 10 - # Redistribution and use in source and binary forms, with or without 11 - # modification, are permitted provided that the following conditions are 12 - # met: 13 - # 14 - # * Redistributions of source code must retain the above copyright 15 - # notice, this list of conditions and the following disclaimer. 16 - # 17 - # * Redistributions in binary form must reproduce the above copyright 18 - # notice, this list of conditions and the following disclaimer in the 19 - # documentation and/or other materials provided with the 20 - # distribution. 21 - # 22 - # * Neither the name of the Intel Corporation nor the names of its 23 - # contributors may be used to endorse or promote products derived from 24 - # this software without specific prior written permission. 25 - # 26 - # 27 - # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 28 - # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 - # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 31 - # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 32 - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 33 - # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR 34 - # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 35 - # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 36 - # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37 - # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 - ######################################################################## 39 - ## 40 - ## Authors: 41 - ## Erdinc Ozturk <erdinc.ozturk@intel.com> 42 - ## Vinodh Gopal <vinodh.gopal@intel.com> 43 - ## James Guilford <james.guilford@intel.com> 44 - ## Tim Chen <tim.c.chen@linux.intel.com> 45 - ## 46 - ## References: 47 - ## This code was derived and highly optimized from the code described in paper: 48 - ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation 49 - ## on Intel Architecture Processors. August, 2010 50 - ## The details of the implementation is explained in: 51 - ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode 52 - ## on Intel Architecture Processors. October, 2012. 53 - ## 54 - ## Assumptions: 55 - ## 56 - ## 57 - ## 58 - ## iv: 59 - ## 0 1 2 3 60 - ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 61 - ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 62 - ## | Salt (From the SA) | 63 - ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 64 - ## | Initialization Vector | 65 - ## | (This is the sequence number from IPSec header) | 66 - ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 67 - ## | 0x1 | 68 - ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 69 - ## 70 - ## 71 - ## 72 - ## AAD: 73 - ## AAD padded to 128 bits with 0 74 - ## for example, assume AAD is a u32 vector 75 - ## 76 - ## if AAD is 8 bytes: 77 - ## AAD[3] = {A0, A1}# 78 - ## padded AAD in xmm register = {A1 A0 0 0} 79 - ## 80 - ## 0 1 2 3 81 - ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 82 - ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 83 - ## | SPI (A1) | 84 - ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 85 - ## | 32-bit Sequence Number (A0) | 86 - ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 87 - ## | 0x0 | 88 - ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 89 - ## 90 - ## AAD Format with 32-bit Sequence Number 91 - ## 92 - ## if AAD is 12 bytes: 93 - ## AAD[3] = {A0, A1, A2}# 94 - ## padded AAD in xmm register = {A2 A1 A0 0} 95 - ## 96 - ## 0 1 2 3 97 - ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 98 - ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 99 - ## | SPI (A2) | 100 - ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 101 - ## | 64-bit Extended Sequence Number {A1,A0} | 102 - ## | | 103 - ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 104 - ## | 0x0 | 105 - ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 106 - ## 107 - ## AAD Format with 64-bit Extended Sequence Number 108 - ## 109 - ## 110 - ## aadLen: 111 - ## from the definition of the spec, aadLen can only be 8 or 12 bytes. 112 - ## The code additionally supports aadLen of length 16 bytes. 113 - ## 114 - ## TLen: 115 - ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 116 - ## 117 - ## poly = x^128 + x^127 + x^126 + x^121 + 1 118 - ## throughout the code, one tab and two tab indentations are used. one tab is 119 - ## for GHASH part, two tabs is for AES part. 120 - ## 121 - 122 - #include <linux/linkage.h> 123 - 124 - # constants in mergeable sections, linker can reorder and merge 125 - .section .rodata.cst16.POLY, "aM", @progbits, 16 126 - .align 16 127 - POLY: .octa 0xC2000000000000000000000000000001 128 - 129 - .section .rodata.cst16.POLY2, "aM", @progbits, 16 130 - .align 16 131 - POLY2: .octa 0xC20000000000000000000001C2000000 132 - 133 - .section .rodata.cst16.TWOONE, "aM", @progbits, 16 134 - .align 16 135 - TWOONE: .octa 0x00000001000000000000000000000001 136 - 137 - .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 138 - .align 16 139 - SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 140 - 141 - .section .rodata.cst16.ONE, "aM", @progbits, 16 142 - .align 16 143 - ONE: .octa 0x00000000000000000000000000000001 144 - 145 - .section .rodata.cst16.ONEf, "aM", @progbits, 16 146 - .align 16 147 - ONEf: .octa 0x01000000000000000000000000000000 148 - 149 - # order of these constants should not change. 150 - # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F 151 - .section .rodata, "a", @progbits 152 - .align 16 153 - SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 154 - ALL_F: .octa 0xffffffffffffffffffffffffffffffff 155 - .octa 0x00000000000000000000000000000000 156 - 157 - .text 158 - 159 - 160 - #define AadHash 16*0 161 - #define AadLen 16*1 162 - #define InLen (16*1)+8 163 - #define PBlockEncKey 16*2 164 - #define OrigIV 16*3 165 - #define CurCount 16*4 166 - #define PBlockLen 16*5 167 - 168 - HashKey = 16*6 # store HashKey <<1 mod poly here 169 - HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here 170 - HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here 171 - HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here 172 - HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here 173 - HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here 174 - HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here 175 - HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here 176 - HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) 177 - HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) 178 - HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) 179 - HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) 180 - HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) 181 - HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) 182 - HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) 183 - HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) 184 - 185 - #define arg1 %rdi 186 - #define arg2 %rsi 187 - #define arg3 %rdx 188 - #define arg4 %rcx 189 - #define arg5 %r8 190 - #define arg6 %r9 191 - #define keysize 2*15*16(arg1) 192 - 193 - i = 0 194 - j = 0 195 - 196 - out_order = 0 197 - in_order = 1 198 - DEC = 0 199 - ENC = 1 200 - 201 - .macro define_reg r n 202 - reg_\r = %xmm\n 203 - .endm 204 - 205 - .macro setreg 206 - .altmacro 207 - define_reg i %i 208 - define_reg j %j 209 - .noaltmacro 210 - .endm 211 - 212 - TMP1 = 16*0 # Temporary storage for AAD 213 - TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) 214 - TMP3 = 16*2 # Temporary storage for AES State 3 215 - TMP4 = 16*3 # Temporary storage for AES State 4 216 - TMP5 = 16*4 # Temporary storage for AES State 5 217 - TMP6 = 16*5 # Temporary storage for AES State 6 218 - TMP7 = 16*6 # Temporary storage for AES State 7 219 - TMP8 = 16*7 # Temporary storage for AES State 8 220 - 221 - VARIABLE_OFFSET = 16*8 222 - 223 - ################################ 224 - # Utility Macros 225 - ################################ 226 - 227 - .macro FUNC_SAVE 228 - push %r12 229 - push %r13 230 - push %r15 231 - 232 - push %rbp 233 - mov %rsp, %rbp 234 - 235 - sub $VARIABLE_OFFSET, %rsp 236 - and $~63, %rsp # align rsp to 64 bytes 237 - .endm 238 - 239 - .macro FUNC_RESTORE 240 - mov %rbp, %rsp 241 - pop %rbp 242 - 243 - pop %r15 244 - pop %r13 245 - pop %r12 246 - .endm 247 - 248 - # Encryption of a single block 249 - .macro ENCRYPT_SINGLE_BLOCK REP XMM0 250 - vpxor (arg1), \XMM0, \XMM0 251 - i = 1 252 - setreg 253 - .rep \REP 254 - vaesenc 16*i(arg1), \XMM0, \XMM0 255 - i = (i+1) 256 - setreg 257 - .endr 258 - vaesenclast 16*i(arg1), \XMM0, \XMM0 259 - .endm 260 - 261 - # combined for GCM encrypt and decrypt functions 262 - # clobbering all xmm registers 263 - # clobbering r10, r11, r12, r13, r15, rax 264 - .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP 265 - vmovdqu AadHash(arg2), %xmm8 266 - vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey 267 - add arg5, InLen(arg2) 268 - 269 - # initialize the data pointer offset as zero 270 - xor %r11d, %r11d 271 - 272 - PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC 273 - sub %r11, arg5 274 - 275 - mov arg5, %r13 # save the number of bytes of plaintext/ciphertext 276 - and $-16, %r13 # r13 = r13 - (r13 mod 16) 277 - 278 - mov %r13, %r12 279 - shr $4, %r12 280 - and $7, %r12 281 - jz .L_initial_num_blocks_is_0\@ 282 - 283 - cmp $7, %r12 284 - je .L_initial_num_blocks_is_7\@ 285 - cmp $6, %r12 286 - je .L_initial_num_blocks_is_6\@ 287 - cmp $5, %r12 288 - je .L_initial_num_blocks_is_5\@ 289 - cmp $4, %r12 290 - je .L_initial_num_blocks_is_4\@ 291 - cmp $3, %r12 292 - je .L_initial_num_blocks_is_3\@ 293 - cmp $2, %r12 294 - je .L_initial_num_blocks_is_2\@ 295 - 296 - jmp .L_initial_num_blocks_is_1\@ 297 - 298 - .L_initial_num_blocks_is_7\@: 299 - \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 300 - sub $16*7, %r13 301 - jmp .L_initial_blocks_encrypted\@ 302 - 303 - .L_initial_num_blocks_is_6\@: 304 - \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 305 - sub $16*6, %r13 306 - jmp .L_initial_blocks_encrypted\@ 307 - 308 - .L_initial_num_blocks_is_5\@: 309 - \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 310 - sub $16*5, %r13 311 - jmp .L_initial_blocks_encrypted\@ 312 - 313 - .L_initial_num_blocks_is_4\@: 314 - \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 315 - sub $16*4, %r13 316 - jmp .L_initial_blocks_encrypted\@ 317 - 318 - .L_initial_num_blocks_is_3\@: 319 - \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 320 - sub $16*3, %r13 321 - jmp .L_initial_blocks_encrypted\@ 322 - 323 - .L_initial_num_blocks_is_2\@: 324 - \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 325 - sub $16*2, %r13 326 - jmp .L_initial_blocks_encrypted\@ 327 - 328 - .L_initial_num_blocks_is_1\@: 329 - \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 330 - sub $16*1, %r13 331 - jmp .L_initial_blocks_encrypted\@ 332 - 333 - .L_initial_num_blocks_is_0\@: 334 - \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 335 - 336 - 337 - .L_initial_blocks_encrypted\@: 338 - test %r13, %r13 339 - je .L_zero_cipher_left\@ 340 - 341 - sub $128, %r13 342 - je .L_eight_cipher_left\@ 343 - 344 - 345 - 346 - 347 - vmovd %xmm9, %r15d 348 - and $255, %r15d 349 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 350 - 351 - 352 - .L_encrypt_by_8_new\@: 353 - cmp $(255-8), %r15d 354 - jg .L_encrypt_by_8\@ 355 - 356 - 357 - 358 - add $8, %r15b 359 - \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC 360 - add $128, %r11 361 - sub $128, %r13 362 - jne .L_encrypt_by_8_new\@ 363 - 364 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 365 - jmp .L_eight_cipher_left\@ 366 - 367 - .L_encrypt_by_8\@: 368 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 369 - add $8, %r15b 370 - \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC 371 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 372 - add $128, %r11 373 - sub $128, %r13 374 - jne .L_encrypt_by_8_new\@ 375 - 376 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 377 - 378 - 379 - 380 - 381 - .L_eight_cipher_left\@: 382 - \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 383 - 384 - 385 - .L_zero_cipher_left\@: 386 - vmovdqu %xmm14, AadHash(arg2) 387 - vmovdqu %xmm9, CurCount(arg2) 388 - 389 - # check for 0 length 390 - mov arg5, %r13 391 - and $15, %r13 # r13 = (arg5 mod 16) 392 - 393 - je .L_multiple_of_16_bytes\@ 394 - 395 - # handle the last <16 Byte block separately 396 - 397 - mov %r13, PBlockLen(arg2) 398 - 399 - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 400 - vmovdqu %xmm9, CurCount(arg2) 401 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 402 - 403 - ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) 404 - vmovdqu %xmm9, PBlockEncKey(arg2) 405 - 406 - cmp $16, arg5 407 - jge .L_large_enough_update\@ 408 - 409 - lea (arg4,%r11,1), %r10 410 - mov %r13, %r12 411 - 412 - READ_PARTIAL_BLOCK %r10 %r12 %xmm1 413 - 414 - lea SHIFT_MASK+16(%rip), %r12 415 - sub %r13, %r12 # adjust the shuffle mask pointer to be 416 - # able to shift 16-r13 bytes (r13 is the 417 - # number of bytes in plaintext mod 16) 418 - 419 - jmp .L_final_ghash_mul\@ 420 - 421 - .L_large_enough_update\@: 422 - sub $16, %r11 423 - add %r13, %r11 424 - 425 - # receive the last <16 Byte block 426 - vmovdqu (arg4, %r11, 1), %xmm1 427 - 428 - sub %r13, %r11 429 - add $16, %r11 430 - 431 - lea SHIFT_MASK+16(%rip), %r12 432 - # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 433 - # (r13 is the number of bytes in plaintext mod 16) 434 - sub %r13, %r12 435 - # get the appropriate shuffle mask 436 - vmovdqu (%r12), %xmm2 437 - # shift right 16-r13 bytes 438 - vpshufb %xmm2, %xmm1, %xmm1 439 - 440 - .L_final_ghash_mul\@: 441 - .if \ENC_DEC == DEC 442 - vmovdqa %xmm1, %xmm2 443 - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 444 - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 445 - # mask out top 16-r13 bytes of xmm9 446 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 447 - vpand %xmm1, %xmm2, %xmm2 448 - vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 449 - vpxor %xmm2, %xmm14, %xmm14 450 - 451 - vmovdqu %xmm14, AadHash(arg2) 452 - .else 453 - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 454 - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 455 - # mask out top 16-r13 bytes of xmm9 456 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 457 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 458 - vpxor %xmm9, %xmm14, %xmm14 459 - 460 - vmovdqu %xmm14, AadHash(arg2) 461 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext 462 - .endif 463 - 464 - 465 - ############################# 466 - # output r13 Bytes 467 - vmovq %xmm9, %rax 468 - cmp $8, %r13 469 - jle .L_less_than_8_bytes_left\@ 470 - 471 - mov %rax, (arg3 , %r11) 472 - add $8, %r11 473 - vpsrldq $8, %xmm9, %xmm9 474 - vmovq %xmm9, %rax 475 - sub $8, %r13 476 - 477 - .L_less_than_8_bytes_left\@: 478 - movb %al, (arg3 , %r11) 479 - add $1, %r11 480 - shr $8, %rax 481 - sub $1, %r13 482 - jne .L_less_than_8_bytes_left\@ 483 - ############################# 484 - 485 - .L_multiple_of_16_bytes\@: 486 - .endm 487 - 488 - 489 - # GCM_COMPLETE Finishes update of tag of last partial block 490 - # Output: Authorization Tag (AUTH_TAG) 491 - # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 492 - .macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN 493 - vmovdqu AadHash(arg2), %xmm14 494 - vmovdqu HashKey(arg2), %xmm13 495 - 496 - mov PBlockLen(arg2), %r12 497 - test %r12, %r12 498 - je .L_partial_done\@ 499 - 500 - #GHASH computation for the last <16 Byte block 501 - \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 502 - 503 - .L_partial_done\@: 504 - mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) 505 - shl $3, %r12 # convert into number of bits 506 - vmovd %r12d, %xmm15 # len(A) in xmm15 507 - 508 - mov InLen(arg2), %r12 509 - shl $3, %r12 # len(C) in bits (*128) 510 - vmovq %r12, %xmm1 511 - vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 512 - vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) 513 - 514 - vpxor %xmm15, %xmm14, %xmm14 515 - \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation 516 - vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap 517 - 518 - vmovdqu OrigIV(arg2), %xmm9 519 - 520 - ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0) 521 - 522 - vpxor %xmm14, %xmm9, %xmm9 523 - 524 - 525 - 526 - .L_return_T\@: 527 - mov \AUTH_TAG, %r10 # r10 = authTag 528 - mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len 529 - 530 - cmp $16, %r11 531 - je .L_T_16\@ 532 - 533 - cmp $8, %r11 534 - jl .L_T_4\@ 535 - 536 - .L_T_8\@: 537 - vmovq %xmm9, %rax 538 - mov %rax, (%r10) 539 - add $8, %r10 540 - sub $8, %r11 541 - vpsrldq $8, %xmm9, %xmm9 542 - test %r11, %r11 543 - je .L_return_T_done\@ 544 - .L_T_4\@: 545 - vmovd %xmm9, %eax 546 - mov %eax, (%r10) 547 - add $4, %r10 548 - sub $4, %r11 549 - vpsrldq $4, %xmm9, %xmm9 550 - test %r11, %r11 551 - je .L_return_T_done\@ 552 - .L_T_123\@: 553 - vmovd %xmm9, %eax 554 - cmp $2, %r11 555 - jl .L_T_1\@ 556 - mov %ax, (%r10) 557 - cmp $2, %r11 558 - je .L_return_T_done\@ 559 - add $2, %r10 560 - sar $16, %eax 561 - .L_T_1\@: 562 - mov %al, (%r10) 563 - jmp .L_return_T_done\@ 564 - 565 - .L_T_16\@: 566 - vmovdqu %xmm9, (%r10) 567 - 568 - .L_return_T_done\@: 569 - .endm 570 - 571 - .macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8 572 - 573 - mov \AAD, %r10 # r10 = AAD 574 - mov \AADLEN, %r12 # r12 = aadLen 575 - 576 - 577 - mov %r12, %r11 578 - 579 - vpxor \T8, \T8, \T8 580 - vpxor \T7, \T7, \T7 581 - cmp $16, %r11 582 - jl .L_get_AAD_rest8\@ 583 - .L_get_AAD_blocks\@: 584 - vmovdqu (%r10), \T7 585 - vpshufb SHUF_MASK(%rip), \T7, \T7 586 - vpxor \T7, \T8, \T8 587 - \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6 588 - add $16, %r10 589 - sub $16, %r12 590 - sub $16, %r11 591 - cmp $16, %r11 592 - jge .L_get_AAD_blocks\@ 593 - vmovdqu \T8, \T7 594 - test %r11, %r11 595 - je .L_get_AAD_done\@ 596 - 597 - vpxor \T7, \T7, \T7 598 - 599 - /* read the last <16B of AAD. since we have at least 4B of 600 - data right after the AAD (the ICV, and maybe some CT), we can 601 - read 4B/8B blocks safely, and then get rid of the extra stuff */ 602 - .L_get_AAD_rest8\@: 603 - cmp $4, %r11 604 - jle .L_get_AAD_rest4\@ 605 - movq (%r10), \T1 606 - add $8, %r10 607 - sub $8, %r11 608 - vpslldq $8, \T1, \T1 609 - vpsrldq $8, \T7, \T7 610 - vpxor \T1, \T7, \T7 611 - jmp .L_get_AAD_rest8\@ 612 - .L_get_AAD_rest4\@: 613 - test %r11, %r11 614 - jle .L_get_AAD_rest0\@ 615 - mov (%r10), %eax 616 - movq %rax, \T1 617 - add $4, %r10 618 - sub $4, %r11 619 - vpslldq $12, \T1, \T1 620 - vpsrldq $4, \T7, \T7 621 - vpxor \T1, \T7, \T7 622 - .L_get_AAD_rest0\@: 623 - /* finalize: shift out the extra bytes we read, and align 624 - left. since pslldq can only shift by an immediate, we use 625 - vpshufb and a pair of shuffle masks */ 626 - leaq ALL_F(%rip), %r11 627 - subq %r12, %r11 628 - vmovdqu 16(%r11), \T1 629 - andq $~3, %r11 630 - vpshufb (%r11), \T7, \T7 631 - vpand \T1, \T7, \T7 632 - .L_get_AAD_rest_final\@: 633 - vpshufb SHUF_MASK(%rip), \T7, \T7 634 - vpxor \T8, \T7, \T7 635 - \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6 636 - 637 - .L_get_AAD_done\@: 638 - vmovdqu \T7, AadHash(arg2) 639 - .endm 640 - 641 - .macro INIT GHASH_MUL PRECOMPUTE 642 - mov arg6, %r11 643 - mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length 644 - xor %r11d, %r11d 645 - mov %r11, InLen(arg2) # ctx_data.in_length = 0 646 - 647 - mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0 648 - mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0 649 - mov arg3, %rax 650 - movdqu (%rax), %xmm0 651 - movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv 652 - 653 - vpshufb SHUF_MASK(%rip), %xmm0, %xmm0 654 - movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv 655 - 656 - vmovdqu (arg4), %xmm6 # xmm6 = HashKey 657 - 658 - vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 659 - ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey 660 - vmovdqa %xmm6, %xmm2 661 - vpsllq $1, %xmm6, %xmm6 662 - vpsrlq $63, %xmm2, %xmm2 663 - vmovdqa %xmm2, %xmm1 664 - vpslldq $8, %xmm2, %xmm2 665 - vpsrldq $8, %xmm1, %xmm1 666 - vpor %xmm2, %xmm6, %xmm6 667 - #reduction 668 - vpshufd $0b00100100, %xmm1, %xmm2 669 - vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 670 - vpand POLY(%rip), %xmm2, %xmm2 671 - vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly 672 - ####################################################################### 673 - vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly 674 - 675 - CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 676 - 677 - \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 678 - .endm 679 - 680 - 681 - # Reads DLEN bytes starting at DPTR and stores in XMMDst 682 - # where 0 < DLEN < 16 683 - # Clobbers %rax, DLEN 684 - .macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst 685 - vpxor \XMMDst, \XMMDst, \XMMDst 686 - 687 - cmp $8, \DLEN 688 - jl .L_read_lt8_\@ 689 - mov (\DPTR), %rax 690 - vpinsrq $0, %rax, \XMMDst, \XMMDst 691 - sub $8, \DLEN 692 - jz .L_done_read_partial_block_\@ 693 - xor %eax, %eax 694 - .L_read_next_byte_\@: 695 - shl $8, %rax 696 - mov 7(\DPTR, \DLEN, 1), %al 697 - dec \DLEN 698 - jnz .L_read_next_byte_\@ 699 - vpinsrq $1, %rax, \XMMDst, \XMMDst 700 - jmp .L_done_read_partial_block_\@ 701 - .L_read_lt8_\@: 702 - xor %eax, %eax 703 - .L_read_next_byte_lt8_\@: 704 - shl $8, %rax 705 - mov -1(\DPTR, \DLEN, 1), %al 706 - dec \DLEN 707 - jnz .L_read_next_byte_lt8_\@ 708 - vpinsrq $0, %rax, \XMMDst, \XMMDst 709 - .L_done_read_partial_block_\@: 710 - .endm 711 - 712 - # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks 713 - # between update calls. 714 - # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK 715 - # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context 716 - # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 717 - .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ 718 - AAD_HASH ENC_DEC 719 - mov PBlockLen(arg2), %r13 720 - test %r13, %r13 721 - je .L_partial_block_done_\@ # Leave Macro if no partial blocks 722 - # Read in input data without over reading 723 - cmp $16, \PLAIN_CYPH_LEN 724 - jl .L_fewer_than_16_bytes_\@ 725 - vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm 726 - jmp .L_data_read_\@ 727 - 728 - .L_fewer_than_16_bytes_\@: 729 - lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 730 - mov \PLAIN_CYPH_LEN, %r12 731 - READ_PARTIAL_BLOCK %r10 %r12 %xmm1 732 - 733 - mov PBlockLen(arg2), %r13 734 - 735 - .L_data_read_\@: # Finished reading in data 736 - 737 - vmovdqu PBlockEncKey(arg2), %xmm9 738 - vmovdqu HashKey(arg2), %xmm13 739 - 740 - lea SHIFT_MASK(%rip), %r12 741 - 742 - # adjust the shuffle mask pointer to be able to shift r13 bytes 743 - # r16-r13 is the number of bytes in plaintext mod 16) 744 - add %r13, %r12 745 - vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask 746 - vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes 747 - 748 - .if \ENC_DEC == DEC 749 - vmovdqa %xmm1, %xmm3 750 - pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn) 751 - 752 - mov \PLAIN_CYPH_LEN, %r10 753 - add %r13, %r10 754 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 755 - sub $16, %r10 756 - # Determine if partial block is not being filled and 757 - # shift mask accordingly 758 - jge .L_no_extra_mask_1_\@ 759 - sub %r10, %r12 760 - .L_no_extra_mask_1_\@: 761 - 762 - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 763 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 764 - vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9 765 - 766 - vpand %xmm1, %xmm3, %xmm3 767 - vmovdqa SHUF_MASK(%rip), %xmm10 768 - vpshufb %xmm10, %xmm3, %xmm3 769 - vpshufb %xmm2, %xmm3, %xmm3 770 - vpxor %xmm3, \AAD_HASH, \AAD_HASH 771 - 772 - test %r10, %r10 773 - jl .L_partial_incomplete_1_\@ 774 - 775 - # GHASH computation for the last <16 Byte block 776 - \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 777 - xor %eax,%eax 778 - 779 - mov %rax, PBlockLen(arg2) 780 - jmp .L_dec_done_\@ 781 - .L_partial_incomplete_1_\@: 782 - add \PLAIN_CYPH_LEN, PBlockLen(arg2) 783 - .L_dec_done_\@: 784 - vmovdqu \AAD_HASH, AadHash(arg2) 785 - .else 786 - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 787 - 788 - mov \PLAIN_CYPH_LEN, %r10 789 - add %r13, %r10 790 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 791 - sub $16, %r10 792 - # Determine if partial block is not being filled and 793 - # shift mask accordingly 794 - jge .L_no_extra_mask_2_\@ 795 - sub %r10, %r12 796 - .L_no_extra_mask_2_\@: 797 - 798 - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 799 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 800 - vpand %xmm1, %xmm9, %xmm9 801 - 802 - vmovdqa SHUF_MASK(%rip), %xmm1 803 - vpshufb %xmm1, %xmm9, %xmm9 804 - vpshufb %xmm2, %xmm9, %xmm9 805 - vpxor %xmm9, \AAD_HASH, \AAD_HASH 806 - 807 - test %r10, %r10 808 - jl .L_partial_incomplete_2_\@ 809 - 810 - # GHASH computation for the last <16 Byte block 811 - \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 812 - xor %eax,%eax 813 - 814 - mov %rax, PBlockLen(arg2) 815 - jmp .L_encode_done_\@ 816 - .L_partial_incomplete_2_\@: 817 - add \PLAIN_CYPH_LEN, PBlockLen(arg2) 818 - .L_encode_done_\@: 819 - vmovdqu \AAD_HASH, AadHash(arg2) 820 - 821 - vmovdqa SHUF_MASK(%rip), %xmm10 822 - # shuffle xmm9 back to output as ciphertext 823 - vpshufb %xmm10, %xmm9, %xmm9 824 - vpshufb %xmm2, %xmm9, %xmm9 825 - .endif 826 - # output encrypted Bytes 827 - test %r10, %r10 828 - jl .L_partial_fill_\@ 829 - mov %r13, %r12 830 - mov $16, %r13 831 - # Set r13 to be the number of bytes to write out 832 - sub %r12, %r13 833 - jmp .L_count_set_\@ 834 - .L_partial_fill_\@: 835 - mov \PLAIN_CYPH_LEN, %r13 836 - .L_count_set_\@: 837 - vmovdqa %xmm9, %xmm0 838 - vmovq %xmm0, %rax 839 - cmp $8, %r13 840 - jle .L_less_than_8_bytes_left_\@ 841 - 842 - mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 843 - add $8, \DATA_OFFSET 844 - psrldq $8, %xmm0 845 - vmovq %xmm0, %rax 846 - sub $8, %r13 847 - .L_less_than_8_bytes_left_\@: 848 - movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 849 - add $1, \DATA_OFFSET 850 - shr $8, %rax 851 - sub $1, %r13 852 - jne .L_less_than_8_bytes_left_\@ 853 - .L_partial_block_done_\@: 854 - .endm # PARTIAL_BLOCK 855 - 856 - ############################################################################### 857 - # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 858 - # Input: A and B (128-bits each, bit-reflected) 859 - # Output: C = A*B*x mod poly, (i.e. >>1 ) 860 - # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 861 - # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 862 - ############################################################################### 863 - .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 864 - 865 - vpshufd $0b01001110, \GH, \T2 866 - vpshufd $0b01001110, \HK, \T3 867 - vpxor \GH , \T2, \T2 # T2 = (a1+a0) 868 - vpxor \HK , \T3, \T3 # T3 = (b1+b0) 869 - 870 - vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 871 - vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 872 - vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) 873 - vpxor \GH, \T2,\T2 874 - vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 875 - 876 - vpslldq $8, \T2,\T3 # shift-L T3 2 DWs 877 - vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs 878 - vpxor \T3, \GH, \GH 879 - vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK 880 - 881 - #first phase of the reduction 882 - vpslld $31, \GH, \T2 # packed right shifting << 31 883 - vpslld $30, \GH, \T3 # packed right shifting shift << 30 884 - vpslld $25, \GH, \T4 # packed right shifting shift << 25 885 - 886 - vpxor \T3, \T2, \T2 # xor the shifted versions 887 - vpxor \T4, \T2, \T2 888 - 889 - vpsrldq $4, \T2, \T5 # shift-R T5 1 DW 890 - 891 - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 892 - vpxor \T2, \GH, \GH # first phase of the reduction complete 893 - 894 - #second phase of the reduction 895 - 896 - vpsrld $1,\GH, \T2 # packed left shifting >> 1 897 - vpsrld $2,\GH, \T3 # packed left shifting >> 2 898 - vpsrld $7,\GH, \T4 # packed left shifting >> 7 899 - vpxor \T3, \T2, \T2 # xor the shifted versions 900 - vpxor \T4, \T2, \T2 901 - 902 - vpxor \T5, \T2, \T2 903 - vpxor \T2, \GH, \GH 904 - vpxor \T1, \GH, \GH # the result is in GH 905 - 906 - 907 - .endm 908 - 909 - .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 910 - 911 - # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 912 - vmovdqa \HK, \T5 913 - 914 - vpshufd $0b01001110, \T5, \T1 915 - vpxor \T5, \T1, \T1 916 - vmovdqu \T1, HashKey_k(arg2) 917 - 918 - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 919 - vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly 920 - vpshufd $0b01001110, \T5, \T1 921 - vpxor \T5, \T1, \T1 922 - vmovdqu \T1, HashKey_2_k(arg2) 923 - 924 - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 925 - vmovdqu \T5, HashKey_3(arg2) 926 - vpshufd $0b01001110, \T5, \T1 927 - vpxor \T5, \T1, \T1 928 - vmovdqu \T1, HashKey_3_k(arg2) 929 - 930 - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 931 - vmovdqu \T5, HashKey_4(arg2) 932 - vpshufd $0b01001110, \T5, \T1 933 - vpxor \T5, \T1, \T1 934 - vmovdqu \T1, HashKey_4_k(arg2) 935 - 936 - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 937 - vmovdqu \T5, HashKey_5(arg2) 938 - vpshufd $0b01001110, \T5, \T1 939 - vpxor \T5, \T1, \T1 940 - vmovdqu \T1, HashKey_5_k(arg2) 941 - 942 - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 943 - vmovdqu \T5, HashKey_6(arg2) 944 - vpshufd $0b01001110, \T5, \T1 945 - vpxor \T5, \T1, \T1 946 - vmovdqu \T1, HashKey_6_k(arg2) 947 - 948 - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 949 - vmovdqu \T5, HashKey_7(arg2) 950 - vpshufd $0b01001110, \T5, \T1 951 - vpxor \T5, \T1, \T1 952 - vmovdqu \T1, HashKey_7_k(arg2) 953 - 954 - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 955 - vmovdqu \T5, HashKey_8(arg2) 956 - vpshufd $0b01001110, \T5, \T1 957 - vpxor \T5, \T1, \T1 958 - vmovdqu \T1, HashKey_8_k(arg2) 959 - 960 - .endm 961 - 962 - ## if a = number of total plaintext bytes 963 - ## b = floor(a/16) 964 - ## num_initial_blocks = b mod 4# 965 - ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 966 - ## r10, r11, r12, rax are clobbered 967 - ## arg1, arg2, arg3, arg4 are used as pointers only, not modified 968 - 969 - .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC 970 - i = (8-\num_initial_blocks) 971 - setreg 972 - vmovdqu AadHash(arg2), reg_i 973 - 974 - # start AES for num_initial_blocks blocks 975 - vmovdqu CurCount(arg2), \CTR 976 - 977 - i = (9-\num_initial_blocks) 978 - setreg 979 - .rep \num_initial_blocks 980 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 981 - vmovdqa \CTR, reg_i 982 - vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 983 - i = (i+1) 984 - setreg 985 - .endr 986 - 987 - vmovdqa (arg1), \T_key 988 - i = (9-\num_initial_blocks) 989 - setreg 990 - .rep \num_initial_blocks 991 - vpxor \T_key, reg_i, reg_i 992 - i = (i+1) 993 - setreg 994 - .endr 995 - 996 - j = 1 997 - setreg 998 - .rep \REP 999 - vmovdqa 16*j(arg1), \T_key 1000 - i = (9-\num_initial_blocks) 1001 - setreg 1002 - .rep \num_initial_blocks 1003 - vaesenc \T_key, reg_i, reg_i 1004 - i = (i+1) 1005 - setreg 1006 - .endr 1007 - 1008 - j = (j+1) 1009 - setreg 1010 - .endr 1011 - 1012 - vmovdqa 16*j(arg1), \T_key 1013 - i = (9-\num_initial_blocks) 1014 - setreg 1015 - .rep \num_initial_blocks 1016 - vaesenclast \T_key, reg_i, reg_i 1017 - i = (i+1) 1018 - setreg 1019 - .endr 1020 - 1021 - i = (9-\num_initial_blocks) 1022 - setreg 1023 - .rep \num_initial_blocks 1024 - vmovdqu (arg4, %r11), \T1 1025 - vpxor \T1, reg_i, reg_i 1026 - vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks 1027 - add $16, %r11 1028 - .if \ENC_DEC == DEC 1029 - vmovdqa \T1, reg_i 1030 - .endif 1031 - vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 1032 - i = (i+1) 1033 - setreg 1034 - .endr 1035 - 1036 - 1037 - i = (8-\num_initial_blocks) 1038 - j = (9-\num_initial_blocks) 1039 - setreg 1040 - 1041 - .rep \num_initial_blocks 1042 - vpxor reg_i, reg_j, reg_j 1043 - GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 1044 - i = (i+1) 1045 - j = (j+1) 1046 - setreg 1047 - .endr 1048 - # XMM8 has the combined result here 1049 - 1050 - vmovdqa \XMM8, TMP1(%rsp) 1051 - vmovdqa \XMM8, \T3 1052 - 1053 - cmp $128, %r13 1054 - jl .L_initial_blocks_done\@ # no need for precomputed constants 1055 - 1056 - ############################################################################### 1057 - # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1058 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1059 - vmovdqa \CTR, \XMM1 1060 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1061 - 1062 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1063 - vmovdqa \CTR, \XMM2 1064 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1065 - 1066 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1067 - vmovdqa \CTR, \XMM3 1068 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1069 - 1070 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1071 - vmovdqa \CTR, \XMM4 1072 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1073 - 1074 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1075 - vmovdqa \CTR, \XMM5 1076 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1077 - 1078 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1079 - vmovdqa \CTR, \XMM6 1080 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1081 - 1082 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1083 - vmovdqa \CTR, \XMM7 1084 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1085 - 1086 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1087 - vmovdqa \CTR, \XMM8 1088 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1089 - 1090 - vmovdqa (arg1), \T_key 1091 - vpxor \T_key, \XMM1, \XMM1 1092 - vpxor \T_key, \XMM2, \XMM2 1093 - vpxor \T_key, \XMM3, \XMM3 1094 - vpxor \T_key, \XMM4, \XMM4 1095 - vpxor \T_key, \XMM5, \XMM5 1096 - vpxor \T_key, \XMM6, \XMM6 1097 - vpxor \T_key, \XMM7, \XMM7 1098 - vpxor \T_key, \XMM8, \XMM8 1099 - 1100 - i = 1 1101 - setreg 1102 - .rep \REP # do REP rounds 1103 - vmovdqa 16*i(arg1), \T_key 1104 - vaesenc \T_key, \XMM1, \XMM1 1105 - vaesenc \T_key, \XMM2, \XMM2 1106 - vaesenc \T_key, \XMM3, \XMM3 1107 - vaesenc \T_key, \XMM4, \XMM4 1108 - vaesenc \T_key, \XMM5, \XMM5 1109 - vaesenc \T_key, \XMM6, \XMM6 1110 - vaesenc \T_key, \XMM7, \XMM7 1111 - vaesenc \T_key, \XMM8, \XMM8 1112 - i = (i+1) 1113 - setreg 1114 - .endr 1115 - 1116 - vmovdqa 16*i(arg1), \T_key 1117 - vaesenclast \T_key, \XMM1, \XMM1 1118 - vaesenclast \T_key, \XMM2, \XMM2 1119 - vaesenclast \T_key, \XMM3, \XMM3 1120 - vaesenclast \T_key, \XMM4, \XMM4 1121 - vaesenclast \T_key, \XMM5, \XMM5 1122 - vaesenclast \T_key, \XMM6, \XMM6 1123 - vaesenclast \T_key, \XMM7, \XMM7 1124 - vaesenclast \T_key, \XMM8, \XMM8 1125 - 1126 - vmovdqu (arg4, %r11), \T1 1127 - vpxor \T1, \XMM1, \XMM1 1128 - vmovdqu \XMM1, (arg3 , %r11) 1129 - .if \ENC_DEC == DEC 1130 - vmovdqa \T1, \XMM1 1131 - .endif 1132 - 1133 - vmovdqu 16*1(arg4, %r11), \T1 1134 - vpxor \T1, \XMM2, \XMM2 1135 - vmovdqu \XMM2, 16*1(arg3 , %r11) 1136 - .if \ENC_DEC == DEC 1137 - vmovdqa \T1, \XMM2 1138 - .endif 1139 - 1140 - vmovdqu 16*2(arg4, %r11), \T1 1141 - vpxor \T1, \XMM3, \XMM3 1142 - vmovdqu \XMM3, 16*2(arg3 , %r11) 1143 - .if \ENC_DEC == DEC 1144 - vmovdqa \T1, \XMM3 1145 - .endif 1146 - 1147 - vmovdqu 16*3(arg4, %r11), \T1 1148 - vpxor \T1, \XMM4, \XMM4 1149 - vmovdqu \XMM4, 16*3(arg3 , %r11) 1150 - .if \ENC_DEC == DEC 1151 - vmovdqa \T1, \XMM4 1152 - .endif 1153 - 1154 - vmovdqu 16*4(arg4, %r11), \T1 1155 - vpxor \T1, \XMM5, \XMM5 1156 - vmovdqu \XMM5, 16*4(arg3 , %r11) 1157 - .if \ENC_DEC == DEC 1158 - vmovdqa \T1, \XMM5 1159 - .endif 1160 - 1161 - vmovdqu 16*5(arg4, %r11), \T1 1162 - vpxor \T1, \XMM6, \XMM6 1163 - vmovdqu \XMM6, 16*5(arg3 , %r11) 1164 - .if \ENC_DEC == DEC 1165 - vmovdqa \T1, \XMM6 1166 - .endif 1167 - 1168 - vmovdqu 16*6(arg4, %r11), \T1 1169 - vpxor \T1, \XMM7, \XMM7 1170 - vmovdqu \XMM7, 16*6(arg3 , %r11) 1171 - .if \ENC_DEC == DEC 1172 - vmovdqa \T1, \XMM7 1173 - .endif 1174 - 1175 - vmovdqu 16*7(arg4, %r11), \T1 1176 - vpxor \T1, \XMM8, \XMM8 1177 - vmovdqu \XMM8, 16*7(arg3 , %r11) 1178 - .if \ENC_DEC == DEC 1179 - vmovdqa \T1, \XMM8 1180 - .endif 1181 - 1182 - add $128, %r11 1183 - 1184 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1185 - vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext 1186 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1187 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1188 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1189 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1190 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1191 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1192 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1193 - 1194 - ############################################################################### 1195 - 1196 - .L_initial_blocks_done\@: 1197 - 1198 - .endm 1199 - 1200 - # encrypt 8 blocks at a time 1201 - # ghash the 8 previously encrypted ciphertext blocks 1202 - # arg1, arg2, arg3, arg4 are used as pointers only, not modified 1203 - # r11 is the data offset value 1204 - .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 1205 - 1206 - vmovdqa \XMM1, \T2 1207 - vmovdqa \XMM2, TMP2(%rsp) 1208 - vmovdqa \XMM3, TMP3(%rsp) 1209 - vmovdqa \XMM4, TMP4(%rsp) 1210 - vmovdqa \XMM5, TMP5(%rsp) 1211 - vmovdqa \XMM6, TMP6(%rsp) 1212 - vmovdqa \XMM7, TMP7(%rsp) 1213 - vmovdqa \XMM8, TMP8(%rsp) 1214 - 1215 - .if \loop_idx == in_order 1216 - vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 1217 - vpaddd ONE(%rip), \XMM1, \XMM2 1218 - vpaddd ONE(%rip), \XMM2, \XMM3 1219 - vpaddd ONE(%rip), \XMM3, \XMM4 1220 - vpaddd ONE(%rip), \XMM4, \XMM5 1221 - vpaddd ONE(%rip), \XMM5, \XMM6 1222 - vpaddd ONE(%rip), \XMM6, \XMM7 1223 - vpaddd ONE(%rip), \XMM7, \XMM8 1224 - vmovdqa \XMM8, \CTR 1225 - 1226 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1227 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1228 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1229 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1230 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1231 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1232 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1233 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1234 - .else 1235 - vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 1236 - vpaddd ONEf(%rip), \XMM1, \XMM2 1237 - vpaddd ONEf(%rip), \XMM2, \XMM3 1238 - vpaddd ONEf(%rip), \XMM3, \XMM4 1239 - vpaddd ONEf(%rip), \XMM4, \XMM5 1240 - vpaddd ONEf(%rip), \XMM5, \XMM6 1241 - vpaddd ONEf(%rip), \XMM6, \XMM7 1242 - vpaddd ONEf(%rip), \XMM7, \XMM8 1243 - vmovdqa \XMM8, \CTR 1244 - .endif 1245 - 1246 - 1247 - ####################################################################### 1248 - 1249 - vmovdqu (arg1), \T1 1250 - vpxor \T1, \XMM1, \XMM1 1251 - vpxor \T1, \XMM2, \XMM2 1252 - vpxor \T1, \XMM3, \XMM3 1253 - vpxor \T1, \XMM4, \XMM4 1254 - vpxor \T1, \XMM5, \XMM5 1255 - vpxor \T1, \XMM6, \XMM6 1256 - vpxor \T1, \XMM7, \XMM7 1257 - vpxor \T1, \XMM8, \XMM8 1258 - 1259 - ####################################################################### 1260 - 1261 - 1262 - 1263 - 1264 - 1265 - vmovdqu 16*1(arg1), \T1 1266 - vaesenc \T1, \XMM1, \XMM1 1267 - vaesenc \T1, \XMM2, \XMM2 1268 - vaesenc \T1, \XMM3, \XMM3 1269 - vaesenc \T1, \XMM4, \XMM4 1270 - vaesenc \T1, \XMM5, \XMM5 1271 - vaesenc \T1, \XMM6, \XMM6 1272 - vaesenc \T1, \XMM7, \XMM7 1273 - vaesenc \T1, \XMM8, \XMM8 1274 - 1275 - vmovdqu 16*2(arg1), \T1 1276 - vaesenc \T1, \XMM1, \XMM1 1277 - vaesenc \T1, \XMM2, \XMM2 1278 - vaesenc \T1, \XMM3, \XMM3 1279 - vaesenc \T1, \XMM4, \XMM4 1280 - vaesenc \T1, \XMM5, \XMM5 1281 - vaesenc \T1, \XMM6, \XMM6 1282 - vaesenc \T1, \XMM7, \XMM7 1283 - vaesenc \T1, \XMM8, \XMM8 1284 - 1285 - 1286 - ####################################################################### 1287 - 1288 - vmovdqu HashKey_8(arg2), \T5 1289 - vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 1290 - vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 1291 - 1292 - vpshufd $0b01001110, \T2, \T6 1293 - vpxor \T2, \T6, \T6 1294 - 1295 - vmovdqu HashKey_8_k(arg2), \T5 1296 - vpclmulqdq $0x00, \T5, \T6, \T6 1297 - 1298 - vmovdqu 16*3(arg1), \T1 1299 - vaesenc \T1, \XMM1, \XMM1 1300 - vaesenc \T1, \XMM2, \XMM2 1301 - vaesenc \T1, \XMM3, \XMM3 1302 - vaesenc \T1, \XMM4, \XMM4 1303 - vaesenc \T1, \XMM5, \XMM5 1304 - vaesenc \T1, \XMM6, \XMM6 1305 - vaesenc \T1, \XMM7, \XMM7 1306 - vaesenc \T1, \XMM8, \XMM8 1307 - 1308 - vmovdqa TMP2(%rsp), \T1 1309 - vmovdqu HashKey_7(arg2), \T5 1310 - vpclmulqdq $0x11, \T5, \T1, \T3 1311 - vpxor \T3, \T4, \T4 1312 - vpclmulqdq $0x00, \T5, \T1, \T3 1313 - vpxor \T3, \T7, \T7 1314 - 1315 - vpshufd $0b01001110, \T1, \T3 1316 - vpxor \T1, \T3, \T3 1317 - vmovdqu HashKey_7_k(arg2), \T5 1318 - vpclmulqdq $0x10, \T5, \T3, \T3 1319 - vpxor \T3, \T6, \T6 1320 - 1321 - vmovdqu 16*4(arg1), \T1 1322 - vaesenc \T1, \XMM1, \XMM1 1323 - vaesenc \T1, \XMM2, \XMM2 1324 - vaesenc \T1, \XMM3, \XMM3 1325 - vaesenc \T1, \XMM4, \XMM4 1326 - vaesenc \T1, \XMM5, \XMM5 1327 - vaesenc \T1, \XMM6, \XMM6 1328 - vaesenc \T1, \XMM7, \XMM7 1329 - vaesenc \T1, \XMM8, \XMM8 1330 - 1331 - ####################################################################### 1332 - 1333 - vmovdqa TMP3(%rsp), \T1 1334 - vmovdqu HashKey_6(arg2), \T5 1335 - vpclmulqdq $0x11, \T5, \T1, \T3 1336 - vpxor \T3, \T4, \T4 1337 - vpclmulqdq $0x00, \T5, \T1, \T3 1338 - vpxor \T3, \T7, \T7 1339 - 1340 - vpshufd $0b01001110, \T1, \T3 1341 - vpxor \T1, \T3, \T3 1342 - vmovdqu HashKey_6_k(arg2), \T5 1343 - vpclmulqdq $0x10, \T5, \T3, \T3 1344 - vpxor \T3, \T6, \T6 1345 - 1346 - vmovdqu 16*5(arg1), \T1 1347 - vaesenc \T1, \XMM1, \XMM1 1348 - vaesenc \T1, \XMM2, \XMM2 1349 - vaesenc \T1, \XMM3, \XMM3 1350 - vaesenc \T1, \XMM4, \XMM4 1351 - vaesenc \T1, \XMM5, \XMM5 1352 - vaesenc \T1, \XMM6, \XMM6 1353 - vaesenc \T1, \XMM7, \XMM7 1354 - vaesenc \T1, \XMM8, \XMM8 1355 - 1356 - vmovdqa TMP4(%rsp), \T1 1357 - vmovdqu HashKey_5(arg2), \T5 1358 - vpclmulqdq $0x11, \T5, \T1, \T3 1359 - vpxor \T3, \T4, \T4 1360 - vpclmulqdq $0x00, \T5, \T1, \T3 1361 - vpxor \T3, \T7, \T7 1362 - 1363 - vpshufd $0b01001110, \T1, \T3 1364 - vpxor \T1, \T3, \T3 1365 - vmovdqu HashKey_5_k(arg2), \T5 1366 - vpclmulqdq $0x10, \T5, \T3, \T3 1367 - vpxor \T3, \T6, \T6 1368 - 1369 - vmovdqu 16*6(arg1), \T1 1370 - vaesenc \T1, \XMM1, \XMM1 1371 - vaesenc \T1, \XMM2, \XMM2 1372 - vaesenc \T1, \XMM3, \XMM3 1373 - vaesenc \T1, \XMM4, \XMM4 1374 - vaesenc \T1, \XMM5, \XMM5 1375 - vaesenc \T1, \XMM6, \XMM6 1376 - vaesenc \T1, \XMM7, \XMM7 1377 - vaesenc \T1, \XMM8, \XMM8 1378 - 1379 - 1380 - vmovdqa TMP5(%rsp), \T1 1381 - vmovdqu HashKey_4(arg2), \T5 1382 - vpclmulqdq $0x11, \T5, \T1, \T3 1383 - vpxor \T3, \T4, \T4 1384 - vpclmulqdq $0x00, \T5, \T1, \T3 1385 - vpxor \T3, \T7, \T7 1386 - 1387 - vpshufd $0b01001110, \T1, \T3 1388 - vpxor \T1, \T3, \T3 1389 - vmovdqu HashKey_4_k(arg2), \T5 1390 - vpclmulqdq $0x10, \T5, \T3, \T3 1391 - vpxor \T3, \T6, \T6 1392 - 1393 - vmovdqu 16*7(arg1), \T1 1394 - vaesenc \T1, \XMM1, \XMM1 1395 - vaesenc \T1, \XMM2, \XMM2 1396 - vaesenc \T1, \XMM3, \XMM3 1397 - vaesenc \T1, \XMM4, \XMM4 1398 - vaesenc \T1, \XMM5, \XMM5 1399 - vaesenc \T1, \XMM6, \XMM6 1400 - vaesenc \T1, \XMM7, \XMM7 1401 - vaesenc \T1, \XMM8, \XMM8 1402 - 1403 - vmovdqa TMP6(%rsp), \T1 1404 - vmovdqu HashKey_3(arg2), \T5 1405 - vpclmulqdq $0x11, \T5, \T1, \T3 1406 - vpxor \T3, \T4, \T4 1407 - vpclmulqdq $0x00, \T5, \T1, \T3 1408 - vpxor \T3, \T7, \T7 1409 - 1410 - vpshufd $0b01001110, \T1, \T3 1411 - vpxor \T1, \T3, \T3 1412 - vmovdqu HashKey_3_k(arg2), \T5 1413 - vpclmulqdq $0x10, \T5, \T3, \T3 1414 - vpxor \T3, \T6, \T6 1415 - 1416 - 1417 - vmovdqu 16*8(arg1), \T1 1418 - vaesenc \T1, \XMM1, \XMM1 1419 - vaesenc \T1, \XMM2, \XMM2 1420 - vaesenc \T1, \XMM3, \XMM3 1421 - vaesenc \T1, \XMM4, \XMM4 1422 - vaesenc \T1, \XMM5, \XMM5 1423 - vaesenc \T1, \XMM6, \XMM6 1424 - vaesenc \T1, \XMM7, \XMM7 1425 - vaesenc \T1, \XMM8, \XMM8 1426 - 1427 - vmovdqa TMP7(%rsp), \T1 1428 - vmovdqu HashKey_2(arg2), \T5 1429 - vpclmulqdq $0x11, \T5, \T1, \T3 1430 - vpxor \T3, \T4, \T4 1431 - vpclmulqdq $0x00, \T5, \T1, \T3 1432 - vpxor \T3, \T7, \T7 1433 - 1434 - vpshufd $0b01001110, \T1, \T3 1435 - vpxor \T1, \T3, \T3 1436 - vmovdqu HashKey_2_k(arg2), \T5 1437 - vpclmulqdq $0x10, \T5, \T3, \T3 1438 - vpxor \T3, \T6, \T6 1439 - 1440 - ####################################################################### 1441 - 1442 - vmovdqu 16*9(arg1), \T5 1443 - vaesenc \T5, \XMM1, \XMM1 1444 - vaesenc \T5, \XMM2, \XMM2 1445 - vaesenc \T5, \XMM3, \XMM3 1446 - vaesenc \T5, \XMM4, \XMM4 1447 - vaesenc \T5, \XMM5, \XMM5 1448 - vaesenc \T5, \XMM6, \XMM6 1449 - vaesenc \T5, \XMM7, \XMM7 1450 - vaesenc \T5, \XMM8, \XMM8 1451 - 1452 - vmovdqa TMP8(%rsp), \T1 1453 - vmovdqu HashKey(arg2), \T5 1454 - vpclmulqdq $0x11, \T5, \T1, \T3 1455 - vpxor \T3, \T4, \T4 1456 - vpclmulqdq $0x00, \T5, \T1, \T3 1457 - vpxor \T3, \T7, \T7 1458 - 1459 - vpshufd $0b01001110, \T1, \T3 1460 - vpxor \T1, \T3, \T3 1461 - vmovdqu HashKey_k(arg2), \T5 1462 - vpclmulqdq $0x10, \T5, \T3, \T3 1463 - vpxor \T3, \T6, \T6 1464 - 1465 - vpxor \T4, \T6, \T6 1466 - vpxor \T7, \T6, \T6 1467 - 1468 - vmovdqu 16*10(arg1), \T5 1469 - 1470 - i = 11 1471 - setreg 1472 - .rep (\REP-9) 1473 - 1474 - vaesenc \T5, \XMM1, \XMM1 1475 - vaesenc \T5, \XMM2, \XMM2 1476 - vaesenc \T5, \XMM3, \XMM3 1477 - vaesenc \T5, \XMM4, \XMM4 1478 - vaesenc \T5, \XMM5, \XMM5 1479 - vaesenc \T5, \XMM6, \XMM6 1480 - vaesenc \T5, \XMM7, \XMM7 1481 - vaesenc \T5, \XMM8, \XMM8 1482 - 1483 - vmovdqu 16*i(arg1), \T5 1484 - i = i + 1 1485 - setreg 1486 - .endr 1487 - 1488 - i = 0 1489 - j = 1 1490 - setreg 1491 - .rep 8 1492 - vpxor 16*i(arg4, %r11), \T5, \T2 1493 - .if \ENC_DEC == ENC 1494 - vaesenclast \T2, reg_j, reg_j 1495 - .else 1496 - vaesenclast \T2, reg_j, \T3 1497 - vmovdqu 16*i(arg4, %r11), reg_j 1498 - vmovdqu \T3, 16*i(arg3, %r11) 1499 - .endif 1500 - i = (i+1) 1501 - j = (j+1) 1502 - setreg 1503 - .endr 1504 - ####################################################################### 1505 - 1506 - 1507 - vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 1508 - vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 1509 - vpxor \T3, \T7, \T7 1510 - vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 1511 - 1512 - 1513 - 1514 - ####################################################################### 1515 - #first phase of the reduction 1516 - ####################################################################### 1517 - vpslld $31, \T7, \T2 # packed right shifting << 31 1518 - vpslld $30, \T7, \T3 # packed right shifting shift << 30 1519 - vpslld $25, \T7, \T4 # packed right shifting shift << 25 1520 - 1521 - vpxor \T3, \T2, \T2 # xor the shifted versions 1522 - vpxor \T4, \T2, \T2 1523 - 1524 - vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 1525 - 1526 - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 1527 - vpxor \T2, \T7, \T7 # first phase of the reduction complete 1528 - ####################################################################### 1529 - .if \ENC_DEC == ENC 1530 - vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer 1531 - vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer 1532 - vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer 1533 - vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer 1534 - vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer 1535 - vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer 1536 - vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer 1537 - vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer 1538 - .endif 1539 - 1540 - ####################################################################### 1541 - #second phase of the reduction 1542 - vpsrld $1, \T7, \T2 # packed left shifting >> 1 1543 - vpsrld $2, \T7, \T3 # packed left shifting >> 2 1544 - vpsrld $7, \T7, \T4 # packed left shifting >> 7 1545 - vpxor \T3, \T2, \T2 # xor the shifted versions 1546 - vpxor \T4, \T2, \T2 1547 - 1548 - vpxor \T1, \T2, \T2 1549 - vpxor \T2, \T7, \T7 1550 - vpxor \T7, \T6, \T6 # the result is in T6 1551 - ####################################################################### 1552 - 1553 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1554 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1555 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1556 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1557 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1558 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1559 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1560 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1561 - 1562 - 1563 - vpxor \T6, \XMM1, \XMM1 1564 - 1565 - 1566 - 1567 - .endm 1568 - 1569 - 1570 - # GHASH the last 4 ciphertext blocks. 1571 - .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 1572 - 1573 - ## Karatsuba Method 1574 - 1575 - 1576 - vpshufd $0b01001110, \XMM1, \T2 1577 - vpxor \XMM1, \T2, \T2 1578 - vmovdqu HashKey_8(arg2), \T5 1579 - vpclmulqdq $0x11, \T5, \XMM1, \T6 1580 - vpclmulqdq $0x00, \T5, \XMM1, \T7 1581 - 1582 - vmovdqu HashKey_8_k(arg2), \T3 1583 - vpclmulqdq $0x00, \T3, \T2, \XMM1 1584 - 1585 - ###################### 1586 - 1587 - vpshufd $0b01001110, \XMM2, \T2 1588 - vpxor \XMM2, \T2, \T2 1589 - vmovdqu HashKey_7(arg2), \T5 1590 - vpclmulqdq $0x11, \T5, \XMM2, \T4 1591 - vpxor \T4, \T6, \T6 1592 - 1593 - vpclmulqdq $0x00, \T5, \XMM2, \T4 1594 - vpxor \T4, \T7, \T7 1595 - 1596 - vmovdqu HashKey_7_k(arg2), \T3 1597 - vpclmulqdq $0x00, \T3, \T2, \T2 1598 - vpxor \T2, \XMM1, \XMM1 1599 - 1600 - ###################### 1601 - 1602 - vpshufd $0b01001110, \XMM3, \T2 1603 - vpxor \XMM3, \T2, \T2 1604 - vmovdqu HashKey_6(arg2), \T5 1605 - vpclmulqdq $0x11, \T5, \XMM3, \T4 1606 - vpxor \T4, \T6, \T6 1607 - 1608 - vpclmulqdq $0x00, \T5, \XMM3, \T4 1609 - vpxor \T4, \T7, \T7 1610 - 1611 - vmovdqu HashKey_6_k(arg2), \T3 1612 - vpclmulqdq $0x00, \T3, \T2, \T2 1613 - vpxor \T2, \XMM1, \XMM1 1614 - 1615 - ###################### 1616 - 1617 - vpshufd $0b01001110, \XMM4, \T2 1618 - vpxor \XMM4, \T2, \T2 1619 - vmovdqu HashKey_5(arg2), \T5 1620 - vpclmulqdq $0x11, \T5, \XMM4, \T4 1621 - vpxor \T4, \T6, \T6 1622 - 1623 - vpclmulqdq $0x00, \T5, \XMM4, \T4 1624 - vpxor \T4, \T7, \T7 1625 - 1626 - vmovdqu HashKey_5_k(arg2), \T3 1627 - vpclmulqdq $0x00, \T3, \T2, \T2 1628 - vpxor \T2, \XMM1, \XMM1 1629 - 1630 - ###################### 1631 - 1632 - vpshufd $0b01001110, \XMM5, \T2 1633 - vpxor \XMM5, \T2, \T2 1634 - vmovdqu HashKey_4(arg2), \T5 1635 - vpclmulqdq $0x11, \T5, \XMM5, \T4 1636 - vpxor \T4, \T6, \T6 1637 - 1638 - vpclmulqdq $0x00, \T5, \XMM5, \T4 1639 - vpxor \T4, \T7, \T7 1640 - 1641 - vmovdqu HashKey_4_k(arg2), \T3 1642 - vpclmulqdq $0x00, \T3, \T2, \T2 1643 - vpxor \T2, \XMM1, \XMM1 1644 - 1645 - ###################### 1646 - 1647 - vpshufd $0b01001110, \XMM6, \T2 1648 - vpxor \XMM6, \T2, \T2 1649 - vmovdqu HashKey_3(arg2), \T5 1650 - vpclmulqdq $0x11, \T5, \XMM6, \T4 1651 - vpxor \T4, \T6, \T6 1652 - 1653 - vpclmulqdq $0x00, \T5, \XMM6, \T4 1654 - vpxor \T4, \T7, \T7 1655 - 1656 - vmovdqu HashKey_3_k(arg2), \T3 1657 - vpclmulqdq $0x00, \T3, \T2, \T2 1658 - vpxor \T2, \XMM1, \XMM1 1659 - 1660 - ###################### 1661 - 1662 - vpshufd $0b01001110, \XMM7, \T2 1663 - vpxor \XMM7, \T2, \T2 1664 - vmovdqu HashKey_2(arg2), \T5 1665 - vpclmulqdq $0x11, \T5, \XMM7, \T4 1666 - vpxor \T4, \T6, \T6 1667 - 1668 - vpclmulqdq $0x00, \T5, \XMM7, \T4 1669 - vpxor \T4, \T7, \T7 1670 - 1671 - vmovdqu HashKey_2_k(arg2), \T3 1672 - vpclmulqdq $0x00, \T3, \T2, \T2 1673 - vpxor \T2, \XMM1, \XMM1 1674 - 1675 - ###################### 1676 - 1677 - vpshufd $0b01001110, \XMM8, \T2 1678 - vpxor \XMM8, \T2, \T2 1679 - vmovdqu HashKey(arg2), \T5 1680 - vpclmulqdq $0x11, \T5, \XMM8, \T4 1681 - vpxor \T4, \T6, \T6 1682 - 1683 - vpclmulqdq $0x00, \T5, \XMM8, \T4 1684 - vpxor \T4, \T7, \T7 1685 - 1686 - vmovdqu HashKey_k(arg2), \T3 1687 - vpclmulqdq $0x00, \T3, \T2, \T2 1688 - 1689 - vpxor \T2, \XMM1, \XMM1 1690 - vpxor \T6, \XMM1, \XMM1 1691 - vpxor \T7, \XMM1, \T2 1692 - 1693 - 1694 - 1695 - 1696 - vpslldq $8, \T2, \T4 1697 - vpsrldq $8, \T2, \T2 1698 - 1699 - vpxor \T4, \T7, \T7 1700 - vpxor \T2, \T6, \T6 # <T6:T7> holds the result of 1701 - # the accumulated carry-less multiplications 1702 - 1703 - ####################################################################### 1704 - #first phase of the reduction 1705 - vpslld $31, \T7, \T2 # packed right shifting << 31 1706 - vpslld $30, \T7, \T3 # packed right shifting shift << 30 1707 - vpslld $25, \T7, \T4 # packed right shifting shift << 25 1708 - 1709 - vpxor \T3, \T2, \T2 # xor the shifted versions 1710 - vpxor \T4, \T2, \T2 1711 - 1712 - vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 1713 - 1714 - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 1715 - vpxor \T2, \T7, \T7 # first phase of the reduction complete 1716 - ####################################################################### 1717 - 1718 - 1719 - #second phase of the reduction 1720 - vpsrld $1, \T7, \T2 # packed left shifting >> 1 1721 - vpsrld $2, \T7, \T3 # packed left shifting >> 2 1722 - vpsrld $7, \T7, \T4 # packed left shifting >> 7 1723 - vpxor \T3, \T2, \T2 # xor the shifted versions 1724 - vpxor \T4, \T2, \T2 1725 - 1726 - vpxor \T1, \T2, \T2 1727 - vpxor \T2, \T7, \T7 1728 - vpxor \T7, \T6, \T6 # the result is in T6 1729 - 1730 - .endm 1731 - 1732 - ############################################################# 1733 - #void aesni_gcm_precomp_avx_gen2 1734 - # (gcm_data *my_ctx_data, 1735 - # gcm_context_data *data, 1736 - # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ 1737 - # u8 *iv, /* Pre-counter block j0: 4 byte salt 1738 - # (from Security Association) concatenated with 8 byte 1739 - # Initialisation Vector (from IPSec ESP Payload) 1740 - # concatenated with 0x00000001. 16-byte aligned pointer. */ 1741 - # const u8 *aad, /* Additional Authentication Data (AAD)*/ 1742 - # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 1743 - ############################################################# 1744 - SYM_FUNC_START(aesni_gcm_init_avx_gen2) 1745 - FUNC_SAVE 1746 - INIT GHASH_MUL_AVX, PRECOMPUTE_AVX 1747 - FUNC_RESTORE 1748 - RET 1749 - SYM_FUNC_END(aesni_gcm_init_avx_gen2) 1750 - 1751 - ############################################################################### 1752 - #void aesni_gcm_enc_update_avx_gen2( 1753 - # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1754 - # gcm_context_data *data, 1755 - # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 1756 - # const u8 *in, /* Plaintext input */ 1757 - # u64 plaintext_len) /* Length of data in Bytes for encryption. */ 1758 - ############################################################################### 1759 - SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2) 1760 - FUNC_SAVE 1761 - mov keysize, %eax 1762 - cmp $32, %eax 1763 - je key_256_enc_update 1764 - cmp $16, %eax 1765 - je key_128_enc_update 1766 - # must be 192 1767 - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 1768 - FUNC_RESTORE 1769 - RET 1770 - key_128_enc_update: 1771 - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 1772 - FUNC_RESTORE 1773 - RET 1774 - key_256_enc_update: 1775 - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 1776 - FUNC_RESTORE 1777 - RET 1778 - SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2) 1779 - 1780 - ############################################################################### 1781 - #void aesni_gcm_dec_update_avx_gen2( 1782 - # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1783 - # gcm_context_data *data, 1784 - # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 1785 - # const u8 *in, /* Ciphertext input */ 1786 - # u64 plaintext_len) /* Length of data in Bytes for encryption. */ 1787 - ############################################################################### 1788 - SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2) 1789 - FUNC_SAVE 1790 - mov keysize,%eax 1791 - cmp $32, %eax 1792 - je key_256_dec_update 1793 - cmp $16, %eax 1794 - je key_128_dec_update 1795 - # must be 192 1796 - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 1797 - FUNC_RESTORE 1798 - RET 1799 - key_128_dec_update: 1800 - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 1801 - FUNC_RESTORE 1802 - RET 1803 - key_256_dec_update: 1804 - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 1805 - FUNC_RESTORE 1806 - RET 1807 - SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2) 1808 - 1809 - ############################################################################### 1810 - #void aesni_gcm_finalize_avx_gen2( 1811 - # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1812 - # gcm_context_data *data, 1813 - # u8 *auth_tag, /* Authenticated Tag output. */ 1814 - # u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 1815 - # Valid values are 16 (most likely), 12 or 8. */ 1816 - ############################################################################### 1817 - SYM_FUNC_START(aesni_gcm_finalize_avx_gen2) 1818 - FUNC_SAVE 1819 - mov keysize,%eax 1820 - cmp $32, %eax 1821 - je key_256_finalize 1822 - cmp $16, %eax 1823 - je key_128_finalize 1824 - # must be 192 1825 - GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4 1826 - FUNC_RESTORE 1827 - RET 1828 - key_128_finalize: 1829 - GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4 1830 - FUNC_RESTORE 1831 - RET 1832 - key_256_finalize: 1833 - GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 1834 - FUNC_RESTORE 1835 - RET 1836 - SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) 1837 - 1838 - ############################################################################### 1839 - # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 1840 - # Input: A and B (128-bits each, bit-reflected) 1841 - # Output: C = A*B*x mod poly, (i.e. >>1 ) 1842 - # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 1843 - # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 1844 - ############################################################################### 1845 - .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 1846 - 1847 - vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 1848 - vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 1849 - vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 1850 - vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 1851 - vpxor \T3, \GH, \GH 1852 - 1853 - 1854 - vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs 1855 - vpslldq $8 , \GH, \GH # shift-L GH 2 DWs 1856 - 1857 - vpxor \T3, \T1, \T1 1858 - vpxor \T2, \GH, \GH 1859 - 1860 - ####################################################################### 1861 - #first phase of the reduction 1862 - vmovdqa POLY2(%rip), \T3 1863 - 1864 - vpclmulqdq $0x01, \GH, \T3, \T2 1865 - vpslldq $8, \T2, \T2 # shift-L T2 2 DWs 1866 - 1867 - vpxor \T2, \GH, \GH # first phase of the reduction complete 1868 - ####################################################################### 1869 - #second phase of the reduction 1870 - vpclmulqdq $0x00, \GH, \T3, \T2 1871 - vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 1872 - 1873 - vpclmulqdq $0x10, \GH, \T3, \GH 1874 - vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) 1875 - 1876 - vpxor \T2, \GH, \GH # second phase of the reduction complete 1877 - ####################################################################### 1878 - vpxor \T1, \GH, \GH # the result is in GH 1879 - 1880 - 1881 - .endm 1882 - 1883 - .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 1884 - 1885 - # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1886 - vmovdqa \HK, \T5 1887 - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 1888 - vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly 1889 - 1890 - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 1891 - vmovdqu \T5, HashKey_3(arg2) 1892 - 1893 - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 1894 - vmovdqu \T5, HashKey_4(arg2) 1895 - 1896 - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 1897 - vmovdqu \T5, HashKey_5(arg2) 1898 - 1899 - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 1900 - vmovdqu \T5, HashKey_6(arg2) 1901 - 1902 - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 1903 - vmovdqu \T5, HashKey_7(arg2) 1904 - 1905 - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 1906 - vmovdqu \T5, HashKey_8(arg2) 1907 - 1908 - .endm 1909 - 1910 - ## if a = number of total plaintext bytes 1911 - ## b = floor(a/16) 1912 - ## num_initial_blocks = b mod 4# 1913 - ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 1914 - ## r10, r11, r12, rax are clobbered 1915 - ## arg1, arg2, arg3, arg4 are used as pointers only, not modified 1916 - 1917 - .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER 1918 - i = (8-\num_initial_blocks) 1919 - setreg 1920 - vmovdqu AadHash(arg2), reg_i 1921 - 1922 - # start AES for num_initial_blocks blocks 1923 - vmovdqu CurCount(arg2), \CTR 1924 - 1925 - i = (9-\num_initial_blocks) 1926 - setreg 1927 - .rep \num_initial_blocks 1928 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1929 - vmovdqa \CTR, reg_i 1930 - vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 1931 - i = (i+1) 1932 - setreg 1933 - .endr 1934 - 1935 - vmovdqa (arg1), \T_key 1936 - i = (9-\num_initial_blocks) 1937 - setreg 1938 - .rep \num_initial_blocks 1939 - vpxor \T_key, reg_i, reg_i 1940 - i = (i+1) 1941 - setreg 1942 - .endr 1943 - 1944 - j = 1 1945 - setreg 1946 - .rep \REP 1947 - vmovdqa 16*j(arg1), \T_key 1948 - i = (9-\num_initial_blocks) 1949 - setreg 1950 - .rep \num_initial_blocks 1951 - vaesenc \T_key, reg_i, reg_i 1952 - i = (i+1) 1953 - setreg 1954 - .endr 1955 - 1956 - j = (j+1) 1957 - setreg 1958 - .endr 1959 - 1960 - 1961 - vmovdqa 16*j(arg1), \T_key 1962 - i = (9-\num_initial_blocks) 1963 - setreg 1964 - .rep \num_initial_blocks 1965 - vaesenclast \T_key, reg_i, reg_i 1966 - i = (i+1) 1967 - setreg 1968 - .endr 1969 - 1970 - i = (9-\num_initial_blocks) 1971 - setreg 1972 - .rep \num_initial_blocks 1973 - vmovdqu (arg4, %r11), \T1 1974 - vpxor \T1, reg_i, reg_i 1975 - vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for 1976 - # num_initial_blocks blocks 1977 - add $16, %r11 1978 - .if \ENC_DEC == DEC 1979 - vmovdqa \T1, reg_i 1980 - .endif 1981 - vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 1982 - i = (i+1) 1983 - setreg 1984 - .endr 1985 - 1986 - 1987 - i = (8-\num_initial_blocks) 1988 - j = (9-\num_initial_blocks) 1989 - setreg 1990 - 1991 - .rep \num_initial_blocks 1992 - vpxor reg_i, reg_j, reg_j 1993 - GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 1994 - i = (i+1) 1995 - j = (j+1) 1996 - setreg 1997 - .endr 1998 - # XMM8 has the combined result here 1999 - 2000 - vmovdqa \XMM8, TMP1(%rsp) 2001 - vmovdqa \XMM8, \T3 2002 - 2003 - cmp $128, %r13 2004 - jl .L_initial_blocks_done\@ # no need for precomputed constants 2005 - 2006 - ############################################################################### 2007 - # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 2008 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2009 - vmovdqa \CTR, \XMM1 2010 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2011 - 2012 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2013 - vmovdqa \CTR, \XMM2 2014 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2015 - 2016 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2017 - vmovdqa \CTR, \XMM3 2018 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2019 - 2020 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2021 - vmovdqa \CTR, \XMM4 2022 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2023 - 2024 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2025 - vmovdqa \CTR, \XMM5 2026 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2027 - 2028 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2029 - vmovdqa \CTR, \XMM6 2030 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2031 - 2032 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2033 - vmovdqa \CTR, \XMM7 2034 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2035 - 2036 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2037 - vmovdqa \CTR, \XMM8 2038 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2039 - 2040 - vmovdqa (arg1), \T_key 2041 - vpxor \T_key, \XMM1, \XMM1 2042 - vpxor \T_key, \XMM2, \XMM2 2043 - vpxor \T_key, \XMM3, \XMM3 2044 - vpxor \T_key, \XMM4, \XMM4 2045 - vpxor \T_key, \XMM5, \XMM5 2046 - vpxor \T_key, \XMM6, \XMM6 2047 - vpxor \T_key, \XMM7, \XMM7 2048 - vpxor \T_key, \XMM8, \XMM8 2049 - 2050 - i = 1 2051 - setreg 2052 - .rep \REP # do REP rounds 2053 - vmovdqa 16*i(arg1), \T_key 2054 - vaesenc \T_key, \XMM1, \XMM1 2055 - vaesenc \T_key, \XMM2, \XMM2 2056 - vaesenc \T_key, \XMM3, \XMM3 2057 - vaesenc \T_key, \XMM4, \XMM4 2058 - vaesenc \T_key, \XMM5, \XMM5 2059 - vaesenc \T_key, \XMM6, \XMM6 2060 - vaesenc \T_key, \XMM7, \XMM7 2061 - vaesenc \T_key, \XMM8, \XMM8 2062 - i = (i+1) 2063 - setreg 2064 - .endr 2065 - 2066 - 2067 - vmovdqa 16*i(arg1), \T_key 2068 - vaesenclast \T_key, \XMM1, \XMM1 2069 - vaesenclast \T_key, \XMM2, \XMM2 2070 - vaesenclast \T_key, \XMM3, \XMM3 2071 - vaesenclast \T_key, \XMM4, \XMM4 2072 - vaesenclast \T_key, \XMM5, \XMM5 2073 - vaesenclast \T_key, \XMM6, \XMM6 2074 - vaesenclast \T_key, \XMM7, \XMM7 2075 - vaesenclast \T_key, \XMM8, \XMM8 2076 - 2077 - vmovdqu (arg4, %r11), \T1 2078 - vpxor \T1, \XMM1, \XMM1 2079 - vmovdqu \XMM1, (arg3 , %r11) 2080 - .if \ENC_DEC == DEC 2081 - vmovdqa \T1, \XMM1 2082 - .endif 2083 - 2084 - vmovdqu 16*1(arg4, %r11), \T1 2085 - vpxor \T1, \XMM2, \XMM2 2086 - vmovdqu \XMM2, 16*1(arg3 , %r11) 2087 - .if \ENC_DEC == DEC 2088 - vmovdqa \T1, \XMM2 2089 - .endif 2090 - 2091 - vmovdqu 16*2(arg4, %r11), \T1 2092 - vpxor \T1, \XMM3, \XMM3 2093 - vmovdqu \XMM3, 16*2(arg3 , %r11) 2094 - .if \ENC_DEC == DEC 2095 - vmovdqa \T1, \XMM3 2096 - .endif 2097 - 2098 - vmovdqu 16*3(arg4, %r11), \T1 2099 - vpxor \T1, \XMM4, \XMM4 2100 - vmovdqu \XMM4, 16*3(arg3 , %r11) 2101 - .if \ENC_DEC == DEC 2102 - vmovdqa \T1, \XMM4 2103 - .endif 2104 - 2105 - vmovdqu 16*4(arg4, %r11), \T1 2106 - vpxor \T1, \XMM5, \XMM5 2107 - vmovdqu \XMM5, 16*4(arg3 , %r11) 2108 - .if \ENC_DEC == DEC 2109 - vmovdqa \T1, \XMM5 2110 - .endif 2111 - 2112 - vmovdqu 16*5(arg4, %r11), \T1 2113 - vpxor \T1, \XMM6, \XMM6 2114 - vmovdqu \XMM6, 16*5(arg3 , %r11) 2115 - .if \ENC_DEC == DEC 2116 - vmovdqa \T1, \XMM6 2117 - .endif 2118 - 2119 - vmovdqu 16*6(arg4, %r11), \T1 2120 - vpxor \T1, \XMM7, \XMM7 2121 - vmovdqu \XMM7, 16*6(arg3 , %r11) 2122 - .if \ENC_DEC == DEC 2123 - vmovdqa \T1, \XMM7 2124 - .endif 2125 - 2126 - vmovdqu 16*7(arg4, %r11), \T1 2127 - vpxor \T1, \XMM8, \XMM8 2128 - vmovdqu \XMM8, 16*7(arg3 , %r11) 2129 - .if \ENC_DEC == DEC 2130 - vmovdqa \T1, \XMM8 2131 - .endif 2132 - 2133 - add $128, %r11 2134 - 2135 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2136 - vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with 2137 - # the corresponding ciphertext 2138 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2139 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2140 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2141 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2142 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2143 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2144 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2145 - 2146 - ############################################################################### 2147 - 2148 - .L_initial_blocks_done\@: 2149 - 2150 - 2151 - .endm 2152 - 2153 - 2154 - 2155 - # encrypt 8 blocks at a time 2156 - # ghash the 8 previously encrypted ciphertext blocks 2157 - # arg1, arg2, arg3, arg4 are used as pointers only, not modified 2158 - # r11 is the data offset value 2159 - .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 2160 - 2161 - vmovdqa \XMM1, \T2 2162 - vmovdqa \XMM2, TMP2(%rsp) 2163 - vmovdqa \XMM3, TMP3(%rsp) 2164 - vmovdqa \XMM4, TMP4(%rsp) 2165 - vmovdqa \XMM5, TMP5(%rsp) 2166 - vmovdqa \XMM6, TMP6(%rsp) 2167 - vmovdqa \XMM7, TMP7(%rsp) 2168 - vmovdqa \XMM8, TMP8(%rsp) 2169 - 2170 - .if \loop_idx == in_order 2171 - vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 2172 - vpaddd ONE(%rip), \XMM1, \XMM2 2173 - vpaddd ONE(%rip), \XMM2, \XMM3 2174 - vpaddd ONE(%rip), \XMM3, \XMM4 2175 - vpaddd ONE(%rip), \XMM4, \XMM5 2176 - vpaddd ONE(%rip), \XMM5, \XMM6 2177 - vpaddd ONE(%rip), \XMM6, \XMM7 2178 - vpaddd ONE(%rip), \XMM7, \XMM8 2179 - vmovdqa \XMM8, \CTR 2180 - 2181 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2182 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2183 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2184 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2185 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2186 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2187 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2188 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2189 - .else 2190 - vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 2191 - vpaddd ONEf(%rip), \XMM1, \XMM2 2192 - vpaddd ONEf(%rip), \XMM2, \XMM3 2193 - vpaddd ONEf(%rip), \XMM3, \XMM4 2194 - vpaddd ONEf(%rip), \XMM4, \XMM5 2195 - vpaddd ONEf(%rip), \XMM5, \XMM6 2196 - vpaddd ONEf(%rip), \XMM6, \XMM7 2197 - vpaddd ONEf(%rip), \XMM7, \XMM8 2198 - vmovdqa \XMM8, \CTR 2199 - .endif 2200 - 2201 - 2202 - ####################################################################### 2203 - 2204 - vmovdqu (arg1), \T1 2205 - vpxor \T1, \XMM1, \XMM1 2206 - vpxor \T1, \XMM2, \XMM2 2207 - vpxor \T1, \XMM3, \XMM3 2208 - vpxor \T1, \XMM4, \XMM4 2209 - vpxor \T1, \XMM5, \XMM5 2210 - vpxor \T1, \XMM6, \XMM6 2211 - vpxor \T1, \XMM7, \XMM7 2212 - vpxor \T1, \XMM8, \XMM8 2213 - 2214 - ####################################################################### 2215 - 2216 - 2217 - 2218 - 2219 - 2220 - vmovdqu 16*1(arg1), \T1 2221 - vaesenc \T1, \XMM1, \XMM1 2222 - vaesenc \T1, \XMM2, \XMM2 2223 - vaesenc \T1, \XMM3, \XMM3 2224 - vaesenc \T1, \XMM4, \XMM4 2225 - vaesenc \T1, \XMM5, \XMM5 2226 - vaesenc \T1, \XMM6, \XMM6 2227 - vaesenc \T1, \XMM7, \XMM7 2228 - vaesenc \T1, \XMM8, \XMM8 2229 - 2230 - vmovdqu 16*2(arg1), \T1 2231 - vaesenc \T1, \XMM1, \XMM1 2232 - vaesenc \T1, \XMM2, \XMM2 2233 - vaesenc \T1, \XMM3, \XMM3 2234 - vaesenc \T1, \XMM4, \XMM4 2235 - vaesenc \T1, \XMM5, \XMM5 2236 - vaesenc \T1, \XMM6, \XMM6 2237 - vaesenc \T1, \XMM7, \XMM7 2238 - vaesenc \T1, \XMM8, \XMM8 2239 - 2240 - 2241 - ####################################################################### 2242 - 2243 - vmovdqu HashKey_8(arg2), \T5 2244 - vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 2245 - vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 2246 - vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 2247 - vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 2248 - vpxor \T5, \T6, \T6 2249 - 2250 - vmovdqu 16*3(arg1), \T1 2251 - vaesenc \T1, \XMM1, \XMM1 2252 - vaesenc \T1, \XMM2, \XMM2 2253 - vaesenc \T1, \XMM3, \XMM3 2254 - vaesenc \T1, \XMM4, \XMM4 2255 - vaesenc \T1, \XMM5, \XMM5 2256 - vaesenc \T1, \XMM6, \XMM6 2257 - vaesenc \T1, \XMM7, \XMM7 2258 - vaesenc \T1, \XMM8, \XMM8 2259 - 2260 - vmovdqa TMP2(%rsp), \T1 2261 - vmovdqu HashKey_7(arg2), \T5 2262 - vpclmulqdq $0x11, \T5, \T1, \T3 2263 - vpxor \T3, \T4, \T4 2264 - 2265 - vpclmulqdq $0x00, \T5, \T1, \T3 2266 - vpxor \T3, \T7, \T7 2267 - 2268 - vpclmulqdq $0x01, \T5, \T1, \T3 2269 - vpxor \T3, \T6, \T6 2270 - 2271 - vpclmulqdq $0x10, \T5, \T1, \T3 2272 - vpxor \T3, \T6, \T6 2273 - 2274 - vmovdqu 16*4(arg1), \T1 2275 - vaesenc \T1, \XMM1, \XMM1 2276 - vaesenc \T1, \XMM2, \XMM2 2277 - vaesenc \T1, \XMM3, \XMM3 2278 - vaesenc \T1, \XMM4, \XMM4 2279 - vaesenc \T1, \XMM5, \XMM5 2280 - vaesenc \T1, \XMM6, \XMM6 2281 - vaesenc \T1, \XMM7, \XMM7 2282 - vaesenc \T1, \XMM8, \XMM8 2283 - 2284 - ####################################################################### 2285 - 2286 - vmovdqa TMP3(%rsp), \T1 2287 - vmovdqu HashKey_6(arg2), \T5 2288 - vpclmulqdq $0x11, \T5, \T1, \T3 2289 - vpxor \T3, \T4, \T4 2290 - 2291 - vpclmulqdq $0x00, \T5, \T1, \T3 2292 - vpxor \T3, \T7, \T7 2293 - 2294 - vpclmulqdq $0x01, \T5, \T1, \T3 2295 - vpxor \T3, \T6, \T6 2296 - 2297 - vpclmulqdq $0x10, \T5, \T1, \T3 2298 - vpxor \T3, \T6, \T6 2299 - 2300 - vmovdqu 16*5(arg1), \T1 2301 - vaesenc \T1, \XMM1, \XMM1 2302 - vaesenc \T1, \XMM2, \XMM2 2303 - vaesenc \T1, \XMM3, \XMM3 2304 - vaesenc \T1, \XMM4, \XMM4 2305 - vaesenc \T1, \XMM5, \XMM5 2306 - vaesenc \T1, \XMM6, \XMM6 2307 - vaesenc \T1, \XMM7, \XMM7 2308 - vaesenc \T1, \XMM8, \XMM8 2309 - 2310 - vmovdqa TMP4(%rsp), \T1 2311 - vmovdqu HashKey_5(arg2), \T5 2312 - vpclmulqdq $0x11, \T5, \T1, \T3 2313 - vpxor \T3, \T4, \T4 2314 - 2315 - vpclmulqdq $0x00, \T5, \T1, \T3 2316 - vpxor \T3, \T7, \T7 2317 - 2318 - vpclmulqdq $0x01, \T5, \T1, \T3 2319 - vpxor \T3, \T6, \T6 2320 - 2321 - vpclmulqdq $0x10, \T5, \T1, \T3 2322 - vpxor \T3, \T6, \T6 2323 - 2324 - vmovdqu 16*6(arg1), \T1 2325 - vaesenc \T1, \XMM1, \XMM1 2326 - vaesenc \T1, \XMM2, \XMM2 2327 - vaesenc \T1, \XMM3, \XMM3 2328 - vaesenc \T1, \XMM4, \XMM4 2329 - vaesenc \T1, \XMM5, \XMM5 2330 - vaesenc \T1, \XMM6, \XMM6 2331 - vaesenc \T1, \XMM7, \XMM7 2332 - vaesenc \T1, \XMM8, \XMM8 2333 - 2334 - 2335 - vmovdqa TMP5(%rsp), \T1 2336 - vmovdqu HashKey_4(arg2), \T5 2337 - vpclmulqdq $0x11, \T5, \T1, \T3 2338 - vpxor \T3, \T4, \T4 2339 - 2340 - vpclmulqdq $0x00, \T5, \T1, \T3 2341 - vpxor \T3, \T7, \T7 2342 - 2343 - vpclmulqdq $0x01, \T5, \T1, \T3 2344 - vpxor \T3, \T6, \T6 2345 - 2346 - vpclmulqdq $0x10, \T5, \T1, \T3 2347 - vpxor \T3, \T6, \T6 2348 - 2349 - vmovdqu 16*7(arg1), \T1 2350 - vaesenc \T1, \XMM1, \XMM1 2351 - vaesenc \T1, \XMM2, \XMM2 2352 - vaesenc \T1, \XMM3, \XMM3 2353 - vaesenc \T1, \XMM4, \XMM4 2354 - vaesenc \T1, \XMM5, \XMM5 2355 - vaesenc \T1, \XMM6, \XMM6 2356 - vaesenc \T1, \XMM7, \XMM7 2357 - vaesenc \T1, \XMM8, \XMM8 2358 - 2359 - vmovdqa TMP6(%rsp), \T1 2360 - vmovdqu HashKey_3(arg2), \T5 2361 - vpclmulqdq $0x11, \T5, \T1, \T3 2362 - vpxor \T3, \T4, \T4 2363 - 2364 - vpclmulqdq $0x00, \T5, \T1, \T3 2365 - vpxor \T3, \T7, \T7 2366 - 2367 - vpclmulqdq $0x01, \T5, \T1, \T3 2368 - vpxor \T3, \T6, \T6 2369 - 2370 - vpclmulqdq $0x10, \T5, \T1, \T3 2371 - vpxor \T3, \T6, \T6 2372 - 2373 - vmovdqu 16*8(arg1), \T1 2374 - vaesenc \T1, \XMM1, \XMM1 2375 - vaesenc \T1, \XMM2, \XMM2 2376 - vaesenc \T1, \XMM3, \XMM3 2377 - vaesenc \T1, \XMM4, \XMM4 2378 - vaesenc \T1, \XMM5, \XMM5 2379 - vaesenc \T1, \XMM6, \XMM6 2380 - vaesenc \T1, \XMM7, \XMM7 2381 - vaesenc \T1, \XMM8, \XMM8 2382 - 2383 - vmovdqa TMP7(%rsp), \T1 2384 - vmovdqu HashKey_2(arg2), \T5 2385 - vpclmulqdq $0x11, \T5, \T1, \T3 2386 - vpxor \T3, \T4, \T4 2387 - 2388 - vpclmulqdq $0x00, \T5, \T1, \T3 2389 - vpxor \T3, \T7, \T7 2390 - 2391 - vpclmulqdq $0x01, \T5, \T1, \T3 2392 - vpxor \T3, \T6, \T6 2393 - 2394 - vpclmulqdq $0x10, \T5, \T1, \T3 2395 - vpxor \T3, \T6, \T6 2396 - 2397 - 2398 - ####################################################################### 2399 - 2400 - vmovdqu 16*9(arg1), \T5 2401 - vaesenc \T5, \XMM1, \XMM1 2402 - vaesenc \T5, \XMM2, \XMM2 2403 - vaesenc \T5, \XMM3, \XMM3 2404 - vaesenc \T5, \XMM4, \XMM4 2405 - vaesenc \T5, \XMM5, \XMM5 2406 - vaesenc \T5, \XMM6, \XMM6 2407 - vaesenc \T5, \XMM7, \XMM7 2408 - vaesenc \T5, \XMM8, \XMM8 2409 - 2410 - vmovdqa TMP8(%rsp), \T1 2411 - vmovdqu HashKey(arg2), \T5 2412 - 2413 - vpclmulqdq $0x00, \T5, \T1, \T3 2414 - vpxor \T3, \T7, \T7 2415 - 2416 - vpclmulqdq $0x01, \T5, \T1, \T3 2417 - vpxor \T3, \T6, \T6 2418 - 2419 - vpclmulqdq $0x10, \T5, \T1, \T3 2420 - vpxor \T3, \T6, \T6 2421 - 2422 - vpclmulqdq $0x11, \T5, \T1, \T3 2423 - vpxor \T3, \T4, \T1 2424 - 2425 - 2426 - vmovdqu 16*10(arg1), \T5 2427 - 2428 - i = 11 2429 - setreg 2430 - .rep (\REP-9) 2431 - vaesenc \T5, \XMM1, \XMM1 2432 - vaesenc \T5, \XMM2, \XMM2 2433 - vaesenc \T5, \XMM3, \XMM3 2434 - vaesenc \T5, \XMM4, \XMM4 2435 - vaesenc \T5, \XMM5, \XMM5 2436 - vaesenc \T5, \XMM6, \XMM6 2437 - vaesenc \T5, \XMM7, \XMM7 2438 - vaesenc \T5, \XMM8, \XMM8 2439 - 2440 - vmovdqu 16*i(arg1), \T5 2441 - i = i + 1 2442 - setreg 2443 - .endr 2444 - 2445 - i = 0 2446 - j = 1 2447 - setreg 2448 - .rep 8 2449 - vpxor 16*i(arg4, %r11), \T5, \T2 2450 - .if \ENC_DEC == ENC 2451 - vaesenclast \T2, reg_j, reg_j 2452 - .else 2453 - vaesenclast \T2, reg_j, \T3 2454 - vmovdqu 16*i(arg4, %r11), reg_j 2455 - vmovdqu \T3, 16*i(arg3, %r11) 2456 - .endif 2457 - i = (i+1) 2458 - j = (j+1) 2459 - setreg 2460 - .endr 2461 - ####################################################################### 2462 - 2463 - 2464 - vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 2465 - vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 2466 - vpxor \T3, \T7, \T7 2467 - vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 2468 - 2469 - 2470 - 2471 - ####################################################################### 2472 - #first phase of the reduction 2473 - vmovdqa POLY2(%rip), \T3 2474 - 2475 - vpclmulqdq $0x01, \T7, \T3, \T2 2476 - vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2477 - 2478 - vpxor \T2, \T7, \T7 # first phase of the reduction complete 2479 - ####################################################################### 2480 - .if \ENC_DEC == ENC 2481 - vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer 2482 - vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer 2483 - vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer 2484 - vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer 2485 - vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer 2486 - vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer 2487 - vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer 2488 - vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer 2489 - .endif 2490 - 2491 - ####################################################################### 2492 - #second phase of the reduction 2493 - vpclmulqdq $0x00, \T7, \T3, \T2 2494 - vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2495 - 2496 - vpclmulqdq $0x10, \T7, \T3, \T4 2497 - vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) 2498 - 2499 - vpxor \T2, \T4, \T4 # second phase of the reduction complete 2500 - ####################################################################### 2501 - vpxor \T4, \T1, \T1 # the result is in T1 2502 - 2503 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2504 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2505 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2506 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2507 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2508 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2509 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2510 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2511 - 2512 - 2513 - vpxor \T1, \XMM1, \XMM1 2514 - 2515 - 2516 - 2517 - .endm 2518 - 2519 - 2520 - # GHASH the last 4 ciphertext blocks. 2521 - .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 2522 - 2523 - ## Karatsuba Method 2524 - 2525 - vmovdqu HashKey_8(arg2), \T5 2526 - 2527 - vpshufd $0b01001110, \XMM1, \T2 2528 - vpshufd $0b01001110, \T5, \T3 2529 - vpxor \XMM1, \T2, \T2 2530 - vpxor \T5, \T3, \T3 2531 - 2532 - vpclmulqdq $0x11, \T5, \XMM1, \T6 2533 - vpclmulqdq $0x00, \T5, \XMM1, \T7 2534 - 2535 - vpclmulqdq $0x00, \T3, \T2, \XMM1 2536 - 2537 - ###################### 2538 - 2539 - vmovdqu HashKey_7(arg2), \T5 2540 - vpshufd $0b01001110, \XMM2, \T2 2541 - vpshufd $0b01001110, \T5, \T3 2542 - vpxor \XMM2, \T2, \T2 2543 - vpxor \T5, \T3, \T3 2544 - 2545 - vpclmulqdq $0x11, \T5, \XMM2, \T4 2546 - vpxor \T4, \T6, \T6 2547 - 2548 - vpclmulqdq $0x00, \T5, \XMM2, \T4 2549 - vpxor \T4, \T7, \T7 2550 - 2551 - vpclmulqdq $0x00, \T3, \T2, \T2 2552 - 2553 - vpxor \T2, \XMM1, \XMM1 2554 - 2555 - ###################### 2556 - 2557 - vmovdqu HashKey_6(arg2), \T5 2558 - vpshufd $0b01001110, \XMM3, \T2 2559 - vpshufd $0b01001110, \T5, \T3 2560 - vpxor \XMM3, \T2, \T2 2561 - vpxor \T5, \T3, \T3 2562 - 2563 - vpclmulqdq $0x11, \T5, \XMM3, \T4 2564 - vpxor \T4, \T6, \T6 2565 - 2566 - vpclmulqdq $0x00, \T5, \XMM3, \T4 2567 - vpxor \T4, \T7, \T7 2568 - 2569 - vpclmulqdq $0x00, \T3, \T2, \T2 2570 - 2571 - vpxor \T2, \XMM1, \XMM1 2572 - 2573 - ###################### 2574 - 2575 - vmovdqu HashKey_5(arg2), \T5 2576 - vpshufd $0b01001110, \XMM4, \T2 2577 - vpshufd $0b01001110, \T5, \T3 2578 - vpxor \XMM4, \T2, \T2 2579 - vpxor \T5, \T3, \T3 2580 - 2581 - vpclmulqdq $0x11, \T5, \XMM4, \T4 2582 - vpxor \T4, \T6, \T6 2583 - 2584 - vpclmulqdq $0x00, \T5, \XMM4, \T4 2585 - vpxor \T4, \T7, \T7 2586 - 2587 - vpclmulqdq $0x00, \T3, \T2, \T2 2588 - 2589 - vpxor \T2, \XMM1, \XMM1 2590 - 2591 - ###################### 2592 - 2593 - vmovdqu HashKey_4(arg2), \T5 2594 - vpshufd $0b01001110, \XMM5, \T2 2595 - vpshufd $0b01001110, \T5, \T3 2596 - vpxor \XMM5, \T2, \T2 2597 - vpxor \T5, \T3, \T3 2598 - 2599 - vpclmulqdq $0x11, \T5, \XMM5, \T4 2600 - vpxor \T4, \T6, \T6 2601 - 2602 - vpclmulqdq $0x00, \T5, \XMM5, \T4 2603 - vpxor \T4, \T7, \T7 2604 - 2605 - vpclmulqdq $0x00, \T3, \T2, \T2 2606 - 2607 - vpxor \T2, \XMM1, \XMM1 2608 - 2609 - ###################### 2610 - 2611 - vmovdqu HashKey_3(arg2), \T5 2612 - vpshufd $0b01001110, \XMM6, \T2 2613 - vpshufd $0b01001110, \T5, \T3 2614 - vpxor \XMM6, \T2, \T2 2615 - vpxor \T5, \T3, \T3 2616 - 2617 - vpclmulqdq $0x11, \T5, \XMM6, \T4 2618 - vpxor \T4, \T6, \T6 2619 - 2620 - vpclmulqdq $0x00, \T5, \XMM6, \T4 2621 - vpxor \T4, \T7, \T7 2622 - 2623 - vpclmulqdq $0x00, \T3, \T2, \T2 2624 - 2625 - vpxor \T2, \XMM1, \XMM1 2626 - 2627 - ###################### 2628 - 2629 - vmovdqu HashKey_2(arg2), \T5 2630 - vpshufd $0b01001110, \XMM7, \T2 2631 - vpshufd $0b01001110, \T5, \T3 2632 - vpxor \XMM7, \T2, \T2 2633 - vpxor \T5, \T3, \T3 2634 - 2635 - vpclmulqdq $0x11, \T5, \XMM7, \T4 2636 - vpxor \T4, \T6, \T6 2637 - 2638 - vpclmulqdq $0x00, \T5, \XMM7, \T4 2639 - vpxor \T4, \T7, \T7 2640 - 2641 - vpclmulqdq $0x00, \T3, \T2, \T2 2642 - 2643 - vpxor \T2, \XMM1, \XMM1 2644 - 2645 - ###################### 2646 - 2647 - vmovdqu HashKey(arg2), \T5 2648 - vpshufd $0b01001110, \XMM8, \T2 2649 - vpshufd $0b01001110, \T5, \T3 2650 - vpxor \XMM8, \T2, \T2 2651 - vpxor \T5, \T3, \T3 2652 - 2653 - vpclmulqdq $0x11, \T5, \XMM8, \T4 2654 - vpxor \T4, \T6, \T6 2655 - 2656 - vpclmulqdq $0x00, \T5, \XMM8, \T4 2657 - vpxor \T4, \T7, \T7 2658 - 2659 - vpclmulqdq $0x00, \T3, \T2, \T2 2660 - 2661 - vpxor \T2, \XMM1, \XMM1 2662 - vpxor \T6, \XMM1, \XMM1 2663 - vpxor \T7, \XMM1, \T2 2664 - 2665 - 2666 - 2667 - 2668 - vpslldq $8, \T2, \T4 2669 - vpsrldq $8, \T2, \T2 2670 - 2671 - vpxor \T4, \T7, \T7 2672 - vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the 2673 - # accumulated carry-less multiplications 2674 - 2675 - ####################################################################### 2676 - #first phase of the reduction 2677 - vmovdqa POLY2(%rip), \T3 2678 - 2679 - vpclmulqdq $0x01, \T7, \T3, \T2 2680 - vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2681 - 2682 - vpxor \T2, \T7, \T7 # first phase of the reduction complete 2683 - ####################################################################### 2684 - 2685 - 2686 - #second phase of the reduction 2687 - vpclmulqdq $0x00, \T7, \T3, \T2 2688 - vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2689 - 2690 - vpclmulqdq $0x10, \T7, \T3, \T4 2691 - vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) 2692 - 2693 - vpxor \T2, \T4, \T4 # second phase of the reduction complete 2694 - ####################################################################### 2695 - vpxor \T4, \T6, \T6 # the result is in T6 2696 - .endm 2697 - 2698 - 2699 - 2700 - ############################################################# 2701 - #void aesni_gcm_init_avx_gen4 2702 - # (gcm_data *my_ctx_data, 2703 - # gcm_context_data *data, 2704 - # u8 *iv, /* Pre-counter block j0: 4 byte salt 2705 - # (from Security Association) concatenated with 8 byte 2706 - # Initialisation Vector (from IPSec ESP Payload) 2707 - # concatenated with 0x00000001. 16-byte aligned pointer. */ 2708 - # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ 2709 - # const u8 *aad, /* Additional Authentication Data (AAD)*/ 2710 - # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 2711 - ############################################################# 2712 - SYM_FUNC_START(aesni_gcm_init_avx_gen4) 2713 - FUNC_SAVE 2714 - INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 2715 - FUNC_RESTORE 2716 - RET 2717 - SYM_FUNC_END(aesni_gcm_init_avx_gen4) 2718 - 2719 - ############################################################################### 2720 - #void aesni_gcm_enc_avx_gen4( 2721 - # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2722 - # gcm_context_data *data, 2723 - # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 2724 - # const u8 *in, /* Plaintext input */ 2725 - # u64 plaintext_len) /* Length of data in Bytes for encryption. */ 2726 - ############################################################################### 2727 - SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4) 2728 - FUNC_SAVE 2729 - mov keysize,%eax 2730 - cmp $32, %eax 2731 - je key_256_enc_update4 2732 - cmp $16, %eax 2733 - je key_128_enc_update4 2734 - # must be 192 2735 - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 2736 - FUNC_RESTORE 2737 - RET 2738 - key_128_enc_update4: 2739 - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 2740 - FUNC_RESTORE 2741 - RET 2742 - key_256_enc_update4: 2743 - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 2744 - FUNC_RESTORE 2745 - RET 2746 - SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4) 2747 - 2748 - ############################################################################### 2749 - #void aesni_gcm_dec_update_avx_gen4( 2750 - # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2751 - # gcm_context_data *data, 2752 - # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 2753 - # const u8 *in, /* Ciphertext input */ 2754 - # u64 plaintext_len) /* Length of data in Bytes for encryption. */ 2755 - ############################################################################### 2756 - SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4) 2757 - FUNC_SAVE 2758 - mov keysize,%eax 2759 - cmp $32, %eax 2760 - je key_256_dec_update4 2761 - cmp $16, %eax 2762 - je key_128_dec_update4 2763 - # must be 192 2764 - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 2765 - FUNC_RESTORE 2766 - RET 2767 - key_128_dec_update4: 2768 - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 2769 - FUNC_RESTORE 2770 - RET 2771 - key_256_dec_update4: 2772 - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 2773 - FUNC_RESTORE 2774 - RET 2775 - SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4) 2776 - 2777 - ############################################################################### 2778 - #void aesni_gcm_finalize_avx_gen4( 2779 - # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2780 - # gcm_context_data *data, 2781 - # u8 *auth_tag, /* Authenticated Tag output. */ 2782 - # u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 2783 - # Valid values are 16 (most likely), 12 or 8. */ 2784 - ############################################################################### 2785 - SYM_FUNC_START(aesni_gcm_finalize_avx_gen4) 2786 - FUNC_SAVE 2787 - mov keysize,%eax 2788 - cmp $32, %eax 2789 - je key_256_finalize4 2790 - cmp $16, %eax 2791 - je key_128_finalize4 2792 - # must be 192 2793 - GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4 2794 - FUNC_RESTORE 2795 - RET 2796 - key_128_finalize4: 2797 - GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4 2798 - FUNC_RESTORE 2799 - RET 2800 - key_256_finalize4: 2801 - GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 2802 - FUNC_RESTORE 2803 - RET 2804 - SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)

+767 -502

arch/x86/crypto/aesni-intel_glue.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0-or-later 2 2 /* 3 - * Support for Intel AES-NI instructions. This file contains glue 4 - * code, the real AES implementation is in intel-aes_asm.S. 3 + * Support for AES-NI and VAES instructions. This file contains glue code. 4 + * The real AES implementations are in aesni-intel_asm.S and other .S files. 5 5 * 6 6 * Copyright (C) 2008, Intel Corp. 7 7 * Author: Huang Ying <ying.huang@intel.com> ··· 13 13 * Tadeusz Struk (tadeusz.struk@intel.com) 14 14 * Aidan O'Mahony (aidan.o.mahony@intel.com) 15 15 * Copyright (c) 2010, Intel Corporation. 16 + * 17 + * Copyright 2024 Google LLC 16 18 */ 17 19 18 20 #include <linux/hardirq.h> ··· 46 44 #define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA) 47 45 #define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA) 48 46 49 - /* This data is stored at the end of the crypto_tfm struct. 50 - * It's a type of per "session" data storage location. 51 - * This needs to be 16 byte aligned. 52 - */ 53 - struct aesni_rfc4106_gcm_ctx { 54 - u8 hash_subkey[16] AESNI_ALIGN_ATTR; 55 - struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; 56 - u8 nonce[4]; 57 - }; 58 - 59 - struct generic_gcmaes_ctx { 60 - u8 hash_subkey[16] AESNI_ALIGN_ATTR; 61 - struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; 62 - }; 63 - 64 47 struct aesni_xts_ctx { 65 48 struct crypto_aes_ctx tweak_ctx AESNI_ALIGN_ATTR; 66 49 struct crypto_aes_ctx crypt_ctx AESNI_ALIGN_ATTR; 67 - }; 68 - 69 - #define GCM_BLOCK_LEN 16 70 - 71 - struct gcm_context_data { 72 - /* init, update and finalize context data */ 73 - u8 aad_hash[GCM_BLOCK_LEN]; 74 - u64 aad_length; 75 - u64 in_length; 76 - u8 partial_block_enc_key[GCM_BLOCK_LEN]; 77 - u8 orig_IV[GCM_BLOCK_LEN]; 78 - u8 current_counter[GCM_BLOCK_LEN]; 79 - u64 partial_block_len; 80 - u64 unused; 81 - u8 hash_keys[GCM_BLOCK_LEN * 16]; 82 50 }; 83 51 84 52 static inline void *aes_align_addr(void *addr) ··· 75 103 asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, 76 104 const u8 *in, unsigned int len, u8 *iv); 77 105 78 - #define AVX_GEN2_OPTSIZE 640 79 - #define AVX_GEN4_OPTSIZE 4096 80 - 81 106 asmlinkage void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *out, 82 107 const u8 *in, unsigned int len, u8 *iv); 83 108 ··· 86 117 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, 87 118 const u8 *in, unsigned int len, u8 *iv); 88 119 DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc); 89 - 90 - /* Scatter / Gather routines, with args similar to above */ 91 - asmlinkage void aesni_gcm_init(void *ctx, 92 - struct gcm_context_data *gdata, 93 - u8 *iv, 94 - u8 *hash_subkey, const u8 *aad, 95 - unsigned long aad_len); 96 - asmlinkage void aesni_gcm_enc_update(void *ctx, 97 - struct gcm_context_data *gdata, u8 *out, 98 - const u8 *in, unsigned long plaintext_len); 99 - asmlinkage void aesni_gcm_dec_update(void *ctx, 100 - struct gcm_context_data *gdata, u8 *out, 101 - const u8 *in, 102 - unsigned long ciphertext_len); 103 - asmlinkage void aesni_gcm_finalize(void *ctx, 104 - struct gcm_context_data *gdata, 105 - u8 *auth_tag, unsigned long auth_tag_len); 106 120 107 121 asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv, 108 122 void *keys, u8 *out, unsigned int num_bytes); ··· 106 154 asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, 107 155 const void *keys, u8 *out, unsigned int num_bytes, 108 156 unsigned int byte_ctr); 109 - 110 - /* 111 - * asmlinkage void aesni_gcm_init_avx_gen2() 112 - * gcm_data *my_ctx_data, context data 113 - * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. 114 - */ 115 - asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data, 116 - struct gcm_context_data *gdata, 117 - u8 *iv, 118 - u8 *hash_subkey, 119 - const u8 *aad, 120 - unsigned long aad_len); 121 - 122 - asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx, 123 - struct gcm_context_data *gdata, u8 *out, 124 - const u8 *in, unsigned long plaintext_len); 125 - asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx, 126 - struct gcm_context_data *gdata, u8 *out, 127 - const u8 *in, 128 - unsigned long ciphertext_len); 129 - asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx, 130 - struct gcm_context_data *gdata, 131 - u8 *auth_tag, unsigned long auth_tag_len); 132 - 133 - /* 134 - * asmlinkage void aesni_gcm_init_avx_gen4() 135 - * gcm_data *my_ctx_data, context data 136 - * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. 137 - */ 138 - asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data, 139 - struct gcm_context_data *gdata, 140 - u8 *iv, 141 - u8 *hash_subkey, 142 - const u8 *aad, 143 - unsigned long aad_len); 144 - 145 - asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx, 146 - struct gcm_context_data *gdata, u8 *out, 147 - const u8 *in, unsigned long plaintext_len); 148 - asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx, 149 - struct gcm_context_data *gdata, u8 *out, 150 - const u8 *in, 151 - unsigned long ciphertext_len); 152 - asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx, 153 - struct gcm_context_data *gdata, 154 - u8 *auth_tag, unsigned long auth_tag_len); 155 - 156 - static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx); 157 - static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx2); 158 - 159 - static inline struct 160 - aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) 161 - { 162 - return aes_align_addr(crypto_aead_ctx(tfm)); 163 - } 164 - 165 - static inline struct 166 - generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm) 167 - { 168 - return aes_align_addr(crypto_aead_ctx(tfm)); 169 - } 170 157 #endif 171 158 172 159 static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) ··· 479 588 } 480 589 return err; 481 590 } 482 - 483 - static int aes_gcm_derive_hash_subkey(const struct crypto_aes_ctx *aes_key, 484 - u8 hash_subkey[AES_BLOCK_SIZE]) 485 - { 486 - static const u8 zeroes[AES_BLOCK_SIZE]; 487 - 488 - aes_encrypt(aes_key, hash_subkey, zeroes); 489 - return 0; 490 - } 491 - 492 - static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, 493 - unsigned int key_len) 494 - { 495 - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead); 496 - 497 - if (key_len < 4) 498 - return -EINVAL; 499 - 500 - /*Account for 4 byte nonce at the end.*/ 501 - key_len -= 4; 502 - 503 - memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce)); 504 - 505 - return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?: 506 - aes_gcm_derive_hash_subkey(&ctx->aes_key_expanded, 507 - ctx->hash_subkey); 508 - } 509 - 510 - /* This is the Integrity Check Value (aka the authentication tag) length and can 511 - * be 8, 12 or 16 bytes long. */ 512 - static int common_rfc4106_set_authsize(struct crypto_aead *aead, 513 - unsigned int authsize) 514 - { 515 - switch (authsize) { 516 - case 8: 517 - case 12: 518 - case 16: 519 - break; 520 - default: 521 - return -EINVAL; 522 - } 523 - 524 - return 0; 525 - } 526 - 527 - static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, 528 - unsigned int authsize) 529 - { 530 - switch (authsize) { 531 - case 4: 532 - case 8: 533 - case 12: 534 - case 13: 535 - case 14: 536 - case 15: 537 - case 16: 538 - break; 539 - default: 540 - return -EINVAL; 541 - } 542 - 543 - return 0; 544 - } 545 - 546 - static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, 547 - unsigned int assoclen, u8 *hash_subkey, 548 - u8 *iv, void *aes_ctx, u8 *auth_tag, 549 - unsigned long auth_tag_len) 550 - { 551 - u8 databuf[sizeof(struct gcm_context_data) + (AESNI_ALIGN - 8)] __aligned(8); 552 - struct gcm_context_data *data = PTR_ALIGN((void *)databuf, AESNI_ALIGN); 553 - unsigned long left = req->cryptlen; 554 - struct scatter_walk assoc_sg_walk; 555 - struct skcipher_walk walk; 556 - bool do_avx, do_avx2; 557 - u8 *assocmem = NULL; 558 - u8 *assoc; 559 - int err; 560 - 561 - if (!enc) 562 - left -= auth_tag_len; 563 - 564 - do_avx = (left >= AVX_GEN2_OPTSIZE); 565 - do_avx2 = (left >= AVX_GEN4_OPTSIZE); 566 - 567 - /* Linearize assoc, if not already linear */ 568 - if (req->src->length >= assoclen && req->src->length) { 569 - scatterwalk_start(&assoc_sg_walk, req->src); 570 - assoc = scatterwalk_map(&assoc_sg_walk); 571 - } else { 572 - gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? 573 - GFP_KERNEL : GFP_ATOMIC; 574 - 575 - /* assoc can be any length, so must be on heap */ 576 - assocmem = kmalloc(assoclen, flags); 577 - if (unlikely(!assocmem)) 578 - return -ENOMEM; 579 - assoc = assocmem; 580 - 581 - scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0); 582 - } 583 - 584 - kernel_fpu_begin(); 585 - if (static_branch_likely(&gcm_use_avx2) && do_avx2) 586 - aesni_gcm_init_avx_gen4(aes_ctx, data, iv, hash_subkey, assoc, 587 - assoclen); 588 - else if (static_branch_likely(&gcm_use_avx) && do_avx) 589 - aesni_gcm_init_avx_gen2(aes_ctx, data, iv, hash_subkey, assoc, 590 - assoclen); 591 - else 592 - aesni_gcm_init(aes_ctx, data, iv, hash_subkey, assoc, assoclen); 593 - kernel_fpu_end(); 594 - 595 - if (!assocmem) 596 - scatterwalk_unmap(assoc); 597 - else 598 - kfree(assocmem); 599 - 600 - err = enc ? skcipher_walk_aead_encrypt(&walk, req, false) 601 - : skcipher_walk_aead_decrypt(&walk, req, false); 602 - 603 - while (walk.nbytes > 0) { 604 - kernel_fpu_begin(); 605 - if (static_branch_likely(&gcm_use_avx2) && do_avx2) { 606 - if (enc) 607 - aesni_gcm_enc_update_avx_gen4(aes_ctx, data, 608 - walk.dst.virt.addr, 609 - walk.src.virt.addr, 610 - walk.nbytes); 611 - else 612 - aesni_gcm_dec_update_avx_gen4(aes_ctx, data, 613 - walk.dst.virt.addr, 614 - walk.src.virt.addr, 615 - walk.nbytes); 616 - } else if (static_branch_likely(&gcm_use_avx) && do_avx) { 617 - if (enc) 618 - aesni_gcm_enc_update_avx_gen2(aes_ctx, data, 619 - walk.dst.virt.addr, 620 - walk.src.virt.addr, 621 - walk.nbytes); 622 - else 623 - aesni_gcm_dec_update_avx_gen2(aes_ctx, data, 624 - walk.dst.virt.addr, 625 - walk.src.virt.addr, 626 - walk.nbytes); 627 - } else if (enc) { 628 - aesni_gcm_enc_update(aes_ctx, data, walk.dst.virt.addr, 629 - walk.src.virt.addr, walk.nbytes); 630 - } else { 631 - aesni_gcm_dec_update(aes_ctx, data, walk.dst.virt.addr, 632 - walk.src.virt.addr, walk.nbytes); 633 - } 634 - kernel_fpu_end(); 635 - 636 - err = skcipher_walk_done(&walk, 0); 637 - } 638 - 639 - if (err) 640 - return err; 641 - 642 - kernel_fpu_begin(); 643 - if (static_branch_likely(&gcm_use_avx2) && do_avx2) 644 - aesni_gcm_finalize_avx_gen4(aes_ctx, data, auth_tag, 645 - auth_tag_len); 646 - else if (static_branch_likely(&gcm_use_avx) && do_avx) 647 - aesni_gcm_finalize_avx_gen2(aes_ctx, data, auth_tag, 648 - auth_tag_len); 649 - else 650 - aesni_gcm_finalize(aes_ctx, data, auth_tag, auth_tag_len); 651 - kernel_fpu_end(); 652 - 653 - return 0; 654 - } 655 - 656 - static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, 657 - u8 *hash_subkey, u8 *iv, void *aes_ctx) 658 - { 659 - struct crypto_aead *tfm = crypto_aead_reqtfm(req); 660 - unsigned long auth_tag_len = crypto_aead_authsize(tfm); 661 - u8 auth_tag[16]; 662 - int err; 663 - 664 - err = gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, aes_ctx, 665 - auth_tag, auth_tag_len); 666 - if (err) 667 - return err; 668 - 669 - scatterwalk_map_and_copy(auth_tag, req->dst, 670 - req->assoclen + req->cryptlen, 671 - auth_tag_len, 1); 672 - return 0; 673 - } 674 - 675 - static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, 676 - u8 *hash_subkey, u8 *iv, void *aes_ctx) 677 - { 678 - struct crypto_aead *tfm = crypto_aead_reqtfm(req); 679 - unsigned long auth_tag_len = crypto_aead_authsize(tfm); 680 - u8 auth_tag_msg[16]; 681 - u8 auth_tag[16]; 682 - int err; 683 - 684 - err = gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, aes_ctx, 685 - auth_tag, auth_tag_len); 686 - if (err) 687 - return err; 688 - 689 - /* Copy out original auth_tag */ 690 - scatterwalk_map_and_copy(auth_tag_msg, req->src, 691 - req->assoclen + req->cryptlen - auth_tag_len, 692 - auth_tag_len, 0); 693 - 694 - /* Compare generated tag with passed in tag. */ 695 - if (crypto_memneq(auth_tag_msg, auth_tag, auth_tag_len)) { 696 - memzero_explicit(auth_tag, sizeof(auth_tag)); 697 - return -EBADMSG; 698 - } 699 - return 0; 700 - } 701 - 702 - static int helper_rfc4106_encrypt(struct aead_request *req) 703 - { 704 - struct crypto_aead *tfm = crypto_aead_reqtfm(req); 705 - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); 706 - void *aes_ctx = &(ctx->aes_key_expanded); 707 - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); 708 - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); 709 - unsigned int i; 710 - __be32 counter = cpu_to_be32(1); 711 - 712 - /* Assuming we are supporting rfc4106 64-bit extended */ 713 - /* sequence numbers We need to have the AAD length equal */ 714 - /* to 16 or 20 bytes */ 715 - if (unlikely(req->assoclen != 16 && req->assoclen != 20)) 716 - return -EINVAL; 717 - 718 - /* IV below built */ 719 - for (i = 0; i < 4; i++) 720 - *(iv+i) = ctx->nonce[i]; 721 - for (i = 0; i < 8; i++) 722 - *(iv+4+i) = req->iv[i]; 723 - *((__be32 *)(iv+12)) = counter; 724 - 725 - return gcmaes_encrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, 726 - aes_ctx); 727 - } 728 - 729 - static int helper_rfc4106_decrypt(struct aead_request *req) 730 - { 731 - __be32 counter = cpu_to_be32(1); 732 - struct crypto_aead *tfm = crypto_aead_reqtfm(req); 733 - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); 734 - void *aes_ctx = &(ctx->aes_key_expanded); 735 - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); 736 - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); 737 - unsigned int i; 738 - 739 - if (unlikely(req->assoclen != 16 && req->assoclen != 20)) 740 - return -EINVAL; 741 - 742 - /* Assuming we are supporting rfc4106 64-bit extended */ 743 - /* sequence numbers We need to have the AAD length */ 744 - /* equal to 16 or 20 bytes */ 745 - 746 - /* IV below built */ 747 - for (i = 0; i < 4; i++) 748 - *(iv+i) = ctx->nonce[i]; 749 - for (i = 0; i < 8; i++) 750 - *(iv+4+i) = req->iv[i]; 751 - *((__be32 *)(iv+12)) = counter; 752 - 753 - return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, 754 - aes_ctx); 755 - } 756 591 #endif 757 592 758 593 static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key, ··· 833 1216 DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800); 834 1217 #endif 835 1218 1219 + /* The common part of the x86_64 AES-GCM key struct */ 1220 + struct aes_gcm_key { 1221 + /* Expanded AES key and the AES key length in bytes */ 1222 + struct crypto_aes_ctx aes_key; 1223 + 1224 + /* RFC4106 nonce (used only by the rfc4106 algorithms) */ 1225 + u32 rfc4106_nonce; 1226 + }; 1227 + 1228 + /* Key struct used by the AES-NI implementations of AES-GCM */ 1229 + struct aes_gcm_key_aesni { 1230 + /* 1231 + * Common part of the key. The assembly code requires 16-byte alignment 1232 + * for the round keys; we get this by them being located at the start of 1233 + * the struct and the whole struct being 16-byte aligned. 1234 + */ 1235 + struct aes_gcm_key base; 1236 + 1237 + /* 1238 + * Powers of the hash key H^8 through H^1. These are 128-bit values. 1239 + * They all have an extra factor of x^-1 and are byte-reversed. 16-byte 1240 + * alignment is required by the assembly code. 1241 + */ 1242 + u64 h_powers[8][2] __aligned(16); 1243 + 1244 + /* 1245 + * h_powers_xored[i] contains the two 64-bit halves of h_powers[i] XOR'd 1246 + * together. It's used for Karatsuba multiplication. 16-byte alignment 1247 + * is required by the assembly code. 1248 + */ 1249 + u64 h_powers_xored[8] __aligned(16); 1250 + 1251 + /* 1252 + * H^1 times x^64 (and also the usual extra factor of x^-1). 16-byte 1253 + * alignment is required by the assembly code. 1254 + */ 1255 + u64 h_times_x64[2] __aligned(16); 1256 + }; 1257 + #define AES_GCM_KEY_AESNI(key) \ 1258 + container_of((key), struct aes_gcm_key_aesni, base) 1259 + #define AES_GCM_KEY_AESNI_SIZE \ 1260 + (sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1))) 1261 + 1262 + /* Key struct used by the VAES + AVX10 implementations of AES-GCM */ 1263 + struct aes_gcm_key_avx10 { 1264 + /* 1265 + * Common part of the key. The assembly code prefers 16-byte alignment 1266 + * for the round keys; we get this by them being located at the start of 1267 + * the struct and the whole struct being 64-byte aligned. 1268 + */ 1269 + struct aes_gcm_key base; 1270 + 1271 + /* 1272 + * Powers of the hash key H^16 through H^1. These are 128-bit values. 1273 + * They all have an extra factor of x^-1 and are byte-reversed. This 1274 + * array is aligned to a 64-byte boundary to make it naturally aligned 1275 + * for 512-bit loads, which can improve performance. (The assembly code 1276 + * doesn't *need* the alignment; this is just an optimization.) 1277 + */ 1278 + u64 h_powers[16][2] __aligned(64); 1279 + 1280 + /* Three padding blocks required by the assembly code */ 1281 + u64 padding[3][2]; 1282 + }; 1283 + #define AES_GCM_KEY_AVX10(key) \ 1284 + container_of((key), struct aes_gcm_key_avx10, base) 1285 + #define AES_GCM_KEY_AVX10_SIZE \ 1286 + (sizeof(struct aes_gcm_key_avx10) + (63 & ~(CRYPTO_MINALIGN - 1))) 1287 + 1288 + /* 1289 + * These flags are passed to the AES-GCM helper functions to specify the 1290 + * specific version of AES-GCM (RFC4106 or not), whether it's encryption or 1291 + * decryption, and which assembly functions should be called. Assembly 1292 + * functions are selected using flags instead of function pointers to avoid 1293 + * indirect calls (which are very expensive on x86) regardless of inlining. 1294 + */ 1295 + #define FLAG_RFC4106 BIT(0) 1296 + #define FLAG_ENC BIT(1) 1297 + #define FLAG_AVX BIT(2) 1298 + #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) 1299 + # define FLAG_AVX10_256 BIT(3) 1300 + # define FLAG_AVX10_512 BIT(4) 1301 + #else 1302 + /* 1303 + * This should cause all calls to the AVX10 assembly functions to be 1304 + * optimized out, avoiding the need to ifdef each call individually. 1305 + */ 1306 + # define FLAG_AVX10_256 0 1307 + # define FLAG_AVX10_512 0 1308 + #endif 1309 + 1310 + static inline struct aes_gcm_key * 1311 + aes_gcm_key_get(struct crypto_aead *tfm, int flags) 1312 + { 1313 + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) 1314 + return PTR_ALIGN(crypto_aead_ctx(tfm), 64); 1315 + else 1316 + return PTR_ALIGN(crypto_aead_ctx(tfm), 16); 1317 + } 1318 + 1319 + asmlinkage void 1320 + aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key); 1321 + asmlinkage void 1322 + aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key); 1323 + asmlinkage void 1324 + aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key); 1325 + asmlinkage void 1326 + aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key); 1327 + 1328 + static void aes_gcm_precompute(struct aes_gcm_key *key, int flags) 1329 + { 1330 + /* 1331 + * To make things a bit easier on the assembly side, the AVX10 1332 + * implementations use the same key format. Therefore, a single 1333 + * function using 256-bit vectors would suffice here. However, it's 1334 + * straightforward to provide a 512-bit one because of how the assembly 1335 + * code is structured, and it works nicely because the total size of the 1336 + * key powers is a multiple of 512 bits. So we take advantage of that. 1337 + * 1338 + * A similar situation applies to the AES-NI implementations. 1339 + */ 1340 + if (flags & FLAG_AVX10_512) 1341 + aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key)); 1342 + else if (flags & FLAG_AVX10_256) 1343 + aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key)); 1344 + else if (flags & FLAG_AVX) 1345 + aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key)); 1346 + else 1347 + aes_gcm_precompute_aesni(AES_GCM_KEY_AESNI(key)); 1348 + } 1349 + 1350 + asmlinkage void 1351 + aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key, 1352 + u8 ghash_acc[16], const u8 *aad, int aadlen); 1353 + asmlinkage void 1354 + aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key, 1355 + u8 ghash_acc[16], const u8 *aad, int aadlen); 1356 + asmlinkage void 1357 + aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, 1358 + u8 ghash_acc[16], const u8 *aad, int aadlen); 1359 + 1360 + static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16], 1361 + const u8 *aad, int aadlen, int flags) 1362 + { 1363 + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) 1364 + aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc, 1365 + aad, aadlen); 1366 + else if (flags & FLAG_AVX) 1367 + aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc, 1368 + aad, aadlen); 1369 + else 1370 + aes_gcm_aad_update_aesni(AES_GCM_KEY_AESNI(key), ghash_acc, 1371 + aad, aadlen); 1372 + } 1373 + 1374 + asmlinkage void 1375 + aes_gcm_enc_update_aesni(const struct aes_gcm_key_aesni *key, 1376 + const u32 le_ctr[4], u8 ghash_acc[16], 1377 + const u8 *src, u8 *dst, int datalen); 1378 + asmlinkage void 1379 + aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key, 1380 + const u32 le_ctr[4], u8 ghash_acc[16], 1381 + const u8 *src, u8 *dst, int datalen); 1382 + asmlinkage void 1383 + aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, 1384 + const u32 le_ctr[4], u8 ghash_acc[16], 1385 + const u8 *src, u8 *dst, int datalen); 1386 + asmlinkage void 1387 + aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, 1388 + const u32 le_ctr[4], u8 ghash_acc[16], 1389 + const u8 *src, u8 *dst, int datalen); 1390 + 1391 + asmlinkage void 1392 + aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key, 1393 + const u32 le_ctr[4], u8 ghash_acc[16], 1394 + const u8 *src, u8 *dst, int datalen); 1395 + asmlinkage void 1396 + aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key, 1397 + const u32 le_ctr[4], u8 ghash_acc[16], 1398 + const u8 *src, u8 *dst, int datalen); 1399 + asmlinkage void 1400 + aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, 1401 + const u32 le_ctr[4], u8 ghash_acc[16], 1402 + const u8 *src, u8 *dst, int datalen); 1403 + asmlinkage void 1404 + aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, 1405 + const u32 le_ctr[4], u8 ghash_acc[16], 1406 + const u8 *src, u8 *dst, int datalen); 1407 + 1408 + /* __always_inline to optimize out the branches based on @flags */ 1409 + static __always_inline void 1410 + aes_gcm_update(const struct aes_gcm_key *key, 1411 + const u32 le_ctr[4], u8 ghash_acc[16], 1412 + const u8 *src, u8 *dst, int datalen, int flags) 1413 + { 1414 + if (flags & FLAG_ENC) { 1415 + if (flags & FLAG_AVX10_512) 1416 + aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), 1417 + le_ctr, ghash_acc, 1418 + src, dst, datalen); 1419 + else if (flags & FLAG_AVX10_256) 1420 + aes_gcm_enc_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), 1421 + le_ctr, ghash_acc, 1422 + src, dst, datalen); 1423 + else if (flags & FLAG_AVX) 1424 + aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key), 1425 + le_ctr, ghash_acc, 1426 + src, dst, datalen); 1427 + else 1428 + aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr, 1429 + ghash_acc, src, dst, datalen); 1430 + } else { 1431 + if (flags & FLAG_AVX10_512) 1432 + aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), 1433 + le_ctr, ghash_acc, 1434 + src, dst, datalen); 1435 + else if (flags & FLAG_AVX10_256) 1436 + aes_gcm_dec_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), 1437 + le_ctr, ghash_acc, 1438 + src, dst, datalen); 1439 + else if (flags & FLAG_AVX) 1440 + aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key), 1441 + le_ctr, ghash_acc, 1442 + src, dst, datalen); 1443 + else 1444 + aes_gcm_dec_update_aesni(AES_GCM_KEY_AESNI(key), 1445 + le_ctr, ghash_acc, 1446 + src, dst, datalen); 1447 + } 1448 + } 1449 + 1450 + asmlinkage void 1451 + aes_gcm_enc_final_aesni(const struct aes_gcm_key_aesni *key, 1452 + const u32 le_ctr[4], u8 ghash_acc[16], 1453 + u64 total_aadlen, u64 total_datalen); 1454 + asmlinkage void 1455 + aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key, 1456 + const u32 le_ctr[4], u8 ghash_acc[16], 1457 + u64 total_aadlen, u64 total_datalen); 1458 + asmlinkage void 1459 + aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, 1460 + const u32 le_ctr[4], u8 ghash_acc[16], 1461 + u64 total_aadlen, u64 total_datalen); 1462 + 1463 + /* __always_inline to optimize out the branches based on @flags */ 1464 + static __always_inline void 1465 + aes_gcm_enc_final(const struct aes_gcm_key *key, 1466 + const u32 le_ctr[4], u8 ghash_acc[16], 1467 + u64 total_aadlen, u64 total_datalen, int flags) 1468 + { 1469 + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) 1470 + aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key), 1471 + le_ctr, ghash_acc, 1472 + total_aadlen, total_datalen); 1473 + else if (flags & FLAG_AVX) 1474 + aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key), 1475 + le_ctr, ghash_acc, 1476 + total_aadlen, total_datalen); 1477 + else 1478 + aes_gcm_enc_final_aesni(AES_GCM_KEY_AESNI(key), 1479 + le_ctr, ghash_acc, 1480 + total_aadlen, total_datalen); 1481 + } 1482 + 1483 + asmlinkage bool __must_check 1484 + aes_gcm_dec_final_aesni(const struct aes_gcm_key_aesni *key, 1485 + const u32 le_ctr[4], const u8 ghash_acc[16], 1486 + u64 total_aadlen, u64 total_datalen, 1487 + const u8 tag[16], int taglen); 1488 + asmlinkage bool __must_check 1489 + aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key, 1490 + const u32 le_ctr[4], const u8 ghash_acc[16], 1491 + u64 total_aadlen, u64 total_datalen, 1492 + const u8 tag[16], int taglen); 1493 + asmlinkage bool __must_check 1494 + aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, 1495 + const u32 le_ctr[4], const u8 ghash_acc[16], 1496 + u64 total_aadlen, u64 total_datalen, 1497 + const u8 tag[16], int taglen); 1498 + 1499 + /* __always_inline to optimize out the branches based on @flags */ 1500 + static __always_inline bool __must_check 1501 + aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4], 1502 + u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, 1503 + u8 tag[16], int taglen, int flags) 1504 + { 1505 + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) 1506 + return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key), 1507 + le_ctr, ghash_acc, 1508 + total_aadlen, total_datalen, 1509 + tag, taglen); 1510 + else if (flags & FLAG_AVX) 1511 + return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key), 1512 + le_ctr, ghash_acc, 1513 + total_aadlen, total_datalen, 1514 + tag, taglen); 1515 + else 1516 + return aes_gcm_dec_final_aesni(AES_GCM_KEY_AESNI(key), 1517 + le_ctr, ghash_acc, 1518 + total_aadlen, total_datalen, 1519 + tag, taglen); 1520 + } 1521 + 1522 + /* 1523 + * This is the Integrity Check Value (aka the authentication tag) length and can 1524 + * be 8, 12 or 16 bytes long. 1525 + */ 1526 + static int common_rfc4106_set_authsize(struct crypto_aead *aead, 1527 + unsigned int authsize) 1528 + { 1529 + switch (authsize) { 1530 + case 8: 1531 + case 12: 1532 + case 16: 1533 + break; 1534 + default: 1535 + return -EINVAL; 1536 + } 1537 + 1538 + return 0; 1539 + } 1540 + 1541 + static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, 1542 + unsigned int authsize) 1543 + { 1544 + switch (authsize) { 1545 + case 4: 1546 + case 8: 1547 + case 12: 1548 + case 13: 1549 + case 14: 1550 + case 15: 1551 + case 16: 1552 + break; 1553 + default: 1554 + return -EINVAL; 1555 + } 1556 + 1557 + return 0; 1558 + } 1559 + 1560 + /* 1561 + * This is the setkey function for the x86_64 implementations of AES-GCM. It 1562 + * saves the RFC4106 nonce if applicable, expands the AES key, and precomputes 1563 + * powers of the hash key. 1564 + * 1565 + * To comply with the crypto_aead API, this has to be usable in no-SIMD context. 1566 + * For that reason, this function includes a portable C implementation of the 1567 + * needed logic. However, the portable C implementation is very slow, taking 1568 + * about the same time as encrypting 37 KB of data. To be ready for users that 1569 + * may set a key even somewhat frequently, we therefore also include a SIMD 1570 + * assembly implementation, expanding the AES key using AES-NI and precomputing 1571 + * the hash key powers using PCLMULQDQ or VPCLMULQDQ. 1572 + */ 1573 + static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key, 1574 + unsigned int keylen, int flags) 1575 + { 1576 + struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags); 1577 + int err; 1578 + 1579 + if (flags & FLAG_RFC4106) { 1580 + if (keylen < 4) 1581 + return -EINVAL; 1582 + keylen -= 4; 1583 + key->rfc4106_nonce = get_unaligned_be32(raw_key + keylen); 1584 + } 1585 + 1586 + /* The assembly code assumes the following offsets. */ 1587 + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_enc) != 0); 1588 + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_length) != 480); 1589 + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496); 1590 + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624); 1591 + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688); 1592 + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0); 1593 + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480); 1594 + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512); 1595 + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, padding) != 768); 1596 + 1597 + if (likely(crypto_simd_usable())) { 1598 + err = aes_check_keylen(keylen); 1599 + if (err) 1600 + return err; 1601 + kernel_fpu_begin(); 1602 + aesni_set_key(&key->aes_key, raw_key, keylen); 1603 + aes_gcm_precompute(key, flags); 1604 + kernel_fpu_end(); 1605 + } else { 1606 + static const u8 x_to_the_minus1[16] __aligned(__alignof__(be128)) = { 1607 + [0] = 0xc2, [15] = 1 1608 + }; 1609 + static const u8 x_to_the_63[16] __aligned(__alignof__(be128)) = { 1610 + [7] = 1, 1611 + }; 1612 + be128 h1 = {}; 1613 + be128 h; 1614 + int i; 1615 + 1616 + err = aes_expandkey(&key->aes_key, raw_key, keylen); 1617 + if (err) 1618 + return err; 1619 + 1620 + /* Encrypt the all-zeroes block to get the hash key H^1 */ 1621 + aes_encrypt(&key->aes_key, (u8 *)&h1, (u8 *)&h1); 1622 + 1623 + /* Compute H^1 * x^-1 */ 1624 + h = h1; 1625 + gf128mul_lle(&h, (const be128 *)x_to_the_minus1); 1626 + 1627 + /* Compute the needed key powers */ 1628 + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) { 1629 + struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key); 1630 + 1631 + for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { 1632 + k->h_powers[i][0] = be64_to_cpu(h.b); 1633 + k->h_powers[i][1] = be64_to_cpu(h.a); 1634 + gf128mul_lle(&h, &h1); 1635 + } 1636 + memset(k->padding, 0, sizeof(k->padding)); 1637 + } else { 1638 + struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key); 1639 + 1640 + for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { 1641 + k->h_powers[i][0] = be64_to_cpu(h.b); 1642 + k->h_powers[i][1] = be64_to_cpu(h.a); 1643 + k->h_powers_xored[i] = k->h_powers[i][0] ^ 1644 + k->h_powers[i][1]; 1645 + gf128mul_lle(&h, &h1); 1646 + } 1647 + gf128mul_lle(&h1, (const be128 *)x_to_the_63); 1648 + k->h_times_x64[0] = be64_to_cpu(h1.b); 1649 + k->h_times_x64[1] = be64_to_cpu(h1.a); 1650 + } 1651 + } 1652 + return 0; 1653 + } 1654 + 1655 + /* 1656 + * Initialize @ghash_acc, then pass all @assoclen bytes of associated data 1657 + * (a.k.a. additional authenticated data) from @sg_src through the GHASH update 1658 + * assembly function. kernel_fpu_begin() must have already been called. 1659 + */ 1660 + static void gcm_process_assoc(const struct aes_gcm_key *key, u8 ghash_acc[16], 1661 + struct scatterlist *sg_src, unsigned int assoclen, 1662 + int flags) 1663 + { 1664 + struct scatter_walk walk; 1665 + /* 1666 + * The assembly function requires that the length of any non-last 1667 + * segment of associated data be a multiple of 16 bytes, so this 1668 + * function does the buffering needed to achieve that. 1669 + */ 1670 + unsigned int pos = 0; 1671 + u8 buf[16]; 1672 + 1673 + memset(ghash_acc, 0, 16); 1674 + scatterwalk_start(&walk, sg_src); 1675 + 1676 + while (assoclen) { 1677 + unsigned int len_this_page = scatterwalk_clamp(&walk, assoclen); 1678 + void *mapped = scatterwalk_map(&walk); 1679 + const void *src = mapped; 1680 + unsigned int len; 1681 + 1682 + assoclen -= len_this_page; 1683 + scatterwalk_advance(&walk, len_this_page); 1684 + if (unlikely(pos)) { 1685 + len = min(len_this_page, 16 - pos); 1686 + memcpy(&buf[pos], src, len); 1687 + pos += len; 1688 + src += len; 1689 + len_this_page -= len; 1690 + if (pos < 16) 1691 + goto next; 1692 + aes_gcm_aad_update(key, ghash_acc, buf, 16, flags); 1693 + pos = 0; 1694 + } 1695 + len = len_this_page; 1696 + if (unlikely(assoclen)) /* Not the last segment yet? */ 1697 + len = round_down(len, 16); 1698 + aes_gcm_aad_update(key, ghash_acc, src, len, flags); 1699 + src += len; 1700 + len_this_page -= len; 1701 + if (unlikely(len_this_page)) { 1702 + memcpy(buf, src, len_this_page); 1703 + pos = len_this_page; 1704 + } 1705 + next: 1706 + scatterwalk_unmap(mapped); 1707 + scatterwalk_pagedone(&walk, 0, assoclen); 1708 + if (need_resched()) { 1709 + kernel_fpu_end(); 1710 + kernel_fpu_begin(); 1711 + } 1712 + } 1713 + if (unlikely(pos)) 1714 + aes_gcm_aad_update(key, ghash_acc, buf, pos, flags); 1715 + } 1716 + 1717 + 1718 + /* __always_inline to optimize out the branches based on @flags */ 1719 + static __always_inline int 1720 + gcm_crypt(struct aead_request *req, int flags) 1721 + { 1722 + struct crypto_aead *tfm = crypto_aead_reqtfm(req); 1723 + const struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags); 1724 + unsigned int assoclen = req->assoclen; 1725 + struct skcipher_walk walk; 1726 + unsigned int nbytes; 1727 + u8 ghash_acc[16]; /* GHASH accumulator */ 1728 + u32 le_ctr[4]; /* Counter in little-endian format */ 1729 + int taglen; 1730 + int err; 1731 + 1732 + /* Initialize the counter and determine the associated data length. */ 1733 + le_ctr[0] = 2; 1734 + if (flags & FLAG_RFC4106) { 1735 + if (unlikely(assoclen != 16 && assoclen != 20)) 1736 + return -EINVAL; 1737 + assoclen -= 8; 1738 + le_ctr[1] = get_unaligned_be32(req->iv + 4); 1739 + le_ctr[2] = get_unaligned_be32(req->iv + 0); 1740 + le_ctr[3] = key->rfc4106_nonce; /* already byte-swapped */ 1741 + } else { 1742 + le_ctr[1] = get_unaligned_be32(req->iv + 8); 1743 + le_ctr[2] = get_unaligned_be32(req->iv + 4); 1744 + le_ctr[3] = get_unaligned_be32(req->iv + 0); 1745 + } 1746 + 1747 + /* Begin walking through the plaintext or ciphertext. */ 1748 + if (flags & FLAG_ENC) 1749 + err = skcipher_walk_aead_encrypt(&walk, req, false); 1750 + else 1751 + err = skcipher_walk_aead_decrypt(&walk, req, false); 1752 + 1753 + /* 1754 + * Since the AES-GCM assembly code requires that at least three assembly 1755 + * functions be called to process any message (this is needed to support 1756 + * incremental updates cleanly), to reduce overhead we try to do all 1757 + * three calls in the same kernel FPU section if possible. We close the 1758 + * section and start a new one if there are multiple data segments or if 1759 + * rescheduling is needed while processing the associated data. 1760 + */ 1761 + kernel_fpu_begin(); 1762 + 1763 + /* Pass the associated data through GHASH. */ 1764 + gcm_process_assoc(key, ghash_acc, req->src, assoclen, flags); 1765 + 1766 + /* En/decrypt the data and pass the ciphertext through GHASH. */ 1767 + while ((nbytes = walk.nbytes) != 0) { 1768 + if (unlikely(nbytes < walk.total)) { 1769 + /* 1770 + * Non-last segment. In this case, the assembly 1771 + * function requires that the length be a multiple of 16 1772 + * (AES_BLOCK_SIZE) bytes. The needed buffering of up 1773 + * to 16 bytes is handled by the skcipher_walk. Here we 1774 + * just need to round down to a multiple of 16. 1775 + */ 1776 + nbytes = round_down(nbytes, AES_BLOCK_SIZE); 1777 + aes_gcm_update(key, le_ctr, ghash_acc, 1778 + walk.src.virt.addr, walk.dst.virt.addr, 1779 + nbytes, flags); 1780 + le_ctr[0] += nbytes / AES_BLOCK_SIZE; 1781 + kernel_fpu_end(); 1782 + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); 1783 + kernel_fpu_begin(); 1784 + } else { 1785 + /* Last segment: process all remaining data. */ 1786 + aes_gcm_update(key, le_ctr, ghash_acc, 1787 + walk.src.virt.addr, walk.dst.virt.addr, 1788 + nbytes, flags); 1789 + err = skcipher_walk_done(&walk, 0); 1790 + /* 1791 + * The low word of the counter isn't used by the 1792 + * finalize, so there's no need to increment it here. 1793 + */ 1794 + } 1795 + } 1796 + if (err) 1797 + goto out; 1798 + 1799 + /* Finalize */ 1800 + taglen = crypto_aead_authsize(tfm); 1801 + if (flags & FLAG_ENC) { 1802 + /* Finish computing the auth tag. */ 1803 + aes_gcm_enc_final(key, le_ctr, ghash_acc, assoclen, 1804 + req->cryptlen, flags); 1805 + 1806 + /* Store the computed auth tag in the dst scatterlist. */ 1807 + scatterwalk_map_and_copy(ghash_acc, req->dst, req->assoclen + 1808 + req->cryptlen, taglen, 1); 1809 + } else { 1810 + unsigned int datalen = req->cryptlen - taglen; 1811 + u8 tag[16]; 1812 + 1813 + /* Get the transmitted auth tag from the src scatterlist. */ 1814 + scatterwalk_map_and_copy(tag, req->src, req->assoclen + datalen, 1815 + taglen, 0); 1816 + /* 1817 + * Finish computing the auth tag and compare it to the 1818 + * transmitted one. The assembly function does the actual tag 1819 + * comparison. Here, just check the boolean result. 1820 + */ 1821 + if (!aes_gcm_dec_final(key, le_ctr, ghash_acc, assoclen, 1822 + datalen, tag, taglen, flags)) 1823 + err = -EBADMSG; 1824 + } 1825 + out: 1826 + kernel_fpu_end(); 1827 + return err; 1828 + } 1829 + 1830 + #define DEFINE_GCM_ALGS(suffix, flags, generic_driver_name, rfc_driver_name, \ 1831 + ctxsize, priority) \ 1832 + \ 1833 + static int gcm_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \ 1834 + unsigned int keylen) \ 1835 + { \ 1836 + return gcm_setkey(tfm, raw_key, keylen, (flags)); \ 1837 + } \ 1838 + \ 1839 + static int gcm_encrypt_##suffix(struct aead_request *req) \ 1840 + { \ 1841 + return gcm_crypt(req, (flags) | FLAG_ENC); \ 1842 + } \ 1843 + \ 1844 + static int gcm_decrypt_##suffix(struct aead_request *req) \ 1845 + { \ 1846 + return gcm_crypt(req, (flags)); \ 1847 + } \ 1848 + \ 1849 + static int rfc4106_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \ 1850 + unsigned int keylen) \ 1851 + { \ 1852 + return gcm_setkey(tfm, raw_key, keylen, (flags) | FLAG_RFC4106); \ 1853 + } \ 1854 + \ 1855 + static int rfc4106_encrypt_##suffix(struct aead_request *req) \ 1856 + { \ 1857 + return gcm_crypt(req, (flags) | FLAG_RFC4106 | FLAG_ENC); \ 1858 + } \ 1859 + \ 1860 + static int rfc4106_decrypt_##suffix(struct aead_request *req) \ 1861 + { \ 1862 + return gcm_crypt(req, (flags) | FLAG_RFC4106); \ 1863 + } \ 1864 + \ 1865 + static struct aead_alg aes_gcm_algs_##suffix[] = { { \ 1866 + .setkey = gcm_setkey_##suffix, \ 1867 + .setauthsize = generic_gcmaes_set_authsize, \ 1868 + .encrypt = gcm_encrypt_##suffix, \ 1869 + .decrypt = gcm_decrypt_##suffix, \ 1870 + .ivsize = GCM_AES_IV_SIZE, \ 1871 + .chunksize = AES_BLOCK_SIZE, \ 1872 + .maxauthsize = 16, \ 1873 + .base = { \ 1874 + .cra_name = "__gcm(aes)", \ 1875 + .cra_driver_name = "__" generic_driver_name, \ 1876 + .cra_priority = (priority), \ 1877 + .cra_flags = CRYPTO_ALG_INTERNAL, \ 1878 + .cra_blocksize = 1, \ 1879 + .cra_ctxsize = (ctxsize), \ 1880 + .cra_module = THIS_MODULE, \ 1881 + }, \ 1882 + }, { \ 1883 + .setkey = rfc4106_setkey_##suffix, \ 1884 + .setauthsize = common_rfc4106_set_authsize, \ 1885 + .encrypt = rfc4106_encrypt_##suffix, \ 1886 + .decrypt = rfc4106_decrypt_##suffix, \ 1887 + .ivsize = GCM_RFC4106_IV_SIZE, \ 1888 + .chunksize = AES_BLOCK_SIZE, \ 1889 + .maxauthsize = 16, \ 1890 + .base = { \ 1891 + .cra_name = "__rfc4106(gcm(aes))", \ 1892 + .cra_driver_name = "__" rfc_driver_name, \ 1893 + .cra_priority = (priority), \ 1894 + .cra_flags = CRYPTO_ALG_INTERNAL, \ 1895 + .cra_blocksize = 1, \ 1896 + .cra_ctxsize = (ctxsize), \ 1897 + .cra_module = THIS_MODULE, \ 1898 + }, \ 1899 + } }; \ 1900 + \ 1901 + static struct simd_aead_alg *aes_gcm_simdalgs_##suffix[2] \ 1902 + 1903 + /* aes_gcm_algs_aesni */ 1904 + DEFINE_GCM_ALGS(aesni, /* no flags */ 0, 1905 + "generic-gcm-aesni", "rfc4106-gcm-aesni", 1906 + AES_GCM_KEY_AESNI_SIZE, 400); 1907 + 1908 + /* aes_gcm_algs_aesni_avx */ 1909 + DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX, 1910 + "generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx", 1911 + AES_GCM_KEY_AESNI_SIZE, 500); 1912 + 1913 + #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) 1914 + /* aes_gcm_algs_vaes_avx10_256 */ 1915 + DEFINE_GCM_ALGS(vaes_avx10_256, FLAG_AVX10_256, 1916 + "generic-gcm-vaes-avx10_256", "rfc4106-gcm-vaes-avx10_256", 1917 + AES_GCM_KEY_AVX10_SIZE, 700); 1918 + 1919 + /* aes_gcm_algs_vaes_avx10_512 */ 1920 + DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512, 1921 + "generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512", 1922 + AES_GCM_KEY_AVX10_SIZE, 800); 1923 + #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ 1924 + 836 1925 /* 837 1926 * This is a list of CPU models that are known to suffer from downclocking when 838 - * zmm registers (512-bit vectors) are used. On these CPUs, the AES-XTS 839 - * implementation with zmm registers won't be used by default. An 840 - * implementation with ymm registers (256-bit vectors) will be used instead. 1927 + * zmm registers (512-bit vectors) are used. On these CPUs, the AES mode 1928 + * implementations with zmm registers won't be used by default. Implementations 1929 + * with ymm registers (256-bit vectors) will be used by default instead. 841 1930 */ 842 1931 static const struct x86_cpu_id zmm_exclusion_list[] = { 843 1932 X86_MATCH_VFM(INTEL_SKYLAKE_X, 0), ··· 1559 1236 {}, 1560 1237 }; 1561 1238 1562 - static int __init register_xts_algs(void) 1239 + static int __init register_avx_algs(void) 1563 1240 { 1564 1241 int err; 1565 1242 ··· 1567 1244 return 0; 1568 1245 err = simd_register_skciphers_compat(&aes_xts_alg_aesni_avx, 1, 1569 1246 &aes_xts_simdalg_aesni_avx); 1247 + if (err) 1248 + return err; 1249 + err = simd_register_aeads_compat(aes_gcm_algs_aesni_avx, 1250 + ARRAY_SIZE(aes_gcm_algs_aesni_avx), 1251 + aes_gcm_simdalgs_aesni_avx); 1570 1252 if (err) 1571 1253 return err; 1572 1254 #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) ··· 1597 1269 &aes_xts_simdalg_vaes_avx10_256); 1598 1270 if (err) 1599 1271 return err; 1272 + err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_256, 1273 + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256), 1274 + aes_gcm_simdalgs_vaes_avx10_256); 1275 + if (err) 1276 + return err; 1600 1277 1601 - if (x86_match_cpu(zmm_exclusion_list)) 1278 + if (x86_match_cpu(zmm_exclusion_list)) { 1279 + int i; 1280 + 1602 1281 aes_xts_alg_vaes_avx10_512.base.cra_priority = 1; 1282 + for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++) 1283 + aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1; 1284 + } 1603 1285 1604 1286 err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_512, 1, 1605 1287 &aes_xts_simdalg_vaes_avx10_512); 1288 + if (err) 1289 + return err; 1290 + err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_512, 1291 + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512), 1292 + aes_gcm_simdalgs_vaes_avx10_512); 1606 1293 if (err) 1607 1294 return err; 1608 1295 #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ 1609 1296 return 0; 1610 1297 } 1611 1298 1612 - static void unregister_xts_algs(void) 1299 + static void unregister_avx_algs(void) 1613 1300 { 1614 1301 if (aes_xts_simdalg_aesni_avx) 1615 1302 simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1, 1616 1303 &aes_xts_simdalg_aesni_avx); 1304 + if (aes_gcm_simdalgs_aesni_avx[0]) 1305 + simd_unregister_aeads(aes_gcm_algs_aesni_avx, 1306 + ARRAY_SIZE(aes_gcm_algs_aesni_avx), 1307 + aes_gcm_simdalgs_aesni_avx); 1617 1308 #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) 1618 1309 if (aes_xts_simdalg_vaes_avx2) 1619 1310 simd_unregister_skciphers(&aes_xts_alg_vaes_avx2, 1, ··· 1640 1293 if (aes_xts_simdalg_vaes_avx10_256) 1641 1294 simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1, 1642 1295 &aes_xts_simdalg_vaes_avx10_256); 1296 + if (aes_gcm_simdalgs_vaes_avx10_256[0]) 1297 + simd_unregister_aeads(aes_gcm_algs_vaes_avx10_256, 1298 + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256), 1299 + aes_gcm_simdalgs_vaes_avx10_256); 1643 1300 if (aes_xts_simdalg_vaes_avx10_512) 1644 1301 simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_512, 1, 1645 1302 &aes_xts_simdalg_vaes_avx10_512); 1303 + if (aes_gcm_simdalgs_vaes_avx10_512[0]) 1304 + simd_unregister_aeads(aes_gcm_algs_vaes_avx10_512, 1305 + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512), 1306 + aes_gcm_simdalgs_vaes_avx10_512); 1646 1307 #endif 1647 1308 } 1648 1309 #else /* CONFIG_X86_64 */ 1649 - static int __init register_xts_algs(void) 1310 + static struct aead_alg aes_gcm_algs_aesni[0]; 1311 + static struct simd_aead_alg *aes_gcm_simdalgs_aesni[0]; 1312 + 1313 + static int __init register_avx_algs(void) 1650 1314 { 1651 1315 return 0; 1652 1316 } 1653 1317 1654 - static void unregister_xts_algs(void) 1318 + static void unregister_avx_algs(void) 1655 1319 { 1656 1320 } 1657 1321 #endif /* !CONFIG_X86_64 */ 1658 - 1659 - #ifdef CONFIG_X86_64 1660 - static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, 1661 - unsigned int key_len) 1662 - { 1663 - struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead); 1664 - 1665 - return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?: 1666 - aes_gcm_derive_hash_subkey(&ctx->aes_key_expanded, 1667 - ctx->hash_subkey); 1668 - } 1669 - 1670 - static int generic_gcmaes_encrypt(struct aead_request *req) 1671 - { 1672 - struct crypto_aead *tfm = crypto_aead_reqtfm(req); 1673 - struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); 1674 - void *aes_ctx = &(ctx->aes_key_expanded); 1675 - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); 1676 - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); 1677 - __be32 counter = cpu_to_be32(1); 1678 - 1679 - memcpy(iv, req->iv, 12); 1680 - *((__be32 *)(iv+12)) = counter; 1681 - 1682 - return gcmaes_encrypt(req, req->assoclen, ctx->hash_subkey, iv, 1683 - aes_ctx); 1684 - } 1685 - 1686 - static int generic_gcmaes_decrypt(struct aead_request *req) 1687 - { 1688 - __be32 counter = cpu_to_be32(1); 1689 - struct crypto_aead *tfm = crypto_aead_reqtfm(req); 1690 - struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); 1691 - void *aes_ctx = &(ctx->aes_key_expanded); 1692 - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); 1693 - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); 1694 - 1695 - memcpy(iv, req->iv, 12); 1696 - *((__be32 *)(iv+12)) = counter; 1697 - 1698 - return gcmaes_decrypt(req, req->assoclen, ctx->hash_subkey, iv, 1699 - aes_ctx); 1700 - } 1701 - 1702 - static struct aead_alg aesni_aeads[] = { { 1703 - .setkey = common_rfc4106_set_key, 1704 - .setauthsize = common_rfc4106_set_authsize, 1705 - .encrypt = helper_rfc4106_encrypt, 1706 - .decrypt = helper_rfc4106_decrypt, 1707 - .ivsize = GCM_RFC4106_IV_SIZE, 1708 - .maxauthsize = 16, 1709 - .base = { 1710 - .cra_name = "__rfc4106(gcm(aes))", 1711 - .cra_driver_name = "__rfc4106-gcm-aesni", 1712 - .cra_priority = 400, 1713 - .cra_flags = CRYPTO_ALG_INTERNAL, 1714 - .cra_blocksize = 1, 1715 - .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx), 1716 - .cra_alignmask = 0, 1717 - .cra_module = THIS_MODULE, 1718 - }, 1719 - }, { 1720 - .setkey = generic_gcmaes_set_key, 1721 - .setauthsize = generic_gcmaes_set_authsize, 1722 - .encrypt = generic_gcmaes_encrypt, 1723 - .decrypt = generic_gcmaes_decrypt, 1724 - .ivsize = GCM_AES_IV_SIZE, 1725 - .maxauthsize = 16, 1726 - .base = { 1727 - .cra_name = "__gcm(aes)", 1728 - .cra_driver_name = "__generic-gcm-aesni", 1729 - .cra_priority = 400, 1730 - .cra_flags = CRYPTO_ALG_INTERNAL, 1731 - .cra_blocksize = 1, 1732 - .cra_ctxsize = sizeof(struct generic_gcmaes_ctx), 1733 - .cra_alignmask = 0, 1734 - .cra_module = THIS_MODULE, 1735 - }, 1736 - } }; 1737 - #else 1738 - static struct aead_alg aesni_aeads[0]; 1739 - #endif 1740 - 1741 - static struct simd_aead_alg *aesni_simd_aeads[ARRAY_SIZE(aesni_aeads)]; 1742 1322 1743 1323 static const struct x86_cpu_id aesni_cpu_id[] = { 1744 1324 X86_MATCH_FEATURE(X86_FEATURE_AES, NULL), ··· 1680 1406 if (!x86_match_cpu(aesni_cpu_id)) 1681 1407 return -ENODEV; 1682 1408 #ifdef CONFIG_X86_64 1683 - if (boot_cpu_has(X86_FEATURE_AVX2)) { 1684 - pr_info("AVX2 version of gcm_enc/dec engaged.\n"); 1685 - static_branch_enable(&gcm_use_avx); 1686 - static_branch_enable(&gcm_use_avx2); 1687 - } else 1688 - if (boot_cpu_has(X86_FEATURE_AVX)) { 1689 - pr_info("AVX version of gcm_enc/dec engaged.\n"); 1690 - static_branch_enable(&gcm_use_avx); 1691 - } else { 1692 - pr_info("SSE version of gcm_enc/dec engaged.\n"); 1693 - } 1694 1409 if (boot_cpu_has(X86_FEATURE_AVX)) { 1695 1410 /* optimize performance of ctr mode encryption transform */ 1696 1411 static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm); ··· 1697 1434 if (err) 1698 1435 goto unregister_cipher; 1699 1436 1700 - err = simd_register_aeads_compat(aesni_aeads, ARRAY_SIZE(aesni_aeads), 1701 - aesni_simd_aeads); 1437 + err = simd_register_aeads_compat(aes_gcm_algs_aesni, 1438 + ARRAY_SIZE(aes_gcm_algs_aesni), 1439 + aes_gcm_simdalgs_aesni); 1702 1440 if (err) 1703 1441 goto unregister_skciphers; 1704 1442 ··· 1711 1447 goto unregister_aeads; 1712 1448 #endif /* CONFIG_X86_64 */ 1713 1449 1714 - err = register_xts_algs(); 1450 + err = register_avx_algs(); 1715 1451 if (err) 1716 - goto unregister_xts; 1452 + goto unregister_avx; 1717 1453 1718 1454 return 0; 1719 1455 1720 - unregister_xts: 1721 - unregister_xts_algs(); 1456 + unregister_avx: 1457 + unregister_avx_algs(); 1722 1458 #ifdef CONFIG_X86_64 1723 1459 if (aesni_simd_xctr) 1724 1460 simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); 1725 1461 unregister_aeads: 1726 1462 #endif /* CONFIG_X86_64 */ 1727 - simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), 1728 - aesni_simd_aeads); 1729 - 1463 + simd_unregister_aeads(aes_gcm_algs_aesni, 1464 + ARRAY_SIZE(aes_gcm_algs_aesni), 1465 + aes_gcm_simdalgs_aesni); 1730 1466 unregister_skciphers: 1731 1467 simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), 1732 1468 aesni_simd_skciphers); ··· 1737 1473 1738 1474 static void __exit aesni_exit(void) 1739 1475 { 1740 - simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), 1741 - aesni_simd_aeads); 1476 + simd_unregister_aeads(aes_gcm_algs_aesni, 1477 + ARRAY_SIZE(aes_gcm_algs_aesni), 1478 + aes_gcm_simdalgs_aesni); 1742 1479 simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), 1743 1480 aesni_simd_skciphers); 1744 1481 crypto_unregister_alg(&aesni_cipher_alg); ··· 1747 1482 if (boot_cpu_has(X86_FEATURE_AVX)) 1748 1483 simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); 1749 1484 #endif /* CONFIG_X86_64 */ 1750 - unregister_xts_algs(); 1485 + unregister_avx_algs(); 1751 1486 } 1752 1487 1753 1488 late_initcall(aesni_init);

+1

arch/x86/crypto/crc32-pclmul_glue.c

··· 195 195 module_exit(crc32_pclmul_mod_fini); 196 196 197 197 MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>"); 198 + MODULE_DESCRIPTION("CRC32 algorithm (IEEE 802.3) accelerated with PCLMULQDQ"); 198 199 MODULE_LICENSE("GPL"); 199 200 200 201 MODULE_ALIAS_CRYPTO("crc32");

+1

arch/x86/crypto/curve25519-x86_64.c

··· 1720 1720 1721 1721 MODULE_ALIAS_CRYPTO("curve25519"); 1722 1722 MODULE_ALIAS_CRYPTO("curve25519-x86"); 1723 + MODULE_DESCRIPTION("Curve25519 algorithm, ADX optimized"); 1723 1724 MODULE_LICENSE("GPL v2"); 1724 1725 MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");

+2 -2

arch/x86/crypto/poly1305_glue.c

··· 12 12 #include <linux/kernel.h> 13 13 #include <linux/module.h> 14 14 #include <linux/sizes.h> 15 - #include <asm/intel-family.h> 15 + #include <asm/cpu_device_id.h> 16 16 #include <asm/simd.h> 17 17 18 18 asmlinkage void poly1305_init_x86_64(void *ctx, ··· 269 269 boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && 270 270 cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && 271 271 /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ 272 - boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X) 272 + boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X) 273 273 static_branch_enable(&poly1305_use_avx512); 274 274 return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0; 275 275 }

+5 -4

arch/x86/crypto/twofish_glue_3way.c

··· 5 5 * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 6 6 */ 7 7 8 + #include <asm/cpu_device_id.h> 8 9 #include <crypto/algapi.h> 9 10 #include <crypto/twofish.h> 10 11 #include <linux/crypto.h> ··· 108 107 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) 109 108 return false; 110 109 111 - if (boot_cpu_data.x86 == 0x06 && 112 - (boot_cpu_data.x86_model == 0x1c || 113 - boot_cpu_data.x86_model == 0x26 || 114 - boot_cpu_data.x86_model == 0x36)) { 110 + switch (boot_cpu_data.x86_vfm) { 111 + case INTEL_ATOM_BONNELL: 112 + case INTEL_ATOM_BONNELL_MID: 113 + case INTEL_ATOM_SALTWELL: 115 114 /* 116 115 * On Atom, twofish-3way is slower than original assembler 117 116 * implementation. Twofish-3way trades off some performance in

-18

crypto/Kconfig

··· 313 313 One of the Russian cryptographic standard algorithms (called GOST 314 314 algorithms). Only signature verification is implemented. 315 315 316 - config CRYPTO_SM2 317 - tristate "SM2 (ShangMi 2)" 318 - select CRYPTO_SM3 319 - select CRYPTO_AKCIPHER 320 - select CRYPTO_MANAGER 321 - select MPILIB 322 - select ASN1 323 - help 324 - SM2 (ShangMi 2) public key algorithm 325 - 326 - Published by State Encryption Management Bureau, China, 327 - as specified by OSCCA GM/T 0003.1-2012 -- 0003.5-2012. 328 - 329 - References: 330 - https://datatracker.ietf.org/doc/draft-shen-sm2-ecdsa/ 331 - http://www.oscca.gov.cn/sca/xxgk/2010-12/17/content_1002386.shtml 332 - http://www.gmbz.org.cn/main/bzlb.html 333 - 334 316 config CRYPTO_CURVE25519 335 317 tristate "Curve25519" 336 318 select CRYPTO_KPP

-8

crypto/Makefile

··· 50 50 rsa_generic-y += rsa-pkcs1pad.o 51 51 obj-$(CONFIG_CRYPTO_RSA) += rsa_generic.o 52 52 53 - $(obj)/sm2signature.asn1.o: $(obj)/sm2signature.asn1.c $(obj)/sm2signature.asn1.h 54 - $(obj)/sm2.o: $(obj)/sm2signature.asn1.h 55 - 56 - sm2_generic-y += sm2signature.asn1.o 57 - sm2_generic-y += sm2.o 58 - 59 - obj-$(CONFIG_CRYPTO_SM2) += sm2_generic.o 60 - 61 53 $(obj)/ecdsasignature.asn1.o: $(obj)/ecdsasignature.asn1.c $(obj)/ecdsasignature.asn1.h 62 54 $(obj)/ecdsa.o: $(obj)/ecdsasignature.asn1.h 63 55 ecdsa_generic-y += ecdsa.o

+1

crypto/af_alg.c

··· 1317 1317 1318 1318 module_init(af_alg_init); 1319 1319 module_exit(af_alg_exit); 1320 + MODULE_DESCRIPTION("Crypto userspace interface"); 1320 1321 MODULE_LICENSE("GPL"); 1321 1322 MODULE_ALIAS_NETPROTO(AF_ALG);

+3

crypto/algapi.c

··· 1056 1056 1057 1057 static void __init crypto_start_tests(void) 1058 1058 { 1059 + if (!IS_BUILTIN(CONFIG_CRYPTO_ALGAPI)) 1060 + return; 1061 + 1059 1062 if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) 1060 1063 return; 1061 1064

+1

crypto/algif_hash.c

··· 471 471 472 472 module_init(algif_hash_init); 473 473 module_exit(algif_hash_exit); 474 + MODULE_DESCRIPTION("Userspace interface for hash algorithms"); 474 475 MODULE_LICENSE("GPL");

+1

crypto/algif_skcipher.c

··· 437 437 438 438 module_init(algif_skcipher_init); 439 439 module_exit(algif_skcipher_exit); 440 + MODULE_DESCRIPTION("Userspace interface for skcipher algorithms"); 440 441 MODULE_LICENSE("GPL");

+2 -2

crypto/api.c

··· 31 31 BLOCKING_NOTIFIER_HEAD(crypto_chain); 32 32 EXPORT_SYMBOL_GPL(crypto_chain); 33 33 34 - #ifndef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS 34 + #if IS_BUILTIN(CONFIG_CRYPTO_ALGAPI) && \ 35 + !IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) 35 36 DEFINE_STATIC_KEY_FALSE(__crypto_boot_test_finished); 36 - EXPORT_SYMBOL_GPL(__crypto_boot_test_finished); 37 37 #endif 38 38 39 39 static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);

-4

crypto/asymmetric_keys/pkcs7_parser.c

··· 292 292 ctx->sinfo->sig->pkey_algo = "ecdsa"; 293 293 ctx->sinfo->sig->encoding = "x962"; 294 294 break; 295 - case OID_SM2_with_SM3: 296 - ctx->sinfo->sig->pkey_algo = "sm2"; 297 - ctx->sinfo->sig->encoding = "raw"; 298 - break; 299 295 case OID_gost2012PKey256: 300 296 case OID_gost2012PKey512: 301 297 ctx->sinfo->sig->pkey_algo = "ecrdsa";

-7

crypto/asymmetric_keys/public_key.c

··· 124 124 strcmp(hash_algo, "sha3-384") != 0 && 125 125 strcmp(hash_algo, "sha3-512") != 0) 126 126 return -EINVAL; 127 - } else if (strcmp(pkey->pkey_algo, "sm2") == 0) { 128 - if (strcmp(encoding, "raw") != 0) 129 - return -EINVAL; 130 - if (!hash_algo) 131 - return -EINVAL; 132 - if (strcmp(hash_algo, "sm3") != 0) 133 - return -EINVAL; 134 127 } else if (strcmp(pkey->pkey_algo, "ecrdsa") == 0) { 135 128 if (strcmp(encoding, "raw") != 0) 136 129 return -EINVAL;

-16

crypto/asymmetric_keys/x509_cert_parser.c

··· 257 257 case OID_gost2012Signature512: 258 258 ctx->cert->sig->hash_algo = "streebog512"; 259 259 goto ecrdsa; 260 - 261 - case OID_SM2_with_SM3: 262 - ctx->cert->sig->hash_algo = "sm3"; 263 - goto sm2; 264 260 } 265 261 266 262 rsa_pkcs1: ··· 266 270 return 0; 267 271 ecrdsa: 268 272 ctx->cert->sig->pkey_algo = "ecrdsa"; 269 - ctx->cert->sig->encoding = "raw"; 270 - ctx->sig_algo = ctx->last_oid; 271 - return 0; 272 - sm2: 273 - ctx->cert->sig->pkey_algo = "sm2"; 274 273 ctx->cert->sig->encoding = "raw"; 275 274 ctx->sig_algo = ctx->last_oid; 276 275 return 0; ··· 300 309 301 310 if (strcmp(ctx->cert->sig->pkey_algo, "rsa") == 0 || 302 311 strcmp(ctx->cert->sig->pkey_algo, "ecrdsa") == 0 || 303 - strcmp(ctx->cert->sig->pkey_algo, "sm2") == 0 || 304 312 strcmp(ctx->cert->sig->pkey_algo, "ecdsa") == 0) { 305 313 /* Discard the BIT STRING metadata */ 306 314 if (vlen < 1 || *(const u8 *)value != 0) ··· 504 514 case OID_gost2012PKey512: 505 515 ctx->cert->pub->pkey_algo = "ecrdsa"; 506 516 break; 507 - case OID_sm2: 508 - ctx->cert->pub->pkey_algo = "sm2"; 509 - break; 510 517 case OID_id_ecPublicKey: 511 518 if (parse_OID(ctx->params, ctx->params_size, &oid) != 0) 512 519 return -EBADMSG; 513 520 514 521 switch (oid) { 515 - case OID_sm2: 516 - ctx->cert->pub->pkey_algo = "sm2"; 517 - break; 518 522 case OID_id_prime192v1: 519 523 ctx->cert->pub->pkey_algo = "ecdsa-nist-p192"; 520 524 break;

+2 -15

crypto/asymmetric_keys/x509_public_key.c

··· 7 7 8 8 #define pr_fmt(fmt) "X.509: "fmt 9 9 #include <crypto/hash.h> 10 - #include <crypto/sm2.h> 11 10 #include <keys/asymmetric-parser.h> 12 11 #include <keys/asymmetric-subtype.h> 13 12 #include <keys/system_keyring.h> ··· 63 64 64 65 desc->tfm = tfm; 65 66 66 - if (strcmp(cert->pub->pkey_algo, "sm2") == 0) { 67 - ret = strcmp(sig->hash_algo, "sm3") != 0 ? -EINVAL : 68 - crypto_shash_init(desc) ?: 69 - sm2_compute_z_digest(desc, cert->pub->key, 70 - cert->pub->keylen, sig->digest) ?: 71 - crypto_shash_init(desc) ?: 72 - crypto_shash_update(desc, sig->digest, 73 - sig->digest_size) ?: 74 - crypto_shash_finup(desc, cert->tbs, cert->tbs_size, 75 - sig->digest); 76 - } else { 77 - ret = crypto_shash_digest(desc, cert->tbs, cert->tbs_size, 78 - sig->digest); 79 - } 67 + ret = crypto_shash_digest(desc, cert->tbs, cert->tbs_size, 68 + sig->digest); 80 69 81 70 if (ret < 0) 82 71 goto error_2;

+1

crypto/cast_common.c

··· 282 282 }; 283 283 EXPORT_SYMBOL_GPL(cast_s4); 284 284 285 + MODULE_DESCRIPTION("Common lookup tables for CAST-128 (cast5) and CAST-256 (cast6)"); 285 286 MODULE_LICENSE("GPL");

+1

crypto/curve25519-generic.c

··· 87 87 88 88 MODULE_ALIAS_CRYPTO("curve25519"); 89 89 MODULE_ALIAS_CRYPTO("curve25519-generic"); 90 + MODULE_DESCRIPTION("Curve25519 elliptic curve (RFC7748)"); 90 91 MODULE_LICENSE("GPL");

+1

crypto/deflate.c

··· 311 311 MODULE_DESCRIPTION("Deflate Compression Algorithm for IPCOMP"); 312 312 MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); 313 313 MODULE_ALIAS_CRYPTO("deflate"); 314 + MODULE_ALIAS_CRYPTO("deflate-generic");

+2 -1

crypto/ecc.c

··· 78 78 /* diff > 0: not enough input bytes: set most significant digits to 0 */ 79 79 if (diff > 0) { 80 80 ndigits -= diff; 81 - memset(&out[ndigits - 1], 0, diff * sizeof(u64)); 81 + memset(&out[ndigits], 0, diff * sizeof(u64)); 82 82 } 83 83 84 84 if (o) { ··· 1715 1715 } 1716 1716 EXPORT_SYMBOL(crypto_ecdh_shared_secret); 1717 1717 1718 + MODULE_DESCRIPTION("core elliptic curve module"); 1718 1719 MODULE_LICENSE("Dual BSD/GPL");

+8 -26

crypto/ecdsa.c

··· 38 38 size_t bufsize = ndigits * sizeof(u64); 39 39 ssize_t diff = vlen - bufsize; 40 40 const char *d = value; 41 - u8 rs[ECC_MAX_BYTES]; 42 41 43 42 if (!value || !vlen) 44 43 return -EINVAL; ··· 45 46 /* diff = 0: 'value' has exacly the right size 46 47 * diff > 0: 'value' has too many bytes; one leading zero is allowed that 47 48 * makes the value a positive integer; error on more 48 - * diff < 0: 'value' is missing leading zeros, which we add 49 + * diff < 0: 'value' is missing leading zeros 49 50 */ 50 51 if (diff > 0) { 51 52 /* skip over leading zeros that make 'value' a positive int */ ··· 60 61 if (-diff >= bufsize) 61 62 return -EINVAL; 62 63 63 - if (diff) { 64 - /* leading zeros not given in 'value' */ 65 - memset(rs, 0, -diff); 66 - } 67 - 68 - memcpy(&rs[-diff], d, vlen); 69 - 70 - ecc_swap_digits((u64 *)rs, dest, ndigits); 64 + ecc_digits_from_bytes(d, vlen, dest, ndigits); 71 65 72 66 return 0; 73 67 } ··· 134 142 struct ecdsa_signature_ctx sig_ctx = { 135 143 .curve = ctx->curve, 136 144 }; 137 - u8 rawhash[ECC_MAX_BYTES]; 138 145 u64 hash[ECC_MAX_DIGITS]; 139 146 unsigned char *buffer; 140 - ssize_t diff; 141 147 int ret; 142 148 143 149 if (unlikely(!ctx->pub_key_set)) ··· 154 164 if (ret < 0) 155 165 goto error; 156 166 157 - /* if the hash is shorter then we will add leading zeros to fit to ndigits */ 158 - diff = bufsize - req->dst_len; 159 - if (diff >= 0) { 160 - if (diff) 161 - memset(rawhash, 0, diff); 162 - memcpy(&rawhash[diff], buffer + req->src_len, req->dst_len); 163 - } else if (diff < 0) { 164 - /* given hash is longer, we take the left-most bytes */ 165 - memcpy(&rawhash, buffer + req->src_len, bufsize); 166 - } 167 + if (bufsize > req->dst_len) 168 + bufsize = req->dst_len; 167 169 168 - ecc_swap_digits((u64 *)rawhash, hash, ctx->curve->g.ndigits); 170 + ecc_digits_from_bytes(buffer + req->src_len, bufsize, 171 + hash, ctx->curve->g.ndigits); 169 172 170 173 ret = _ecdsa_verify(ctx, hash, sig_ctx.r, sig_ctx.s); 171 174 ··· 198 215 } 199 216 200 217 /* 201 - * Set the public key given the raw uncompressed key data from an X509 202 - * certificate. The key data contain the concatenated X and Y coordinates of 203 - * the public key. 218 + * Set the public ECC key as defined by RFC5480 section 2.2 "Subject Public 219 + * Key". Only the uncompressed format is supported. 204 220 */ 205 221 static int ecdsa_set_pub_key(struct crypto_akcipher *tfm, const void *key, unsigned int keylen) 206 222 {

+5 -2

crypto/internal.h

··· 66 66 67 67 int alg_test(const char *driver, const char *alg, u32 type, u32 mask); 68 68 69 - #ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS 69 + #if !IS_BUILTIN(CONFIG_CRYPTO_ALGAPI) || \ 70 + IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) 70 71 static inline bool crypto_boot_test_finished(void) 71 72 { 72 73 return true; ··· 85 84 { 86 85 static_branch_enable(&__crypto_boot_test_finished); 87 86 } 88 - #endif /* !CONFIG_CRYPTO_MANAGER_DISABLE_TESTS */ 87 + #endif /* !IS_BUILTIN(CONFIG_CRYPTO_ALGAPI) || 88 + * IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) 89 + */ 89 90 90 91 #ifdef CONFIG_PROC_FS 91 92 void __init crypto_init_proc(void);

+1

crypto/simd.c

··· 523 523 } 524 524 EXPORT_SYMBOL_GPL(simd_unregister_aeads); 525 525 526 + MODULE_DESCRIPTION("Shared crypto SIMD helpers"); 526 527 MODULE_LICENSE("GPL");

-498

crypto/sm2.c

··· 1 - // SPDX-License-Identifier: GPL-2.0-or-later 2 - /* 3 - * SM2 asymmetric public-key algorithm 4 - * as specified by OSCCA GM/T 0003.1-2012 -- 0003.5-2012 SM2 and 5 - * described at https://tools.ietf.org/html/draft-shen-sm2-ecdsa-02 6 - * 7 - * Copyright (c) 2020, Alibaba Group. 8 - * Authors: Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 9 - */ 10 - 11 - #include <linux/module.h> 12 - #include <linux/mpi.h> 13 - #include <crypto/internal/akcipher.h> 14 - #include <crypto/akcipher.h> 15 - #include <crypto/hash.h> 16 - #include <crypto/rng.h> 17 - #include <crypto/sm2.h> 18 - #include "sm2signature.asn1.h" 19 - 20 - /* The default user id as specified in GM/T 0009-2012 */ 21 - #define SM2_DEFAULT_USERID "1234567812345678" 22 - #define SM2_DEFAULT_USERID_LEN 16 23 - 24 - #define MPI_NBYTES(m) ((mpi_get_nbits(m) + 7) / 8) 25 - 26 - struct ecc_domain_parms { 27 - const char *desc; /* Description of the curve. */ 28 - unsigned int nbits; /* Number of bits. */ 29 - unsigned int fips:1; /* True if this is a FIPS140-2 approved curve */ 30 - 31 - /* The model describing this curve. This is mainly used to select 32 - * the group equation. 33 - */ 34 - enum gcry_mpi_ec_models model; 35 - 36 - /* The actual ECC dialect used. This is used for curve specific 37 - * optimizations and to select encodings etc. 38 - */ 39 - enum ecc_dialects dialect; 40 - 41 - const char *p; /* The prime defining the field. */ 42 - const char *a, *b; /* The coefficients. For Twisted Edwards 43 - * Curves b is used for d. For Montgomery 44 - * Curves (a,b) has ((A-2)/4,B^-1). 45 - */ 46 - const char *n; /* The order of the base point. */ 47 - const char *g_x, *g_y; /* Base point. */ 48 - unsigned int h; /* Cofactor. */ 49 - }; 50 - 51 - static const struct ecc_domain_parms sm2_ecp = { 52 - .desc = "sm2p256v1", 53 - .nbits = 256, 54 - .fips = 0, 55 - .model = MPI_EC_WEIERSTRASS, 56 - .dialect = ECC_DIALECT_STANDARD, 57 - .p = "0xfffffffeffffffffffffffffffffffffffffffff00000000ffffffffffffffff", 58 - .a = "0xfffffffeffffffffffffffffffffffffffffffff00000000fffffffffffffffc", 59 - .b = "0x28e9fa9e9d9f5e344d5a9e4bcf6509a7f39789f515ab8f92ddbcbd414d940e93", 60 - .n = "0xfffffffeffffffffffffffffffffffff7203df6b21c6052b53bbf40939d54123", 61 - .g_x = "0x32c4ae2c1f1981195f9904466a39c9948fe30bbff2660be1715a4589334c74c7", 62 - .g_y = "0xbc3736a2f4f6779c59bdcee36b692153d0a9877cc62a474002df32e52139f0a0", 63 - .h = 1 64 - }; 65 - 66 - static int __sm2_set_pub_key(struct mpi_ec_ctx *ec, 67 - const void *key, unsigned int keylen); 68 - 69 - static int sm2_ec_ctx_init(struct mpi_ec_ctx *ec) 70 - { 71 - const struct ecc_domain_parms *ecp = &sm2_ecp; 72 - MPI p, a, b; 73 - MPI x, y; 74 - int rc = -EINVAL; 75 - 76 - p = mpi_scanval(ecp->p); 77 - a = mpi_scanval(ecp->a); 78 - b = mpi_scanval(ecp->b); 79 - if (!p || !a || !b) 80 - goto free_p; 81 - 82 - x = mpi_scanval(ecp->g_x); 83 - y = mpi_scanval(ecp->g_y); 84 - if (!x || !y) 85 - goto free; 86 - 87 - rc = -ENOMEM; 88 - 89 - ec->Q = mpi_point_new(0); 90 - if (!ec->Q) 91 - goto free; 92 - 93 - /* mpi_ec_setup_elliptic_curve */ 94 - ec->G = mpi_point_new(0); 95 - if (!ec->G) { 96 - mpi_point_release(ec->Q); 97 - goto free; 98 - } 99 - 100 - mpi_set(ec->G->x, x); 101 - mpi_set(ec->G->y, y); 102 - mpi_set_ui(ec->G->z, 1); 103 - 104 - rc = -EINVAL; 105 - ec->n = mpi_scanval(ecp->n); 106 - if (!ec->n) { 107 - mpi_point_release(ec->Q); 108 - mpi_point_release(ec->G); 109 - goto free; 110 - } 111 - 112 - ec->h = ecp->h; 113 - ec->name = ecp->desc; 114 - mpi_ec_init(ec, ecp->model, ecp->dialect, 0, p, a, b); 115 - 116 - rc = 0; 117 - 118 - free: 119 - mpi_free(x); 120 - mpi_free(y); 121 - free_p: 122 - mpi_free(p); 123 - mpi_free(a); 124 - mpi_free(b); 125 - 126 - return rc; 127 - } 128 - 129 - static void sm2_ec_ctx_deinit(struct mpi_ec_ctx *ec) 130 - { 131 - mpi_ec_deinit(ec); 132 - 133 - memset(ec, 0, sizeof(*ec)); 134 - } 135 - 136 - /* RESULT must have been initialized and is set on success to the 137 - * point given by VALUE. 138 - */ 139 - static int sm2_ecc_os2ec(MPI_POINT result, MPI value) 140 - { 141 - int rc; 142 - size_t n; 143 - unsigned char *buf; 144 - MPI x, y; 145 - 146 - n = MPI_NBYTES(value); 147 - buf = kmalloc(n, GFP_KERNEL); 148 - if (!buf) 149 - return -ENOMEM; 150 - 151 - rc = mpi_print(GCRYMPI_FMT_USG, buf, n, &n, value); 152 - if (rc) 153 - goto err_freebuf; 154 - 155 - rc = -EINVAL; 156 - if (n < 1 || ((n - 1) % 2)) 157 - goto err_freebuf; 158 - /* No support for point compression */ 159 - if (*buf != 0x4) 160 - goto err_freebuf; 161 - 162 - rc = -ENOMEM; 163 - n = (n - 1) / 2; 164 - x = mpi_read_raw_data(buf + 1, n); 165 - if (!x) 166 - goto err_freebuf; 167 - y = mpi_read_raw_data(buf + 1 + n, n); 168 - if (!y) 169 - goto err_freex; 170 - 171 - mpi_normalize(x); 172 - mpi_normalize(y); 173 - mpi_set(result->x, x); 174 - mpi_set(result->y, y); 175 - mpi_set_ui(result->z, 1); 176 - 177 - rc = 0; 178 - 179 - mpi_free(y); 180 - err_freex: 181 - mpi_free(x); 182 - err_freebuf: 183 - kfree(buf); 184 - return rc; 185 - } 186 - 187 - struct sm2_signature_ctx { 188 - MPI sig_r; 189 - MPI sig_s; 190 - }; 191 - 192 - int sm2_get_signature_r(void *context, size_t hdrlen, unsigned char tag, 193 - const void *value, size_t vlen) 194 - { 195 - struct sm2_signature_ctx *sig = context; 196 - 197 - if (!value || !vlen) 198 - return -EINVAL; 199 - 200 - sig->sig_r = mpi_read_raw_data(value, vlen); 201 - if (!sig->sig_r) 202 - return -ENOMEM; 203 - 204 - return 0; 205 - } 206 - 207 - int sm2_get_signature_s(void *context, size_t hdrlen, unsigned char tag, 208 - const void *value, size_t vlen) 209 - { 210 - struct sm2_signature_ctx *sig = context; 211 - 212 - if (!value || !vlen) 213 - return -EINVAL; 214 - 215 - sig->sig_s = mpi_read_raw_data(value, vlen); 216 - if (!sig->sig_s) 217 - return -ENOMEM; 218 - 219 - return 0; 220 - } 221 - 222 - static int sm2_z_digest_update(struct shash_desc *desc, 223 - MPI m, unsigned int pbytes) 224 - { 225 - static const unsigned char zero[32]; 226 - unsigned char *in; 227 - unsigned int inlen; 228 - int err; 229 - 230 - in = mpi_get_buffer(m, &inlen, NULL); 231 - if (!in) 232 - return -EINVAL; 233 - 234 - if (inlen < pbytes) { 235 - /* padding with zero */ 236 - err = crypto_shash_update(desc, zero, pbytes - inlen) ?: 237 - crypto_shash_update(desc, in, inlen); 238 - } else if (inlen > pbytes) { 239 - /* skip the starting zero */ 240 - err = crypto_shash_update(desc, in + inlen - pbytes, pbytes); 241 - } else { 242 - err = crypto_shash_update(desc, in, inlen); 243 - } 244 - 245 - kfree(in); 246 - return err; 247 - } 248 - 249 - static int sm2_z_digest_update_point(struct shash_desc *desc, 250 - MPI_POINT point, struct mpi_ec_ctx *ec, 251 - unsigned int pbytes) 252 - { 253 - MPI x, y; 254 - int ret = -EINVAL; 255 - 256 - x = mpi_new(0); 257 - y = mpi_new(0); 258 - 259 - ret = mpi_ec_get_affine(x, y, point, ec) ? -EINVAL : 260 - sm2_z_digest_update(desc, x, pbytes) ?: 261 - sm2_z_digest_update(desc, y, pbytes); 262 - 263 - mpi_free(x); 264 - mpi_free(y); 265 - return ret; 266 - } 267 - 268 - int sm2_compute_z_digest(struct shash_desc *desc, 269 - const void *key, unsigned int keylen, void *dgst) 270 - { 271 - struct mpi_ec_ctx *ec; 272 - unsigned int bits_len; 273 - unsigned int pbytes; 274 - u8 entl[2]; 275 - int err; 276 - 277 - ec = kmalloc(sizeof(*ec), GFP_KERNEL); 278 - if (!ec) 279 - return -ENOMEM; 280 - 281 - err = sm2_ec_ctx_init(ec); 282 - if (err) 283 - goto out_free_ec; 284 - 285 - err = __sm2_set_pub_key(ec, key, keylen); 286 - if (err) 287 - goto out_deinit_ec; 288 - 289 - bits_len = SM2_DEFAULT_USERID_LEN * 8; 290 - entl[0] = bits_len >> 8; 291 - entl[1] = bits_len & 0xff; 292 - 293 - pbytes = MPI_NBYTES(ec->p); 294 - 295 - /* ZA = H256(ENTLA | IDA | a | b | xG | yG | xA | yA) */ 296 - err = crypto_shash_init(desc); 297 - if (err) 298 - goto out_deinit_ec; 299 - 300 - err = crypto_shash_update(desc, entl, 2); 301 - if (err) 302 - goto out_deinit_ec; 303 - 304 - err = crypto_shash_update(desc, SM2_DEFAULT_USERID, 305 - SM2_DEFAULT_USERID_LEN); 306 - if (err) 307 - goto out_deinit_ec; 308 - 309 - err = sm2_z_digest_update(desc, ec->a, pbytes) ?: 310 - sm2_z_digest_update(desc, ec->b, pbytes) ?: 311 - sm2_z_digest_update_point(desc, ec->G, ec, pbytes) ?: 312 - sm2_z_digest_update_point(desc, ec->Q, ec, pbytes); 313 - if (err) 314 - goto out_deinit_ec; 315 - 316 - err = crypto_shash_final(desc, dgst); 317 - 318 - out_deinit_ec: 319 - sm2_ec_ctx_deinit(ec); 320 - out_free_ec: 321 - kfree(ec); 322 - return err; 323 - } 324 - EXPORT_SYMBOL_GPL(sm2_compute_z_digest); 325 - 326 - static int _sm2_verify(struct mpi_ec_ctx *ec, MPI hash, MPI sig_r, MPI sig_s) 327 - { 328 - int rc = -EINVAL; 329 - struct gcry_mpi_point sG, tP; 330 - MPI t = NULL; 331 - MPI x1 = NULL, y1 = NULL; 332 - 333 - mpi_point_init(&sG); 334 - mpi_point_init(&tP); 335 - x1 = mpi_new(0); 336 - y1 = mpi_new(0); 337 - t = mpi_new(0); 338 - 339 - /* r, s in [1, n-1] */ 340 - if (mpi_cmp_ui(sig_r, 1) < 0 || mpi_cmp(sig_r, ec->n) > 0 || 341 - mpi_cmp_ui(sig_s, 1) < 0 || mpi_cmp(sig_s, ec->n) > 0) { 342 - goto leave; 343 - } 344 - 345 - /* t = (r + s) % n, t == 0 */ 346 - mpi_addm(t, sig_r, sig_s, ec->n); 347 - if (mpi_cmp_ui(t, 0) == 0) 348 - goto leave; 349 - 350 - /* sG + tP = (x1, y1) */ 351 - rc = -EBADMSG; 352 - mpi_ec_mul_point(&sG, sig_s, ec->G, ec); 353 - mpi_ec_mul_point(&tP, t, ec->Q, ec); 354 - mpi_ec_add_points(&sG, &sG, &tP, ec); 355 - if (mpi_ec_get_affine(x1, y1, &sG, ec)) 356 - goto leave; 357 - 358 - /* R = (e + x1) % n */ 359 - mpi_addm(t, hash, x1, ec->n); 360 - 361 - /* check R == r */ 362 - rc = -EKEYREJECTED; 363 - if (mpi_cmp(t, sig_r)) 364 - goto leave; 365 - 366 - rc = 0; 367 - 368 - leave: 369 - mpi_point_free_parts(&sG); 370 - mpi_point_free_parts(&tP); 371 - mpi_free(x1); 372 - mpi_free(y1); 373 - mpi_free(t); 374 - 375 - return rc; 376 - } 377 - 378 - static int sm2_verify(struct akcipher_request *req) 379 - { 380 - struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); 381 - struct mpi_ec_ctx *ec = akcipher_tfm_ctx(tfm); 382 - unsigned char *buffer; 383 - struct sm2_signature_ctx sig; 384 - MPI hash; 385 - int ret; 386 - 387 - if (unlikely(!ec->Q)) 388 - return -EINVAL; 389 - 390 - buffer = kmalloc(req->src_len + req->dst_len, GFP_KERNEL); 391 - if (!buffer) 392 - return -ENOMEM; 393 - 394 - sg_pcopy_to_buffer(req->src, 395 - sg_nents_for_len(req->src, req->src_len + req->dst_len), 396 - buffer, req->src_len + req->dst_len, 0); 397 - 398 - sig.sig_r = NULL; 399 - sig.sig_s = NULL; 400 - ret = asn1_ber_decoder(&sm2signature_decoder, &sig, 401 - buffer, req->src_len); 402 - if (ret) 403 - goto error; 404 - 405 - ret = -ENOMEM; 406 - hash = mpi_read_raw_data(buffer + req->src_len, req->dst_len); 407 - if (!hash) 408 - goto error; 409 - 410 - ret = _sm2_verify(ec, hash, sig.sig_r, sig.sig_s); 411 - 412 - mpi_free(hash); 413 - error: 414 - mpi_free(sig.sig_r); 415 - mpi_free(sig.sig_s); 416 - kfree(buffer); 417 - return ret; 418 - } 419 - 420 - static int sm2_set_pub_key(struct crypto_akcipher *tfm, 421 - const void *key, unsigned int keylen) 422 - { 423 - struct mpi_ec_ctx *ec = akcipher_tfm_ctx(tfm); 424 - 425 - return __sm2_set_pub_key(ec, key, keylen); 426 - 427 - } 428 - 429 - static int __sm2_set_pub_key(struct mpi_ec_ctx *ec, 430 - const void *key, unsigned int keylen) 431 - { 432 - MPI a; 433 - int rc; 434 - 435 - /* include the uncompressed flag '0x04' */ 436 - a = mpi_read_raw_data(key, keylen); 437 - if (!a) 438 - return -ENOMEM; 439 - 440 - mpi_normalize(a); 441 - rc = sm2_ecc_os2ec(ec->Q, a); 442 - mpi_free(a); 443 - 444 - return rc; 445 - } 446 - 447 - static unsigned int sm2_max_size(struct crypto_akcipher *tfm) 448 - { 449 - /* Unlimited max size */ 450 - return PAGE_SIZE; 451 - } 452 - 453 - static int sm2_init_tfm(struct crypto_akcipher *tfm) 454 - { 455 - struct mpi_ec_ctx *ec = akcipher_tfm_ctx(tfm); 456 - 457 - return sm2_ec_ctx_init(ec); 458 - } 459 - 460 - static void sm2_exit_tfm(struct crypto_akcipher *tfm) 461 - { 462 - struct mpi_ec_ctx *ec = akcipher_tfm_ctx(tfm); 463 - 464 - sm2_ec_ctx_deinit(ec); 465 - } 466 - 467 - static struct akcipher_alg sm2 = { 468 - .verify = sm2_verify, 469 - .set_pub_key = sm2_set_pub_key, 470 - .max_size = sm2_max_size, 471 - .init = sm2_init_tfm, 472 - .exit = sm2_exit_tfm, 473 - .base = { 474 - .cra_name = "sm2", 475 - .cra_driver_name = "sm2-generic", 476 - .cra_priority = 100, 477 - .cra_module = THIS_MODULE, 478 - .cra_ctxsize = sizeof(struct mpi_ec_ctx), 479 - }, 480 - }; 481 - 482 - static int __init sm2_init(void) 483 - { 484 - return crypto_register_akcipher(&sm2); 485 - } 486 - 487 - static void __exit sm2_exit(void) 488 - { 489 - crypto_unregister_akcipher(&sm2); 490 - } 491 - 492 - subsys_initcall(sm2_init); 493 - module_exit(sm2_exit); 494 - 495 - MODULE_LICENSE("GPL"); 496 - MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>"); 497 - MODULE_DESCRIPTION("SM2 generic algorithm"); 498 - MODULE_ALIAS_CRYPTO("sm2-generic");

-4

crypto/sm2signature.asn1

··· 1 - Sm2Signature ::= SEQUENCE { 2 - sig_r INTEGER ({ sm2_get_signature_r }), 3 - sig_s INTEGER ({ sm2_get_signature_s }) 4 - }

+9

crypto/tcrypt.c

··· 2613 2613 break; 2614 2614 2615 2615 case 600: 2616 + if (alg) { 2617 + u8 speed_template[2] = {klen, 0}; 2618 + test_mb_skcipher_speed(alg, ENCRYPT, sec, NULL, 0, 2619 + speed_template, num_mb); 2620 + test_mb_skcipher_speed(alg, DECRYPT, sec, NULL, 0, 2621 + speed_template, num_mb); 2622 + break; 2623 + } 2624 + 2616 2625 test_mb_skcipher_speed("ecb(aes)", ENCRYPT, sec, NULL, 0, 2617 2626 speed_template_16_24_32, num_mb); 2618 2627 test_mb_skcipher_speed("ecb(aes)", DECRYPT, sec, NULL, 0,

+36 -15

crypto/testmgr.c

··· 293 293 * the @key_offset 294 294 * @finalization_type: what finalization function to use for hashes 295 295 * @nosimd: execute with SIMD disabled? Requires !CRYPTO_TFM_REQ_MAY_SLEEP. 296 + * This applies to the parts of the operation that aren't controlled 297 + * individually by @nosimd_setkey or @src_divs[].nosimd. 298 + * @nosimd_setkey: set the key (if applicable) with SIMD disabled? Requires 299 + * !CRYPTO_TFM_REQ_MAY_SLEEP. 296 300 */ 297 301 struct testvec_config { 298 302 const char *name; ··· 310 306 bool key_offset_relative_to_alignmask; 311 307 enum finalization_type finalization_type; 312 308 bool nosimd; 309 + bool nosimd_setkey; 313 310 }; 314 311 315 312 #define TESTVEC_CONFIG_NAMELEN 192 ··· 538 533 cfg->finalization_type == FINALIZATION_TYPE_DIGEST) 539 534 return false; 540 535 541 - if ((cfg->nosimd || (flags & SGDIVS_HAVE_NOSIMD)) && 536 + if ((cfg->nosimd || cfg->nosimd_setkey || 537 + (flags & SGDIVS_HAVE_NOSIMD)) && 542 538 (cfg->req_flags & CRYPTO_TFM_REQ_MAY_SLEEP)) 543 539 return false; 544 540 ··· 847 841 return 0; 848 842 } 849 843 850 - /* Like setkey_f(tfm, key, ksize), but sometimes misalign the key */ 844 + /* 845 + * Like setkey_f(tfm, key, ksize), but sometimes misalign the key. 846 + * In addition, run the setkey function in no-SIMD context if requested. 847 + */ 851 848 #define do_setkey(setkey_f, tfm, key, ksize, cfg, alignmask) \ 852 849 ({ \ 853 850 const u8 *keybuf, *keyptr; \ ··· 859 850 err = prepare_keybuf((key), (ksize), (cfg), (alignmask), \ 860 851 &keybuf, &keyptr); \ 861 852 if (err == 0) { \ 853 + if ((cfg)->nosimd_setkey) \ 854 + crypto_disable_simd_for_test(); \ 862 855 err = setkey_f((tfm), keyptr, (ksize)); \ 856 + if ((cfg)->nosimd_setkey) \ 857 + crypto_reenable_simd_for_test(); \ 863 858 kfree(keybuf); \ 864 859 } \ 865 860 err; \ ··· 916 903 917 904 switch (prandom_u32_below(rng, 4)) { 918 905 case 0: 919 - return len % 64; 906 + len %= 64; 907 + break; 920 908 case 1: 921 - return len % 256; 909 + len %= 256; 910 + break; 922 911 case 2: 923 - return len % 1024; 912 + len %= 1024; 913 + break; 924 914 default: 925 - return len; 915 + break; 926 916 } 917 + if (len && prandom_u32_below(rng, 4) == 0) 918 + len = rounddown_pow_of_two(len); 919 + return len; 927 920 } 928 921 929 922 /* Flip a random bit in the given nonempty data buffer */ ··· 1025 1006 1026 1007 if (div == &divs[max_divs - 1] || prandom_bool(rng)) 1027 1008 this_len = remaining; 1009 + else if (prandom_u32_below(rng, 4) == 0) 1010 + this_len = (remaining + 1) / 2; 1028 1011 else 1029 1012 this_len = prandom_u32_inclusive(rng, 1, remaining); 1030 1013 div->proportion_of_total = this_len; ··· 1139 1118 break; 1140 1119 } 1141 1120 1142 - if (!(cfg->req_flags & CRYPTO_TFM_REQ_MAY_SLEEP) && prandom_bool(rng)) { 1143 - cfg->nosimd = true; 1144 - p += scnprintf(p, end - p, " nosimd"); 1121 + if (!(cfg->req_flags & CRYPTO_TFM_REQ_MAY_SLEEP)) { 1122 + if (prandom_bool(rng)) { 1123 + cfg->nosimd = true; 1124 + p += scnprintf(p, end - p, " nosimd"); 1125 + } 1126 + if (prandom_bool(rng)) { 1127 + cfg->nosimd_setkey = true; 1128 + p += scnprintf(p, end - p, " nosimd_setkey"); 1129 + } 1145 1130 } 1146 1131 1147 1132 p += scnprintf(p, end - p, " src_divs=["); ··· 5615 5588 .fips_allowed = 1, 5616 5589 .suite = { 5617 5590 .hash = __VECS(sha512_tv_template) 5618 - } 5619 - }, { 5620 - .alg = "sm2", 5621 - .test = alg_test_akcipher, 5622 - .suite = { 5623 - .akcipher = __VECS(sm2_tv_template) 5624 5591 } 5625 5592 }, { 5626 5593 .alg = "sm3",

-59

crypto/testmgr.h

··· 5774 5774 }, 5775 5775 }; 5776 5776 5777 - /* 5778 - * SM2 test vectors. 5779 - */ 5780 - static const struct akcipher_testvec sm2_tv_template[] = { 5781 - { /* Generated from openssl */ 5782 - .key = 5783 - "\x04" 5784 - "\x8e\xa0\x33\x69\x91\x7e\x3d\xec\xad\x8e\xf0\x45\x5e\x13\x3e\x68" 5785 - "\x5b\x8c\xab\x5c\xc6\xc8\x50\xdf\x91\x00\xe0\x24\x73\x4d\x31\xf2" 5786 - "\x2e\xc0\xd5\x6b\xee\xda\x98\x93\xec\xd8\x36\xaa\xb9\xcf\x63\x82" 5787 - "\xef\xa7\x1a\x03\xed\x16\xba\x74\xb8\x8b\xf9\xe5\x70\x39\xa4\x70", 5788 - .key_len = 65, 5789 - .param_len = 0, 5790 - .c = 5791 - "\x30\x45" 5792 - "\x02\x20" 5793 - "\x70\xab\xb6\x7d\xd6\x54\x80\x64\x42\x7e\x2d\x05\x08\x36\xc9\x96" 5794 - "\x25\xc2\xbb\xff\x08\xe5\x43\x15\x5e\xf3\x06\xd9\x2b\x2f\x0a\x9f" 5795 - "\x02\x21" 5796 - "\x00" 5797 - "\xbf\x21\x5f\x7e\x5d\x3f\x1a\x4d\x8f\x84\xc2\xe9\xa6\x4c\xa4\x18" 5798 - "\xb2\xb8\x46\xf4\x32\x96\xfa\x57\xc6\x29\xd4\x89\xae\xcc\xda\xdb", 5799 - .c_size = 71, 5800 - .algo = OID_SM2_with_SM3, 5801 - .m = 5802 - "\x47\xa7\xbf\xd3\xda\xc4\x79\xee\xda\x8b\x4f\xe8\x40\x94\xd4\x32" 5803 - "\x8f\xf1\xcd\x68\x4d\xbd\x9b\x1d\xe0\xd8\x9a\x5d\xad\x85\x47\x5c", 5804 - .m_size = 32, 5805 - .public_key_vec = true, 5806 - .siggen_sigver_test = true, 5807 - }, 5808 - { /* From libgcrypt */ 5809 - .key = 5810 - "\x04" 5811 - "\x87\x59\x38\x9a\x34\xaa\xad\x07\xec\xf4\xe0\xc8\xc2\x65\x0a\x44" 5812 - "\x59\xc8\xd9\x26\xee\x23\x78\x32\x4e\x02\x61\xc5\x25\x38\xcb\x47" 5813 - "\x75\x28\x10\x6b\x1e\x0b\x7c\x8d\xd5\xff\x29\xa9\xc8\x6a\x89\x06" 5814 - "\x56\x56\xeb\x33\x15\x4b\xc0\x55\x60\x91\xef\x8a\xc9\xd1\x7d\x78", 5815 - .key_len = 65, 5816 - .param_len = 0, 5817 - .c = 5818 - "\x30\x44" 5819 - "\x02\x20" 5820 - "\xd9\xec\xef\xe8\x5f\xee\x3c\x59\x57\x8e\x5b\xab\xb3\x02\xe1\x42" 5821 - "\x4b\x67\x2c\x0b\x26\xb6\x51\x2c\x3e\xfc\xc6\x49\xec\xfe\x89\xe5" 5822 - "\x02\x20" 5823 - "\x43\x45\xd0\xa5\xff\xe5\x13\x27\x26\xd0\xec\x37\xad\x24\x1e\x9a" 5824 - "\x71\x9a\xa4\x89\xb0\x7e\x0f\xc4\xbb\x2d\x50\xd0\xe5\x7f\x7a\x68", 5825 - .c_size = 70, 5826 - .algo = OID_SM2_with_SM3, 5827 - .m = 5828 - "\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff\x00" 5829 - "\x12\x34\x56\x78\x9a\xbc\xde\xf0\x12\x34\x56\x78\x9a\xbc\xde\xf0", 5830 - .m_size = 32, 5831 - .public_key_vec = true, 5832 - .siggen_sigver_test = true, 5833 - }, 5834 - }; 5835 - 5836 5777 /* Example vectors below taken from 5837 5778 * http://www.oscca.gov.cn/UpFile/20101222141857786.pdf 5838 5779 *

+1

crypto/xor.c

··· 165 165 166 166 static __exit void xor_exit(void) { } 167 167 168 + MODULE_DESCRIPTION("RAID-5 checksumming functions"); 168 169 MODULE_LICENSE("GPL"); 169 170 170 171 #ifndef MODULE

-1

drivers/char/hw_random/Kconfig

··· 555 555 config HW_RANDOM_CN10K 556 556 tristate "Marvell CN10K Random Number Generator support" 557 557 depends on HW_RANDOM && PCI && (ARM64 || (64BIT && COMPILE_TEST)) 558 - default HW_RANDOM 559 558 help 560 559 This driver provides support for the True Random Number 561 560 generator available in Marvell CN10K SoCs.

+3 -1

drivers/char/hw_random/amd-rng.c

··· 143 143 144 144 found: 145 145 err = pci_read_config_dword(pdev, 0x58, &pmbase); 146 - if (err) 146 + if (err) { 147 + err = pcibios_err_to_errno(err); 147 148 goto put_dev; 149 + } 148 150 149 151 pmbase &= 0x0000FF00; 150 152 if (pmbase == 0) {

+1

drivers/char/hw_random/arm_smccc_trng.c

··· 118 118 119 119 MODULE_ALIAS("platform:smccc_trng"); 120 120 MODULE_AUTHOR("Andre Przywara"); 121 + MODULE_DESCRIPTION("Arm SMCCC TRNG firmware interface support"); 121 122 MODULE_LICENSE("GPL");

+1

drivers/char/hw_random/cavium-rng-vf.c

··· 266 266 module_pci_driver(cavium_rng_vf_driver); 267 267 268 268 MODULE_AUTHOR("Omer Khaliq <okhaliq@caviumnetworks.com>"); 269 + MODULE_DESCRIPTION("Cavium ThunderX Random Number Generator VF support"); 269 270 MODULE_LICENSE("GPL v2");

+1

drivers/char/hw_random/cavium-rng.c

··· 88 88 89 89 module_pci_driver(cavium_rng_pf_driver); 90 90 MODULE_AUTHOR("Omer Khaliq <okhaliq@caviumnetworks.com>"); 91 + MODULE_DESCRIPTION("Cavium ThunderX Random Number Generator support"); 91 92 MODULE_LICENSE("GPL v2");

+5 -13

drivers/char/hw_random/core.c

··· 161 161 reinit_completion(&rng->cleanup_done); 162 162 163 163 skip_init: 164 - rng->quality = min_t(u16, min_t(u16, default_quality, 1024), rng->quality ?: 1024); 165 164 current_quality = rng->quality; /* obsolete */ 166 165 167 166 return 0; ··· 469 470 470 471 ATTRIBUTE_GROUPS(rng_dev); 471 472 472 - static void __exit unregister_miscdev(void) 473 - { 474 - misc_deregister(&rng_miscdev); 475 - } 476 - 477 - static int __init register_miscdev(void) 478 - { 479 - return misc_register(&rng_miscdev); 480 - } 481 - 482 473 static int hwrng_fillfn(void *unused) 483 474 { 484 475 size_t entropy, entropy_credit = 0; /* in 1/1024 of a bit */ ··· 533 544 init_completion(&rng->cleanup_done); 534 545 complete(&rng->cleanup_done); 535 546 init_completion(&rng->dying); 547 + 548 + /* Adjust quality field to always have a proper value */ 549 + rng->quality = min_t(u16, min_t(u16, default_quality, 1024), rng->quality ?: 1024); 536 550 537 551 if (!current_rng || 538 552 (!cur_rng_set_by_user && rng->quality > current_rng->quality)) { ··· 660 668 return -ENOMEM; 661 669 } 662 670 663 - ret = register_miscdev(); 671 + ret = misc_register(&rng_miscdev); 664 672 if (ret) { 665 673 kfree(rng_fillbuf); 666 674 kfree(rng_buffer); ··· 677 685 kfree(rng_fillbuf); 678 686 mutex_unlock(&rng_mutex); 679 687 680 - unregister_miscdev(); 688 + misc_deregister(&rng_miscdev); 681 689 } 682 690 683 691 fs_initcall(hwrng_modinit); /* depends on misc_register() */

+166 -51

drivers/char/hw_random/exynos-trng.c

··· 10 10 * Krzysztof Kozłowski <krzk@kernel.org> 11 11 */ 12 12 13 + #include <linux/arm-smccc.h> 13 14 #include <linux/clk.h> 14 15 #include <linux/crypto.h> 15 16 #include <linux/delay.h> ··· 23 22 #include <linux/mod_devicetable.h> 24 23 #include <linux/platform_device.h> 25 24 #include <linux/pm_runtime.h> 25 + #include <linux/property.h> 26 26 27 - #define EXYNOS_TRNG_CLKDIV (0x0) 27 + #define EXYNOS_TRNG_CLKDIV 0x0 28 28 29 - #define EXYNOS_TRNG_CTRL (0x20) 30 - #define EXYNOS_TRNG_CTRL_RNGEN BIT(31) 29 + #define EXYNOS_TRNG_CTRL 0x20 30 + #define EXYNOS_TRNG_CTRL_RNGEN BIT(31) 31 31 32 - #define EXYNOS_TRNG_POST_CTRL (0x30) 33 - #define EXYNOS_TRNG_ONLINE_CTRL (0x40) 34 - #define EXYNOS_TRNG_ONLINE_STAT (0x44) 35 - #define EXYNOS_TRNG_ONLINE_MAXCHI2 (0x48) 36 - #define EXYNOS_TRNG_FIFO_CTRL (0x50) 37 - #define EXYNOS_TRNG_FIFO_0 (0x80) 38 - #define EXYNOS_TRNG_FIFO_1 (0x84) 39 - #define EXYNOS_TRNG_FIFO_2 (0x88) 40 - #define EXYNOS_TRNG_FIFO_3 (0x8c) 41 - #define EXYNOS_TRNG_FIFO_4 (0x90) 42 - #define EXYNOS_TRNG_FIFO_5 (0x94) 43 - #define EXYNOS_TRNG_FIFO_6 (0x98) 44 - #define EXYNOS_TRNG_FIFO_7 (0x9c) 45 - #define EXYNOS_TRNG_FIFO_LEN (8) 46 - #define EXYNOS_TRNG_CLOCK_RATE (500000) 32 + #define EXYNOS_TRNG_POST_CTRL 0x30 33 + #define EXYNOS_TRNG_ONLINE_CTRL 0x40 34 + #define EXYNOS_TRNG_ONLINE_STAT 0x44 35 + #define EXYNOS_TRNG_ONLINE_MAXCHI2 0x48 36 + #define EXYNOS_TRNG_FIFO_CTRL 0x50 37 + #define EXYNOS_TRNG_FIFO_0 0x80 38 + #define EXYNOS_TRNG_FIFO_1 0x84 39 + #define EXYNOS_TRNG_FIFO_2 0x88 40 + #define EXYNOS_TRNG_FIFO_3 0x8c 41 + #define EXYNOS_TRNG_FIFO_4 0x90 42 + #define EXYNOS_TRNG_FIFO_5 0x94 43 + #define EXYNOS_TRNG_FIFO_6 0x98 44 + #define EXYNOS_TRNG_FIFO_7 0x9c 45 + #define EXYNOS_TRNG_FIFO_LEN 8 46 + #define EXYNOS_TRNG_CLOCK_RATE 500000 47 47 48 + /* Driver feature flags */ 49 + #define EXYNOS_SMC BIT(0) 50 + 51 + #define EXYNOS_SMC_CALL_VAL(func_num) \ 52 + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ 53 + ARM_SMCCC_SMC_32, \ 54 + ARM_SMCCC_OWNER_SIP, \ 55 + func_num) 56 + 57 + /* SMC command for DTRNG access */ 58 + #define SMC_CMD_RANDOM EXYNOS_SMC_CALL_VAL(0x1012) 59 + 60 + /* SMC_CMD_RANDOM: arguments */ 61 + #define HWRNG_INIT 0x0 62 + #define HWRNG_EXIT 0x1 63 + #define HWRNG_GET_DATA 0x2 64 + #define HWRNG_RESUME 0x3 65 + 66 + /* SMC_CMD_RANDOM: return values */ 67 + #define HWRNG_RET_OK 0x0 68 + #define HWRNG_RET_RETRY_ERROR 0x2 69 + 70 + #define HWRNG_MAX_TRIES 100 48 71 49 72 struct exynos_trng_dev { 50 - struct device *dev; 51 - void __iomem *mem; 52 - struct clk *clk; 53 - struct hwrng rng; 73 + struct device *dev; 74 + void __iomem *mem; 75 + struct clk *clk; /* operating clock */ 76 + struct clk *pclk; /* bus clock */ 77 + struct hwrng rng; 78 + unsigned long flags; 54 79 }; 55 80 56 - static int exynos_trng_do_read(struct hwrng *rng, void *data, size_t max, 57 - bool wait) 81 + static int exynos_trng_do_read_reg(struct hwrng *rng, void *data, size_t max, 82 + bool wait) 58 83 { 59 - struct exynos_trng_dev *trng; 84 + struct exynos_trng_dev *trng = (struct exynos_trng_dev *)rng->priv; 60 85 int val; 61 86 62 87 max = min_t(size_t, max, (EXYNOS_TRNG_FIFO_LEN * 4)); 63 - 64 - trng = (struct exynos_trng_dev *)rng->priv; 65 - 66 88 writel_relaxed(max * 8, trng->mem + EXYNOS_TRNG_FIFO_CTRL); 67 89 val = readl_poll_timeout(trng->mem + EXYNOS_TRNG_FIFO_CTRL, val, 68 90 val == 0, 200, 1000000); ··· 97 73 return max; 98 74 } 99 75 100 - static int exynos_trng_init(struct hwrng *rng) 76 + static int exynos_trng_do_read_smc(struct hwrng *rng, void *data, size_t max, 77 + bool wait) 78 + { 79 + struct arm_smccc_res res; 80 + unsigned int copied = 0; 81 + u32 *buf = data; 82 + int tries = 0; 83 + 84 + while (copied < max) { 85 + arm_smccc_smc(SMC_CMD_RANDOM, HWRNG_GET_DATA, 0, 0, 0, 0, 0, 0, 86 + &res); 87 + switch (res.a0) { 88 + case HWRNG_RET_OK: 89 + *buf++ = res.a2; 90 + *buf++ = res.a3; 91 + copied += 8; 92 + tries = 0; 93 + break; 94 + case HWRNG_RET_RETRY_ERROR: 95 + if (!wait) 96 + return copied; 97 + if (++tries >= HWRNG_MAX_TRIES) 98 + return copied; 99 + cond_resched(); 100 + break; 101 + default: 102 + return -EIO; 103 + } 104 + } 105 + 106 + return copied; 107 + } 108 + 109 + static int exynos_trng_init_reg(struct hwrng *rng) 101 110 { 102 111 struct exynos_trng_dev *trng = (struct exynos_trng_dev *)rng->priv; 103 112 unsigned long sss_rate; ··· 144 87 */ 145 88 val = sss_rate / (EXYNOS_TRNG_CLOCK_RATE * 2); 146 89 if (val > 0x7fff) { 147 - dev_err(trng->dev, "clock divider too large: %d", val); 90 + dev_err(trng->dev, "clock divider too large: %d\n", val); 148 91 return -ERANGE; 149 92 } 150 93 val = val << 1; ··· 163 106 return 0; 164 107 } 165 108 109 + static int exynos_trng_init_smc(struct hwrng *rng) 110 + { 111 + struct exynos_trng_dev *trng = (struct exynos_trng_dev *)rng->priv; 112 + struct arm_smccc_res res; 113 + int ret = 0; 114 + 115 + arm_smccc_smc(SMC_CMD_RANDOM, HWRNG_INIT, 0, 0, 0, 0, 0, 0, &res); 116 + if (res.a0 != HWRNG_RET_OK) { 117 + dev_err(trng->dev, "SMC command for TRNG init failed (%d)\n", 118 + (int)res.a0); 119 + ret = -EIO; 120 + } 121 + if ((int)res.a0 == -1) 122 + dev_info(trng->dev, "Make sure LDFW is loaded by your BL\n"); 123 + 124 + return ret; 125 + } 126 + 166 127 static int exynos_trng_probe(struct platform_device *pdev) 167 128 { 168 129 struct exynos_trng_dev *trng; ··· 190 115 if (!trng) 191 116 return ret; 192 117 118 + platform_set_drvdata(pdev, trng); 119 + trng->dev = &pdev->dev; 120 + 121 + trng->flags = (unsigned long)device_get_match_data(&pdev->dev); 122 + 193 123 trng->rng.name = devm_kstrdup(&pdev->dev, dev_name(&pdev->dev), 194 124 GFP_KERNEL); 195 125 if (!trng->rng.name) 196 126 return ret; 197 127 198 - trng->rng.init = exynos_trng_init; 199 - trng->rng.read = exynos_trng_do_read; 200 - trng->rng.priv = (unsigned long) trng; 128 + trng->rng.priv = (unsigned long)trng; 201 129 202 - platform_set_drvdata(pdev, trng); 203 - trng->dev = &pdev->dev; 130 + if (trng->flags & EXYNOS_SMC) { 131 + trng->rng.init = exynos_trng_init_smc; 132 + trng->rng.read = exynos_trng_do_read_smc; 133 + } else { 134 + trng->rng.init = exynos_trng_init_reg; 135 + trng->rng.read = exynos_trng_do_read_reg; 204 136 205 - trng->mem = devm_platform_ioremap_resource(pdev, 0); 206 - if (IS_ERR(trng->mem)) 207 - return PTR_ERR(trng->mem); 137 + trng->mem = devm_platform_ioremap_resource(pdev, 0); 138 + if (IS_ERR(trng->mem)) 139 + return PTR_ERR(trng->mem); 140 + } 208 141 209 142 pm_runtime_enable(&pdev->dev); 210 143 ret = pm_runtime_resume_and_get(&pdev->dev); ··· 221 138 goto err_pm_get; 222 139 } 223 140 224 - trng->clk = devm_clk_get(&pdev->dev, "secss"); 141 + trng->clk = devm_clk_get_enabled(&pdev->dev, "secss"); 225 142 if (IS_ERR(trng->clk)) { 226 - ret = PTR_ERR(trng->clk); 227 - dev_err(&pdev->dev, "Could not get clock.\n"); 143 + ret = dev_err_probe(&pdev->dev, PTR_ERR(trng->clk), 144 + "Could not get clock\n"); 228 145 goto err_clock; 229 146 } 230 147 231 - ret = clk_prepare_enable(trng->clk); 232 - if (ret) { 233 - dev_err(&pdev->dev, "Could not enable the clk.\n"); 148 + trng->pclk = devm_clk_get_optional_enabled(&pdev->dev, "pclk"); 149 + if (IS_ERR(trng->pclk)) { 150 + ret = dev_err_probe(&pdev->dev, PTR_ERR(trng->pclk), 151 + "Could not get pclk\n"); 234 152 goto err_clock; 235 153 } 236 154 237 155 ret = devm_hwrng_register(&pdev->dev, &trng->rng); 238 156 if (ret) { 239 157 dev_err(&pdev->dev, "Could not register hwrng device.\n"); 240 - goto err_register; 158 + goto err_clock; 241 159 } 242 160 243 161 dev_info(&pdev->dev, "Exynos True Random Number Generator.\n"); 244 162 245 163 return 0; 246 - 247 - err_register: 248 - clk_disable_unprepare(trng->clk); 249 164 250 165 err_clock: 251 166 pm_runtime_put_noidle(&pdev->dev); ··· 256 175 257 176 static void exynos_trng_remove(struct platform_device *pdev) 258 177 { 259 - struct exynos_trng_dev *trng = platform_get_drvdata(pdev); 178 + struct exynos_trng_dev *trng = platform_get_drvdata(pdev); 260 179 261 - clk_disable_unprepare(trng->clk); 180 + if (trng->flags & EXYNOS_SMC) { 181 + struct arm_smccc_res res; 182 + 183 + arm_smccc_smc(SMC_CMD_RANDOM, HWRNG_EXIT, 0, 0, 0, 0, 0, 0, 184 + &res); 185 + } 262 186 263 187 pm_runtime_put_sync(&pdev->dev); 264 188 pm_runtime_disable(&pdev->dev); ··· 271 185 272 186 static int exynos_trng_suspend(struct device *dev) 273 187 { 188 + struct exynos_trng_dev *trng = dev_get_drvdata(dev); 189 + struct arm_smccc_res res; 190 + 191 + if (trng->flags & EXYNOS_SMC) { 192 + arm_smccc_smc(SMC_CMD_RANDOM, HWRNG_EXIT, 0, 0, 0, 0, 0, 0, 193 + &res); 194 + if (res.a0 != HWRNG_RET_OK) 195 + return -EIO; 196 + } 197 + 274 198 pm_runtime_put_sync(dev); 275 199 276 200 return 0; ··· 288 192 289 193 static int exynos_trng_resume(struct device *dev) 290 194 { 195 + struct exynos_trng_dev *trng = dev_get_drvdata(dev); 291 196 int ret; 292 197 293 198 ret = pm_runtime_resume_and_get(dev); ··· 297 200 return ret; 298 201 } 299 202 203 + if (trng->flags & EXYNOS_SMC) { 204 + struct arm_smccc_res res; 205 + 206 + arm_smccc_smc(SMC_CMD_RANDOM, HWRNG_RESUME, 0, 0, 0, 0, 0, 0, 207 + &res); 208 + if (res.a0 != HWRNG_RET_OK) 209 + return -EIO; 210 + 211 + arm_smccc_smc(SMC_CMD_RANDOM, HWRNG_INIT, 0, 0, 0, 0, 0, 0, 212 + &res); 213 + if (res.a0 != HWRNG_RET_OK) 214 + return -EIO; 215 + } 216 + 300 217 return 0; 301 218 } 302 219 303 220 static DEFINE_SIMPLE_DEV_PM_OPS(exynos_trng_pm_ops, exynos_trng_suspend, 304 - exynos_trng_resume); 221 + exynos_trng_resume); 305 222 306 223 static const struct of_device_id exynos_trng_dt_match[] = { 307 224 { 308 225 .compatible = "samsung,exynos5250-trng", 226 + }, { 227 + .compatible = "samsung,exynos850-trng", 228 + .data = (void *)EXYNOS_SMC, 309 229 }, 310 230 { }, 311 231 }; ··· 339 225 }; 340 226 341 227 module_platform_driver(exynos_trng_driver); 228 + 342 229 MODULE_AUTHOR("Łukasz Stelmach"); 343 230 MODULE_DESCRIPTION("H/W TRNG driver for Exynos chips"); 344 231 MODULE_LICENSE("GPL v2");

+1

drivers/char/hw_random/omap-rng.c

··· 564 564 module_platform_driver(omap_rng_driver); 565 565 MODULE_ALIAS("platform:omap_rng"); 566 566 MODULE_AUTHOR("Deepak Saxena (and others)"); 567 + MODULE_DESCRIPTION("RNG driver for TI OMAP CPU family"); 567 568 MODULE_LICENSE("GPL");

+1

drivers/char/hw_random/omap3-rom-rng.c

··· 178 178 MODULE_ALIAS("platform:omap3-rom-rng"); 179 179 MODULE_AUTHOR("Juha Yrjola"); 180 180 MODULE_AUTHOR("Pali Rohár <pali@kernel.org>"); 181 + MODULE_DESCRIPTION("RNG driver for TI OMAP3 CPU family"); 181 182 MODULE_LICENSE("GPL");

+16 -19

drivers/char/hw_random/stm32-rng.c

··· 70 70 71 71 struct stm32_rng_private { 72 72 struct hwrng rng; 73 + struct device *dev; 73 74 void __iomem *base; 74 75 struct clk *clk; 75 76 struct reset_control *rst; ··· 100 99 */ 101 100 static int stm32_rng_conceal_seed_error_cond_reset(struct stm32_rng_private *priv) 102 101 { 103 - struct device *dev = (struct device *)priv->rng.priv; 102 + struct device *dev = priv->dev; 104 103 u32 sr = readl_relaxed(priv->base + RNG_SR); 105 104 u32 cr = readl_relaxed(priv->base + RNG_CR); 106 105 int err; ··· 172 171 { 173 172 struct stm32_rng_private *priv = container_of(rng, struct stm32_rng_private, rng); 174 173 175 - dev_dbg((struct device *)priv->rng.priv, "Concealing seed error\n"); 174 + dev_dbg(priv->dev, "Concealing seed error\n"); 176 175 177 176 if (priv->data->has_cond_reset) 178 177 return stm32_rng_conceal_seed_error_cond_reset(priv); ··· 188 187 int retval = 0, err = 0; 189 188 u32 sr; 190 189 191 - pm_runtime_get_sync((struct device *) priv->rng.priv); 190 + retval = pm_runtime_resume_and_get(priv->dev); 191 + if (retval) 192 + return retval; 192 193 193 194 if (readl_relaxed(priv->base + RNG_SR) & RNG_SR_SEIS) 194 195 stm32_rng_conceal_seed_error(rng); ··· 207 204 sr, sr, 208 205 10, 50000); 209 206 if (err) { 210 - dev_err((struct device *)priv->rng.priv, 211 - "%s: timeout %x!\n", __func__, sr); 207 + dev_err(priv->dev, "%s: timeout %x!\n", __func__, sr); 212 208 break; 213 209 } 214 210 } else if (!sr) { ··· 220 218 err = stm32_rng_conceal_seed_error(rng); 221 219 i++; 222 220 if (err && i > RNG_NB_RECOVER_TRIES) { 223 - dev_err((struct device *)priv->rng.priv, 224 - "Couldn't recover from seed error\n"); 221 + dev_err(priv->dev, "Couldn't recover from seed error\n"); 225 222 retval = -ENOTRECOVERABLE; 226 223 goto exit_rpm; 227 224 } ··· 238 237 err = stm32_rng_conceal_seed_error(rng); 239 238 i++; 240 239 if (err && i > RNG_NB_RECOVER_TRIES) { 241 - dev_err((struct device *)priv->rng.priv, 242 - "Couldn't recover from seed error"); 240 + dev_err(priv->dev, "Couldn't recover from seed error"); 243 241 retval = -ENOTRECOVERABLE; 244 242 goto exit_rpm; 245 243 } ··· 253 253 } 254 254 255 255 exit_rpm: 256 - pm_runtime_mark_last_busy((struct device *) priv->rng.priv); 257 - pm_runtime_put_sync_autosuspend((struct device *) priv->rng.priv); 256 + pm_runtime_mark_last_busy(priv->dev); 257 + pm_runtime_put_sync_autosuspend(priv->dev); 258 258 259 259 return retval || !wait ? retval : -EIO; 260 260 } ··· 329 329 10, 50000); 330 330 if (err) { 331 331 clk_disable_unprepare(priv->clk); 332 - dev_err((struct device *)priv->rng.priv, 333 - "%s: timeout %x!\n", __func__, reg); 332 + dev_err(priv->dev, "%s: timeout %x!\n", __func__, reg); 334 333 return -EINVAL; 335 334 } 336 335 } else { ··· 357 358 10, 100000); 358 359 if (err || (reg & ~RNG_SR_DRDY)) { 359 360 clk_disable_unprepare(priv->clk); 360 - dev_err((struct device *)priv->rng.priv, 361 - "%s: timeout:%x SR: %x!\n", __func__, err, reg); 361 + dev_err(priv->dev, "%s: timeout:%x SR: %x!\n", __func__, err, reg); 362 362 return -EINVAL; 363 363 } 364 364 ··· 463 465 464 466 if (err) { 465 467 clk_disable_unprepare(priv->clk); 466 - dev_err((struct device *)priv->rng.priv, 467 - "%s: timeout:%x CR: %x!\n", __func__, err, reg); 468 + dev_err(priv->dev, "%s: timeout:%x CR: %x!\n", __func__, err, reg); 468 469 return -EINVAL; 469 470 } 470 471 } else { ··· 517 520 struct stm32_rng_private *priv; 518 521 struct resource *res; 519 522 520 - priv = devm_kzalloc(dev, sizeof(struct stm32_rng_private), GFP_KERNEL); 523 + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); 521 524 if (!priv) 522 525 return -ENOMEM; 523 526 ··· 538 541 539 542 priv->ced = of_property_read_bool(np, "clock-error-detect"); 540 543 priv->lock_conf = of_property_read_bool(np, "st,rng-lock-conf"); 544 + priv->dev = dev; 541 545 542 546 priv->data = of_device_get_match_data(dev); 543 547 if (!priv->data) ··· 549 551 priv->rng.name = dev_driver_string(dev); 550 552 priv->rng.init = stm32_rng_init; 551 553 priv->rng.read = stm32_rng_read; 552 - priv->rng.priv = (unsigned long) dev; 553 554 priv->rng.quality = 900; 554 555 555 556 pm_runtime_set_autosuspend_delay(dev, 100);

+4 -4

drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c

··· 190 190 err = -EFAULT; 191 191 goto theend; 192 192 } 193 - cet->t_key = cpu_to_le32(rctx->addr_key); 193 + cet->t_key = desc_addr_val_le32(ce, rctx->addr_key); 194 194 195 195 ivsize = crypto_skcipher_ivsize(tfm); 196 196 if (areq->iv && crypto_skcipher_ivsize(tfm) > 0) { ··· 208 208 err = -ENOMEM; 209 209 goto theend_iv; 210 210 } 211 - cet->t_iv = cpu_to_le32(rctx->addr_iv); 211 + cet->t_iv = desc_addr_val_le32(ce, rctx->addr_iv); 212 212 } 213 213 214 214 if (areq->src == areq->dst) { ··· 236 236 237 237 len = areq->cryptlen; 238 238 for_each_sg(areq->src, sg, nr_sgs, i) { 239 - cet->t_src[i].addr = cpu_to_le32(sg_dma_address(sg)); 239 + cet->t_src[i].addr = desc_addr_val_le32(ce, sg_dma_address(sg)); 240 240 todo = min(len, sg_dma_len(sg)); 241 241 cet->t_src[i].len = cpu_to_le32(todo / 4); 242 242 dev_dbg(ce->dev, "%s total=%u SG(%d %u off=%d) todo=%u\n", __func__, ··· 251 251 252 252 len = areq->cryptlen; 253 253 for_each_sg(areq->dst, sg, nr_sgd, i) { 254 - cet->t_dst[i].addr = cpu_to_le32(sg_dma_address(sg)); 254 + cet->t_dst[i].addr = desc_addr_val_le32(ce, sg_dma_address(sg)); 255 255 todo = min(len, sg_dma_len(sg)); 256 256 cet->t_dst[i].len = cpu_to_le32(todo / 4); 257 257 dev_dbg(ce->dev, "%s total=%u SG(%d %u off=%d) todo=%u\n", __func__,

+27 -1

drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c

··· 92 92 .trng = CE_ALG_TRNG_V2, 93 93 }; 94 94 95 + static const struct ce_variant ce_h616_variant = { 96 + .alg_cipher = { CE_ALG_AES, CE_ALG_DES, CE_ALG_3DES, 97 + }, 98 + .alg_hash = { CE_ALG_MD5, CE_ALG_SHA1, CE_ALG_SHA224, CE_ALG_SHA256, 99 + CE_ALG_SHA384, CE_ALG_SHA512 100 + }, 101 + .op_mode = { CE_OP_ECB, CE_OP_CBC 102 + }, 103 + .cipher_t_dlen_in_bytes = true, 104 + .hash_t_dlen_in_bits = true, 105 + .prng_t_dlen_in_bytes = true, 106 + .trng_t_dlen_in_bytes = true, 107 + .needs_word_addresses = true, 108 + .ce_clks = { 109 + { "bus", 0, 200000000 }, 110 + { "mod", 300000000, 0 }, 111 + { "ram", 0, 400000000 }, 112 + { "trng", 0, 0 }, 113 + }, 114 + .esr = ESR_H6, 115 + .prng = CE_ALG_PRNG_V2, 116 + .trng = CE_ALG_TRNG_V2, 117 + }; 118 + 95 119 static const struct ce_variant ce_a64_variant = { 96 120 .alg_cipher = { CE_ALG_AES, CE_ALG_DES, CE_ALG_3DES, 97 121 }, ··· 196 172 writel(v, ce->base + CE_ICR); 197 173 198 174 reinit_completion(&ce->chanlist[flow].complete); 199 - writel(ce->chanlist[flow].t_phy, ce->base + CE_TDQ); 175 + writel(desc_addr_val(ce, ce->chanlist[flow].t_phy), ce->base + CE_TDQ); 200 176 201 177 ce->chanlist[flow].status = 0; 202 178 /* Be sure all data is written before enabling the task */ ··· 1121 1097 .data = &ce_h5_variant }, 1122 1098 { .compatible = "allwinner,sun50i-h6-crypto", 1123 1099 .data = &ce_h6_variant }, 1100 + { .compatible = "allwinner,sun50i-h616-crypto", 1101 + .data = &ce_h616_variant }, 1124 1102 {} 1125 1103 }; 1126 1104 MODULE_DEVICE_TABLE(of, sun8i_ce_crypto_of_match_table);

+3 -3

drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c

··· 403 403 404 404 len = areq->nbytes; 405 405 for_each_sg(areq->src, sg, nr_sgs, i) { 406 - cet->t_src[i].addr = cpu_to_le32(sg_dma_address(sg)); 406 + cet->t_src[i].addr = desc_addr_val_le32(ce, sg_dma_address(sg)); 407 407 todo = min(len, sg_dma_len(sg)); 408 408 cet->t_src[i].len = cpu_to_le32(todo / 4); 409 409 len -= todo; ··· 414 414 goto theend; 415 415 } 416 416 addr_res = dma_map_single(ce->dev, result, digestsize, DMA_FROM_DEVICE); 417 - cet->t_dst[0].addr = cpu_to_le32(addr_res); 417 + cet->t_dst[0].addr = desc_addr_val_le32(ce, addr_res); 418 418 cet->t_dst[0].len = cpu_to_le32(digestsize / 4); 419 419 if (dma_mapping_error(ce->dev, addr_res)) { 420 420 dev_err(ce->dev, "DMA map dest\n"); ··· 445 445 } 446 446 447 447 addr_pad = dma_map_single(ce->dev, buf, j * 4, DMA_TO_DEVICE); 448 - cet->t_src[i].addr = cpu_to_le32(addr_pad); 448 + cet->t_src[i].addr = desc_addr_val_le32(ce, addr_pad); 449 449 cet->t_src[i].len = cpu_to_le32(j); 450 450 if (dma_mapping_error(ce->dev, addr_pad)) { 451 451 dev_err(ce->dev, "DMA error on padding SG\n");

+3 -3

drivers/crypto/allwinner/sun8i-ce/sun8i-ce-prng.c

··· 132 132 cet->t_sym_ctl = cpu_to_le32(sym); 133 133 cet->t_asym_ctl = 0; 134 134 135 - cet->t_key = cpu_to_le32(dma_iv); 136 - cet->t_iv = cpu_to_le32(dma_iv); 135 + cet->t_key = desc_addr_val_le32(ce, dma_iv); 136 + cet->t_iv = desc_addr_val_le32(ce, dma_iv); 137 137 138 - cet->t_dst[0].addr = cpu_to_le32(dma_dst); 138 + cet->t_dst[0].addr = desc_addr_val_le32(ce, dma_dst); 139 139 cet->t_dst[0].len = cpu_to_le32(todo / 4); 140 140 ce->chanlist[flow].timeout = 2000; 141 141

+1 -1

drivers/crypto/allwinner/sun8i-ce/sun8i-ce-trng.c

··· 77 77 cet->t_sym_ctl = 0; 78 78 cet->t_asym_ctl = 0; 79 79 80 - cet->t_dst[0].addr = cpu_to_le32(dma_dst); 80 + cet->t_dst[0].addr = desc_addr_val_le32(ce, dma_dst); 81 81 cet->t_dst[0].len = cpu_to_le32(todo / 4); 82 82 ce->chanlist[flow].timeout = todo; 83 83

+15

drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h

··· 149 149 bool hash_t_dlen_in_bits; 150 150 bool prng_t_dlen_in_bytes; 151 151 bool trng_t_dlen_in_bytes; 152 + bool needs_word_addresses; 152 153 struct ce_clock ce_clks[CE_MAX_CLOCKS]; 153 154 int esr; 154 155 unsigned char prng; ··· 241 240 #endif 242 241 #endif 243 242 }; 243 + 244 + static inline u32 desc_addr_val(struct sun8i_ce_dev *dev, dma_addr_t addr) 245 + { 246 + if (dev->variant->needs_word_addresses) 247 + return addr / 4; 248 + 249 + return addr; 250 + } 251 + 252 + static inline __le32 desc_addr_val_le32(struct sun8i_ce_dev *dev, 253 + dma_addr_t addr) 254 + { 255 + return cpu_to_le32(desc_addr_val(dev, addr)); 256 + } 244 257 245 258 /* 246 259 * struct sun8i_cipher_req_ctx - context for a skcipher request

+2 -1

drivers/crypto/atmel-sha204a.c

··· 106 106 107 107 if (cmd.data[0] == 0xff) { 108 108 dev_err(&client->dev, "failed, device not ready\n"); 109 - return -ret; 109 + return -EINVAL; 110 110 } 111 111 112 112 memcpy(otp, cmd.data+1, 4); ··· 232 232 module_exit(atmel_sha204a_exit); 233 233 234 234 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 235 + MODULE_DESCRIPTION("Microchip / Atmel SHA204A (I2C) driver"); 235 236 MODULE_LICENSE("GPL v2");

-7

drivers/crypto/axis/artpec6_crypto.c

··· 2811 2811 2812 2812 #ifdef CONFIG_DEBUG_FS 2813 2813 2814 - struct dbgfs_u32 { 2815 - char *name; 2816 - mode_t mode; 2817 - u32 *flag; 2818 - char *desc; 2819 - }; 2820 - 2821 2814 static struct dentry *dbgfs_root; 2822 2815 2823 2816 static void artpec6_crypto_init_debugfs(void)

+2 -1

drivers/crypto/ccp/Makefile

··· 12 12 sev-dev.o \ 13 13 tee-dev.o \ 14 14 platform-access.o \ 15 - dbc.o 15 + dbc.o \ 16 + hsti.o 16 17 17 18 obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o 18 19 ccp-crypto-objs := ccp-crypto-main.o \

+1 -1

drivers/crypto/ccp/dbc.c

··· 223 223 dbc_dev->dev = dev; 224 224 dbc_dev->psp = psp; 225 225 226 - if (PSP_CAPABILITY(psp, DBC_THRU_EXT)) { 226 + if (psp->capability.dbc_thru_ext) { 227 227 dbc_dev->use_ext = true; 228 228 dbc_dev->payload_size = &dbc_dev->mbox->ext_req.header.payload_size; 229 229 dbc_dev->result = &dbc_dev->mbox->ext_req.header.status;

+138

drivers/crypto/ccp/hsti.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * AMD Secure Processor device driver, security attributes 4 + * 5 + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. 6 + * 7 + * Author: Mario Limonciello <mario.limonciello@amd.com> 8 + */ 9 + 10 + #include <linux/device.h> 11 + 12 + #include "psp-dev.h" 13 + #include "hsti.h" 14 + 15 + #define PSP_CAPABILITY_PSP_SECURITY_OFFSET 8 16 + 17 + struct hsti_request { 18 + struct psp_req_buffer_hdr header; 19 + u32 hsti; 20 + } __packed; 21 + 22 + #define security_attribute_show(name) \ 23 + static ssize_t name##_show(struct device *d, struct device_attribute *attr, \ 24 + char *buf) \ 25 + { \ 26 + struct sp_device *sp = dev_get_drvdata(d); \ 27 + struct psp_device *psp = sp->psp_data; \ 28 + return sysfs_emit(buf, "%d\n", psp->capability.name); \ 29 + } 30 + 31 + security_attribute_show(fused_part) 32 + static DEVICE_ATTR_RO(fused_part); 33 + security_attribute_show(debug_lock_on) 34 + static DEVICE_ATTR_RO(debug_lock_on); 35 + security_attribute_show(tsme_status) 36 + static DEVICE_ATTR_RO(tsme_status); 37 + security_attribute_show(anti_rollback_status) 38 + static DEVICE_ATTR_RO(anti_rollback_status); 39 + security_attribute_show(rpmc_production_enabled) 40 + static DEVICE_ATTR_RO(rpmc_production_enabled); 41 + security_attribute_show(rpmc_spirom_available) 42 + static DEVICE_ATTR_RO(rpmc_spirom_available); 43 + security_attribute_show(hsp_tpm_available) 44 + static DEVICE_ATTR_RO(hsp_tpm_available); 45 + security_attribute_show(rom_armor_enforced) 46 + static DEVICE_ATTR_RO(rom_armor_enforced); 47 + 48 + static struct attribute *psp_security_attrs[] = { 49 + &dev_attr_fused_part.attr, 50 + &dev_attr_debug_lock_on.attr, 51 + &dev_attr_tsme_status.attr, 52 + &dev_attr_anti_rollback_status.attr, 53 + &dev_attr_rpmc_production_enabled.attr, 54 + &dev_attr_rpmc_spirom_available.attr, 55 + &dev_attr_hsp_tpm_available.attr, 56 + &dev_attr_rom_armor_enforced.attr, 57 + NULL 58 + }; 59 + 60 + static umode_t psp_security_is_visible(struct kobject *kobj, struct attribute *attr, int idx) 61 + { 62 + struct device *dev = kobj_to_dev(kobj); 63 + struct sp_device *sp = dev_get_drvdata(dev); 64 + struct psp_device *psp = sp->psp_data; 65 + 66 + if (psp && psp->capability.security_reporting) 67 + return 0444; 68 + 69 + return 0; 70 + } 71 + 72 + struct attribute_group psp_security_attr_group = { 73 + .attrs = psp_security_attrs, 74 + .is_visible = psp_security_is_visible, 75 + }; 76 + 77 + static int psp_poulate_hsti(struct psp_device *psp) 78 + { 79 + struct hsti_request *req; 80 + int ret; 81 + 82 + /* Are the security attributes already reported? */ 83 + if (psp->capability.security_reporting) 84 + return 0; 85 + 86 + /* Allocate command-response buffer */ 87 + req = kzalloc(sizeof(*req), GFP_KERNEL | __GFP_ZERO); 88 + if (!req) 89 + return -ENOMEM; 90 + 91 + req->header.payload_size = sizeof(req); 92 + 93 + ret = psp_send_platform_access_msg(PSP_CMD_HSTI_QUERY, (struct psp_request *)req); 94 + if (ret) 95 + goto out; 96 + 97 + if (req->header.status != 0) { 98 + dev_dbg(psp->dev, "failed to populate HSTI state: %d\n", req->header.status); 99 + ret = -EINVAL; 100 + goto out; 101 + } 102 + 103 + psp->capability.security_reporting = 1; 104 + psp->capability.raw |= req->hsti << PSP_CAPABILITY_PSP_SECURITY_OFFSET; 105 + 106 + out: 107 + kfree(req); 108 + 109 + return ret; 110 + } 111 + 112 + int psp_init_hsti(struct psp_device *psp) 113 + { 114 + int ret; 115 + 116 + if (PSP_FEATURE(psp, HSTI)) { 117 + ret = psp_poulate_hsti(psp); 118 + if (ret) 119 + return ret; 120 + } 121 + 122 + /* 123 + * At this stage, if security information hasn't been populated by 124 + * either the PSP or by the driver through the platform command, 125 + * then there is nothing more to do. 126 + */ 127 + if (!psp->capability.security_reporting) 128 + return 0; 129 + 130 + if (psp->capability.tsme_status) { 131 + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) 132 + dev_notice(psp->dev, "psp: Both TSME and SME are active, SME is unnecessary when TSME is active.\n"); 133 + else 134 + dev_notice(psp->dev, "psp: TSME enabled\n"); 135 + } 136 + 137 + return 0; 138 + }

+17

drivers/crypto/ccp/hsti.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * AMD Secure Processor device driver, security attributes 4 + * 5 + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. 6 + * 7 + * Author: Mario Limonciello <mario.limonciello@amd.com> 8 + */ 9 + 10 + #ifndef __HSTI_H 11 + #define __HSTI_H 12 + 13 + extern struct attribute_group psp_security_attr_group; 14 + 15 + int psp_init_hsti(struct psp_device *psp); 16 + 17 + #endif /* __HSTI_H */

+10 -13

drivers/crypto/ccp/psp-dev.c

··· 19 19 #include "tee-dev.h" 20 20 #include "platform-access.h" 21 21 #include "dbc.h" 22 + #include "hsti.h" 22 23 23 24 struct psp_device *psp_master; 24 25 ··· 155 154 dev_notice(psp->dev, "psp: unable to access the device: you might be running a broken BIOS.\n"); 156 155 return -ENODEV; 157 156 } 158 - psp->capability = val; 159 - 160 - /* Detect TSME and/or SME status */ 161 - if (PSP_CAPABILITY(psp, PSP_SECURITY_REPORTING) && 162 - psp->capability & (PSP_SECURITY_TSME_STATUS << PSP_CAPABILITY_PSP_SECURITY_OFFSET)) { 163 - if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) 164 - dev_notice(psp->dev, "psp: Both TSME and SME are active, SME is unnecessary when TSME is active.\n"); 165 - else 166 - dev_notice(psp->dev, "psp: TSME enabled\n"); 167 - } 157 + psp->capability.raw = val; 168 158 169 159 return 0; 170 160 } ··· 163 171 static int psp_check_sev_support(struct psp_device *psp) 164 172 { 165 173 /* Check if device supports SEV feature */ 166 - if (!PSP_CAPABILITY(psp, SEV)) { 174 + if (!psp->capability.sev) { 167 175 dev_dbg(psp->dev, "psp does not support SEV\n"); 168 176 return -ENODEV; 169 177 } ··· 174 182 static int psp_check_tee_support(struct psp_device *psp) 175 183 { 176 184 /* Check if device supports TEE feature */ 177 - if (!PSP_CAPABILITY(psp, TEE)) { 185 + if (!psp->capability.tee) { 178 186 dev_dbg(psp->dev, "psp does not support TEE\n"); 179 187 return -ENODEV; 180 188 } ··· 206 214 207 215 /* dbc must come after platform access as it tests the feature */ 208 216 if (PSP_FEATURE(psp, DBC) || 209 - PSP_CAPABILITY(psp, DBC_THRU_EXT)) { 217 + psp->capability.dbc_thru_ext) { 210 218 ret = dbc_dev_init(psp); 211 219 if (ret) 212 220 return ret; 213 221 } 222 + 223 + /* HSTI uses platform access on some systems. */ 224 + ret = psp_init_hsti(psp); 225 + if (ret) 226 + return ret; 214 227 215 228 return 0; 216 229 }

+24 -22

drivers/crypto/ccp/psp-dev.h

··· 26 26 27 27 typedef void (*psp_irq_handler_t)(int, void *, unsigned int); 28 28 29 + union psp_cap_register { 30 + unsigned int raw; 31 + struct { 32 + unsigned int sev :1, 33 + tee :1, 34 + dbc_thru_ext :1, 35 + rsvd1 :4, 36 + security_reporting :1, 37 + fused_part :1, 38 + rsvd2 :1, 39 + debug_lock_on :1, 40 + rsvd3 :2, 41 + tsme_status :1, 42 + rsvd4 :1, 43 + anti_rollback_status :1, 44 + rpmc_production_enabled :1, 45 + rpmc_spirom_available :1, 46 + hsp_tpm_available :1, 47 + rom_armor_enforced :1, 48 + rsvd5 :12; 49 + }; 50 + }; 51 + 29 52 struct psp_device { 30 53 struct list_head entry; 31 54 ··· 69 46 void *platform_access_data; 70 47 void *dbc_data; 71 48 72 - unsigned int capability; 49 + union psp_cap_register capability; 73 50 }; 74 51 75 52 void psp_set_sev_irq_handler(struct psp_device *psp, psp_irq_handler_t handler, ··· 77 54 void psp_clear_sev_irq_handler(struct psp_device *psp); 78 55 79 56 struct psp_device *psp_get_master_device(void); 80 - 81 - #define PSP_CAPABILITY_SEV BIT(0) 82 - #define PSP_CAPABILITY_TEE BIT(1) 83 - #define PSP_CAPABILITY_DBC_THRU_EXT BIT(2) 84 - #define PSP_CAPABILITY_PSP_SECURITY_REPORTING BIT(7) 85 - 86 - #define PSP_CAPABILITY_PSP_SECURITY_OFFSET 8 87 - /* 88 - * The PSP doesn't directly store these bits in the capability register 89 - * but instead copies them from the results of query command. 90 - * 91 - * The offsets from the query command are below, and shifted when used. 92 - */ 93 - #define PSP_SECURITY_FUSED_PART BIT(0) 94 - #define PSP_SECURITY_DEBUG_LOCK_ON BIT(2) 95 - #define PSP_SECURITY_TSME_STATUS BIT(5) 96 - #define PSP_SECURITY_ANTI_ROLLBACK_STATUS BIT(7) 97 - #define PSP_SECURITY_RPMC_PRODUCTION_ENABLED BIT(8) 98 - #define PSP_SECURITY_RPMC_SPIROM_AVAILABLE BIT(9) 99 - #define PSP_SECURITY_HSP_TPM_AVAILABLE BIT(10) 100 - #define PSP_SECURITY_ROM_ARMOR_ENFORCED BIT(11) 101 57 102 58 /** 103 59 * enum psp_cmd - PSP mailbox commands

+7 -1

drivers/crypto/ccp/sev-dev.c

··· 1642 1642 1643 1643 static int __sev_snp_shutdown_locked(int *error, bool panic) 1644 1644 { 1645 - struct sev_device *sev = psp_master->sev_data; 1645 + struct psp_device *psp = psp_master; 1646 + struct sev_device *sev; 1646 1647 struct sev_data_snp_shutdown_ex data; 1647 1648 int ret; 1649 + 1650 + if (!psp || !psp->sev_data) 1651 + return 0; 1652 + 1653 + sev = psp->sev_data; 1648 1654 1649 1655 if (!sev->snp_initialized) 1650 1656 return 0;

+1 -1

drivers/crypto/ccp/sp-dev.h

··· 29 29 #define CACHE_WB_NO_ALLOC 0xb7 30 30 31 31 #define PLATFORM_FEATURE_DBC 0x1 32 + #define PLATFORM_FEATURE_HSTI 0x2 32 33 33 - #define PSP_CAPABILITY(psp, cap) (psp->capability & PSP_CAPABILITY_##cap) 34 34 #define PSP_FEATURE(psp, feat) (psp->vdata && psp->vdata->platform_features & PLATFORM_FEATURE_##feat) 35 35 36 36 /* Structure to hold CCP device data */

+8 -59

drivers/crypto/ccp/sp-pci.c

··· 24 24 25 25 #include "ccp-dev.h" 26 26 #include "psp-dev.h" 27 + #include "hsti.h" 27 28 28 29 /* used for version string AA.BB.CC.DD */ 29 30 #define AA GENMASK(31, 24) ··· 39 38 struct msix_entry msix_entry[MSIX_VECTORS]; 40 39 }; 41 40 static struct sp_device *sp_dev_master; 42 - 43 - #define security_attribute_show(name, def) \ 44 - static ssize_t name##_show(struct device *d, struct device_attribute *attr, \ 45 - char *buf) \ 46 - { \ 47 - struct sp_device *sp = dev_get_drvdata(d); \ 48 - struct psp_device *psp = sp->psp_data; \ 49 - int bit = PSP_SECURITY_##def << PSP_CAPABILITY_PSP_SECURITY_OFFSET; \ 50 - return sysfs_emit(buf, "%d\n", (psp->capability & bit) > 0); \ 51 - } 52 - 53 - security_attribute_show(fused_part, FUSED_PART) 54 - static DEVICE_ATTR_RO(fused_part); 55 - security_attribute_show(debug_lock_on, DEBUG_LOCK_ON) 56 - static DEVICE_ATTR_RO(debug_lock_on); 57 - security_attribute_show(tsme_status, TSME_STATUS) 58 - static DEVICE_ATTR_RO(tsme_status); 59 - security_attribute_show(anti_rollback_status, ANTI_ROLLBACK_STATUS) 60 - static DEVICE_ATTR_RO(anti_rollback_status); 61 - security_attribute_show(rpmc_production_enabled, RPMC_PRODUCTION_ENABLED) 62 - static DEVICE_ATTR_RO(rpmc_production_enabled); 63 - security_attribute_show(rpmc_spirom_available, RPMC_SPIROM_AVAILABLE) 64 - static DEVICE_ATTR_RO(rpmc_spirom_available); 65 - security_attribute_show(hsp_tpm_available, HSP_TPM_AVAILABLE) 66 - static DEVICE_ATTR_RO(hsp_tpm_available); 67 - security_attribute_show(rom_armor_enforced, ROM_ARMOR_ENFORCED) 68 - static DEVICE_ATTR_RO(rom_armor_enforced); 69 - 70 - static struct attribute *psp_security_attrs[] = { 71 - &dev_attr_fused_part.attr, 72 - &dev_attr_debug_lock_on.attr, 73 - &dev_attr_tsme_status.attr, 74 - &dev_attr_anti_rollback_status.attr, 75 - &dev_attr_rpmc_production_enabled.attr, 76 - &dev_attr_rpmc_spirom_available.attr, 77 - &dev_attr_hsp_tpm_available.attr, 78 - &dev_attr_rom_armor_enforced.attr, 79 - NULL 80 - }; 81 - 82 - static umode_t psp_security_is_visible(struct kobject *kobj, struct attribute *attr, int idx) 83 - { 84 - struct device *dev = kobj_to_dev(kobj); 85 - struct sp_device *sp = dev_get_drvdata(dev); 86 - struct psp_device *psp = sp->psp_data; 87 - 88 - if (psp && PSP_CAPABILITY(psp, PSP_SECURITY_REPORTING)) 89 - return 0444; 90 - 91 - return 0; 92 - } 93 - 94 - static struct attribute_group psp_security_attr_group = { 95 - .attrs = psp_security_attrs, 96 - .is_visible = psp_security_is_visible, 97 - }; 98 41 99 42 #define version_attribute_show(name, _offset) \ 100 43 static ssize_t name##_show(struct device *d, struct device_attribute *attr, \ ··· 79 134 psp->vdata->bootloader_info_reg) 80 135 val = ioread32(psp->io_regs + psp->vdata->bootloader_info_reg); 81 136 82 - if (attr == &dev_attr_tee_version.attr && 83 - PSP_CAPABILITY(psp, TEE) && 137 + if (attr == &dev_attr_tee_version.attr && psp->capability.tee && 84 138 psp->vdata->tee->info_reg) 85 139 val = ioread32(psp->io_regs + psp->vdata->tee->info_reg); 86 140 ··· 96 152 }; 97 153 98 154 static const struct attribute_group *psp_groups[] = { 155 + #ifdef CONFIG_CRYPTO_DEV_SP_PSP 99 156 &psp_security_attr_group, 157 + #endif 100 158 &psp_firmware_attr_group, 101 159 NULL, 102 160 }; ··· 397 451 398 452 static const struct psp_vdata pspv2 = { 399 453 .sev = &sevv2, 454 + .platform_access = &pa_v1, 400 455 .bootloader_info_reg = 0x109ec, /* C2PMSG_59 */ 401 456 .feature_reg = 0x109fc, /* C2PMSG_63 */ 402 457 .inten_reg = 0x10690, /* P2CMSG_INTEN */ 403 458 .intsts_reg = 0x10694, /* P2CMSG_INTSTS */ 459 + .platform_features = PLATFORM_FEATURE_HSTI, 404 460 }; 405 461 406 462 static const struct psp_vdata pspv3 = { ··· 415 467 .feature_reg = 0x109fc, /* C2PMSG_63 */ 416 468 .inten_reg = 0x10690, /* P2CMSG_INTEN */ 417 469 .intsts_reg = 0x10694, /* P2CMSG_INTSTS */ 418 - .platform_features = PLATFORM_FEATURE_DBC, 470 + .platform_features = PLATFORM_FEATURE_DBC | 471 + PLATFORM_FEATURE_HSTI, 419 472 }; 420 473 421 474 static const struct psp_vdata pspv4 = {

-6

drivers/crypto/ccree/cc_cipher.c

··· 261 261 kfree_sensitive(ctx_p->user.key); 262 262 } 263 263 264 - struct tdes_keys { 265 - u8 key1[DES_KEY_SIZE]; 266 - u8 key2[DES_KEY_SIZE]; 267 - u8 key3[DES_KEY_SIZE]; 268 - }; 269 - 270 264 static enum cc_hw_crypto_key cc_slot_to_hw_key(u8 slot_num) 271 265 { 272 266 switch (slot_num) {

-17

drivers/crypto/hifn_795x.c

··· 495 495 #define HIFN_CRYPT_CMD_SRCLEN_M 0xc000 496 496 #define HIFN_CRYPT_CMD_SRCLEN_S 14 497 497 498 - /* 499 - * Structure to help build up the command data structure. 500 - */ 501 - struct hifn_mac_command { 502 - volatile __le16 masks; 503 - volatile __le16 header_skip; 504 - volatile __le16 source_count; 505 - volatile __le16 reserved; 506 - }; 507 - 508 498 #define HIFN_MAC_CMD_ALG_MASK 0x0001 509 499 #define HIFN_MAC_CMD_ALG_SHA1 0x0000 510 500 #define HIFN_MAC_CMD_ALG_MD5 0x0001 ··· 515 525 */ 516 526 #define HIFN_MAC_CMD_POS_IPSEC 0x0200 517 527 #define HIFN_MAC_CMD_NEW_KEY 0x0800 518 - 519 - struct hifn_comp_command { 520 - volatile __le16 masks; 521 - volatile __le16 header_skip; 522 - volatile __le16 source_count; 523 - volatile __le16 reserved; 524 - }; 525 528 526 529 #define HIFN_COMP_CMD_SRCLEN_M 0xc000 527 530 #define HIFN_COMP_CMD_SRCLEN_S 14

+3 -8

drivers/crypto/hisilicon/qm.c

··· 3793 3793 goto err_put_sync; 3794 3794 } 3795 3795 3796 - qm->vfs_num = num_vfs; 3797 - 3798 3796 ret = pci_enable_sriov(pdev, num_vfs); 3799 3797 if (ret) { 3800 3798 pci_err(pdev, "Can't enable VF!\n"); 3801 3799 qm_clear_vft_config(qm); 3802 3800 goto err_put_sync; 3803 3801 } 3802 + qm->vfs_num = num_vfs; 3804 3803 3805 3804 pci_info(pdev, "VF enabled, vfs_num(=%d)!\n", num_vfs); 3806 3805 ··· 3821 3822 int hisi_qm_sriov_disable(struct pci_dev *pdev, bool is_frozen) 3822 3823 { 3823 3824 struct hisi_qm *qm = pci_get_drvdata(pdev); 3824 - int ret; 3825 3825 3826 3826 if (pci_vfs_assigned(pdev)) { 3827 3827 pci_err(pdev, "Failed to disable VFs as VFs are assigned!\n"); ··· 3835 3837 3836 3838 pci_disable_sriov(pdev); 3837 3839 3838 - ret = qm_clear_vft_config(qm); 3839 - if (ret) 3840 - return ret; 3841 - 3840 + qm->vfs_num = 0; 3842 3841 qm_pm_put_sync(qm); 3843 3842 3844 - return 0; 3843 + return qm_clear_vft_config(qm); 3845 3844 } 3846 3845 EXPORT_SYMBOL_GPL(hisi_qm_sriov_disable); 3847 3846

+20 -28

drivers/crypto/hisilicon/zip/zip_main.c

··· 37 37 #define HZIP_QM_IDEL_STATUS 0x3040e4 38 38 39 39 #define HZIP_CORE_DFX_BASE 0x301000 40 - #define HZIP_CLOCK_GATED_CONTL 0X301004 40 + #define HZIP_CORE_DFX_DECOMP_BASE 0x304000 41 41 #define HZIP_CORE_DFX_COMP_0 0x302000 42 42 #define HZIP_CORE_DFX_COMP_1 0x303000 43 43 #define HZIP_CORE_DFX_DECOMP_0 0x304000 ··· 48 48 #define HZIP_CORE_DFX_DECOMP_5 0x309000 49 49 #define HZIP_CORE_REGS_BASE_LEN 0xB0 50 50 #define HZIP_CORE_REGS_DFX_LEN 0x28 51 + #define HZIP_CORE_ADDR_INTRVL 0x1000 51 52 52 53 #define HZIP_CORE_INT_SOURCE 0x3010A0 53 54 #define HZIP_CORE_INT_MASK_REG 0x3010A4 ··· 268 267 ZIP_COMP_ENABLE_BITMAP, 269 268 ZIP_DRV_ALG_BITMAP, 270 269 ZIP_DEV_ALG_BITMAP, 271 - }; 272 - 273 - enum { 274 - HZIP_COMP_CORE0, 275 - HZIP_COMP_CORE1, 276 - HZIP_DECOMP_CORE0, 277 - HZIP_DECOMP_CORE1, 278 - HZIP_DECOMP_CORE2, 279 - HZIP_DECOMP_CORE3, 280 - HZIP_DECOMP_CORE4, 281 - HZIP_DECOMP_CORE5, 282 - }; 283 - 284 - static const u64 core_offsets[] = { 285 - [HZIP_COMP_CORE0] = 0x302000, 286 - [HZIP_COMP_CORE1] = 0x303000, 287 - [HZIP_DECOMP_CORE0] = 0x304000, 288 - [HZIP_DECOMP_CORE1] = 0x305000, 289 - [HZIP_DECOMP_CORE2] = 0x306000, 290 - [HZIP_DECOMP_CORE3] = 0x307000, 291 - [HZIP_DECOMP_CORE4] = 0x308000, 292 - [HZIP_DECOMP_CORE5] = 0x309000, 293 270 }; 294 271 295 272 static const struct debugfs_reg32 hzip_dfx_regs[] = { ··· 786 807 787 808 DEFINE_SHOW_ATTRIBUTE(hisi_zip_regs); 788 809 810 + static void __iomem *get_zip_core_addr(struct hisi_qm *qm, int core_num) 811 + { 812 + u32 zip_comp_core_num = qm->cap_tables.dev_cap_table[ZIP_CLUSTER_COMP_NUM_CAP_IDX].cap_val; 813 + 814 + if (core_num < zip_comp_core_num) 815 + return qm->io_base + HZIP_CORE_DFX_BASE + 816 + (core_num + 1) * HZIP_CORE_ADDR_INTRVL; 817 + 818 + return qm->io_base + HZIP_CORE_DFX_DECOMP_BASE + 819 + (core_num - zip_comp_core_num) * HZIP_CORE_ADDR_INTRVL; 820 + } 821 + 789 822 static int hisi_zip_core_debug_init(struct hisi_qm *qm) 790 823 { 791 824 u32 zip_core_num, zip_comp_core_num; ··· 823 832 824 833 regset->regs = hzip_dfx_regs; 825 834 regset->nregs = ARRAY_SIZE(hzip_dfx_regs); 826 - regset->base = qm->io_base + core_offsets[i]; 835 + regset->base = get_zip_core_addr(qm, i); 827 836 regset->dev = dev; 828 837 829 838 tmp_d = debugfs_create_dir(buf, qm->debug.debug_root); ··· 912 921 /* hisi_zip_debug_regs_clear() - clear the zip debug regs */ 913 922 static void hisi_zip_debug_regs_clear(struct hisi_qm *qm) 914 923 { 924 + u32 zip_core_num = qm->cap_tables.dev_cap_table[ZIP_CORE_NUM_CAP_IDX].cap_val; 915 925 int i, j; 916 926 917 927 /* enable register read_clear bit */ 918 928 writel(HZIP_RD_CNT_CLR_CE_EN, qm->io_base + HZIP_SOFT_CTRL_CNT_CLR_CE); 919 - for (i = 0; i < ARRAY_SIZE(core_offsets); i++) 929 + for (i = 0; i < zip_core_num; i++) 920 930 for (j = 0; j < ARRAY_SIZE(hzip_dfx_regs); j++) 921 - readl(qm->io_base + core_offsets[i] + 931 + readl(get_zip_core_addr(qm, i) + 922 932 hzip_dfx_regs[j].offset); 923 933 924 934 /* disable register read_clear bit */ ··· 962 970 } 963 971 964 972 for (i = 0; i < zip_core_num; i++) { 965 - io_base = qm->io_base + core_offsets[i]; 973 + io_base = get_zip_core_addr(qm, i); 966 974 for (j = 0; j < core_dfx_regs_num; j++) { 967 975 idx = com_dfx_regs_num + i * core_dfx_regs_num + j; 968 976 debug->last_words[idx] = readl_relaxed( ··· 1014 1022 else 1015 1023 scnprintf(buf, sizeof(buf), "Decomp_core-%d", 1016 1024 i - zip_comp_core_num); 1017 - base = qm->io_base + core_offsets[i]; 1025 + base = get_zip_core_addr(qm, i); 1018 1026 1019 1027 pci_info(qm->pdev, "==>%s:\n", buf); 1020 1028 /* dump last word for dfx regs during control resetting */

+1

drivers/crypto/intel/keembay/ocs-hcu.c

··· 837 837 return IRQ_HANDLED; 838 838 } 839 839 840 + MODULE_DESCRIPTION("Intel Keem Bay OCS HCU Crypto Driver"); 840 841 MODULE_LICENSE("GPL");

+4 -2

drivers/crypto/intel/qat/qat_common/adf_cfg.c

··· 290 290 * 3. if the key exists with the same value, then return without doing 291 291 * anything (the newly created key_val is freed). 292 292 */ 293 + down_write(&cfg->lock); 293 294 if (!adf_cfg_key_val_get(accel_dev, section_name, key, temp_val)) { 294 295 if (strncmp(temp_val, key_val->val, sizeof(temp_val))) { 295 296 adf_cfg_keyval_remove(key, section); 296 297 } else { 297 298 kfree(key_val); 298 - return 0; 299 + goto out; 299 300 } 300 301 } 301 302 302 - down_write(&cfg->lock); 303 303 adf_cfg_keyval_add(key_val, section); 304 + 305 + out: 304 306 up_write(&cfg->lock); 305 307 return 0; 306 308 }

+12 -9

drivers/crypto/intel/qat/qat_common/adf_ctl_drv.c

··· 31 31 .compat_ioctl = compat_ptr_ioctl, 32 32 }; 33 33 34 + static const struct class adf_ctl_class = { 35 + .name = DEVICE_NAME, 36 + }; 37 + 34 38 struct adf_ctl_drv_info { 35 39 unsigned int major; 36 40 struct cdev drv_cdev; 37 - struct class *drv_class; 38 41 }; 39 42 40 43 static struct adf_ctl_drv_info adf_ctl_drv; 41 44 42 45 static void adf_chr_drv_destroy(void) 43 46 { 44 - device_destroy(adf_ctl_drv.drv_class, MKDEV(adf_ctl_drv.major, 0)); 47 + device_destroy(&adf_ctl_class, MKDEV(adf_ctl_drv.major, 0)); 45 48 cdev_del(&adf_ctl_drv.drv_cdev); 46 - class_destroy(adf_ctl_drv.drv_class); 49 + class_unregister(&adf_ctl_class); 47 50 unregister_chrdev_region(MKDEV(adf_ctl_drv.major, 0), 1); 48 51 } 49 52 ··· 54 51 { 55 52 dev_t dev_id; 56 53 struct device *drv_device; 54 + int ret; 57 55 58 56 if (alloc_chrdev_region(&dev_id, 0, 1, DEVICE_NAME)) { 59 57 pr_err("QAT: unable to allocate chrdev region\n"); 60 58 return -EFAULT; 61 59 } 62 60 63 - adf_ctl_drv.drv_class = class_create(DEVICE_NAME); 64 - if (IS_ERR(adf_ctl_drv.drv_class)) { 65 - pr_err("QAT: class_create failed for adf_ctl\n"); 61 + ret = class_register(&adf_ctl_class); 62 + if (ret) 66 63 goto err_chrdev_unreg; 67 - } 64 + 68 65 adf_ctl_drv.major = MAJOR(dev_id); 69 66 cdev_init(&adf_ctl_drv.drv_cdev, &adf_ctl_ops); 70 67 if (cdev_add(&adf_ctl_drv.drv_cdev, dev_id, 1)) { ··· 72 69 goto err_class_destr; 73 70 } 74 71 75 - drv_device = device_create(adf_ctl_drv.drv_class, NULL, 72 + drv_device = device_create(&adf_ctl_class, NULL, 76 73 MKDEV(adf_ctl_drv.major, 0), 77 74 NULL, DEVICE_NAME); 78 75 if (IS_ERR(drv_device)) { ··· 83 80 err_cdev_del: 84 81 cdev_del(&adf_ctl_drv.drv_cdev); 85 82 err_class_destr: 86 - class_destroy(adf_ctl_drv.drv_class); 83 + class_unregister(&adf_ctl_class); 87 84 err_chrdev_unreg: 88 85 unregister_chrdev_region(dev_id, 1); 89 86 return -EFAULT;

+1 -1

drivers/crypto/intel/qat/qat_common/adf_dev_mgr.c

··· 59 59 } 60 60 61 61 /** 62 - * adf_clean_vf_map() - Cleans VF id mapings 62 + * adf_clean_vf_map() - Cleans VF id mappings 63 63 * @vf: flag indicating whether mappings is cleaned 64 64 * for vfs only or for vfs and pfs 65 65 *

+3 -1

drivers/crypto/intel/qat/qat_common/adf_gen2_pfvf.c

··· 100 100 errmsk3 |= ADF_GEN2_ERR_MSK_VF2PF(ADF_GEN2_VF_MSK); 101 101 ADF_CSR_WR(pmisc_addr, ADF_GEN2_ERRMSK3, errmsk3); 102 102 103 - errmsk3 &= ADF_GEN2_ERR_MSK_VF2PF(sources | disabled); 103 + /* Update only section of errmsk3 related to VF2PF */ 104 + errmsk3 &= ~ADF_GEN2_ERR_MSK_VF2PF(ADF_GEN2_VF_MSK); 105 + errmsk3 |= ADF_GEN2_ERR_MSK_VF2PF(sources | disabled); 104 106 ADF_CSR_WR(pmisc_addr, ADF_GEN2_ERRMSK3, errmsk3); 105 107 106 108 /* Return the sources of the (new) interrupt(s) */

+1

drivers/crypto/intel/qat/qat_common/adf_rl.c

··· 1106 1106 mutex_init(&rl->rl_lock); 1107 1107 rl->device_data = &accel_dev->hw_device->rl_data; 1108 1108 rl->accel_dev = accel_dev; 1109 + init_rwsem(&rl->user_input.lock); 1109 1110 accel_dev->rate_limiting = rl; 1110 1111 1111 1112 err_ret:

+6 -2

drivers/crypto/intel/qat/qat_dh895xcc/adf_dh895xcc_hw_data.c

··· 193 193 ADF_CSR_WR(pmisc_addr, ADF_GEN2_ERRMSK3, errmsk3); 194 194 ADF_CSR_WR(pmisc_addr, ADF_GEN2_ERRMSK5, errmsk5); 195 195 196 - errmsk3 &= ADF_DH895XCC_ERR_MSK_VF2PF_L(sources | disabled); 197 - errmsk5 &= ADF_DH895XCC_ERR_MSK_VF2PF_U(sources | disabled); 196 + /* Update only section of errmsk3 and errmsk5 related to VF2PF */ 197 + errmsk3 &= ~ADF_DH895XCC_ERR_MSK_VF2PF_L(ADF_DH895XCC_VF_MSK); 198 + errmsk5 &= ~ADF_DH895XCC_ERR_MSK_VF2PF_U(ADF_DH895XCC_VF_MSK); 199 + 200 + errmsk3 |= ADF_DH895XCC_ERR_MSK_VF2PF_L(sources | disabled); 201 + errmsk5 |= ADF_DH895XCC_ERR_MSK_VF2PF_U(sources | disabled); 198 202 ADF_CSR_WR(pmisc_addr, ADF_GEN2_ERRMSK3, errmsk3); 199 203 ADF_CSR_WR(pmisc_addr, ADF_GEN2_ERRMSK5, errmsk5); 200 204

+2 -1

drivers/crypto/mxs-dcp.c

··· 225 225 static int mxs_dcp_run_aes(struct dcp_async_ctx *actx, 226 226 struct skcipher_request *req, int init) 227 227 { 228 - dma_addr_t key_phys, src_phys, dst_phys; 228 + dma_addr_t key_phys = 0; 229 + dma_addr_t src_phys, dst_phys; 229 230 struct dcp *sdcp = global_sdcp; 230 231 struct dcp_dma_desc *desc = &sdcp->coh->desc[actx->chan]; 231 232 struct dcp_aes_req_ctx *rctx = skcipher_request_ctx(req);

-4

drivers/crypto/n2_core.c

··· 720 720 return container_of(alg, struct n2_skcipher_alg, skcipher); 721 721 } 722 722 723 - struct n2_skcipher_request_context { 724 - struct skcipher_walk walk; 725 - }; 726 - 727 723 static int n2_aes_setkey(struct crypto_skcipher *skcipher, const u8 *key, 728 724 unsigned int keylen) 729 725 {

+1

drivers/crypto/sa2ul.c

··· 2496 2496 }, 2497 2497 }; 2498 2498 module_platform_driver(sa_ul_driver); 2499 + MODULE_DESCRIPTION("K3 SA2UL crypto accelerator driver"); 2499 2500 MODULE_LICENSE("GPL v2");

+2 -2

drivers/crypto/starfive/jh7110-cryp.h

··· 30 30 #define MAX_KEY_SIZE SHA512_BLOCK_SIZE 31 31 #define STARFIVE_AES_IV_LEN AES_BLOCK_SIZE 32 32 #define STARFIVE_AES_CTR_LEN AES_BLOCK_SIZE 33 + #define STARFIVE_RSA_MAX_KEYSZ 256 33 34 34 35 union starfive_aes_csr { 35 36 u32 v; ··· 218 217 struct scatterlist *out_sg; 219 218 struct ahash_request ahash_fbk_req; 220 219 size_t total; 221 - size_t nents; 222 220 unsigned int blksize; 223 221 unsigned int digsize; 224 222 unsigned long in_sg_len; 225 223 unsigned char *adata; 226 - u8 rsa_data[] __aligned(sizeof(u32)); 224 + u8 rsa_data[STARFIVE_RSA_MAX_KEYSZ] __aligned(sizeof(u32)); 227 225 }; 228 226 229 227 struct starfive_cryp_dev *starfive_cryp_find_dev(struct starfive_cryp_ctx *ctx);

+9 -6

drivers/crypto/starfive/jh7110-rsa.c

··· 31 31 /* A * A * R mod N ==> A */ 32 32 #define CRYPTO_CMD_AARN 0x7 33 33 34 - #define STARFIVE_RSA_MAX_KEYSZ 256 35 34 #define STARFIVE_RSA_RESET 0x2 36 35 37 36 static inline int starfive_pka_wait_done(struct starfive_cryp_ctx *ctx) ··· 73 74 { 74 75 struct starfive_cryp_dev *cryp = ctx->cryp; 75 76 struct starfive_cryp_request_ctx *rctx = ctx->rctx; 76 - int count = rctx->total / sizeof(u32) - 1; 77 + int count = (ALIGN(rctx->total, 4) / 4) - 1; 77 78 int loop; 78 79 u32 temp; 79 80 u8 opsize; ··· 250 251 struct starfive_cryp_dev *cryp = ctx->cryp; 251 252 struct starfive_cryp_request_ctx *rctx = ctx->rctx; 252 253 struct starfive_rsa_key *key = &ctx->rsa_key; 253 - int ret = 0; 254 + int ret = 0, shift = 0; 254 255 255 256 writel(STARFIVE_RSA_RESET, cryp->base + STARFIVE_PKA_CACR_OFFSET); 256 257 257 - rctx->total = sg_copy_to_buffer(rctx->in_sg, rctx->nents, 258 - rctx->rsa_data, rctx->total); 258 + if (!IS_ALIGNED(rctx->total, sizeof(u32))) { 259 + shift = sizeof(u32) - (rctx->total & 0x3); 260 + memset(rctx->rsa_data, 0, shift); 261 + } 262 + 263 + rctx->total = sg_copy_to_buffer(rctx->in_sg, sg_nents(rctx->in_sg), 264 + rctx->rsa_data + shift, rctx->total); 259 265 260 266 if (enc) { 261 267 key->bitlen = key->e_bitlen; ··· 309 305 rctx->in_sg = req->src; 310 306 rctx->out_sg = req->dst; 311 307 rctx->total = req->src_len; 312 - rctx->nents = sg_nents(rctx->in_sg); 313 308 ctx->rctx = rctx; 314 309 315 310 return starfive_rsa_enc_core(ctx, 1);

+674 -45

drivers/crypto/stm32/stm32-cryp.c

··· 11 11 #include <crypto/internal/des.h> 12 12 #include <crypto/internal/skcipher.h> 13 13 #include <crypto/scatterwalk.h> 14 + #include <linux/bottom_half.h> 14 15 #include <linux/clk.h> 15 16 #include <linux/delay.h> 17 + #include <linux/dma-mapping.h> 18 + #include <linux/dmaengine.h> 16 19 #include <linux/err.h> 17 20 #include <linux/iopoll.h> 18 21 #include <linux/interrupt.h> ··· 43 40 /* Mode mask = bits [15..0] */ 44 41 #define FLG_MODE_MASK GENMASK(15, 0) 45 42 /* Bit [31..16] status */ 43 + #define FLG_IN_OUT_DMA BIT(16) 44 + #define FLG_HEADER_DMA BIT(17) 46 45 47 46 /* Registers */ 48 47 #define CRYP_CR 0x00000000 ··· 126 121 #define CR_PH_MASK 0x00030000 127 122 #define CR_NBPBL_SHIFT 20 128 123 129 - #define SR_BUSY 0x00000010 130 - #define SR_OFNE 0x00000004 124 + #define SR_IFNF BIT(1) 125 + #define SR_OFNE BIT(2) 126 + #define SR_BUSY BIT(8) 127 + 128 + #define DMACR_DIEN BIT(0) 129 + #define DMACR_DOEN BIT(1) 131 130 132 131 #define IMSCR_IN BIT(0) 133 132 #define IMSCR_OUT BIT(1) ··· 142 133 /* Misc */ 143 134 #define AES_BLOCK_32 (AES_BLOCK_SIZE / sizeof(u32)) 144 135 #define GCM_CTR_INIT 2 145 - #define CRYP_AUTOSUSPEND_DELAY 50 136 + #define CRYP_AUTOSUSPEND_DELAY 50 137 + 138 + #define CRYP_DMA_BURST_REG 4 139 + 140 + enum stm32_dma_mode { 141 + NO_DMA, 142 + DMA_PLAIN_SG, 143 + DMA_NEED_SG_TRUNC 144 + }; 146 145 147 146 struct stm32_cryp_caps { 148 147 bool aeads_support; ··· 163 146 u32 sr; 164 147 u32 din; 165 148 u32 dout; 149 + u32 dmacr; 166 150 u32 imsc; 167 151 u32 mis; 168 152 u32 k1l; ··· 190 172 struct list_head list; 191 173 struct device *dev; 192 174 void __iomem *regs; 175 + phys_addr_t phys_base; 193 176 struct clk *clk; 194 177 unsigned long flags; 195 178 u32 irq_status; ··· 209 190 size_t header_in; 210 191 size_t payload_out; 211 192 193 + /* DMA process fields */ 194 + struct scatterlist *in_sg; 195 + struct scatterlist *header_sg; 212 196 struct scatterlist *out_sg; 197 + size_t in_sg_len; 198 + size_t header_sg_len; 199 + size_t out_sg_len; 200 + struct completion dma_completion; 213 201 202 + struct dma_chan *dma_lch_in; 203 + struct dma_chan *dma_lch_out; 204 + enum stm32_dma_mode dma_mode; 205 + 206 + /* IT process fields */ 214 207 struct scatter_walk in_walk; 215 208 struct scatter_walk out_walk; 216 209 ··· 322 291 !(status & CR_CRYPEN), 10, 100000); 323 292 } 324 293 294 + static inline int stm32_cryp_wait_input(struct stm32_cryp *cryp) 295 + { 296 + u32 status; 297 + 298 + return readl_relaxed_poll_timeout_atomic(cryp->regs + cryp->caps->sr, status, 299 + status & SR_IFNF, 1, 10); 300 + } 301 + 325 302 static inline int stm32_cryp_wait_output(struct stm32_cryp *cryp) 326 303 { 327 304 u32 status; 328 305 329 - return readl_relaxed_poll_timeout(cryp->regs + cryp->caps->sr, status, 330 - status & SR_OFNE, 10, 100000); 306 + return readl_relaxed_poll_timeout_atomic(cryp->regs + cryp->caps->sr, status, 307 + status & SR_OFNE, 1, 10); 331 308 } 332 309 333 310 static inline void stm32_cryp_key_read_enable(struct stm32_cryp *cryp) ··· 350 311 cryp->regs + cryp->caps->cr); 351 312 } 352 313 314 + static void stm32_cryp_irq_read_data(struct stm32_cryp *cryp); 315 + static void stm32_cryp_irq_write_data(struct stm32_cryp *cryp); 316 + static void stm32_cryp_irq_write_gcmccm_header(struct stm32_cryp *cryp); 353 317 static int stm32_cryp_read_auth_tag(struct stm32_cryp *cryp); 354 318 static void stm32_cryp_finish_req(struct stm32_cryp *cryp, int err); 319 + static int stm32_cryp_dma_start(struct stm32_cryp *cryp); 320 + static int stm32_cryp_it_start(struct stm32_cryp *cryp); 355 321 356 322 static struct stm32_cryp *stm32_cryp_find_dev(struct stm32_cryp_ctx *ctx) 357 323 { ··· 857 813 if (is_gcm(cryp) || is_ccm(cryp)) 858 814 crypto_finalize_aead_request(cryp->engine, cryp->areq, err); 859 815 else 860 - crypto_finalize_skcipher_request(cryp->engine, cryp->req, 861 - err); 816 + crypto_finalize_skcipher_request(cryp->engine, cryp->req, err); 862 817 } 863 818 864 - static int stm32_cryp_cpu_start(struct stm32_cryp *cryp) 819 + static void stm32_cryp_header_dma_callback(void *param) 820 + { 821 + struct stm32_cryp *cryp = (struct stm32_cryp *)param; 822 + int ret; 823 + u32 reg; 824 + 825 + dma_unmap_sg(cryp->dev, cryp->header_sg, cryp->header_sg_len, DMA_TO_DEVICE); 826 + 827 + reg = stm32_cryp_read(cryp, cryp->caps->dmacr); 828 + stm32_cryp_write(cryp, cryp->caps->dmacr, reg & ~(DMACR_DOEN | DMACR_DIEN)); 829 + 830 + kfree(cryp->header_sg); 831 + 832 + reg = stm32_cryp_read(cryp, cryp->caps->cr); 833 + 834 + if (cryp->header_in) { 835 + stm32_cryp_write(cryp, cryp->caps->cr, reg | CR_CRYPEN); 836 + 837 + ret = stm32_cryp_wait_input(cryp); 838 + if (ret) { 839 + dev_err(cryp->dev, "input header ready timeout after dma\n"); 840 + stm32_cryp_finish_req(cryp, ret); 841 + return; 842 + } 843 + stm32_cryp_irq_write_gcmccm_header(cryp); 844 + WARN_ON(cryp->header_in); 845 + } 846 + 847 + if (stm32_cryp_get_input_text_len(cryp)) { 848 + /* Phase 3 : payload */ 849 + reg = stm32_cryp_read(cryp, cryp->caps->cr); 850 + stm32_cryp_write(cryp, cryp->caps->cr, reg & ~CR_CRYPEN); 851 + 852 + reg &= ~CR_PH_MASK; 853 + reg |= CR_PH_PAYLOAD | CR_CRYPEN; 854 + stm32_cryp_write(cryp, cryp->caps->cr, reg); 855 + 856 + if (cryp->flags & FLG_IN_OUT_DMA) { 857 + ret = stm32_cryp_dma_start(cryp); 858 + if (ret) 859 + stm32_cryp_finish_req(cryp, ret); 860 + } else { 861 + stm32_cryp_it_start(cryp); 862 + } 863 + } else { 864 + /* 865 + * Phase 4 : tag. 866 + * Nothing to read, nothing to write => end request 867 + */ 868 + stm32_cryp_finish_req(cryp, 0); 869 + } 870 + } 871 + 872 + static void stm32_cryp_dma_callback(void *param) 873 + { 874 + struct stm32_cryp *cryp = (struct stm32_cryp *)param; 875 + int ret; 876 + u32 reg; 877 + 878 + complete(&cryp->dma_completion); /* completion to indicate no timeout */ 879 + 880 + dma_sync_sg_for_device(cryp->dev, cryp->out_sg, cryp->out_sg_len, DMA_FROM_DEVICE); 881 + 882 + if (cryp->in_sg != cryp->out_sg) 883 + dma_unmap_sg(cryp->dev, cryp->in_sg, cryp->in_sg_len, DMA_TO_DEVICE); 884 + 885 + dma_unmap_sg(cryp->dev, cryp->out_sg, cryp->out_sg_len, DMA_FROM_DEVICE); 886 + 887 + reg = stm32_cryp_read(cryp, cryp->caps->dmacr); 888 + stm32_cryp_write(cryp, cryp->caps->dmacr, reg & ~(DMACR_DOEN | DMACR_DIEN)); 889 + 890 + reg = stm32_cryp_read(cryp, cryp->caps->cr); 891 + 892 + if (is_gcm(cryp) || is_ccm(cryp)) { 893 + kfree(cryp->in_sg); 894 + kfree(cryp->out_sg); 895 + } else { 896 + if (cryp->in_sg != cryp->req->src) 897 + kfree(cryp->in_sg); 898 + if (cryp->out_sg != cryp->req->dst) 899 + kfree(cryp->out_sg); 900 + } 901 + 902 + if (cryp->payload_in) { 903 + stm32_cryp_write(cryp, cryp->caps->cr, reg | CR_CRYPEN); 904 + 905 + ret = stm32_cryp_wait_input(cryp); 906 + if (ret) { 907 + dev_err(cryp->dev, "input ready timeout after dma\n"); 908 + stm32_cryp_finish_req(cryp, ret); 909 + return; 910 + } 911 + stm32_cryp_irq_write_data(cryp); 912 + 913 + ret = stm32_cryp_wait_output(cryp); 914 + if (ret) { 915 + dev_err(cryp->dev, "output ready timeout after dma\n"); 916 + stm32_cryp_finish_req(cryp, ret); 917 + return; 918 + } 919 + stm32_cryp_irq_read_data(cryp); 920 + } 921 + 922 + stm32_cryp_finish_req(cryp, 0); 923 + } 924 + 925 + static int stm32_cryp_header_dma_start(struct stm32_cryp *cryp) 926 + { 927 + int ret; 928 + struct dma_async_tx_descriptor *tx_in; 929 + u32 reg; 930 + size_t align_size; 931 + 932 + ret = dma_map_sg(cryp->dev, cryp->header_sg, cryp->header_sg_len, DMA_TO_DEVICE); 933 + if (!ret) { 934 + dev_err(cryp->dev, "dma_map_sg() error\n"); 935 + return -ENOMEM; 936 + } 937 + 938 + dma_sync_sg_for_device(cryp->dev, cryp->header_sg, cryp->header_sg_len, DMA_TO_DEVICE); 939 + 940 + tx_in = dmaengine_prep_slave_sg(cryp->dma_lch_in, cryp->header_sg, cryp->header_sg_len, 941 + DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); 942 + if (!tx_in) { 943 + dev_err(cryp->dev, "IN prep_slave_sg() failed\n"); 944 + return -EINVAL; 945 + } 946 + 947 + tx_in->callback_param = cryp; 948 + tx_in->callback = stm32_cryp_header_dma_callback; 949 + 950 + /* Advance scatterwalk to not DMA'ed data */ 951 + align_size = ALIGN_DOWN(cryp->header_in, cryp->hw_blocksize); 952 + scatterwalk_copychunks(NULL, &cryp->in_walk, align_size, 2); 953 + cryp->header_in -= align_size; 954 + 955 + ret = dma_submit_error(dmaengine_submit(tx_in)); 956 + if (ret < 0) { 957 + dev_err(cryp->dev, "DMA in submit failed\n"); 958 + return ret; 959 + } 960 + dma_async_issue_pending(cryp->dma_lch_in); 961 + 962 + reg = stm32_cryp_read(cryp, cryp->caps->dmacr); 963 + stm32_cryp_write(cryp, cryp->caps->dmacr, reg | DMACR_DIEN); 964 + 965 + return 0; 966 + } 967 + 968 + static int stm32_cryp_dma_start(struct stm32_cryp *cryp) 969 + { 970 + int ret; 971 + size_t align_size; 972 + struct dma_async_tx_descriptor *tx_in, *tx_out; 973 + u32 reg; 974 + 975 + if (cryp->in_sg != cryp->out_sg) { 976 + ret = dma_map_sg(cryp->dev, cryp->in_sg, cryp->in_sg_len, DMA_TO_DEVICE); 977 + if (!ret) { 978 + dev_err(cryp->dev, "dma_map_sg() error\n"); 979 + return -ENOMEM; 980 + } 981 + } 982 + 983 + ret = dma_map_sg(cryp->dev, cryp->out_sg, cryp->out_sg_len, DMA_FROM_DEVICE); 984 + if (!ret) { 985 + dev_err(cryp->dev, "dma_map_sg() error\n"); 986 + return -ENOMEM; 987 + } 988 + 989 + dma_sync_sg_for_device(cryp->dev, cryp->in_sg, cryp->in_sg_len, DMA_TO_DEVICE); 990 + 991 + tx_in = dmaengine_prep_slave_sg(cryp->dma_lch_in, cryp->in_sg, cryp->in_sg_len, 992 + DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); 993 + if (!tx_in) { 994 + dev_err(cryp->dev, "IN prep_slave_sg() failed\n"); 995 + return -EINVAL; 996 + } 997 + 998 + /* No callback necessary */ 999 + tx_in->callback_param = cryp; 1000 + tx_in->callback = NULL; 1001 + 1002 + tx_out = dmaengine_prep_slave_sg(cryp->dma_lch_out, cryp->out_sg, cryp->out_sg_len, 1003 + DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); 1004 + if (!tx_out) { 1005 + dev_err(cryp->dev, "OUT prep_slave_sg() failed\n"); 1006 + return -EINVAL; 1007 + } 1008 + 1009 + reinit_completion(&cryp->dma_completion); 1010 + tx_out->callback = stm32_cryp_dma_callback; 1011 + tx_out->callback_param = cryp; 1012 + 1013 + /* Advance scatterwalk to not DMA'ed data */ 1014 + align_size = ALIGN_DOWN(cryp->payload_in, cryp->hw_blocksize); 1015 + scatterwalk_copychunks(NULL, &cryp->in_walk, align_size, 2); 1016 + cryp->payload_in -= align_size; 1017 + 1018 + ret = dma_submit_error(dmaengine_submit(tx_in)); 1019 + if (ret < 0) { 1020 + dev_err(cryp->dev, "DMA in submit failed\n"); 1021 + return ret; 1022 + } 1023 + dma_async_issue_pending(cryp->dma_lch_in); 1024 + 1025 + /* Advance scatterwalk to not DMA'ed data */ 1026 + scatterwalk_copychunks(NULL, &cryp->out_walk, align_size, 2); 1027 + cryp->payload_out -= align_size; 1028 + ret = dma_submit_error(dmaengine_submit(tx_out)); 1029 + if (ret < 0) { 1030 + dev_err(cryp->dev, "DMA out submit failed\n"); 1031 + return ret; 1032 + } 1033 + dma_async_issue_pending(cryp->dma_lch_out); 1034 + 1035 + reg = stm32_cryp_read(cryp, cryp->caps->dmacr); 1036 + stm32_cryp_write(cryp, cryp->caps->dmacr, reg | DMACR_DOEN | DMACR_DIEN); 1037 + 1038 + if (!wait_for_completion_timeout(&cryp->dma_completion, msecs_to_jiffies(1000))) { 1039 + dev_err(cryp->dev, "DMA out timed out\n"); 1040 + dmaengine_terminate_sync(cryp->dma_lch_out); 1041 + return -ETIMEDOUT; 1042 + } 1043 + 1044 + return 0; 1045 + } 1046 + 1047 + static int stm32_cryp_it_start(struct stm32_cryp *cryp) 865 1048 { 866 1049 /* Enable interrupt and let the IRQ handler do everything */ 867 1050 stm32_cryp_write(cryp, cryp->caps->imsc, IMSCR_IN | IMSCR_OUT); ··· 1420 1149 return stm32_cryp_crypt(req, FLG_TDES | FLG_CBC); 1421 1150 } 1422 1151 1152 + static enum stm32_dma_mode stm32_cryp_dma_check_sg(struct scatterlist *test_sg, size_t len, 1153 + size_t block_size) 1154 + { 1155 + struct scatterlist *sg; 1156 + int i; 1157 + 1158 + if (len <= 16) 1159 + return NO_DMA; /* Faster */ 1160 + 1161 + for_each_sg(test_sg, sg, sg_nents(test_sg), i) { 1162 + if (!IS_ALIGNED(sg->length, block_size) && !sg_is_last(sg)) 1163 + return NO_DMA; 1164 + 1165 + if (sg->offset % sizeof(u32)) 1166 + return NO_DMA; 1167 + 1168 + if (sg_is_last(sg) && !IS_ALIGNED(sg->length, AES_BLOCK_SIZE)) 1169 + return DMA_NEED_SG_TRUNC; 1170 + } 1171 + 1172 + return DMA_PLAIN_SG; 1173 + } 1174 + 1175 + static enum stm32_dma_mode stm32_cryp_dma_check(struct stm32_cryp *cryp, struct scatterlist *in_sg, 1176 + struct scatterlist *out_sg) 1177 + { 1178 + enum stm32_dma_mode ret = DMA_PLAIN_SG; 1179 + 1180 + if (!is_aes(cryp)) 1181 + return NO_DMA; 1182 + 1183 + if (!cryp->dma_lch_in || !cryp->dma_lch_out) 1184 + return NO_DMA; 1185 + 1186 + ret = stm32_cryp_dma_check_sg(in_sg, cryp->payload_in, AES_BLOCK_SIZE); 1187 + if (ret == NO_DMA) 1188 + return ret; 1189 + 1190 + ret = stm32_cryp_dma_check_sg(out_sg, cryp->payload_out, AES_BLOCK_SIZE); 1191 + if (ret == NO_DMA) 1192 + return ret; 1193 + 1194 + /* Check CTR counter overflow */ 1195 + if (is_aes(cryp) && is_ctr(cryp)) { 1196 + u32 c; 1197 + __be32 iv3; 1198 + 1199 + memcpy(&iv3, &cryp->req->iv[3 * sizeof(u32)], sizeof(iv3)); 1200 + c = be32_to_cpu(iv3); 1201 + if ((c + cryp->payload_in) < cryp->payload_in) 1202 + return NO_DMA; 1203 + } 1204 + 1205 + /* Workaround */ 1206 + if (is_aes(cryp) && is_ctr(cryp) && ret == DMA_NEED_SG_TRUNC) 1207 + return NO_DMA; 1208 + 1209 + return ret; 1210 + } 1211 + 1212 + static int stm32_cryp_truncate_sg(struct scatterlist **new_sg, size_t *new_sg_len, 1213 + struct scatterlist *sg, off_t skip, size_t size) 1214 + { 1215 + struct scatterlist *cur; 1216 + int alloc_sg_len; 1217 + 1218 + *new_sg_len = 0; 1219 + 1220 + if (!sg || !size) { 1221 + *new_sg = NULL; 1222 + return 0; 1223 + } 1224 + 1225 + alloc_sg_len = sg_nents_for_len(sg, skip + size); 1226 + if (alloc_sg_len < 0) 1227 + return alloc_sg_len; 1228 + 1229 + /* We allocate to much sg entry, but it is easier */ 1230 + *new_sg = kmalloc_array((size_t)alloc_sg_len, sizeof(struct scatterlist), GFP_KERNEL); 1231 + if (!*new_sg) 1232 + return -ENOMEM; 1233 + 1234 + sg_init_table(*new_sg, (unsigned int)alloc_sg_len); 1235 + 1236 + cur = *new_sg; 1237 + while (sg && size) { 1238 + unsigned int len = sg->length; 1239 + unsigned int offset = sg->offset; 1240 + 1241 + if (skip > len) { 1242 + skip -= len; 1243 + sg = sg_next(sg); 1244 + continue; 1245 + } 1246 + 1247 + if (skip) { 1248 + len -= skip; 1249 + offset += skip; 1250 + skip = 0; 1251 + } 1252 + 1253 + if (size < len) 1254 + len = size; 1255 + 1256 + if (len > 0) { 1257 + (*new_sg_len)++; 1258 + size -= len; 1259 + sg_set_page(cur, sg_page(sg), len, offset); 1260 + if (size == 0) 1261 + sg_mark_end(cur); 1262 + cur = sg_next(cur); 1263 + } 1264 + 1265 + sg = sg_next(sg); 1266 + } 1267 + 1268 + return 0; 1269 + } 1270 + 1271 + static int stm32_cryp_cipher_prepare(struct stm32_cryp *cryp, struct scatterlist *in_sg, 1272 + struct scatterlist *out_sg) 1273 + { 1274 + size_t align_size; 1275 + int ret; 1276 + 1277 + cryp->dma_mode = stm32_cryp_dma_check(cryp, in_sg, out_sg); 1278 + 1279 + scatterwalk_start(&cryp->in_walk, in_sg); 1280 + scatterwalk_start(&cryp->out_walk, out_sg); 1281 + 1282 + if (cryp->dma_mode == NO_DMA) { 1283 + cryp->flags &= ~FLG_IN_OUT_DMA; 1284 + 1285 + if (is_ctr(cryp)) 1286 + memset(cryp->last_ctr, 0, sizeof(cryp->last_ctr)); 1287 + 1288 + } else if (cryp->dma_mode == DMA_NEED_SG_TRUNC) { 1289 + 1290 + cryp->flags |= FLG_IN_OUT_DMA; 1291 + 1292 + align_size = ALIGN_DOWN(cryp->payload_in, cryp->hw_blocksize); 1293 + ret = stm32_cryp_truncate_sg(&cryp->in_sg, &cryp->in_sg_len, in_sg, 0, align_size); 1294 + if (ret) 1295 + return ret; 1296 + 1297 + ret = stm32_cryp_truncate_sg(&cryp->out_sg, &cryp->out_sg_len, out_sg, 0, 1298 + align_size); 1299 + if (ret) { 1300 + kfree(cryp->in_sg); 1301 + return ret; 1302 + } 1303 + } else { 1304 + cryp->flags |= FLG_IN_OUT_DMA; 1305 + 1306 + cryp->in_sg = in_sg; 1307 + cryp->out_sg = out_sg; 1308 + 1309 + ret = sg_nents_for_len(cryp->in_sg, cryp->payload_in); 1310 + if (ret < 0) 1311 + return ret; 1312 + cryp->in_sg_len = (size_t)ret; 1313 + 1314 + ret = sg_nents_for_len(out_sg, cryp->payload_out); 1315 + if (ret < 0) 1316 + return ret; 1317 + cryp->out_sg_len = (size_t)ret; 1318 + } 1319 + 1320 + return 0; 1321 + } 1322 + 1323 + static int stm32_cryp_aead_prepare(struct stm32_cryp *cryp, struct scatterlist *in_sg, 1324 + struct scatterlist *out_sg) 1325 + { 1326 + size_t align_size; 1327 + off_t skip; 1328 + int ret, ret2; 1329 + 1330 + cryp->header_sg = NULL; 1331 + cryp->in_sg = NULL; 1332 + cryp->out_sg = NULL; 1333 + 1334 + if (!cryp->dma_lch_in || !cryp->dma_lch_out) { 1335 + cryp->dma_mode = NO_DMA; 1336 + cryp->flags &= ~(FLG_IN_OUT_DMA | FLG_HEADER_DMA); 1337 + 1338 + return 0; 1339 + } 1340 + 1341 + /* CCM hw_init may have advanced in header */ 1342 + skip = cryp->areq->assoclen - cryp->header_in; 1343 + 1344 + align_size = ALIGN_DOWN(cryp->header_in, cryp->hw_blocksize); 1345 + ret = stm32_cryp_truncate_sg(&cryp->header_sg, &cryp->header_sg_len, in_sg, skip, 1346 + align_size); 1347 + if (ret) 1348 + return ret; 1349 + 1350 + ret = stm32_cryp_dma_check_sg(cryp->header_sg, align_size, AES_BLOCK_SIZE); 1351 + if (ret == NO_DMA) { 1352 + /* We cannot DMA the header */ 1353 + kfree(cryp->header_sg); 1354 + cryp->header_sg = NULL; 1355 + 1356 + cryp->flags &= ~FLG_HEADER_DMA; 1357 + } else { 1358 + cryp->flags |= FLG_HEADER_DMA; 1359 + } 1360 + 1361 + /* Now skip all header to be at payload start */ 1362 + skip = cryp->areq->assoclen; 1363 + align_size = ALIGN_DOWN(cryp->payload_in, cryp->hw_blocksize); 1364 + ret = stm32_cryp_truncate_sg(&cryp->in_sg, &cryp->in_sg_len, in_sg, skip, align_size); 1365 + if (ret) { 1366 + kfree(cryp->header_sg); 1367 + return ret; 1368 + } 1369 + 1370 + /* For out buffer align_size is same as in buffer */ 1371 + ret = stm32_cryp_truncate_sg(&cryp->out_sg, &cryp->out_sg_len, out_sg, skip, align_size); 1372 + if (ret) { 1373 + kfree(cryp->header_sg); 1374 + kfree(cryp->in_sg); 1375 + return ret; 1376 + } 1377 + 1378 + ret = stm32_cryp_dma_check_sg(cryp->in_sg, align_size, AES_BLOCK_SIZE); 1379 + ret2 = stm32_cryp_dma_check_sg(cryp->out_sg, align_size, AES_BLOCK_SIZE); 1380 + if (ret == NO_DMA || ret2 == NO_DMA) { 1381 + kfree(cryp->in_sg); 1382 + cryp->in_sg = NULL; 1383 + 1384 + kfree(cryp->out_sg); 1385 + cryp->out_sg = NULL; 1386 + 1387 + cryp->flags &= ~FLG_IN_OUT_DMA; 1388 + } else { 1389 + cryp->flags |= FLG_IN_OUT_DMA; 1390 + } 1391 + 1392 + return 0; 1393 + } 1394 + 1423 1395 static int stm32_cryp_prepare_req(struct skcipher_request *req, 1424 1396 struct aead_request *areq) 1425 1397 { 1426 1398 struct stm32_cryp_ctx *ctx; 1427 1399 struct stm32_cryp *cryp; 1428 1400 struct stm32_cryp_reqctx *rctx; 1429 - struct scatterlist *in_sg; 1401 + struct scatterlist *in_sg, *out_sg; 1430 1402 int ret; 1431 1403 1432 1404 if (!req && !areq) ··· 1683 1169 rctx = req ? skcipher_request_ctx(req) : aead_request_ctx(areq); 1684 1170 rctx->mode &= FLG_MODE_MASK; 1685 1171 1686 - ctx->cryp = cryp; 1687 - 1688 1172 cryp->flags = (cryp->flags & ~FLG_MODE_MASK) | rctx->mode; 1689 1173 cryp->hw_blocksize = is_aes(cryp) ? AES_BLOCK_SIZE : DES_BLOCK_SIZE; 1690 1174 cryp->ctx = ctx; ··· 1694 1182 cryp->payload_in = req->cryptlen; 1695 1183 cryp->payload_out = req->cryptlen; 1696 1184 cryp->authsize = 0; 1185 + 1186 + in_sg = req->src; 1187 + out_sg = req->dst; 1188 + 1189 + ret = stm32_cryp_cipher_prepare(cryp, in_sg, out_sg); 1190 + if (ret) 1191 + return ret; 1192 + 1193 + ret = stm32_cryp_hw_init(cryp); 1697 1194 } else { 1698 1195 /* 1699 1196 * Length of input and output data: ··· 1732 1211 cryp->header_in = areq->assoclen; 1733 1212 cryp->payload_out = cryp->payload_in; 1734 1213 } 1735 - } 1736 1214 1737 - in_sg = req ? req->src : areq->src; 1738 - scatterwalk_start(&cryp->in_walk, in_sg); 1215 + in_sg = areq->src; 1216 + out_sg = areq->dst; 1739 1217 1740 - cryp->out_sg = req ? req->dst : areq->dst; 1741 - scatterwalk_start(&cryp->out_walk, cryp->out_sg); 1742 - 1743 - if (is_gcm(cryp) || is_ccm(cryp)) { 1218 + scatterwalk_start(&cryp->in_walk, in_sg); 1219 + scatterwalk_start(&cryp->out_walk, out_sg); 1744 1220 /* In output, jump after assoc data */ 1745 1221 scatterwalk_copychunks(NULL, &cryp->out_walk, cryp->areq->assoclen, 2); 1222 + 1223 + ret = stm32_cryp_hw_init(cryp); 1224 + if (ret) 1225 + return ret; 1226 + 1227 + ret = stm32_cryp_aead_prepare(cryp, in_sg, out_sg); 1746 1228 } 1747 1229 1748 - if (is_ctr(cryp)) 1749 - memset(cryp->last_ctr, 0, sizeof(cryp->last_ctr)); 1750 - 1751 - ret = stm32_cryp_hw_init(cryp); 1752 1230 return ret; 1753 1231 } 1754 1232 ··· 1759 1239 struct stm32_cryp_ctx *ctx = crypto_skcipher_ctx( 1760 1240 crypto_skcipher_reqtfm(req)); 1761 1241 struct stm32_cryp *cryp = ctx->cryp; 1242 + int ret; 1762 1243 1763 1244 if (!cryp) 1764 1245 return -ENODEV; 1765 1246 1766 - return stm32_cryp_prepare_req(req, NULL) ?: 1767 - stm32_cryp_cpu_start(cryp); 1247 + ret = stm32_cryp_prepare_req(req, NULL); 1248 + if (ret) 1249 + return ret; 1250 + 1251 + if (cryp->flags & FLG_IN_OUT_DMA) 1252 + ret = stm32_cryp_dma_start(cryp); 1253 + else 1254 + ret = stm32_cryp_it_start(cryp); 1255 + 1256 + if (ret == -ETIMEDOUT) 1257 + stm32_cryp_finish_req(cryp, ret); 1258 + 1259 + return ret; 1768 1260 } 1769 1261 1770 1262 static int stm32_cryp_aead_one_req(struct crypto_engine *engine, void *areq) ··· 1794 1262 if (err) 1795 1263 return err; 1796 1264 1797 - if (unlikely(!cryp->payload_in && !cryp->header_in)) { 1265 + if (!stm32_cryp_get_input_text_len(cryp) && !cryp->header_in && 1266 + !(cryp->flags & FLG_HEADER_DMA)) { 1798 1267 /* No input data to process: get tag and finish */ 1799 1268 stm32_cryp_finish_req(cryp, 0); 1800 1269 return 0; 1801 1270 } 1802 1271 1803 - return stm32_cryp_cpu_start(cryp); 1272 + if (cryp->flags & FLG_HEADER_DMA) 1273 + return stm32_cryp_header_dma_start(cryp); 1274 + 1275 + if (!cryp->header_in && cryp->flags & FLG_IN_OUT_DMA) 1276 + return stm32_cryp_dma_start(cryp); 1277 + 1278 + return stm32_cryp_it_start(cryp); 1804 1279 } 1805 1280 1806 1281 static int stm32_cryp_read_auth_tag(struct stm32_cryp *cryp) ··· 2204 1665 it_mask &= ~IMSCR_OUT; 2205 1666 stm32_cryp_write(cryp, cryp->caps->imsc, it_mask); 2206 1667 2207 - if (!cryp->payload_in && !cryp->header_in && !cryp->payload_out) 1668 + if (!cryp->payload_in && !cryp->header_in && !cryp->payload_out) { 1669 + local_bh_disable(); 2208 1670 stm32_cryp_finish_req(cryp, 0); 1671 + local_bh_enable(); 1672 + } 2209 1673 2210 1674 return IRQ_HANDLED; 2211 1675 } ··· 2222 1680 return IRQ_WAKE_THREAD; 2223 1681 } 2224 1682 1683 + static int stm32_cryp_dma_init(struct stm32_cryp *cryp) 1684 + { 1685 + struct dma_slave_config dma_conf; 1686 + struct dma_chan *chan; 1687 + int ret; 1688 + 1689 + memset(&dma_conf, 0, sizeof(dma_conf)); 1690 + 1691 + dma_conf.direction = DMA_MEM_TO_DEV; 1692 + dma_conf.dst_addr = cryp->phys_base + cryp->caps->din; 1693 + dma_conf.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; 1694 + dma_conf.dst_maxburst = CRYP_DMA_BURST_REG; 1695 + dma_conf.device_fc = false; 1696 + 1697 + chan = dma_request_chan(cryp->dev, "in"); 1698 + if (IS_ERR(chan)) 1699 + return PTR_ERR(chan); 1700 + 1701 + cryp->dma_lch_in = chan; 1702 + ret = dmaengine_slave_config(cryp->dma_lch_in, &dma_conf); 1703 + if (ret) { 1704 + dma_release_channel(cryp->dma_lch_in); 1705 + cryp->dma_lch_in = NULL; 1706 + dev_err(cryp->dev, "Couldn't configure DMA in slave.\n"); 1707 + return ret; 1708 + } 1709 + 1710 + memset(&dma_conf, 0, sizeof(dma_conf)); 1711 + 1712 + dma_conf.direction = DMA_DEV_TO_MEM; 1713 + dma_conf.src_addr = cryp->phys_base + cryp->caps->dout; 1714 + dma_conf.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; 1715 + dma_conf.src_maxburst = CRYP_DMA_BURST_REG; 1716 + dma_conf.device_fc = false; 1717 + 1718 + chan = dma_request_chan(cryp->dev, "out"); 1719 + if (IS_ERR(chan)) { 1720 + dma_release_channel(cryp->dma_lch_in); 1721 + cryp->dma_lch_in = NULL; 1722 + return PTR_ERR(chan); 1723 + } 1724 + 1725 + cryp->dma_lch_out = chan; 1726 + 1727 + ret = dmaengine_slave_config(cryp->dma_lch_out, &dma_conf); 1728 + if (ret) { 1729 + dma_release_channel(cryp->dma_lch_out); 1730 + cryp->dma_lch_out = NULL; 1731 + dev_err(cryp->dev, "Couldn't configure DMA out slave.\n"); 1732 + dma_release_channel(cryp->dma_lch_in); 1733 + cryp->dma_lch_in = NULL; 1734 + return ret; 1735 + } 1736 + 1737 + init_completion(&cryp->dma_completion); 1738 + 1739 + return 0; 1740 + } 1741 + 2225 1742 static struct skcipher_engine_alg crypto_algs[] = { 2226 1743 { 2227 1744 .base = { 2228 1745 .base.cra_name = "ecb(aes)", 2229 1746 .base.cra_driver_name = "stm32-ecb-aes", 2230 - .base.cra_priority = 200, 2231 - .base.cra_flags = CRYPTO_ALG_ASYNC, 1747 + .base.cra_priority = 300, 1748 + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, 2232 1749 .base.cra_blocksize = AES_BLOCK_SIZE, 2233 1750 .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), 2234 1751 .base.cra_alignmask = 0, ··· 2308 1707 .base = { 2309 1708 .base.cra_name = "cbc(aes)", 2310 1709 .base.cra_driver_name = "stm32-cbc-aes", 2311 - .base.cra_priority = 200, 2312 - .base.cra_flags = CRYPTO_ALG_ASYNC, 1710 + .base.cra_priority = 300, 1711 + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, 2313 1712 .base.cra_blocksize = AES_BLOCK_SIZE, 2314 1713 .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), 2315 1714 .base.cra_alignmask = 0, ··· 2331 1730 .base = { 2332 1731 .base.cra_name = "ctr(aes)", 2333 1732 .base.cra_driver_name = "stm32-ctr-aes", 2334 - .base.cra_priority = 200, 2335 - .base.cra_flags = CRYPTO_ALG_ASYNC, 1733 + .base.cra_priority = 300, 1734 + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, 2336 1735 .base.cra_blocksize = 1, 2337 1736 .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), 2338 1737 .base.cra_alignmask = 0, ··· 2354 1753 .base = { 2355 1754 .base.cra_name = "ecb(des)", 2356 1755 .base.cra_driver_name = "stm32-ecb-des", 2357 - .base.cra_priority = 200, 2358 - .base.cra_flags = CRYPTO_ALG_ASYNC, 1756 + .base.cra_priority = 300, 1757 + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, 2359 1758 .base.cra_blocksize = DES_BLOCK_SIZE, 2360 1759 .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), 2361 1760 .base.cra_alignmask = 0, ··· 2376 1775 .base = { 2377 1776 .base.cra_name = "cbc(des)", 2378 1777 .base.cra_driver_name = "stm32-cbc-des", 2379 - .base.cra_priority = 200, 2380 - .base.cra_flags = CRYPTO_ALG_ASYNC, 1778 + .base.cra_priority = 300, 1779 + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, 2381 1780 .base.cra_blocksize = DES_BLOCK_SIZE, 2382 1781 .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), 2383 1782 .base.cra_alignmask = 0, ··· 2399 1798 .base = { 2400 1799 .base.cra_name = "ecb(des3_ede)", 2401 1800 .base.cra_driver_name = "stm32-ecb-des3", 2402 - .base.cra_priority = 200, 2403 - .base.cra_flags = CRYPTO_ALG_ASYNC, 1801 + .base.cra_priority = 300, 1802 + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, 2404 1803 .base.cra_blocksize = DES_BLOCK_SIZE, 2405 1804 .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), 2406 1805 .base.cra_alignmask = 0, ··· 2421 1820 .base = { 2422 1821 .base.cra_name = "cbc(des3_ede)", 2423 1822 .base.cra_driver_name = "stm32-cbc-des3", 2424 - .base.cra_priority = 200, 2425 - .base.cra_flags = CRYPTO_ALG_ASYNC, 1823 + .base.cra_priority = 300, 1824 + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, 2426 1825 .base.cra_blocksize = DES_BLOCK_SIZE, 2427 1826 .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), 2428 1827 .base.cra_alignmask = 0, ··· 2455 1854 .base.base = { 2456 1855 .cra_name = "gcm(aes)", 2457 1856 .cra_driver_name = "stm32-gcm-aes", 2458 - .cra_priority = 200, 2459 - .cra_flags = CRYPTO_ALG_ASYNC, 1857 + .cra_priority = 300, 1858 + .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, 2460 1859 .cra_blocksize = 1, 2461 1860 .cra_ctxsize = sizeof(struct stm32_cryp_ctx), 2462 1861 .cra_alignmask = 0, ··· 2478 1877 .base.base = { 2479 1878 .cra_name = "ccm(aes)", 2480 1879 .cra_driver_name = "stm32-ccm-aes", 2481 - .cra_priority = 200, 2482 - .cra_flags = CRYPTO_ALG_ASYNC, 1880 + .cra_priority = 300, 1881 + .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, 2483 1882 .cra_blocksize = 1, 2484 1883 .cra_ctxsize = sizeof(struct stm32_cryp_ctx), 2485 1884 .cra_alignmask = 0, ··· 2502 1901 .sr = UX500_CRYP_SR, 2503 1902 .din = UX500_CRYP_DIN, 2504 1903 .dout = UX500_CRYP_DOUT, 1904 + .dmacr = UX500_CRYP_DMACR, 2505 1905 .imsc = UX500_CRYP_IMSC, 2506 1906 .mis = UX500_CRYP_MIS, 2507 1907 .k1l = UX500_CRYP_K1L, ··· 2525 1923 .sr = CRYP_SR, 2526 1924 .din = CRYP_DIN, 2527 1925 .dout = CRYP_DOUT, 1926 + .dmacr = CRYP_DMACR, 2528 1927 .imsc = CRYP_IMSCR, 2529 1928 .mis = CRYP_MISR, 2530 1929 .k1l = CRYP_K1LR, ··· 2548 1945 .sr = CRYP_SR, 2549 1946 .din = CRYP_DIN, 2550 1947 .dout = CRYP_DOUT, 1948 + .dmacr = CRYP_DMACR, 2551 1949 .imsc = CRYP_IMSCR, 2552 1950 .mis = CRYP_MISR, 2553 1951 .k1l = CRYP_K1LR, ··· 2588 1984 cryp->regs = devm_platform_ioremap_resource(pdev, 0); 2589 1985 if (IS_ERR(cryp->regs)) 2590 1986 return PTR_ERR(cryp->regs); 1987 + 1988 + cryp->phys_base = platform_get_resource(pdev, IORESOURCE_MEM, 0)->start; 2591 1989 2592 1990 irq = platform_get_irq(pdev, 0); 2593 1991 if (irq < 0) ··· 2636 2030 2637 2031 platform_set_drvdata(pdev, cryp); 2638 2032 2033 + ret = stm32_cryp_dma_init(cryp); 2034 + switch (ret) { 2035 + case 0: 2036 + break; 2037 + case -ENODEV: 2038 + dev_dbg(dev, "DMA mode not available\n"); 2039 + break; 2040 + default: 2041 + goto err_dma; 2042 + } 2043 + 2639 2044 spin_lock(&cryp_list.lock); 2640 2045 list_add(&cryp->list, &cryp_list.dev_list); 2641 2046 spin_unlock(&cryp_list.lock); ··· 2692 2075 spin_lock(&cryp_list.lock); 2693 2076 list_del(&cryp->list); 2694 2077 spin_unlock(&cryp_list.lock); 2078 + 2079 + if (cryp->dma_lch_in) 2080 + dma_release_channel(cryp->dma_lch_in); 2081 + if (cryp->dma_lch_out) 2082 + dma_release_channel(cryp->dma_lch_out); 2083 + err_dma: 2695 2084 err_rst: 2696 2085 pm_runtime_disable(dev); 2697 2086 pm_runtime_put_noidle(dev); ··· 2723 2100 spin_lock(&cryp_list.lock); 2724 2101 list_del(&cryp->list); 2725 2102 spin_unlock(&cryp_list.lock); 2103 + 2104 + if (cryp->dma_lch_in) 2105 + dma_release_channel(cryp->dma_lch_in); 2106 + 2107 + if (cryp->dma_lch_out) 2108 + dma_release_channel(cryp->dma_lch_out); 2726 2109 2727 2110 pm_runtime_disable(cryp->dev); 2728 2111 pm_runtime_put_noidle(cryp->dev);

-1

drivers/crypto/tegra/tegra-se-main.c

··· 326 326 327 327 crypto_engine_stop(se->engine); 328 328 crypto_engine_exit(se->engine); 329 - iommu_fwspec_free(se->dev); 330 329 host1x_client_unregister(&se->client); 331 330 } 332 331

+1

drivers/crypto/xilinx/zynqmp-aes-gcm.c

··· 446 446 }; 447 447 448 448 module_platform_driver(zynqmp_aes_driver); 449 + MODULE_DESCRIPTION("Xilinx ZynqMP AES Driver"); 449 450 MODULE_LICENSE("GPL");

+3

include/crypto/internal/ecc.h

··· 63 63 * @nbytes Size of input byte array 64 64 * @out Output digits array 65 65 * @ndigits: Number of digits to create from byte array 66 + * 67 + * The first byte in the input byte array is expected to hold the most 68 + * significant bits of the large integer. 66 69 */ 67 70 void ecc_digits_from_bytes(const u8 *in, unsigned int nbytes, 68 71 u64 *out, unsigned int ndigits);

-28

include/crypto/sm2.h

··· 1 - /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 - /* 3 - * sm2.h - SM2 asymmetric public-key algorithm 4 - * as specified by OSCCA GM/T 0003.1-2012 -- 0003.5-2012 SM2 and 5 - * described at https://tools.ietf.org/html/draft-shen-sm2-ecdsa-02 6 - * 7 - * Copyright (c) 2020, Alibaba Group. 8 - * Written by Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 9 - */ 10 - 11 - #ifndef _CRYPTO_SM2_H 12 - #define _CRYPTO_SM2_H 13 - 14 - struct shash_desc; 15 - 16 - #if IS_REACHABLE(CONFIG_CRYPTO_SM2) 17 - int sm2_compute_z_digest(struct shash_desc *desc, 18 - const void *key, unsigned int keylen, void *dgst); 19 - #else 20 - static inline int sm2_compute_z_digest(struct shash_desc *desc, 21 - const void *key, unsigned int keylen, 22 - void *dgst) 23 - { 24 - return -ENOTSUPP; 25 - } 26 - #endif 27 - 28 - #endif /* _CRYPTO_SM2_H */

+1 -2

include/linux/hw_random.h

··· 13 13 #define LINUX_HWRANDOM_H_ 14 14 15 15 #include <linux/completion.h> 16 - #include <linux/types.h> 17 - #include <linux/list.h> 18 16 #include <linux/kref.h> 17 + #include <linux/types.h> 19 18 20 19 /** 21 20 * struct hwrng - Hardware Random Number Generator driver

+3 -2

include/linux/psp-platform-access.h

··· 6 6 #include <linux/psp.h> 7 7 8 8 enum psp_platform_access_msg { 9 - PSP_CMD_NONE = 0x0, 10 - PSP_I2C_REQ_BUS_CMD = 0x64, 9 + PSP_CMD_NONE = 0x0, 10 + PSP_CMD_HSTI_QUERY = 0x14, 11 + PSP_I2C_REQ_BUS_CMD = 0x64, 11 12 PSP_DYNAMIC_BOOST_GET_NONCE, 12 13 PSP_DYNAMIC_BOOST_SET_UID, 13 14 PSP_DYNAMIC_BOOST_GET_PARAMETER,

+1

lib/crypto/arc4.c

··· 71 71 } 72 72 EXPORT_SYMBOL(arc4_crypt); 73 73 74 + MODULE_DESCRIPTION("ARC4 Cipher Algorithm"); 74 75 MODULE_LICENSE("GPL");

+1

lib/crypto/des.c

··· 899 899 } 900 900 EXPORT_SYMBOL_GPL(des3_ede_decrypt); 901 901 902 + MODULE_DESCRIPTION("DES & Triple DES EDE Cipher Algorithms"); 902 903 MODULE_LICENSE("GPL");

+1

lib/crypto/libchacha.c

··· 32 32 } 33 33 EXPORT_SYMBOL(chacha_crypt_generic); 34 34 35 + MODULE_DESCRIPTION("ChaCha stream cipher (RFC7539)"); 35 36 MODULE_LICENSE("GPL");

+2 -4

lib/crypto/mpi/ec.c

··· 1285 1285 sum = &p2_; 1286 1286 1287 1287 for (j = nbits-1; j >= 0; j--) { 1288 - MPI_POINT t; 1289 - 1290 1288 sw = mpi_test_bit(scalar, j); 1291 1289 point_swap_cond(q1, q2, sw, ctx); 1292 1290 montgomery_ladder(prd, sum, q1, q2, point->x, ctx); 1293 1291 point_swap_cond(prd, sum, sw, ctx); 1294 - t = q1; q1 = prd; prd = t; 1295 - t = q2; q2 = sum; sum = t; 1292 + swap(q1, prd); 1293 + swap(q2, sum); 1296 1294 } 1297 1295 1298 1296 mpi_clear(result->y);

+4 -6

lib/crypto/mpi/mpi-bit.c

··· 212 212 return; 213 213 } 214 214 215 - if (nlimbs) { 216 - for (i = 0; i < x->nlimbs - nlimbs; i++) 217 - x->d[i] = x->d[i+nlimbs]; 218 - x->d[i] = 0; 219 - x->nlimbs -= nlimbs; 220 - } 215 + for (i = 0; i < x->nlimbs - nlimbs; i++) 216 + x->d[i] = x->d[i+nlimbs]; 217 + x->d[i] = 0; 218 + x->nlimbs -= nlimbs; 221 219 222 220 if (x->nlimbs && nbits) 223 221 mpihelp_rshift(x->d, x->d, x->nlimbs, nbits);

+2 -7

lib/crypto/mpi/mpi-pow.c

··· 176 176 177 177 for (;;) { 178 178 while (c) { 179 - mpi_ptr_t tp; 180 179 mpi_size_t xsize; 181 180 182 181 /*if (mpihelp_mul_n(xp, rp, rp, rsize) < 0) goto enomem */ ··· 206 207 xsize = msize; 207 208 } 208 209 209 - tp = rp; 210 - rp = xp; 211 - xp = tp; 210 + swap(rp, xp); 212 211 rsize = xsize; 213 212 214 213 if ((mpi_limb_signed_t) e < 0) { ··· 232 235 xsize = msize; 233 236 } 234 237 235 - tp = rp; 236 - rp = xp; 237 - xp = tp; 238 + swap(rp, xp); 238 239 rsize = xsize; 239 240 } 240 241 e <<= 1;

+1

lib/crypto/poly1305.c

··· 76 76 77 77 MODULE_LICENSE("GPL"); 78 78 MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); 79 + MODULE_DESCRIPTION("Poly1305 authenticator algorithm, RFC7539");

+1

lib/crypto/sha1.c

··· 137 137 } 138 138 EXPORT_SYMBOL(sha1_init); 139 139 140 + MODULE_DESCRIPTION("SHA-1 Algorithm"); 140 141 MODULE_LICENSE("GPL");

+1

lib/crypto/sha256.c

··· 165 165 } 166 166 EXPORT_SYMBOL(sha256); 167 167 168 + MODULE_DESCRIPTION("SHA-256 Algorithm"); 168 169 MODULE_LICENSE("GPL");

+1

lib/crypto/utils.c

··· 85 85 } 86 86 EXPORT_SYMBOL_GPL(__crypto_xor); 87 87 88 + MODULE_DESCRIPTION("Crypto library utility functions"); 88 89 MODULE_LICENSE("GPL");

+1 -2

security/integrity/digsig_asymmetric.c

··· 114 114 } else if (!strncmp(pk->pkey_algo, "ecdsa-", 6)) { 115 115 /* edcsa-nist-p192 etc. */ 116 116 pks.encoding = "x962"; 117 - } else if (!strcmp(pk->pkey_algo, "ecrdsa") || 118 - !strcmp(pk->pkey_algo, "sm2")) { 117 + } else if (!strcmp(pk->pkey_algo, "ecrdsa")) { 119 118 pks.encoding = "raw"; 120 119 } else { 121 120 ret = -ENOPKG;