lib/crypto: arm/blake2b: Migrate optimized code into library

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Migrate the arm-optimized BLAKE2b code from arch/arm/crypto/ to
lib/crypto/arm/. This makes the BLAKE2b library able to use it, and it
also simplifies the code because it's easier to integrate with the
library than crypto_shash.

This temporarily makes the arm-optimized BLAKE2b code unavailable via
crypto_shash. A later commit reimplements the blake2b-* crypto_shash
algorithms on top of the BLAKE2b library API, making it available again.

Note that as per the lib/crypto/ convention, the optimized code is now
enabled by default. So, this also fixes the longstanding issue where
the optimized BLAKE2b code was not enabled by default.

To see the diff from arch/arm/crypto/blake2b-neon-glue.c to
lib/crypto/arm/blake2b.h, view this commit with 'git show -M10'.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251018043106.375964-8-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>

Eric Biggers 5 months ago ba6617bd 23a16c95

+59 -135

7 changed files

expand all

arch

arm

crypto

Kconfig

Makefile

blake2b-neon-glue.c

lib

crypto

Kconfig

Makefile

arm

blake2b-neon-core.S

blake2b.h

-16

arch/arm/crypto/Kconfig

··· 33 33 Architecture: arm using: 34 34 - NEON (Advanced SIMD) extensions 35 35 36 - config CRYPTO_BLAKE2B_NEON 37 - tristate "Hash functions: BLAKE2b (NEON)" 38 - depends on KERNEL_MODE_NEON 39 - select CRYPTO_BLAKE2B 40 - help 41 - BLAKE2b cryptographic hash function (RFC 7693) 42 - 43 - Architecture: arm using 44 - - NEON (Advanced SIMD) extensions 45 - 46 - BLAKE2b digest algorithm optimized with ARM NEON instructions. 47 - On ARM processors that have NEON support but not the ARMv8 48 - Crypto Extensions, typically this BLAKE2b implementation is 49 - much faster than the SHA-2 family and slightly faster than 50 - SHA-1. 51 - 52 36 config CRYPTO_AES_ARM 53 37 tristate "Ciphers: AES" 54 38 select CRYPTO_ALGAPI

-2

arch/arm/crypto/Makefile

··· 5 5 6 6 obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o 7 7 obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o 8 - obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o 9 8 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o 10 9 11 10 obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o ··· 12 13 13 14 aes-arm-y := aes-cipher-core.o aes-cipher-glue.o 14 15 aes-arm-bs-y := aes-neonbs-core.o aes-neonbs-glue.o 15 - blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o 16 16 aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o 17 17 ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o 18 18 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o

+16 -13

arch/arm/crypto/blake2b-neon-core.S lib/crypto/arm/blake2b-neon-core.S

··· 1 1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 2 /* 3 - * BLAKE2b digest algorithm, NEON accelerated 3 + * BLAKE2b digest algorithm optimized with ARM NEON instructions. On ARM 4 + * processors that have NEON support but not the ARMv8 Crypto Extensions, 5 + * typically this BLAKE2b implementation is much faster than the SHA-2 family 6 + * and slightly faster than SHA-1. 4 7 * 5 8 * Copyright 2020 Google LLC 6 9 * ··· 16 13 .fpu neon 17 14 18 15 // The arguments to blake2b_compress_neon() 19 - STATE .req r0 20 - BLOCK .req r1 16 + CTX .req r0 17 + DATA .req r1 21 18 NBLOCKS .req r2 22 19 INC .req r3 23 20 ··· 237 234 .endm 238 235 239 236 // 240 - // void blake2b_compress_neon(struct blake2b_state *state, 241 - // const u8 *block, size_t nblocks, u32 inc); 237 + // void blake2b_compress_neon(struct blake2b_ctx *ctx, 238 + // const u8 *data, size_t nblocks, u32 inc); 242 239 // 243 - // Only the first three fields of struct blake2b_state are used: 240 + // Only the first three fields of struct blake2b_ctx are used: 244 241 // u64 h[8]; (inout) 245 242 // u64 t[2]; (inout) 246 243 // u64 f[2]; (in) ··· 258 255 adr ROR24_TABLE, .Lror24_table 259 256 adr ROR16_TABLE, .Lror16_table 260 257 261 - mov ip, STATE 258 + mov ip, CTX 262 259 vld1.64 {q0-q1}, [ip]! // Load h[0..3] 263 260 vld1.64 {q2-q3}, [ip]! // Load h[4..7] 264 261 .Lnext_block: ··· 284 281 // (q8-q9) in an aligned buffer on the stack so that they can be 285 282 // reloaded when needed. (We could just reload directly from the 286 283 // message buffer, but it's faster to use aligned loads.) 287 - vld1.8 {q8-q9}, [BLOCK]! 284 + vld1.8 {q8-q9}, [DATA]! 288 285 veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1] 289 - vld1.8 {q10-q11}, [BLOCK]! 286 + vld1.8 {q10-q11}, [DATA]! 290 287 veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1] 291 - vld1.8 {q12-q13}, [BLOCK]! 288 + vld1.8 {q12-q13}, [DATA]! 292 289 vst1.8 {q8-q9}, [sp, :256] 293 - mov ip, STATE 294 - vld1.8 {q14-q15}, [BLOCK]! 290 + mov ip, CTX 291 + vld1.8 {q14-q15}, [DATA]! 295 292 296 293 // Execute the rounds. Each round is provided the order in which it 297 294 // needs to use the message words. ··· 322 319 veor q3, q3, q7 // v[6..7] ^= v[14..15] 323 320 veor q0, q0, q8 // v[0..1] ^= h[0..1] 324 321 veor q1, q1, q9 // v[2..3] ^= h[2..3] 325 - mov ip, STATE 322 + mov ip, CTX 326 323 subs NBLOCKS, NBLOCKS, #1 // nblocks-- 327 324 vst1.64 {q0-q1}, [ip]! // Store new h[0..3] 328 325 veor q2, q2, q10 // v[4..5] ^= h[4..5]

-104

arch/arm/crypto/blake2b-neon-glue.c

··· 1 - // SPDX-License-Identifier: GPL-2.0-or-later 2 - /* 3 - * BLAKE2b digest algorithm, NEON accelerated 4 - * 5 - * Copyright 2020 Google LLC 6 - */ 7 - 8 - #include <crypto/internal/blake2b.h> 9 - #include <crypto/internal/hash.h> 10 - 11 - #include <linux/module.h> 12 - #include <linux/sizes.h> 13 - 14 - #include <asm/neon.h> 15 - #include <asm/simd.h> 16 - 17 - asmlinkage void blake2b_compress_neon(struct blake2b_state *state, 18 - const u8 *block, size_t nblocks, u32 inc); 19 - 20 - static void blake2b_compress_arch(struct blake2b_state *state, 21 - const u8 *block, size_t nblocks, u32 inc) 22 - { 23 - do { 24 - const size_t blocks = min_t(size_t, nblocks, 25 - SZ_4K / BLAKE2B_BLOCK_SIZE); 26 - 27 - kernel_neon_begin(); 28 - blake2b_compress_neon(state, block, blocks, inc); 29 - kernel_neon_end(); 30 - 31 - nblocks -= blocks; 32 - block += blocks * BLAKE2B_BLOCK_SIZE; 33 - } while (nblocks); 34 - } 35 - 36 - static int crypto_blake2b_update_neon(struct shash_desc *desc, 37 - const u8 *in, unsigned int inlen) 38 - { 39 - return crypto_blake2b_update_bo(desc, in, inlen, blake2b_compress_arch); 40 - } 41 - 42 - static int crypto_blake2b_finup_neon(struct shash_desc *desc, const u8 *in, 43 - unsigned int inlen, u8 *out) 44 - { 45 - return crypto_blake2b_finup(desc, in, inlen, out, 46 - blake2b_compress_arch); 47 - } 48 - 49 - #define BLAKE2B_ALG(name, driver_name, digest_size) \ 50 - { \ 51 - .base.cra_name = name, \ 52 - .base.cra_driver_name = driver_name, \ 53 - .base.cra_priority = 200, \ 54 - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY | \ 55 - CRYPTO_AHASH_ALG_BLOCK_ONLY | \ 56 - CRYPTO_AHASH_ALG_FINAL_NONZERO, \ 57 - .base.cra_blocksize = BLAKE2B_BLOCK_SIZE, \ 58 - .base.cra_ctxsize = sizeof(struct blake2b_tfm_ctx), \ 59 - .base.cra_module = THIS_MODULE, \ 60 - .digestsize = digest_size, \ 61 - .setkey = crypto_blake2b_setkey, \ 62 - .init = crypto_blake2b_init, \ 63 - .update = crypto_blake2b_update_neon, \ 64 - .finup = crypto_blake2b_finup_neon, \ 65 - .descsize = sizeof(struct blake2b_state), \ 66 - .statesize = BLAKE2B_STATE_SIZE, \ 67 - } 68 - 69 - static struct shash_alg blake2b_neon_algs[] = { 70 - BLAKE2B_ALG("blake2b-160", "blake2b-160-neon", BLAKE2B_160_HASH_SIZE), 71 - BLAKE2B_ALG("blake2b-256", "blake2b-256-neon", BLAKE2B_256_HASH_SIZE), 72 - BLAKE2B_ALG("blake2b-384", "blake2b-384-neon", BLAKE2B_384_HASH_SIZE), 73 - BLAKE2B_ALG("blake2b-512", "blake2b-512-neon", BLAKE2B_512_HASH_SIZE), 74 - }; 75 - 76 - static int __init blake2b_neon_mod_init(void) 77 - { 78 - if (!(elf_hwcap & HWCAP_NEON)) 79 - return -ENODEV; 80 - 81 - return crypto_register_shashes(blake2b_neon_algs, 82 - ARRAY_SIZE(blake2b_neon_algs)); 83 - } 84 - 85 - static void __exit blake2b_neon_mod_exit(void) 86 - { 87 - crypto_unregister_shashes(blake2b_neon_algs, 88 - ARRAY_SIZE(blake2b_neon_algs)); 89 - } 90 - 91 - module_init(blake2b_neon_mod_init); 92 - module_exit(blake2b_neon_mod_exit); 93 - 94 - MODULE_DESCRIPTION("BLAKE2b digest algorithm, NEON accelerated"); 95 - MODULE_LICENSE("GPL"); 96 - MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>"); 97 - MODULE_ALIAS_CRYPTO("blake2b-160"); 98 - MODULE_ALIAS_CRYPTO("blake2b-160-neon"); 99 - MODULE_ALIAS_CRYPTO("blake2b-256"); 100 - MODULE_ALIAS_CRYPTO("blake2b-256-neon"); 101 - MODULE_ALIAS_CRYPTO("blake2b-384"); 102 - MODULE_ALIAS_CRYPTO("blake2b-384-neon"); 103 - MODULE_ALIAS_CRYPTO("blake2b-512"); 104 - MODULE_ALIAS_CRYPTO("blake2b-512-neon");

lib/crypto/Kconfig

··· 37 37 config CRYPTO_LIB_BLAKE2B_ARCH 38 38 bool 39 39 depends on CRYPTO_LIB_BLAKE2B && !UML 40 + default y if ARM && KERNEL_MODE_NEON 40 41 41 42 # BLAKE2s support is always built-in, so there's no CRYPTO_LIB_BLAKE2S option. 42 43

lib/crypto/Makefile

··· 36 36 CFLAGS_blake2b.o := -Wframe-larger-than=4096 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105930 37 37 ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2B_ARCH),y) 38 38 CFLAGS_blake2b.o += -I$(src)/$(SRCARCH) 39 + libblake2b-$(CONFIG_ARM) += arm/blake2b-neon-core.o 39 40 endif # CONFIG_CRYPTO_LIB_BLAKE2B_ARCH 40 41 41 42 ################################################################################

+41

lib/crypto/arm/blake2b.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * BLAKE2b digest algorithm, NEON accelerated 4 + * 5 + * Copyright 2020 Google LLC 6 + */ 7 + 8 + #include <asm/neon.h> 9 + #include <asm/simd.h> 10 + 11 + static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); 12 + 13 + asmlinkage void blake2b_compress_neon(struct blake2b_ctx *ctx, 14 + const u8 *data, size_t nblocks, u32 inc); 15 + 16 + static void blake2b_compress(struct blake2b_ctx *ctx, 17 + const u8 *data, size_t nblocks, u32 inc) 18 + { 19 + if (!static_branch_likely(&have_neon) || !may_use_simd()) { 20 + blake2b_compress_generic(ctx, data, nblocks, inc); 21 + return; 22 + } 23 + do { 24 + const size_t blocks = min_t(size_t, nblocks, 25 + SZ_4K / BLAKE2B_BLOCK_SIZE); 26 + 27 + kernel_neon_begin(); 28 + blake2b_compress_neon(ctx, data, blocks, inc); 29 + kernel_neon_end(); 30 + 31 + data += blocks * BLAKE2B_BLOCK_SIZE; 32 + nblocks -= blocks; 33 + } while (nblocks); 34 + } 35 + 36 + #define blake2b_mod_init_arch blake2b_mod_init_arch 37 + static void blake2b_mod_init_arch(void) 38 + { 39 + if (elf_hwcap & HWCAP_NEON) 40 + static_branch_enable(&have_neon); 41 + }