Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

crypto: mips/chacha - wire up accelerated 32r2 code from Zinc

This integrates the accelerated MIPS 32r2 implementation of ChaCha
into both the API and library interfaces of the kernel crypto stack.

The significance of this is that, in addition to becoming available
as an accelerated library implementation, it can also be used by
existing crypto API code such as Adiantum (for block encryption on
ultra low performance cores) or IPsec using chacha20poly1305. These
are use cases that have already opted into using the abstract crypto
API. In order to support Adiantum, the core assembler routine has
been adapted to take the round count as a function argument rather
than hardcoding it to 20.

Co-developed-by: René van Dorst <opensource@vdorst.com>
Signed-off-by: René van Dorst <opensource@vdorst.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Ard Biesheuvel and committed by
Herbert Xu
3a2f58f3 49aa7c00

+277 -44
+1 -1
arch/mips/Makefile
··· 323 323 # See arch/mips/Kbuild for content of core part of the kernel 324 324 core-y += arch/mips/ 325 325 326 - drivers-$(CONFIG_MIPS_CRC_SUPPORT) += arch/mips/crypto/ 326 + drivers-y += arch/mips/crypto/ 327 327 drivers-$(CONFIG_OPROFILE) += arch/mips/oprofile/ 328 328 329 329 # suspend and hibernation support
+4
arch/mips/crypto/Makefile
··· 4 4 # 5 5 6 6 obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32-mips.o 7 + 8 + obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o 9 + chacha-mips-y := chacha-core.o chacha-glue.o 10 + AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
+116 -43
arch/mips/crypto/chacha-core.S
··· 125 125 #define CONCAT3(a,b,c) _CONCAT3(a,b,c) 126 126 127 127 #define STORE_UNALIGNED(x) \ 128 - CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \ 128 + CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \ 129 129 .if (x != 12); \ 130 130 lw T0, (x*4)(STATE); \ 131 131 .endif; \ ··· 142 142 swr X ## x, (x*4)+LSB ## (OUT); 143 143 144 144 #define STORE_ALIGNED(x) \ 145 - CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ 145 + CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ 146 146 .if (x != 12); \ 147 147 lw T0, (x*4)(STATE); \ 148 148 .endif; \ ··· 162 162 * Every jumptable entry must be equal in size. 163 163 */ 164 164 #define JMPTBL_ALIGNED(x) \ 165 - .Lchacha20_mips_jmptbl_aligned_ ## x: ; \ 165 + .Lchacha_mips_jmptbl_aligned_ ## x: ; \ 166 166 .set noreorder; \ 167 - b .Lchacha20_mips_xor_aligned_ ## x ## _b; \ 167 + b .Lchacha_mips_xor_aligned_ ## x ## _b; \ 168 168 .if (x == 12); \ 169 169 addu SAVED_X, X ## x, NONCE_0; \ 170 170 .else; \ ··· 173 173 .set reorder 174 174 175 175 #define JMPTBL_UNALIGNED(x) \ 176 - .Lchacha20_mips_jmptbl_unaligned_ ## x: ; \ 176 + .Lchacha_mips_jmptbl_unaligned_ ## x: ; \ 177 177 .set noreorder; \ 178 - b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \ 178 + b .Lchacha_mips_xor_unaligned_ ## x ## _b; \ 179 179 .if (x == 12); \ 180 180 addu SAVED_X, X ## x, NONCE_0; \ 181 181 .else; \ ··· 200 200 .text 201 201 .set reorder 202 202 .set noat 203 - .globl chacha20_mips 204 - .ent chacha20_mips 205 - chacha20_mips: 203 + .globl chacha_crypt_arch 204 + .ent chacha_crypt_arch 205 + chacha_crypt_arch: 206 206 .frame $sp, STACK_SIZE, $ra 207 + 208 + /* Load number of rounds */ 209 + lw $at, 16($sp) 207 210 208 211 addiu $sp, -STACK_SIZE 209 212 210 213 /* Return bytes = 0. */ 211 - beqz BYTES, .Lchacha20_mips_end 214 + beqz BYTES, .Lchacha_mips_end 212 215 213 216 lw NONCE_0, 48(STATE) 214 217 ··· 231 228 or IS_UNALIGNED, IN, OUT 232 229 andi IS_UNALIGNED, 0x3 233 230 234 - /* Set number of rounds */ 235 - li $at, 20 236 - 237 - b .Lchacha20_rounds_start 231 + b .Lchacha_rounds_start 238 232 239 233 .align 4 240 - .Loop_chacha20_rounds: 234 + .Loop_chacha_rounds: 241 235 addiu IN, CHACHA20_BLOCK_SIZE 242 236 addiu OUT, CHACHA20_BLOCK_SIZE 243 237 addiu NONCE_0, 1 244 238 245 - .Lchacha20_rounds_start: 239 + .Lchacha_rounds_start: 246 240 lw X0, 0(STATE) 247 241 lw X1, 4(STATE) 248 242 lw X2, 8(STATE) ··· 259 259 lw X14, 56(STATE) 260 260 lw X15, 60(STATE) 261 261 262 - .Loop_chacha20_xor_rounds: 262 + .Loop_chacha_xor_rounds: 263 263 addiu $at, -2 264 264 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); 265 265 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); ··· 269 269 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); 270 270 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); 271 271 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); 272 - bnez $at, .Loop_chacha20_xor_rounds 272 + bnez $at, .Loop_chacha_xor_rounds 273 273 274 274 addiu BYTES, -(CHACHA20_BLOCK_SIZE) 275 275 276 276 /* Is data src/dst unaligned? Jump */ 277 - bnez IS_UNALIGNED, .Loop_chacha20_unaligned 277 + bnez IS_UNALIGNED, .Loop_chacha_unaligned 278 278 279 279 /* Set number rounds here to fill delayslot. */ 280 - li $at, 20 280 + lw $at, (STACK_SIZE+16)($sp) 281 281 282 282 /* BYTES < 0, it has no full block. */ 283 - bltz BYTES, .Lchacha20_mips_no_full_block_aligned 283 + bltz BYTES, .Lchacha_mips_no_full_block_aligned 284 284 285 285 FOR_EACH_WORD_REV(STORE_ALIGNED) 286 286 287 287 /* BYTES > 0? Loop again. */ 288 - bgtz BYTES, .Loop_chacha20_rounds 288 + bgtz BYTES, .Loop_chacha_rounds 289 289 290 290 /* Place this here to fill delay slot */ 291 291 addiu NONCE_0, 1 292 292 293 293 /* BYTES < 0? Handle last bytes */ 294 - bltz BYTES, .Lchacha20_mips_xor_bytes 294 + bltz BYTES, .Lchacha_mips_xor_bytes 295 295 296 - .Lchacha20_mips_xor_done: 296 + .Lchacha_mips_xor_done: 297 297 /* Restore used registers */ 298 298 lw $s0, 0($sp) 299 299 lw $s1, 4($sp) ··· 307 307 /* Write NONCE_0 back to right location in state */ 308 308 sw NONCE_0, 48(STATE) 309 309 310 - .Lchacha20_mips_end: 310 + .Lchacha_mips_end: 311 311 addiu $sp, STACK_SIZE 312 312 jr $ra 313 313 314 - .Lchacha20_mips_no_full_block_aligned: 314 + .Lchacha_mips_no_full_block_aligned: 315 315 /* Restore the offset on BYTES */ 316 316 addiu BYTES, CHACHA20_BLOCK_SIZE 317 317 ··· 319 319 andi $at, BYTES, MASK_U32 320 320 321 321 /* Load upper half of jump table addr */ 322 - lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0) 322 + lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0) 323 323 324 324 /* Calculate lower half jump table offset */ 325 325 ins T0, $at, 1, 6 ··· 328 328 addu T1, STATE, $at 329 329 330 330 /* Add lower half jump table addr */ 331 - addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0) 331 + addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0) 332 332 333 333 /* Read value from STATE */ 334 334 lw SAVED_CA, 0(T1) ··· 342 342 FOR_EACH_WORD(JMPTBL_ALIGNED) 343 343 344 344 345 - .Loop_chacha20_unaligned: 345 + .Loop_chacha_unaligned: 346 346 /* Set number rounds here to fill delayslot. */ 347 - li $at, 20 347 + lw $at, (STACK_SIZE+16)($sp) 348 348 349 349 /* BYTES > 0, it has no full block. */ 350 - bltz BYTES, .Lchacha20_mips_no_full_block_unaligned 350 + bltz BYTES, .Lchacha_mips_no_full_block_unaligned 351 351 352 352 FOR_EACH_WORD_REV(STORE_UNALIGNED) 353 353 354 354 /* BYTES > 0? Loop again. */ 355 - bgtz BYTES, .Loop_chacha20_rounds 355 + bgtz BYTES, .Loop_chacha_rounds 356 356 357 357 /* Write NONCE_0 back to right location in state */ 358 358 sw NONCE_0, 48(STATE) 359 359 360 360 .set noreorder 361 361 /* Fall through to byte handling */ 362 - bgez BYTES, .Lchacha20_mips_xor_done 363 - .Lchacha20_mips_xor_unaligned_0_b: 364 - .Lchacha20_mips_xor_aligned_0_b: 362 + bgez BYTES, .Lchacha_mips_xor_done 363 + .Lchacha_mips_xor_unaligned_0_b: 364 + .Lchacha_mips_xor_aligned_0_b: 365 365 /* Place this here to fill delay slot */ 366 366 addiu NONCE_0, 1 367 367 .set reorder 368 368 369 - .Lchacha20_mips_xor_bytes: 369 + .Lchacha_mips_xor_bytes: 370 370 addu IN, $at 371 371 addu OUT, $at 372 372 /* First byte */ ··· 376 376 ROTR(SAVED_X) 377 377 xor T1, SAVED_X 378 378 sb T1, 0(OUT) 379 - beqz $at, .Lchacha20_mips_xor_done 379 + beqz $at, .Lchacha_mips_xor_done 380 380 /* Second byte */ 381 381 lbu T1, 1(IN) 382 382 addiu $at, BYTES, 2 383 383 ROTx SAVED_X, 8 384 384 xor T1, SAVED_X 385 385 sb T1, 1(OUT) 386 - beqz $at, .Lchacha20_mips_xor_done 386 + beqz $at, .Lchacha_mips_xor_done 387 387 /* Third byte */ 388 388 lbu T1, 2(IN) 389 389 ROTx SAVED_X, 8 390 390 xor T1, SAVED_X 391 391 sb T1, 2(OUT) 392 - b .Lchacha20_mips_xor_done 392 + b .Lchacha_mips_xor_done 393 393 394 - .Lchacha20_mips_no_full_block_unaligned: 394 + .Lchacha_mips_no_full_block_unaligned: 395 395 /* Restore the offset on BYTES */ 396 396 addiu BYTES, CHACHA20_BLOCK_SIZE 397 397 ··· 399 399 andi $at, BYTES, MASK_U32 400 400 401 401 /* Load upper half of jump table addr */ 402 - lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0) 402 + lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0) 403 403 404 404 /* Calculate lower half jump table offset */ 405 405 ins T0, $at, 1, 6 ··· 408 408 addu T1, STATE, $at 409 409 410 410 /* Add lower half jump table addr */ 411 - addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0) 411 + addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0) 412 412 413 413 /* Read value from STATE */ 414 414 lw SAVED_CA, 0(T1) ··· 420 420 421 421 /* Jump table */ 422 422 FOR_EACH_WORD(JMPTBL_UNALIGNED) 423 - .end chacha20_mips 423 + .end chacha_crypt_arch 424 + .set at 425 + 426 + /* Input arguments 427 + * STATE $a0 428 + * OUT $a1 429 + * NROUND $a2 430 + */ 431 + 432 + #undef X12 433 + #undef X13 434 + #undef X14 435 + #undef X15 436 + 437 + #define X12 $a3 438 + #define X13 $at 439 + #define X14 $v0 440 + #define X15 STATE 441 + 442 + .set noat 443 + .globl hchacha_block_arch 444 + .ent hchacha_block_arch 445 + hchacha_block_arch: 446 + .frame $sp, STACK_SIZE, $ra 447 + 448 + addiu $sp, -STACK_SIZE 449 + 450 + /* Save X11(s6) */ 451 + sw X11, 0($sp) 452 + 453 + lw X0, 0(STATE) 454 + lw X1, 4(STATE) 455 + lw X2, 8(STATE) 456 + lw X3, 12(STATE) 457 + lw X4, 16(STATE) 458 + lw X5, 20(STATE) 459 + lw X6, 24(STATE) 460 + lw X7, 28(STATE) 461 + lw X8, 32(STATE) 462 + lw X9, 36(STATE) 463 + lw X10, 40(STATE) 464 + lw X11, 44(STATE) 465 + lw X12, 48(STATE) 466 + lw X13, 52(STATE) 467 + lw X14, 56(STATE) 468 + lw X15, 60(STATE) 469 + 470 + .Loop_hchacha_xor_rounds: 471 + addiu $a2, -2 472 + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); 473 + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); 474 + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); 475 + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); 476 + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); 477 + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); 478 + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); 479 + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); 480 + bnez $a2, .Loop_hchacha_xor_rounds 481 + 482 + /* Restore used register */ 483 + lw X11, 0($sp) 484 + 485 + sw X0, 0(OUT) 486 + sw X1, 4(OUT) 487 + sw X2, 8(OUT) 488 + sw X3, 12(OUT) 489 + sw X12, 16(OUT) 490 + sw X13, 20(OUT) 491 + sw X14, 24(OUT) 492 + sw X15, 28(OUT) 493 + 494 + addiu $sp, STACK_SIZE 495 + jr $ra 496 + .end hchacha_block_arch 424 497 .set at
+150
arch/mips/crypto/chacha-glue.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * MIPS accelerated ChaCha and XChaCha stream ciphers, 4 + * including ChaCha20 (RFC7539) 5 + * 6 + * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> 7 + */ 8 + 9 + #include <asm/byteorder.h> 10 + #include <crypto/algapi.h> 11 + #include <crypto/internal/chacha.h> 12 + #include <crypto/internal/skcipher.h> 13 + #include <linux/kernel.h> 14 + #include <linux/module.h> 15 + 16 + asmlinkage void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, 17 + unsigned int bytes, int nrounds); 18 + EXPORT_SYMBOL(chacha_crypt_arch); 19 + 20 + asmlinkage void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds); 21 + EXPORT_SYMBOL(hchacha_block_arch); 22 + 23 + void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) 24 + { 25 + chacha_init_generic(state, key, iv); 26 + } 27 + EXPORT_SYMBOL(chacha_init_arch); 28 + 29 + static int chacha_mips_stream_xor(struct skcipher_request *req, 30 + const struct chacha_ctx *ctx, const u8 *iv) 31 + { 32 + struct skcipher_walk walk; 33 + u32 state[16]; 34 + int err; 35 + 36 + err = skcipher_walk_virt(&walk, req, false); 37 + 38 + chacha_init_generic(state, ctx->key, iv); 39 + 40 + while (walk.nbytes > 0) { 41 + unsigned int nbytes = walk.nbytes; 42 + 43 + if (nbytes < walk.total) 44 + nbytes = round_down(nbytes, walk.stride); 45 + 46 + chacha_crypt(state, walk.dst.virt.addr, walk.src.virt.addr, 47 + nbytes, ctx->nrounds); 48 + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); 49 + } 50 + 51 + return err; 52 + } 53 + 54 + static int chacha_mips(struct skcipher_request *req) 55 + { 56 + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 57 + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); 58 + 59 + return chacha_mips_stream_xor(req, ctx, req->iv); 60 + } 61 + 62 + static int xchacha_mips(struct skcipher_request *req) 63 + { 64 + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 65 + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); 66 + struct chacha_ctx subctx; 67 + u32 state[16]; 68 + u8 real_iv[16]; 69 + 70 + chacha_init_generic(state, ctx->key, req->iv); 71 + 72 + hchacha_block(state, subctx.key, ctx->nrounds); 73 + subctx.nrounds = ctx->nrounds; 74 + 75 + memcpy(&real_iv[0], req->iv + 24, 8); 76 + memcpy(&real_iv[8], req->iv + 16, 8); 77 + return chacha_mips_stream_xor(req, &subctx, real_iv); 78 + } 79 + 80 + static struct skcipher_alg algs[] = { 81 + { 82 + .base.cra_name = "chacha20", 83 + .base.cra_driver_name = "chacha20-mips", 84 + .base.cra_priority = 200, 85 + .base.cra_blocksize = 1, 86 + .base.cra_ctxsize = sizeof(struct chacha_ctx), 87 + .base.cra_module = THIS_MODULE, 88 + 89 + .min_keysize = CHACHA_KEY_SIZE, 90 + .max_keysize = CHACHA_KEY_SIZE, 91 + .ivsize = CHACHA_IV_SIZE, 92 + .chunksize = CHACHA_BLOCK_SIZE, 93 + .setkey = chacha20_setkey, 94 + .encrypt = chacha_mips, 95 + .decrypt = chacha_mips, 96 + }, { 97 + .base.cra_name = "xchacha20", 98 + .base.cra_driver_name = "xchacha20-mips", 99 + .base.cra_priority = 200, 100 + .base.cra_blocksize = 1, 101 + .base.cra_ctxsize = sizeof(struct chacha_ctx), 102 + .base.cra_module = THIS_MODULE, 103 + 104 + .min_keysize = CHACHA_KEY_SIZE, 105 + .max_keysize = CHACHA_KEY_SIZE, 106 + .ivsize = XCHACHA_IV_SIZE, 107 + .chunksize = CHACHA_BLOCK_SIZE, 108 + .setkey = chacha20_setkey, 109 + .encrypt = xchacha_mips, 110 + .decrypt = xchacha_mips, 111 + }, { 112 + .base.cra_name = "xchacha12", 113 + .base.cra_driver_name = "xchacha12-mips", 114 + .base.cra_priority = 200, 115 + .base.cra_blocksize = 1, 116 + .base.cra_ctxsize = sizeof(struct chacha_ctx), 117 + .base.cra_module = THIS_MODULE, 118 + 119 + .min_keysize = CHACHA_KEY_SIZE, 120 + .max_keysize = CHACHA_KEY_SIZE, 121 + .ivsize = XCHACHA_IV_SIZE, 122 + .chunksize = CHACHA_BLOCK_SIZE, 123 + .setkey = chacha12_setkey, 124 + .encrypt = xchacha_mips, 125 + .decrypt = xchacha_mips, 126 + } 127 + }; 128 + 129 + static int __init chacha_simd_mod_init(void) 130 + { 131 + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); 132 + } 133 + 134 + static void __exit chacha_simd_mod_fini(void) 135 + { 136 + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); 137 + } 138 + 139 + module_init(chacha_simd_mod_init); 140 + module_exit(chacha_simd_mod_fini); 141 + 142 + MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (MIPS accelerated)"); 143 + MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 144 + MODULE_LICENSE("GPL v2"); 145 + MODULE_ALIAS_CRYPTO("chacha20"); 146 + MODULE_ALIAS_CRYPTO("chacha20-mips"); 147 + MODULE_ALIAS_CRYPTO("xchacha20"); 148 + MODULE_ALIAS_CRYPTO("xchacha20-mips"); 149 + MODULE_ALIAS_CRYPTO("xchacha12"); 150 + MODULE_ALIAS_CRYPTO("xchacha12-mips");
+6
crypto/Kconfig
··· 1442 1442 SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20, 1443 1443 XChaCha20, and XChaCha12 stream ciphers. 1444 1444 1445 + config CRYPTO_CHACHA_MIPS 1446 + tristate "ChaCha stream cipher algorithms (MIPS 32r2 optimized)" 1447 + depends on CPU_MIPS32_R2 1448 + select CRYPTO_BLKCIPHER 1449 + select CRYPTO_ARCH_HAVE_LIB_CHACHA 1450 + 1445 1451 config CRYPTO_SEED 1446 1452 tristate "SEED cipher algorithm" 1447 1453 select CRYPTO_ALGAPI