crypto: arm64/crc-t10dif - move NEON yield to C code

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Instead of yielding from the bowels of the asm routine if a reschedule
is needed, divide up the input into 4 KB chunks in the C glue. This
simplifies the code substantially, and avoids scheduling out the task
with the asm routine on the call stack, which is undesirable from a
CFI/instrumentation point of view.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Ard Biesheuvel and committed by

Herbert Xu 5 years ago fc754c02 f0070f4a

+35 -38

2 changed files

expand all

arch

arm64

crypto

crct10dif-ce-core.S

crct10dif-ce-glue.c

+11 -32

arch/arm64/crypto/crct10dif-ce-core.S

··· 68 68 .text 69 69 .arch armv8-a+crypto 70 70 71 - init_crc .req w19 72 - buf .req x20 73 - len .req x21 74 - fold_consts_ptr .req x22 71 + init_crc .req w0 72 + buf .req x1 73 + len .req x2 74 + fold_consts_ptr .req x3 75 75 76 76 fold_consts .req v10 77 77 ··· 257 257 .endm 258 258 259 259 .macro crc_t10dif_pmull, p 260 - frame_push 4, 128 261 - 262 - mov init_crc, w0 263 - mov buf, x1 264 - mov len, x2 265 - 266 260 __pmull_init_\p 267 261 268 262 // For sizes less than 256 bytes, we can't fold 128 bytes at a time. ··· 311 317 fold_32_bytes \p, v6, v7 312 318 313 319 subs len, len, #128 314 - b.lt .Lfold_128_bytes_loop_done_\@ 315 - 316 - if_will_cond_yield_neon 317 - stp q0, q1, [sp, #.Lframe_local_offset] 318 - stp q2, q3, [sp, #.Lframe_local_offset + 32] 319 - stp q4, q5, [sp, #.Lframe_local_offset + 64] 320 - stp q6, q7, [sp, #.Lframe_local_offset + 96] 321 - do_cond_yield_neon 322 - ldp q0, q1, [sp, #.Lframe_local_offset] 323 - ldp q2, q3, [sp, #.Lframe_local_offset + 32] 324 - ldp q4, q5, [sp, #.Lframe_local_offset + 64] 325 - ldp q6, q7, [sp, #.Lframe_local_offset + 96] 326 - ld1 {fold_consts.2d}, [fold_consts_ptr] 327 - __pmull_init_\p 328 - __pmull_pre_\p fold_consts 329 - endif_yield_neon 330 - 331 - b .Lfold_128_bytes_loop_\@ 332 - 333 - .Lfold_128_bytes_loop_done_\@: 320 + b.ge .Lfold_128_bytes_loop_\@ 334 321 335 322 // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7. 336 323 ··· 428 453 // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0. 429 454 430 455 umov w0, v0.h[0] 431 - frame_pop 456 + .ifc \p, p8 457 + ldp x29, x30, [sp], #16 458 + .endif 432 459 ret 433 460 434 461 .Lless_than_256_bytes_\@: ··· 466 489 // Assumes len >= 16. 467 490 // 468 491 SYM_FUNC_START(crc_t10dif_pmull_p8) 469 - crc_t10dif_pmull p8 492 + stp x29, x30, [sp, #-16]! 493 + mov x29, sp 494 + crc_t10dif_pmull p8 470 495 SYM_FUNC_END(crc_t10dif_pmull_p8) 471 496 472 497 .align 5

+24 -6

arch/arm64/crypto/crct10dif-ce-glue.c

··· 37 37 u16 *crc = shash_desc_ctx(desc); 38 38 39 39 if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { 40 - kernel_neon_begin(); 41 - *crc = crc_t10dif_pmull_p8(*crc, data, length); 42 - kernel_neon_end(); 40 + do { 41 + unsigned int chunk = length; 42 + 43 + if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE) 44 + chunk = SZ_4K; 45 + 46 + kernel_neon_begin(); 47 + *crc = crc_t10dif_pmull_p8(*crc, data, chunk); 48 + kernel_neon_end(); 49 + data += chunk; 50 + length -= chunk; 51 + } while (length); 43 52 } else { 44 53 *crc = crc_t10dif_generic(*crc, data, length); 45 54 } ··· 62 53 u16 *crc = shash_desc_ctx(desc); 63 54 64 55 if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { 65 - kernel_neon_begin(); 66 - *crc = crc_t10dif_pmull_p64(*crc, data, length); 67 - kernel_neon_end(); 56 + do { 57 + unsigned int chunk = length; 58 + 59 + if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE) 60 + chunk = SZ_4K; 61 + 62 + kernel_neon_begin(); 63 + *crc = crc_t10dif_pmull_p64(*crc, data, chunk); 64 + kernel_neon_end(); 65 + data += chunk; 66 + length -= chunk; 67 + } while (length); 68 68 } else { 69 69 *crc = crc_t10dif_generic(*crc, data, length); 70 70 }