Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

arm64: fpsimd: run kernel mode NEON with softirqs disabled

Kernel mode NEON can be used in task or softirq context, but only in
a non-nesting manner, i.e., softirq context is only permitted if the
interrupt was not taken at a point where the kernel was using the NEON
in task context.

This means all users of kernel mode NEON have to be aware of this
limitation, and either need to provide scalar fallbacks that may be much
slower (up to 20x for AES instructions) and potentially less safe, or
use an asynchronous interface that defers processing to a later time
when the NEON is guaranteed to be available.

Given that grabbing and releasing the NEON is cheap, we can relax this
restriction, by increasing the granularity of kernel mode NEON code, and
always disabling softirq processing while the NEON is being used in task
context.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210302090118.30666-4-ardb@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

authored by

Ard Biesheuvel and committed by
Catalin Marinas
13150149 4c4dcd35

+31 -15
+1 -1
arch/arm64/crypto/aes-modes.S
··· 700 700 cbz w5, .Lmacout 701 701 encrypt_block v0, w2, x1, x7, w8 702 702 st1 {v0.16b}, [x4] /* return dg */ 703 - cond_yield .Lmacout, x7 703 + cond_yield .Lmacout, x7, x8 704 704 b .Lmacloop4x 705 705 .Lmac1x: 706 706 add w3, w3, #4
+1 -1
arch/arm64/crypto/sha1-ce-core.S
··· 121 121 add dgav.4s, dgav.4s, dg0v.4s 122 122 123 123 cbz w2, 2f 124 - cond_yield 3f, x5 124 + cond_yield 3f, x5, x6 125 125 b 0b 126 126 127 127 /*
+1 -1
arch/arm64/crypto/sha2-ce-core.S
··· 129 129 130 130 /* handled all input blocks? */ 131 131 cbz w2, 2f 132 - cond_yield 3f, x5 132 + cond_yield 3f, x5, x6 133 133 b 0b 134 134 135 135 /*
+2 -2
arch/arm64/crypto/sha3-ce-core.S
··· 184 184 eor v0.16b, v0.16b, v31.16b 185 185 186 186 cbnz w8, 3b 187 - cond_yield 3f, x8 187 + cond_yield 4f, x8, x9 188 188 cbnz w2, 0b 189 189 190 190 /* save state */ 191 - 3: st1 { v0.1d- v3.1d}, [x0], #32 191 + 4: st1 { v0.1d- v3.1d}, [x0], #32 192 192 st1 { v4.1d- v7.1d}, [x0], #32 193 193 st1 { v8.1d-v11.1d}, [x0], #32 194 194 st1 {v12.1d-v15.1d}, [x0], #32
+1 -1
arch/arm64/crypto/sha512-ce-core.S
··· 195 195 add v10.2d, v10.2d, v2.2d 196 196 add v11.2d, v11.2d, v3.2d 197 197 198 - cond_yield 3f, x4 198 + cond_yield 3f, x4, x5 199 199 /* handled all input blocks? */ 200 200 cbnz w2, 0b 201 201
+21 -7
arch/arm64/include/asm/assembler.h
··· 15 15 #include <asm-generic/export.h> 16 16 17 17 #include <asm/asm-offsets.h> 18 + #include <asm/alternative.h> 18 19 #include <asm/cpufeature.h> 19 20 #include <asm/cputype.h> 20 21 #include <asm/debug-monitors.h> ··· 702 701 .endm 703 702 704 703 /* 705 - * Check whether preempt-disabled code should yield as soon as it 706 - * is able. This is the case if re-enabling preemption a single 707 - * time results in a preempt count of zero, and the TIF_NEED_RESCHED 708 - * flag is set. (Note that the latter is stored negated in the 709 - * top word of the thread_info::preempt_count field) 704 + * Check whether preempt/bh-disabled asm code should yield as soon as 705 + * it is able. This is the case if we are currently running in task 706 + * context, and either a softirq is pending, or the TIF_NEED_RESCHED 707 + * flag is set and re-enabling preemption a single time would result in 708 + * a preempt count of zero. (Note that the TIF_NEED_RESCHED flag is 709 + * stored negated in the top word of the thread_info::preempt_count 710 + * field) 710 711 */ 711 - .macro cond_yield, lbl:req, tmp:req 712 - #ifdef CONFIG_PREEMPTION 712 + .macro cond_yield, lbl:req, tmp:req, tmp2:req 713 713 get_current_task \tmp 714 714 ldr \tmp, [\tmp, #TSK_TI_PREEMPT] 715 + /* 716 + * If we are serving a softirq, there is no point in yielding: the 717 + * softirq will not be preempted no matter what we do, so we should 718 + * run to completion as quickly as we can. 719 + */ 720 + tbnz \tmp, #SOFTIRQ_SHIFT, .Lnoyield_\@ 721 + #ifdef CONFIG_PREEMPTION 715 722 sub \tmp, \tmp, #PREEMPT_DISABLE_OFFSET 716 723 cbz \tmp, \lbl 717 724 #endif 725 + adr_l \tmp, irq_stat + IRQ_CPUSTAT_SOFTIRQ_PENDING 726 + this_cpu_offset \tmp2 727 + ldr w\tmp, [\tmp, \tmp2] 728 + cbnz w\tmp, \lbl // yield on pending softirq in task context 729 + .Lnoyield_\@: 718 730 .endm 719 731 720 732 /*
+2
arch/arm64/kernel/asm-offsets.c
··· 95 95 DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); 96 96 BLANK(); 97 97 DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET); 98 + DEFINE(SOFTIRQ_SHIFT, SOFTIRQ_SHIFT); 99 + DEFINE(IRQ_CPUSTAT_SOFTIRQ_PENDING, offsetof(irq_cpustat_t, __softirq_pending)); 98 100 BLANK(); 99 101 DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack)); 100 102 DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task));
+2 -2
arch/arm64/kernel/fpsimd.c
··· 180 180 */ 181 181 static void get_cpu_fpsimd_context(void) 182 182 { 183 - preempt_disable(); 183 + local_bh_disable(); 184 184 __get_cpu_fpsimd_context(); 185 185 } 186 186 ··· 201 201 static void put_cpu_fpsimd_context(void) 202 202 { 203 203 __put_cpu_fpsimd_context(); 204 - preempt_enable(); 204 + local_bh_enable(); 205 205 } 206 206 207 207 static bool have_cpu_fpsimd_context(void)