Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-next/sve' into for-next/core

Optimise SVE switching for CPUs with 128-bit implementations.

* for-next/sve:
arm64/sve: Skip flushing Z registers with 128 bit vectors
arm64/sve: Use the sve_flush macros in sve_load_from_fpsimd_state()
arm64/sve: Split _sve_flush macro into separate Z and predicate flushes

+23 -11
+1 -1
arch/arm64/include/asm/fpsimd.h
··· 69 69 extern void sve_save_state(void *state, u32 *pfpsr); 70 70 extern void sve_load_state(void const *state, u32 const *pfpsr, 71 71 unsigned long vq_minus_1); 72 - extern void sve_flush_live(void); 72 + extern void sve_flush_live(unsigned long vq_minus_1); 73 73 extern void sve_load_from_fpsimd_state(struct user_fpsimd_state const *state, 74 74 unsigned long vq_minus_1); 75 75 extern unsigned int sve_get_vl(void);
+3 -1
arch/arm64/include/asm/fpsimdmacros.h
··· 213 213 mov v\nz\().16b, v\nz\().16b 214 214 .endm 215 215 216 - .macro sve_flush 216 + .macro sve_flush_z 217 217 _for n, 0, 31, _sve_flush_z \n 218 + .endm 219 + .macro sve_flush_p_ffr 218 220 _for n, 0, 15, _sve_pfalse \n 219 221 _sve_wrffr 0 220 222 .endm
+15 -7
arch/arm64/kernel/entry-fpsimd.S
··· 63 63 * and the rest zeroed. All the other SVE registers will be zeroed. 64 64 */ 65 65 SYM_FUNC_START(sve_load_from_fpsimd_state) 66 - sve_load_vq x1, x2, x3 67 - fpsimd_restore x0, 8 68 - _for n, 0, 15, _sve_pfalse \n 69 - _sve_wrffr 0 70 - ret 66 + sve_load_vq x1, x2, x3 67 + fpsimd_restore x0, 8 68 + sve_flush_p_ffr 69 + ret 71 70 SYM_FUNC_END(sve_load_from_fpsimd_state) 72 71 73 - /* Zero all SVE registers but the first 128-bits of each vector */ 72 + /* 73 + * Zero all SVE registers but the first 128-bits of each vector 74 + * 75 + * VQ must already be configured by caller, any further updates of VQ 76 + * will need to ensure that the register state remains valid. 77 + * 78 + * x0 = VQ - 1 79 + */ 74 80 SYM_FUNC_START(sve_flush_live) 75 - sve_flush 81 + cbz x0, 1f // A VQ-1 of 0 is 128 bits so no extra Z state 82 + sve_flush_z 83 + 1: sve_flush_p_ffr 76 84 ret 77 85 SYM_FUNC_END(sve_flush_live) 78 86
+4 -2
arch/arm64/kernel/fpsimd.c
··· 957 957 * disabling the trap, otherwise update our in-memory copy. 958 958 */ 959 959 if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { 960 - sve_set_vq(sve_vq_from_vl(current->thread.sve_vl) - 1); 961 - sve_flush_live(); 960 + unsigned long vq_minus_one = 961 + sve_vq_from_vl(current->thread.sve_vl) - 1; 962 + sve_set_vq(vq_minus_one); 963 + sve_flush_live(vq_minus_one); 962 964 fpsimd_bind_task_to_cpu(); 963 965 } else { 964 966 fpsimd_to_sve(current);