LoongArch: Add SIMD-optimized XOR routines

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Add LSX and LASX implementations of xor operations, operating on 64
bytes (one L1 cache line) at a time, for a balance between memory
utilization and instruction mix. Huacai confirmed that all future
LoongArch implementations by Loongson (that we care) will likely also
feature 64-byte cache lines, and experiments show no throughput
improvement with further unrolling.

Performance numbers measured during system boot on a 3A5000 @ 2.5GHz:

> 8regs : 12702 MB/sec
> 8regs_prefetch : 10920 MB/sec
> 32regs : 12686 MB/sec
> 32regs_prefetch : 10918 MB/sec
> lsx : 17589 MB/sec
> lasx : 26116 MB/sec

Acked-by: Song Liu <song@kernel.org>
Signed-off-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>

authored by

WANG Xuerui and committed by

Huacai Chen 2 years ago 75ded18a 2478e4b7

+417

7 changed files

expand all

arch

loongarch

include

asm

xor.h

xor_simd.h

lib

Makefile

xor_simd.c

xor_simd.h

xor_simd_glue.c

xor_template.c

+68

arch/loongarch/include/asm/xor.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Copyright (C) 2023 WANG Xuerui <git@xen0n.name> 4 + */ 5 + #ifndef _ASM_LOONGARCH_XOR_H 6 + #define _ASM_LOONGARCH_XOR_H 7 + 8 + #include <asm/cpu-features.h> 9 + #include <asm/xor_simd.h> 10 + 11 + #ifdef CONFIG_CPU_HAS_LSX 12 + static struct xor_block_template xor_block_lsx = { 13 + .name = "lsx", 14 + .do_2 = xor_lsx_2, 15 + .do_3 = xor_lsx_3, 16 + .do_4 = xor_lsx_4, 17 + .do_5 = xor_lsx_5, 18 + }; 19 + 20 + #define XOR_SPEED_LSX() \ 21 + do { \ 22 + if (cpu_has_lsx) \ 23 + xor_speed(&xor_block_lsx); \ 24 + } while (0) 25 + #else /* CONFIG_CPU_HAS_LSX */ 26 + #define XOR_SPEED_LSX() 27 + #endif /* CONFIG_CPU_HAS_LSX */ 28 + 29 + #ifdef CONFIG_CPU_HAS_LASX 30 + static struct xor_block_template xor_block_lasx = { 31 + .name = "lasx", 32 + .do_2 = xor_lasx_2, 33 + .do_3 = xor_lasx_3, 34 + .do_4 = xor_lasx_4, 35 + .do_5 = xor_lasx_5, 36 + }; 37 + 38 + #define XOR_SPEED_LASX() \ 39 + do { \ 40 + if (cpu_has_lasx) \ 41 + xor_speed(&xor_block_lasx); \ 42 + } while (0) 43 + #else /* CONFIG_CPU_HAS_LASX */ 44 + #define XOR_SPEED_LASX() 45 + #endif /* CONFIG_CPU_HAS_LASX */ 46 + 47 + /* 48 + * For grins, also test the generic routines. 49 + * 50 + * More importantly: it cannot be ruled out at this point of time, that some 51 + * future (maybe reduced) models could run the vector algorithms slower than 52 + * the scalar ones, maybe for errata or micro-op reasons. It may be 53 + * appropriate to revisit this after one or two more uarch generations. 54 + */ 55 + #include <asm-generic/xor.h> 56 + 57 + #undef XOR_TRY_TEMPLATES 58 + #define XOR_TRY_TEMPLATES \ 59 + do { \ 60 + xor_speed(&xor_block_8regs); \ 61 + xor_speed(&xor_block_8regs_p); \ 62 + xor_speed(&xor_block_32regs); \ 63 + xor_speed(&xor_block_32regs_p); \ 64 + XOR_SPEED_LSX(); \ 65 + XOR_SPEED_LASX(); \ 66 + } while (0) 67 + 68 + #endif /* _ASM_LOONGARCH_XOR_H */

+34

arch/loongarch/include/asm/xor_simd.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Copyright (C) 2023 WANG Xuerui <git@xen0n.name> 4 + */ 5 + #ifndef _ASM_LOONGARCH_XOR_SIMD_H 6 + #define _ASM_LOONGARCH_XOR_SIMD_H 7 + 8 + #ifdef CONFIG_CPU_HAS_LSX 9 + void xor_lsx_2(unsigned long bytes, unsigned long * __restrict p1, 10 + const unsigned long * __restrict p2); 11 + void xor_lsx_3(unsigned long bytes, unsigned long * __restrict p1, 12 + const unsigned long * __restrict p2, const unsigned long * __restrict p3); 13 + void xor_lsx_4(unsigned long bytes, unsigned long * __restrict p1, 14 + const unsigned long * __restrict p2, const unsigned long * __restrict p3, 15 + const unsigned long * __restrict p4); 16 + void xor_lsx_5(unsigned long bytes, unsigned long * __restrict p1, 17 + const unsigned long * __restrict p2, const unsigned long * __restrict p3, 18 + const unsigned long * __restrict p4, const unsigned long * __restrict p5); 19 + #endif /* CONFIG_CPU_HAS_LSX */ 20 + 21 + #ifdef CONFIG_CPU_HAS_LASX 22 + void xor_lasx_2(unsigned long bytes, unsigned long * __restrict p1, 23 + const unsigned long * __restrict p2); 24 + void xor_lasx_3(unsigned long bytes, unsigned long * __restrict p1, 25 + const unsigned long * __restrict p2, const unsigned long * __restrict p3); 26 + void xor_lasx_4(unsigned long bytes, unsigned long * __restrict p1, 27 + const unsigned long * __restrict p2, const unsigned long * __restrict p3, 28 + const unsigned long * __restrict p4); 29 + void xor_lasx_5(unsigned long bytes, unsigned long * __restrict p1, 30 + const unsigned long * __restrict p2, const unsigned long * __restrict p3, 31 + const unsigned long * __restrict p4, const unsigned long * __restrict p5); 32 + #endif /* CONFIG_CPU_HAS_LASX */ 33 + 34 + #endif /* _ASM_LOONGARCH_XOR_SIMD_H */

arch/loongarch/lib/Makefile

··· 6 6 lib-y += delay.o memset.o memcpy.o memmove.o \ 7 7 clear_user.o copy_user.o csum.o dump_tlb.o unaligned.o 8 8 9 + obj-$(CONFIG_CPU_HAS_LSX) += xor_simd.o xor_simd_glue.o 10 + 9 11 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o

+93

arch/loongarch/lib/xor_simd.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * LoongArch SIMD XOR operations 4 + * 5 + * Copyright (C) 2023 WANG Xuerui <git@xen0n.name> 6 + */ 7 + 8 + #include "xor_simd.h" 9 + 10 + /* 11 + * Process one cache line (64 bytes) per loop. This is assuming all future 12 + * popular LoongArch cores are similar performance-characteristics-wise to the 13 + * current models. 14 + */ 15 + #define LINE_WIDTH 64 16 + 17 + #ifdef CONFIG_CPU_HAS_LSX 18 + 19 + #define LD(reg, base, offset) \ 20 + "vld $vr" #reg ", %[" #base "], " #offset "\n\t" 21 + #define ST(reg, base, offset) \ 22 + "vst $vr" #reg ", %[" #base "], " #offset "\n\t" 23 + #define XOR(dj, k) "vxor.v $vr" #dj ", $vr" #dj ", $vr" #k "\n\t" 24 + 25 + #define LD_INOUT_LINE(base) \ 26 + LD(0, base, 0) \ 27 + LD(1, base, 16) \ 28 + LD(2, base, 32) \ 29 + LD(3, base, 48) 30 + 31 + #define LD_AND_XOR_LINE(base) \ 32 + LD(4, base, 0) \ 33 + LD(5, base, 16) \ 34 + LD(6, base, 32) \ 35 + LD(7, base, 48) \ 36 + XOR(0, 4) \ 37 + XOR(1, 5) \ 38 + XOR(2, 6) \ 39 + XOR(3, 7) 40 + 41 + #define ST_LINE(base) \ 42 + ST(0, base, 0) \ 43 + ST(1, base, 16) \ 44 + ST(2, base, 32) \ 45 + ST(3, base, 48) 46 + 47 + #define XOR_FUNC_NAME(nr) __xor_lsx_##nr 48 + #include "xor_template.c" 49 + 50 + #undef LD 51 + #undef ST 52 + #undef XOR 53 + #undef LD_INOUT_LINE 54 + #undef LD_AND_XOR_LINE 55 + #undef ST_LINE 56 + #undef XOR_FUNC_NAME 57 + 58 + #endif /* CONFIG_CPU_HAS_LSX */ 59 + 60 + #ifdef CONFIG_CPU_HAS_LASX 61 + 62 + #define LD(reg, base, offset) \ 63 + "xvld $xr" #reg ", %[" #base "], " #offset "\n\t" 64 + #define ST(reg, base, offset) \ 65 + "xvst $xr" #reg ", %[" #base "], " #offset "\n\t" 66 + #define XOR(dj, k) "xvxor.v $xr" #dj ", $xr" #dj ", $xr" #k "\n\t" 67 + 68 + #define LD_INOUT_LINE(base) \ 69 + LD(0, base, 0) \ 70 + LD(1, base, 32) 71 + 72 + #define LD_AND_XOR_LINE(base) \ 73 + LD(2, base, 0) \ 74 + LD(3, base, 32) \ 75 + XOR(0, 2) \ 76 + XOR(1, 3) 77 + 78 + #define ST_LINE(base) \ 79 + ST(0, base, 0) \ 80 + ST(1, base, 32) 81 + 82 + #define XOR_FUNC_NAME(nr) __xor_lasx_##nr 83 + #include "xor_template.c" 84 + 85 + #undef LD 86 + #undef ST 87 + #undef XOR 88 + #undef LD_INOUT_LINE 89 + #undef LD_AND_XOR_LINE 90 + #undef ST_LINE 91 + #undef XOR_FUNC_NAME 92 + 93 + #endif /* CONFIG_CPU_HAS_LASX */

+38

arch/loongarch/lib/xor_simd.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Simple interface to link xor_simd.c and xor_simd_glue.c 4 + * 5 + * Separating these files ensures that no SIMD instructions are run outside of 6 + * the kfpu critical section. 7 + */ 8 + 9 + #ifndef __LOONGARCH_LIB_XOR_SIMD_H 10 + #define __LOONGARCH_LIB_XOR_SIMD_H 11 + 12 + #ifdef CONFIG_CPU_HAS_LSX 13 + void __xor_lsx_2(unsigned long bytes, unsigned long * __restrict p1, 14 + const unsigned long * __restrict p2); 15 + void __xor_lsx_3(unsigned long bytes, unsigned long * __restrict p1, 16 + const unsigned long * __restrict p2, const unsigned long * __restrict p3); 17 + void __xor_lsx_4(unsigned long bytes, unsigned long * __restrict p1, 18 + const unsigned long * __restrict p2, const unsigned long * __restrict p3, 19 + const unsigned long * __restrict p4); 20 + void __xor_lsx_5(unsigned long bytes, unsigned long * __restrict p1, 21 + const unsigned long * __restrict p2, const unsigned long * __restrict p3, 22 + const unsigned long * __restrict p4, const unsigned long * __restrict p5); 23 + #endif /* CONFIG_CPU_HAS_LSX */ 24 + 25 + #ifdef CONFIG_CPU_HAS_LASX 26 + void __xor_lasx_2(unsigned long bytes, unsigned long * __restrict p1, 27 + const unsigned long * __restrict p2); 28 + void __xor_lasx_3(unsigned long bytes, unsigned long * __restrict p1, 29 + const unsigned long * __restrict p2, const unsigned long * __restrict p3); 30 + void __xor_lasx_4(unsigned long bytes, unsigned long * __restrict p1, 31 + const unsigned long * __restrict p2, const unsigned long * __restrict p3, 32 + const unsigned long * __restrict p4); 33 + void __xor_lasx_5(unsigned long bytes, unsigned long * __restrict p1, 34 + const unsigned long * __restrict p2, const unsigned long * __restrict p3, 35 + const unsigned long * __restrict p4, const unsigned long * __restrict p5); 36 + #endif /* CONFIG_CPU_HAS_LASX */ 37 + 38 + #endif /* __LOONGARCH_LIB_XOR_SIMD_H */

+72

arch/loongarch/lib/xor_simd_glue.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * LoongArch SIMD XOR operations 4 + * 5 + * Copyright (C) 2023 WANG Xuerui <git@xen0n.name> 6 + */ 7 + 8 + #include <linux/export.h> 9 + #include <linux/sched.h> 10 + #include <asm/fpu.h> 11 + #include <asm/xor_simd.h> 12 + #include "xor_simd.h" 13 + 14 + #define MAKE_XOR_GLUE_2(flavor) \ 15 + void xor_##flavor##_2(unsigned long bytes, unsigned long * __restrict p1, \ 16 + const unsigned long * __restrict p2) \ 17 + { \ 18 + kernel_fpu_begin(); \ 19 + __xor_##flavor##_2(bytes, p1, p2); \ 20 + kernel_fpu_end(); \ 21 + } \ 22 + EXPORT_SYMBOL_GPL(xor_##flavor##_2) 23 + 24 + #define MAKE_XOR_GLUE_3(flavor) \ 25 + void xor_##flavor##_3(unsigned long bytes, unsigned long * __restrict p1, \ 26 + const unsigned long * __restrict p2, \ 27 + const unsigned long * __restrict p3) \ 28 + { \ 29 + kernel_fpu_begin(); \ 30 + __xor_##flavor##_3(bytes, p1, p2, p3); \ 31 + kernel_fpu_end(); \ 32 + } \ 33 + EXPORT_SYMBOL_GPL(xor_##flavor##_3) 34 + 35 + #define MAKE_XOR_GLUE_4(flavor) \ 36 + void xor_##flavor##_4(unsigned long bytes, unsigned long * __restrict p1, \ 37 + const unsigned long * __restrict p2, \ 38 + const unsigned long * __restrict p3, \ 39 + const unsigned long * __restrict p4) \ 40 + { \ 41 + kernel_fpu_begin(); \ 42 + __xor_##flavor##_4(bytes, p1, p2, p3, p4); \ 43 + kernel_fpu_end(); \ 44 + } \ 45 + EXPORT_SYMBOL_GPL(xor_##flavor##_4) 46 + 47 + #define MAKE_XOR_GLUE_5(flavor) \ 48 + void xor_##flavor##_5(unsigned long bytes, unsigned long * __restrict p1, \ 49 + const unsigned long * __restrict p2, \ 50 + const unsigned long * __restrict p3, \ 51 + const unsigned long * __restrict p4, \ 52 + const unsigned long * __restrict p5) \ 53 + { \ 54 + kernel_fpu_begin(); \ 55 + __xor_##flavor##_5(bytes, p1, p2, p3, p4, p5); \ 56 + kernel_fpu_end(); \ 57 + } \ 58 + EXPORT_SYMBOL_GPL(xor_##flavor##_5) 59 + 60 + #define MAKE_XOR_GLUES(flavor) \ 61 + MAKE_XOR_GLUE_2(flavor); \ 62 + MAKE_XOR_GLUE_3(flavor); \ 63 + MAKE_XOR_GLUE_4(flavor); \ 64 + MAKE_XOR_GLUE_5(flavor) 65 + 66 + #ifdef CONFIG_CPU_HAS_LSX 67 + MAKE_XOR_GLUES(lsx); 68 + #endif 69 + 70 + #ifdef CONFIG_CPU_HAS_LASX 71 + MAKE_XOR_GLUES(lasx); 72 + #endif

+110

arch/loongarch/lib/xor_template.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Copyright (C) 2023 WANG Xuerui <git@xen0n.name> 4 + * 5 + * Template for XOR operations, instantiated in xor_simd.c. 6 + * 7 + * Expected preprocessor definitions: 8 + * 9 + * - LINE_WIDTH 10 + * - XOR_FUNC_NAME(nr) 11 + * - LD_INOUT_LINE(buf) 12 + * - LD_AND_XOR_LINE(buf) 13 + * - ST_LINE(buf) 14 + */ 15 + 16 + void XOR_FUNC_NAME(2)(unsigned long bytes, 17 + unsigned long * __restrict v1, 18 + const unsigned long * __restrict v2) 19 + { 20 + unsigned long lines = bytes / LINE_WIDTH; 21 + 22 + do { 23 + __asm__ __volatile__ ( 24 + LD_INOUT_LINE(v1) 25 + LD_AND_XOR_LINE(v2) 26 + ST_LINE(v1) 27 + : : [v1] "r"(v1), [v2] "r"(v2) : "memory" 28 + ); 29 + 30 + v1 += LINE_WIDTH / sizeof(unsigned long); 31 + v2 += LINE_WIDTH / sizeof(unsigned long); 32 + } while (--lines > 0); 33 + } 34 + 35 + void XOR_FUNC_NAME(3)(unsigned long bytes, 36 + unsigned long * __restrict v1, 37 + const unsigned long * __restrict v2, 38 + const unsigned long * __restrict v3) 39 + { 40 + unsigned long lines = bytes / LINE_WIDTH; 41 + 42 + do { 43 + __asm__ __volatile__ ( 44 + LD_INOUT_LINE(v1) 45 + LD_AND_XOR_LINE(v2) 46 + LD_AND_XOR_LINE(v3) 47 + ST_LINE(v1) 48 + : : [v1] "r"(v1), [v2] "r"(v2), [v3] "r"(v3) : "memory" 49 + ); 50 + 51 + v1 += LINE_WIDTH / sizeof(unsigned long); 52 + v2 += LINE_WIDTH / sizeof(unsigned long); 53 + v3 += LINE_WIDTH / sizeof(unsigned long); 54 + } while (--lines > 0); 55 + } 56 + 57 + void XOR_FUNC_NAME(4)(unsigned long bytes, 58 + unsigned long * __restrict v1, 59 + const unsigned long * __restrict v2, 60 + const unsigned long * __restrict v3, 61 + const unsigned long * __restrict v4) 62 + { 63 + unsigned long lines = bytes / LINE_WIDTH; 64 + 65 + do { 66 + __asm__ __volatile__ ( 67 + LD_INOUT_LINE(v1) 68 + LD_AND_XOR_LINE(v2) 69 + LD_AND_XOR_LINE(v3) 70 + LD_AND_XOR_LINE(v4) 71 + ST_LINE(v1) 72 + : : [v1] "r"(v1), [v2] "r"(v2), [v3] "r"(v3), [v4] "r"(v4) 73 + : "memory" 74 + ); 75 + 76 + v1 += LINE_WIDTH / sizeof(unsigned long); 77 + v2 += LINE_WIDTH / sizeof(unsigned long); 78 + v3 += LINE_WIDTH / sizeof(unsigned long); 79 + v4 += LINE_WIDTH / sizeof(unsigned long); 80 + } while (--lines > 0); 81 + } 82 + 83 + void XOR_FUNC_NAME(5)(unsigned long bytes, 84 + unsigned long * __restrict v1, 85 + const unsigned long * __restrict v2, 86 + const unsigned long * __restrict v3, 87 + const unsigned long * __restrict v4, 88 + const unsigned long * __restrict v5) 89 + { 90 + unsigned long lines = bytes / LINE_WIDTH; 91 + 92 + do { 93 + __asm__ __volatile__ ( 94 + LD_INOUT_LINE(v1) 95 + LD_AND_XOR_LINE(v2) 96 + LD_AND_XOR_LINE(v3) 97 + LD_AND_XOR_LINE(v4) 98 + LD_AND_XOR_LINE(v5) 99 + ST_LINE(v1) 100 + : : [v1] "r"(v1), [v2] "r"(v2), [v3] "r"(v3), [v4] "r"(v4), 101 + [v5] "r"(v5) : "memory" 102 + ); 103 + 104 + v1 += LINE_WIDTH / sizeof(unsigned long); 105 + v2 += LINE_WIDTH / sizeof(unsigned long); 106 + v3 += LINE_WIDTH / sizeof(unsigned long); 107 + v4 += LINE_WIDTH / sizeof(unsigned long); 108 + v5 += LINE_WIDTH / sizeof(unsigned long); 109 + } while (--lines > 0); 110 + }