s390/checksum: provide vector register variant of csum_partial()

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Provide a faster variant of csum_partial() which uses vector registers
instead of the cksm instruction.

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>

Heiko Carstens 2 years ago cb2a1dd5 3a74f44d

+187 -16

6 changed files

expand all

arch

s390

include

asm

checksum.h

fpu-insn-asm.h

fpu-insn.h

fpu-types.h

lib

Makefile

csum-partial.c

+1 -16

arch/s390/include/asm/checksum.h

··· 30 30 return sum; 31 31 } 32 32 33 - /* 34 - * Computes the checksum of a memory block at buff, length len, 35 - * and adds in "sum" (32-bit). 36 - * 37 - * Returns a 32-bit number suitable for feeding into itself 38 - * or csum_tcpudp_magic. 39 - * 40 - * This function must be called with even lengths, except 41 - * for the last fragment, which may be odd. 42 - * 43 - * It's best to have buff aligned on a 32-bit boundary. 44 - */ 45 - static inline __wsum csum_partial(const void *buff, int len, __wsum sum) 46 - { 47 - return cksm(buff, len, sum); 48 - } 33 + __wsum csum_partial(const void *buff, int len, __wsum sum); 49 34 50 35 /* 51 36 * Fold a partial checksum without adding pseudo headers.

+19

arch/s390/include/asm/fpu-insn-asm.h

··· 521 521 VMRL \vr1, \vr2, \vr3, 3 522 522 .endm 523 523 524 + /* VECTOR LOAD WITH LENGTH */ 525 + .macro VLL v, gr, disp, base 526 + VX_NUM v1, \v 527 + GR_NUM b2, \base 528 + GR_NUM r3, \gr 529 + .word 0xE700 | ((v1&15) << 4) | r3 530 + .word (b2 << 12) | (\disp) 531 + MRXBOPC 0, 0x37, v1 532 + .endm 524 533 525 534 /* Vector integer instructions */ 526 535 ··· 541 532 .word 0xE700 | ((v1&15) << 4) | (v2&15) 542 533 .word ((v3&15) << 12) 543 534 MRXBOPC 0, 0x68, v1, v2, v3 535 + .endm 536 + 537 + /* VECTOR CHECKSUM */ 538 + .macro VCKSM vr1, vr2, vr3 539 + VX_NUM v1, \vr1 540 + VX_NUM v2, \vr2 541 + VX_NUM v3, \vr3 542 + .word 0xE700 | ((v1&15) << 4) | (v2&15) 543 + .word ((v3&15) << 12) 544 + MRXBOPC 0, 0x66, v1, v2, v3 544 545 .endm 545 546 546 547 /* VECTOR EXCLUSIVE OR */

+99

arch/s390/include/asm/fpu-insn.h

··· 108 108 : "memory"); 109 109 } 110 110 111 + static __always_inline void fpu_vcksm(u8 v1, u8 v2, u8 v3) 112 + { 113 + asm volatile("VCKSM %[v1],%[v2],%[v3]" 114 + : 115 + : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3) 116 + : "memory"); 117 + } 118 + 119 + #ifdef CONFIG_CC_IS_CLANG 120 + 121 + static __always_inline void fpu_vl(u8 v1, const void *vxr) 122 + { 123 + instrument_read(vxr, sizeof(__vector128)); 124 + asm volatile("\n" 125 + " la 1,%[vxr]\n" 126 + " VL %[v1],0,,1\n" 127 + : 128 + : [vxr] "R" (*(__vector128 *)vxr), 129 + [v1] "I" (v1) 130 + : "memory", "1"); 131 + } 132 + 133 + #else /* CONFIG_CC_IS_CLANG */ 134 + 135 + static __always_inline void fpu_vl(u8 v1, const void *vxr) 136 + { 137 + instrument_read(vxr, sizeof(__vector128)); 138 + asm volatile("VL %[v1],%O[vxr],,%R[vxr]\n" 139 + : 140 + : [vxr] "Q" (*(__vector128 *)vxr), 141 + [v1] "I" (v1) 142 + : "memory"); 143 + } 144 + 145 + #endif /* CONFIG_CC_IS_CLANG */ 146 + 147 + static __always_inline u64 fpu_vlgvf(u8 v, u16 index) 148 + { 149 + u64 val; 150 + 151 + asm volatile("VLGVF %[val],%[v],%[index]" 152 + : [val] "=d" (val) 153 + : [v] "I" (v), [index] "L" (index) 154 + : "memory"); 155 + return val; 156 + } 157 + 158 + #ifdef CONFIG_CC_IS_CLANG 159 + 160 + static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr) 161 + { 162 + unsigned int size; 163 + 164 + size = min(index + 1, sizeof(__vector128)); 165 + instrument_read(vxr, size); 166 + asm volatile("\n" 167 + " la 1,%[vxr]\n" 168 + " VLL %[v1],%[index],0,1\n" 169 + : 170 + : [vxr] "R" (*(u8 *)vxr), 171 + [index] "d" (index), 172 + [v1] "I" (v1) 173 + : "memory", "1"); 174 + } 175 + 176 + #else /* CONFIG_CC_IS_CLANG */ 177 + 178 + static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr) 179 + { 180 + unsigned int size; 181 + 182 + size = min(index + 1, sizeof(__vector128)); 183 + instrument_read(vxr, size); 184 + asm volatile("VLL %[v1],%[index],%O[vxr],%R[vxr]\n" 185 + : 186 + : [vxr] "Q" (*(u8 *)vxr), 187 + [index] "d" (index), 188 + [v1] "I" (v1) 189 + : "memory"); 190 + } 191 + 192 + #endif /* CONFIG_CC_IS_CLANG */ 193 + 111 194 #ifdef CONFIG_CC_IS_CLANG 112 195 113 196 #define fpu_vlm(_v1, _v3, _vxrs) \ ··· 231 148 232 149 #endif /* CONFIG_CC_IS_CLANG */ 233 150 151 + static __always_inline void fpu_vlvgf(u8 v, u32 val, u16 index) 152 + { 153 + asm volatile("VLVGF %[v],%[val],%[index]" 154 + : 155 + : [v] "I" (v), [val] "d" (val), [index] "L" (index) 156 + : "memory"); 157 + } 158 + 234 159 #ifdef CONFIG_CC_IS_CLANG 235 160 236 161 #define fpu_vstm(_v1, _v3, _vxrs) \ ··· 276 185 }) 277 186 278 187 #endif /* CONFIG_CC_IS_CLANG */ 188 + 189 + static __always_inline void fpu_vzero(u8 v) 190 + { 191 + asm volatile("VZERO %[v]" 192 + : 193 + : [v] "I" (v) 194 + : "memory"); 195 + } 279 196 280 197 #endif /* __ASSEMBLY__ */ 281 198 #endif /* __ASM_S390_FPU_INSN_H */

arch/s390/include/asm/fpu-types.h

··· 32 32 __vector128 vxrs[vxr_size] __aligned(8); \ 33 33 } 34 34 35 + KERNEL_FPU_STRUCT(8); 35 36 KERNEL_FPU_STRUCT(16); 36 37 KERNEL_FPU_STRUCT(32); 37 38 38 39 #define DECLARE_KERNEL_FPU_ONSTACK(vxr_size, name) \ 39 40 struct kernel_fpu_##vxr_size name __uninitialized 41 + 42 + #define DECLARE_KERNEL_FPU_ONSTACK8(name) \ 43 + DECLARE_KERNEL_FPU_ONSTACK(8, name) 40 44 41 45 #define DECLARE_KERNEL_FPU_ONSTACK16(name) \ 42 46 DECLARE_KERNEL_FPU_ONSTACK(16, name)

arch/s390/lib/Makefile

··· 4 4 # 5 5 6 6 lib-y += delay.o string.o uaccess.o find.o spinlock.o tishift.o 7 + lib-y += csum-partial.o 7 8 obj-y += mem.o xor.o 8 9 lib-$(CONFIG_KPROBES) += probes.o 9 10 lib-$(CONFIG_UPROBES) += probes.o

+63

arch/s390/lib/csum-partial.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <linux/export.h> 4 + #include <asm/checksum.h> 5 + #include <asm/fpu.h> 6 + 7 + /* 8 + * Computes the checksum of a memory block at buff, length len, 9 + * and adds in "sum" (32-bit). 10 + * 11 + * Returns a 32-bit number suitable for feeding into itself 12 + * or csum_tcpudp_magic. 13 + * 14 + * This function must be called with even lengths, except 15 + * for the last fragment, which may be odd. 16 + * 17 + * It's best to have buff aligned on a 64-bit boundary. 18 + */ 19 + __wsum csum_partial(const void *buff, int len, __wsum sum) 20 + { 21 + DECLARE_KERNEL_FPU_ONSTACK8(vxstate); 22 + 23 + if (!cpu_has_vx()) 24 + return cksm(buff, len, sum); 25 + kernel_fpu_begin(&vxstate, KERNEL_VXR_V16V23); 26 + fpu_vlvgf(16, (__force u32)sum, 1); 27 + fpu_vzero(17); 28 + fpu_vzero(18); 29 + fpu_vzero(19); 30 + while (len >= 64) { 31 + fpu_vlm(20, 23, buff); 32 + fpu_vcksm(16, 20, 16); 33 + fpu_vcksm(17, 21, 17); 34 + fpu_vcksm(18, 22, 18); 35 + fpu_vcksm(19, 23, 19); 36 + buff += 64; 37 + len -= 64; 38 + } 39 + while (len >= 32) { 40 + fpu_vlm(20, 21, buff); 41 + fpu_vcksm(16, 20, 16); 42 + fpu_vcksm(17, 21, 17); 43 + buff += 32; 44 + len -= 32; 45 + } 46 + while (len >= 16) { 47 + fpu_vl(20, buff); 48 + fpu_vcksm(16, 20, 16); 49 + buff += 16; 50 + len -= 16; 51 + } 52 + if (len) { 53 + fpu_vll(20, len - 1, buff); 54 + fpu_vcksm(16, 20, 16); 55 + } 56 + fpu_vcksm(18, 19, 18); 57 + fpu_vcksm(16, 17, 16); 58 + fpu_vcksm(16, 18, 16); 59 + sum = (__force __wsum)fpu_vlgvf(16, 1); 60 + kernel_fpu_end(&vxstate, KERNEL_VXR_V16V23); 61 + return sum; 62 + } 63 + EXPORT_SYMBOL(csum_partial);