Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

RAID/s390: add SIMD implementation for raid6 gen/xor

Using vector registers is slightly faster:

raid6: vx128x8 gen() 19705 MB/s
raid6: vx128x8 xor() 11886 MB/s
raid6: using algorithm vx128x8 gen() 19705 MB/s
raid6: .... xor() 11886 MB/s, rmw enabled

vs the software algorithms:

raid6: int64x1 gen() 3018 MB/s
raid6: int64x1 xor() 1429 MB/s
raid6: int64x2 gen() 4661 MB/s
raid6: int64x2 xor() 3143 MB/s
raid6: int64x4 gen() 5392 MB/s
raid6: int64x4 xor() 3509 MB/s
raid6: int64x8 gen() 4441 MB/s
raid6: int64x8 xor() 3207 MB/s
raid6: using algorithm int64x4 gen() 5392 MB/s
raid6: .... xor() 3509 MB/s, rmw enabled

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

+265
+86
arch/s390/include/asm/vx-insn.h
··· 278 278 VLVG \v, \gr, \index, 3 279 279 .endm 280 280 281 + /* VECTOR LOAD REGISTER */ 282 + .macro VLR v1, v2 283 + VX_NUM v1, \v1 284 + VX_NUM v2, \v2 285 + .word 0xE700 | ((v1&15) << 4) | (v2&15) 286 + .word 0 287 + MRXBOPC 0, 0x56, v1, v2 288 + .endm 289 + 281 290 /* VECTOR LOAD */ 282 291 .macro VL v, disp, index="%r0", base 283 292 VX_NUM v1, \v ··· 413 404 414 405 /* Vector integer instructions */ 415 406 407 + /* VECTOR AND */ 408 + .macro VN vr1, vr2, vr3 409 + VX_NUM v1, \vr1 410 + VX_NUM v2, \vr2 411 + VX_NUM v3, \vr3 412 + .word 0xE700 | ((v1&15) << 4) | (v2&15) 413 + .word ((v3&15) << 12) 414 + MRXBOPC 0, 0x68, v1, v2, v3 415 + .endm 416 + 416 417 /* VECTOR EXCLUSIVE OR */ 417 418 .macro VX vr1, vr2, vr3 418 419 VX_NUM v1, \vr1 ··· 488 469 MRXBOPC 0, 0x7D, v1, v2, v3 489 470 .endm 490 471 472 + /* VECTOR REPLICATE IMMEDIATE */ 473 + .macro VREPI vr1, imm2, m3 474 + VX_NUM v1, \vr1 475 + .word 0xE700 | ((v1&15) << 4) 476 + .word \imm2 477 + MRXBOPC \m3, 0x45, v1 478 + .endm 479 + .macro VREPIB vr1, imm2 480 + VREPI \vr1, \imm2, 0 481 + .endm 482 + .macro VREPIH vr1, imm2 483 + VREPI \vr1, \imm2, 1 484 + .endm 485 + .macro VREPIF vr1, imm2 486 + VREPI \vr1, \imm2, 2 487 + .endm 488 + .macro VREPIG vr1, imm2 489 + VREP \vr1, \imm2, 3 490 + .endm 491 + 492 + /* VECTOR ADD */ 493 + .macro VA vr1, vr2, vr3, m4 494 + VX_NUM v1, \vr1 495 + VX_NUM v2, \vr2 496 + VX_NUM v3, \vr3 497 + .word 0xE700 | ((v1&15) << 4) | (v2&15) 498 + .word ((v3&15) << 12) 499 + MRXBOPC \m4, 0xF3, v1, v2, v3 500 + .endm 501 + .macro VAB vr1, vr2, vr3 502 + VA \vr1, \vr2, \vr3, 0 503 + .endm 504 + .macro VAH vr1, vr2, vr3 505 + VA \vr1, \vr2, \vr3, 1 506 + .endm 507 + .macro VAF vr1, vr2, vr3 508 + VA \vr1, \vr2, \vr3, 2 509 + .endm 510 + .macro VAG vr1, vr2, vr3 511 + VA \vr1, \vr2, \vr3, 3 512 + .endm 513 + .macro VAQ vr1, vr2, vr3 514 + VA \vr1, \vr2, \vr3, 4 515 + .endm 516 + 517 + /* VECTOR ELEMENT SHIFT RIGHT ARITHMETIC */ 518 + .macro VESRAV vr1, vr2, vr3, m4 519 + VX_NUM v1, \vr1 520 + VX_NUM v2, \vr2 521 + VX_NUM v3, \vr3 522 + .word 0xE700 | ((v1&15) << 4) | (v2&15) 523 + .word ((v3&15) << 12) 524 + MRXBOPC \m4, 0x7A, v1, v2, v3 525 + .endm 526 + 527 + .macro VESRAVB vr1, vr2, vr3 528 + VESRAV \vr1, \vr2, \vr3, 0 529 + .endm 530 + .macro VESRAVH vr1, vr2, vr3 531 + VESRAV \vr1, \vr2, \vr3, 1 532 + .endm 533 + .macro VESRAVF vr1, vr2, vr3 534 + VESRAV \vr1, \vr2, \vr3, 2 535 + .endm 536 + .macro VESRAVG vr1, vr2, vr3 537 + VESRAV \vr1, \vr2, \vr3, 3 538 + .endm 491 539 492 540 #endif /* __ASSEMBLY__ */ 493 541 #endif /* __ASM_S390_VX_INSN_H */
+1
include/linux/raid/pq.h
··· 103 103 extern const struct raid6_calls raid6_avx2x2; 104 104 extern const struct raid6_calls raid6_avx2x4; 105 105 extern const struct raid6_calls raid6_tilegx8; 106 + extern const struct raid6_calls raid6_s390vx8; 106 107 107 108 struct raid6_recov_calls { 108 109 void (*data2)(int, size_t, int, int, void **);
+1
lib/raid6/.gitignore
··· 3 3 int*.c 4 4 tables.c 5 5 neon?.c 6 + s390vx?.c
+6
lib/raid6/Makefile
··· 7 7 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o 8 8 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o 9 9 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o 10 + raid6_pq-$(CONFIG_S390) += s390vx8.o 10 11 11 12 hostprogs-y += mktables 12 13 ··· 115 114 targets += tilegx8.c 116 115 $(obj)/tilegx8.c: UNROLL := 8 117 116 $(obj)/tilegx8.c: $(src)/tilegx.uc $(src)/unroll.awk FORCE 117 + $(call if_changed,unroll) 118 + 119 + targets += s390vx8.c 120 + $(obj)/s390vx8.c: UNROLL := 8 121 + $(obj)/s390vx8.c: $(src)/s390vx.uc $(src)/unroll.awk FORCE 118 122 $(call if_changed,unroll) 119 123 120 124 quiet_cmd_mktable = TABLE $@
+3
lib/raid6/algos.c
··· 69 69 #if defined(CONFIG_TILEGX) 70 70 &raid6_tilegx8, 71 71 #endif 72 + #if defined(CONFIG_S390) 73 + &raid6_s390vx8, 74 + #endif 72 75 &raid6_intx1, 73 76 &raid6_intx2, 74 77 &raid6_intx4,
+168
lib/raid6/s390vx.uc
··· 1 + /* 2 + * raid6_vx$#.c 3 + * 4 + * $#-way unrolled RAID6 gen/xor functions for s390 5 + * based on the vector facility 6 + * 7 + * Copyright IBM Corp. 2016 8 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 9 + * 10 + * This file is postprocessed using unroll.awk. 11 + */ 12 + 13 + #include <linux/raid/pq.h> 14 + #include <asm/fpu/api.h> 15 + 16 + asm(".include \"asm/vx-insn.h\"\n"); 17 + 18 + #define NSIZE 16 19 + 20 + static inline void LOAD_CONST(void) 21 + { 22 + asm volatile("VREPIB %v24,7"); 23 + asm volatile("VREPIB %v25,0x1d"); 24 + } 25 + 26 + /* 27 + * The SHLBYTE() operation shifts each of the 16 bytes in 28 + * vector register y left by 1 bit and stores the result in 29 + * vector register x. 30 + */ 31 + static inline void SHLBYTE(int x, int y) 32 + { 33 + asm volatile ("VAB %0,%1,%1" : : "i" (x), "i" (y)); 34 + } 35 + 36 + /* 37 + * For each of the 16 bytes in the vector register y the MASK() 38 + * operation returns 0xFF if the high bit of the byte is 1, 39 + * or 0x00 if the high bit is 0. The result is stored in vector 40 + * register x. 41 + */ 42 + static inline void MASK(int x, int y) 43 + { 44 + asm volatile ("VESRAVB %0,%1,24" : : "i" (x), "i" (y)); 45 + } 46 + 47 + static inline void AND(int x, int y, int z) 48 + { 49 + asm volatile ("VN %0,%1,%2" : : "i" (x), "i" (y), "i" (z)); 50 + } 51 + 52 + static inline void XOR(int x, int y, int z) 53 + { 54 + asm volatile ("VX %0,%1,%2" : : "i" (x), "i" (y), "i" (z)); 55 + } 56 + 57 + static inline void LOAD_DATA(int x, int n, u8 *ptr) 58 + { 59 + typedef struct { u8 _[16*n]; } addrtype; 60 + register addrtype *__ptr asm("1") = (addrtype *) ptr; 61 + 62 + asm volatile ("VLM %2,%3,0,%r1" 63 + : : "m" (*__ptr), "a" (__ptr), "i" (x), "i" (x + n - 1)); 64 + } 65 + 66 + static inline void STORE_DATA(int x, int n, u8 *ptr) 67 + { 68 + typedef struct { u8 _[16*n]; } addrtype; 69 + register addrtype *__ptr asm("1") = (addrtype *) ptr; 70 + 71 + asm volatile ("VSTM %2,%3,0,1" 72 + : "=m" (*__ptr) : "a" (__ptr), "i" (x), "i" (x + n - 1)); 73 + } 74 + 75 + static inline void COPY_VEC(int x, int y) 76 + { 77 + asm volatile ("VLR %0,%1" : : "i" (x), "i" (y)); 78 + } 79 + 80 + static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs) 81 + { 82 + struct kernel_fpu vxstate; 83 + u8 **dptr, *p, *q; 84 + int d, z, z0; 85 + 86 + kernel_fpu_begin(&vxstate, KERNEL_VXR); 87 + LOAD_CONST(); 88 + 89 + dptr = (u8 **) ptrs; 90 + z0 = disks - 3; /* Highest data disk */ 91 + p = dptr[z0 + 1]; /* XOR parity */ 92 + q = dptr[z0 + 2]; /* RS syndrome */ 93 + 94 + for (d = 0; d < bytes; d += $#*NSIZE) { 95 + LOAD_DATA(0,$#,&dptr[z0][d]); 96 + COPY_VEC(8+$$,0+$$); 97 + for (z = z0 - 1; z >= 0; z--) { 98 + MASK(16+$$,8+$$); 99 + AND(16+$$,16+$$,25); 100 + SHLBYTE(8+$$,8+$$); 101 + XOR(8+$$,8+$$,16+$$); 102 + LOAD_DATA(16,$#,&dptr[z][d]); 103 + XOR(0+$$,0+$$,16+$$); 104 + XOR(8+$$,8+$$,16+$$); 105 + } 106 + STORE_DATA(0,$#,&p[d]); 107 + STORE_DATA(8,$#,&q[d]); 108 + } 109 + kernel_fpu_end(&vxstate, KERNEL_VXR); 110 + } 111 + 112 + static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop, 113 + size_t bytes, void **ptrs) 114 + { 115 + struct kernel_fpu vxstate; 116 + u8 **dptr, *p, *q; 117 + int d, z, z0; 118 + 119 + dptr = (u8 **) ptrs; 120 + z0 = stop; /* P/Q right side optimization */ 121 + p = dptr[disks - 2]; /* XOR parity */ 122 + q = dptr[disks - 1]; /* RS syndrome */ 123 + 124 + kernel_fpu_begin(&vxstate, KERNEL_VXR); 125 + LOAD_CONST(); 126 + 127 + for (d = 0; d < bytes; d += $#*NSIZE) { 128 + /* P/Q data pages */ 129 + LOAD_DATA(0,$#,&dptr[z0][d]); 130 + COPY_VEC(8+$$,0+$$); 131 + for (z = z0 - 1; z >= start; z--) { 132 + MASK(16+$$,8+$$); 133 + AND(16+$$,16+$$,25); 134 + SHLBYTE(8+$$,8+$$); 135 + XOR(8+$$,8+$$,16+$$); 136 + LOAD_DATA(16,$#,&dptr[z][d]); 137 + XOR(0+$$,0+$$,16+$$); 138 + XOR(8+$$,8+$$,16+$$); 139 + } 140 + /* P/Q left side optimization */ 141 + for (z = start - 1; z >= 0; z--) { 142 + MASK(16+$$,8+$$); 143 + AND(16+$$,16+$$,25); 144 + SHLBYTE(8+$$,8+$$); 145 + XOR(8+$$,8+$$,16+$$); 146 + } 147 + LOAD_DATA(16,$#,&p[d]); 148 + XOR(16+$$,16+$$,0+$$); 149 + STORE_DATA(16,$#,&p[d]); 150 + LOAD_DATA(16,$#,&q[d]); 151 + XOR(16+$$,16+$$,8+$$); 152 + STORE_DATA(16,$#,&q[d]); 153 + } 154 + kernel_fpu_end(&vxstate, KERNEL_VXR); 155 + } 156 + 157 + static int raid6_s390vx$#_valid(void) 158 + { 159 + return MACHINE_HAS_VX; 160 + } 161 + 162 + const struct raid6_calls raid6_s390vx$# = { 163 + raid6_s390vx$#_gen_syndrome, 164 + raid6_s390vx$#_xor_syndrome, 165 + raid6_s390vx$#_valid, 166 + "vx128x$#", 167 + 1 168 + };