Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

lib/raid6: Add AVX512 optimized xor_syndrome functions

Optimize RAID6 xor_syndrome functions to take advantage of the 512-bit
ZMM integer instructions introduced in AVX512.

AVX512 optimized xor_syndrome functions, which is simply based on sse2.c
written by hpa.

The patch was tested and benchmarked before submission on
a hardware that has AVX512 flags to support such instructions

Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jim Kukunas <james.t.kukunas@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Megha Dey <megha.dey@linux.intel.com>
Signed-off-by: Gayatri Kammela <gayatri.kammela@intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>

authored by

Gayatri Kammela and committed by
Shaohua Li
694dda62 161db5d1

+278 -3
+278 -3
lib/raid6/avx512.c
··· 103 103 kernel_fpu_end(); 104 104 } 105 105 106 + static void raid6_avx5121_xor_syndrome(int disks, int start, int stop, 107 + size_t bytes, void **ptrs) 108 + { 109 + u8 **dptr = (u8 **)ptrs; 110 + u8 *p, *q; 111 + int d, z, z0; 112 + 113 + z0 = stop; /* P/Q right side optimization */ 114 + p = dptr[disks-2]; /* XOR parity */ 115 + q = dptr[disks-1]; /* RS syndrome */ 116 + 117 + kernel_fpu_begin(); 118 + 119 + asm volatile("vmovdqa64 %0,%%zmm0" 120 + : : "m" (raid6_avx512_constants.x1d[0])); 121 + 122 + for (d = 0 ; d < bytes ; d += 64) { 123 + asm volatile("vmovdqa64 %0,%%zmm4\n\t" 124 + "vmovdqa64 %1,%%zmm2\n\t" 125 + "vpxorq %%zmm4,%%zmm2,%%zmm2" 126 + : 127 + : "m" (dptr[z0][d]), "m" (p[d])); 128 + /* P/Q data pages */ 129 + for (z = z0-1 ; z >= start ; z--) { 130 + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 131 + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 132 + "vpmovm2b %%k1,%%zmm5\n\t" 133 + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 134 + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 135 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 136 + "vmovdqa64 %0,%%zmm5\n\t" 137 + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 138 + "vpxorq %%zmm5,%%zmm4,%%zmm4" 139 + : 140 + : "m" (dptr[z][d])); 141 + } 142 + /* P/Q left side optimization */ 143 + for (z = start-1 ; z >= 0 ; z--) { 144 + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 145 + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 146 + "vpmovm2b %%k1,%%zmm5\n\t" 147 + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 148 + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 149 + "vpxorq %%zmm5,%%zmm4,%%zmm4" 150 + : 151 + : ); 152 + } 153 + asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" 154 + /* Don't use movntdq for r/w memory area < cache line */ 155 + "vmovdqa64 %%zmm4,%0\n\t" 156 + "vmovdqa64 %%zmm2,%1" 157 + : 158 + : "m" (q[d]), "m" (p[d])); 159 + } 160 + 161 + asm volatile("sfence" : : : "memory"); 162 + kernel_fpu_end(); 163 + } 164 + 106 165 const struct raid6_calls raid6_avx512x1 = { 107 166 raid6_avx5121_gen_syndrome, 108 - NULL, /* XOR not yet implemented */ 167 + raid6_avx5121_xor_syndrome, 109 168 raid6_have_avx512, 110 169 "avx512x1", 111 170 1 /* Has cache hints */ ··· 235 176 kernel_fpu_end(); 236 177 } 237 178 179 + static void raid6_avx5122_xor_syndrome(int disks, int start, int stop, 180 + size_t bytes, void **ptrs) 181 + { 182 + u8 **dptr = (u8 **)ptrs; 183 + u8 *p, *q; 184 + int d, z, z0; 185 + 186 + z0 = stop; /* P/Q right side optimization */ 187 + p = dptr[disks-2]; /* XOR parity */ 188 + q = dptr[disks-1]; /* RS syndrome */ 189 + 190 + kernel_fpu_begin(); 191 + 192 + asm volatile("vmovdqa64 %0,%%zmm0" 193 + : : "m" (raid6_avx512_constants.x1d[0])); 194 + 195 + for (d = 0 ; d < bytes ; d += 128) { 196 + asm volatile("vmovdqa64 %0,%%zmm4\n\t" 197 + "vmovdqa64 %1,%%zmm6\n\t" 198 + "vmovdqa64 %2,%%zmm2\n\t" 199 + "vmovdqa64 %3,%%zmm3\n\t" 200 + "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" 201 + "vpxorq %%zmm6,%%zmm3,%%zmm3" 202 + : 203 + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), 204 + "m" (p[d]), "m" (p[d+64])); 205 + /* P/Q data pages */ 206 + for (z = z0-1 ; z >= start ; z--) { 207 + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 208 + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 209 + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 210 + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 211 + "vpmovm2b %%k1,%%zmm5\n\t" 212 + "vpmovm2b %%k2,%%zmm7\n\t" 213 + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 214 + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 215 + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 216 + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 217 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 218 + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 219 + "vmovdqa64 %0,%%zmm5\n\t" 220 + "vmovdqa64 %1,%%zmm7\n\t" 221 + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 222 + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 223 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 224 + "vpxorq %%zmm7,%%zmm6,%%zmm6" 225 + : 226 + : "m" (dptr[z][d]), "m" (dptr[z][d+64])); 227 + } 228 + /* P/Q left side optimization */ 229 + for (z = start-1 ; z >= 0 ; z--) { 230 + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 231 + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 232 + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 233 + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 234 + "vpmovm2b %%k1,%%zmm5\n\t" 235 + "vpmovm2b %%k2,%%zmm7\n\t" 236 + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 237 + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 238 + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 239 + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 240 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 241 + "vpxorq %%zmm7,%%zmm6,%%zmm6" 242 + : 243 + : ); 244 + } 245 + asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" 246 + "vpxorq %1,%%zmm6,%%zmm6\n\t" 247 + /* Don't use movntdq for r/w 248 + * memory area < cache line 249 + */ 250 + "vmovdqa64 %%zmm4,%0\n\t" 251 + "vmovdqa64 %%zmm6,%1\n\t" 252 + "vmovdqa64 %%zmm2,%2\n\t" 253 + "vmovdqa64 %%zmm3,%3" 254 + : 255 + : "m" (q[d]), "m" (q[d+64]), "m" (p[d]), 256 + "m" (p[d+64])); 257 + } 258 + 259 + asm volatile("sfence" : : : "memory"); 260 + kernel_fpu_end(); 261 + } 262 + 238 263 const struct raid6_calls raid6_avx512x2 = { 239 264 raid6_avx5122_gen_syndrome, 240 - NULL, /* XOR not yet implemented */ 265 + raid6_avx5122_xor_syndrome, 241 266 raid6_have_avx512, 242 267 "avx512x2", 243 268 1 /* Has cache hints */ ··· 425 282 kernel_fpu_end(); 426 283 } 427 284 285 + static void raid6_avx5124_xor_syndrome(int disks, int start, int stop, 286 + size_t bytes, void **ptrs) 287 + { 288 + u8 **dptr = (u8 **)ptrs; 289 + u8 *p, *q; 290 + int d, z, z0; 291 + 292 + z0 = stop; /* P/Q right side optimization */ 293 + p = dptr[disks-2]; /* XOR parity */ 294 + q = dptr[disks-1]; /* RS syndrome */ 295 + 296 + kernel_fpu_begin(); 297 + 298 + asm volatile("vmovdqa64 %0,%%zmm0" 299 + :: "m" (raid6_avx512_constants.x1d[0])); 300 + 301 + for (d = 0 ; d < bytes ; d += 256) { 302 + asm volatile("vmovdqa64 %0,%%zmm4\n\t" 303 + "vmovdqa64 %1,%%zmm6\n\t" 304 + "vmovdqa64 %2,%%zmm12\n\t" 305 + "vmovdqa64 %3,%%zmm14\n\t" 306 + "vmovdqa64 %4,%%zmm2\n\t" 307 + "vmovdqa64 %5,%%zmm3\n\t" 308 + "vmovdqa64 %6,%%zmm10\n\t" 309 + "vmovdqa64 %7,%%zmm11\n\t" 310 + "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" 311 + "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t" 312 + "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t" 313 + "vpxorq %%zmm14,%%zmm11,%%zmm11" 314 + : 315 + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), 316 + "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]), 317 + "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), 318 + "m" (p[d+192])); 319 + /* P/Q data pages */ 320 + for (z = z0-1 ; z >= start ; z--) { 321 + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 322 + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 323 + "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" 324 + "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" 325 + "prefetchnta %0\n\t" 326 + "prefetchnta %2\n\t" 327 + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 328 + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 329 + "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" 330 + "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" 331 + "vpmovm2b %%k1,%%zmm5\n\t" 332 + "vpmovm2b %%k2,%%zmm7\n\t" 333 + "vpmovm2b %%k3,%%zmm13\n\t" 334 + "vpmovm2b %%k4,%%zmm15\n\t" 335 + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 336 + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 337 + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" 338 + "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t" 339 + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 340 + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 341 + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" 342 + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" 343 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 344 + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 345 + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 346 + "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" 347 + "vmovdqa64 %0,%%zmm5\n\t" 348 + "vmovdqa64 %1,%%zmm7\n\t" 349 + "vmovdqa64 %2,%%zmm13\n\t" 350 + "vmovdqa64 %3,%%zmm15\n\t" 351 + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 352 + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 353 + "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" 354 + "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t" 355 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 356 + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 357 + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 358 + "vpxorq %%zmm15,%%zmm14,%%zmm14" 359 + : 360 + : "m" (dptr[z][d]), "m" (dptr[z][d+64]), 361 + "m" (dptr[z][d+128]), 362 + "m" (dptr[z][d+192])); 363 + } 364 + asm volatile("prefetchnta %0\n\t" 365 + "prefetchnta %1\n\t" 366 + : 367 + : "m" (q[d]), "m" (q[d+128])); 368 + /* P/Q left side optimization */ 369 + for (z = start-1 ; z >= 0 ; z--) { 370 + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 371 + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 372 + "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" 373 + "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" 374 + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 375 + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 376 + "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" 377 + "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" 378 + "vpmovm2b %%k1,%%zmm5\n\t" 379 + "vpmovm2b %%k2,%%zmm7\n\t" 380 + "vpmovm2b %%k3,%%zmm13\n\t" 381 + "vpmovm2b %%k4,%%zmm15\n\t" 382 + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 383 + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 384 + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" 385 + "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" 386 + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 387 + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 388 + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" 389 + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" 390 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 391 + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 392 + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 393 + "vpxorq %%zmm15,%%zmm14,%%zmm14" 394 + : 395 + : ); 396 + } 397 + asm volatile("vmovntdq %%zmm2,%0\n\t" 398 + "vmovntdq %%zmm3,%1\n\t" 399 + "vmovntdq %%zmm10,%2\n\t" 400 + "vmovntdq %%zmm11,%3\n\t" 401 + "vpxorq %4,%%zmm4,%%zmm4\n\t" 402 + "vpxorq %5,%%zmm6,%%zmm6\n\t" 403 + "vpxorq %6,%%zmm12,%%zmm12\n\t" 404 + "vpxorq %7,%%zmm14,%%zmm14\n\t" 405 + "vmovntdq %%zmm4,%4\n\t" 406 + "vmovntdq %%zmm6,%5\n\t" 407 + "vmovntdq %%zmm12,%6\n\t" 408 + "vmovntdq %%zmm14,%7" 409 + : 410 + : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), 411 + "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), 412 + "m" (q[d+128]), "m" (q[d+192])); 413 + } 414 + asm volatile("sfence" : : : "memory"); 415 + kernel_fpu_end(); 416 + } 428 417 const struct raid6_calls raid6_avx512x4 = { 429 418 raid6_avx5124_gen_syndrome, 430 - NULL, /* XOR not yet implemented */ 419 + raid6_avx5124_xor_syndrome, 431 420 raid6_have_avx512, 432 421 "avx512x4", 433 422 1 /* Has cache hints */