Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

lib/raid6: Add AVX512 optimized recovery functions

Optimize RAID6 recovery functions to take advantage of
the 512-bit ZMM integer instructions introduced in AVX512.

AVX512 optimized recovery functions, which is simply based
on recov_avx2.c written by Jim Kukunas

This patch was tested and benchmarked before submission on
a hardware that has AVX512 flags to support such instructions

Cc: Jim Kukunas <james.t.kukunas@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Signed-off-by: Gayatri Kammela <gayatri.kammela@intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>

authored by

Gayatri Kammela and committed by
Shaohua Li
13c520b2 e0a491c1

+393 -1
+1
include/linux/raid/pq.h
··· 118 118 extern const struct raid6_recov_calls raid6_recov_intx1; 119 119 extern const struct raid6_recov_calls raid6_recov_ssse3; 120 120 extern const struct raid6_recov_calls raid6_recov_avx2; 121 + extern const struct raid6_recov_calls raid6_recov_avx512; 121 122 122 123 extern const struct raid6_calls raid6_neonx1; 123 124 extern const struct raid6_calls raid6_neonx2;
+1 -1
lib/raid6/Makefile
··· 3 3 raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ 4 4 int8.o int16.o int32.o 5 5 6 - raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o 6 + raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o 7 7 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o 8 8 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o 9 9 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
+3
lib/raid6/algos.c
··· 98 98 EXPORT_SYMBOL_GPL(raid6_datap_recov); 99 99 100 100 const struct raid6_recov_calls *const raid6_recov_algos[] = { 101 + #ifdef CONFIG_AS_AVX512 102 + &raid6_recov_avx512, 103 + #endif 101 104 #ifdef CONFIG_AS_AVX2 102 105 &raid6_recov_avx2, 103 106 #endif
+388
lib/raid6/recov_avx512.c
··· 1 + /* 2 + * Copyright (C) 2016 Intel Corporation 3 + * 4 + * Author: Gayatri Kammela <gayatri.kammela@intel.com> 5 + * Author: Megha Dey <megha.dey@linux.intel.com> 6 + * 7 + * This program is free software; you can redistribute it and/or 8 + * modify it under the terms of the GNU General Public License 9 + * as published by the Free Software Foundation; version 2 10 + * of the License. 11 + * 12 + */ 13 + 14 + #ifdef CONFIG_AS_AVX512 15 + 16 + #include <linux/raid/pq.h> 17 + #include "x86.h" 18 + 19 + static int raid6_has_avx512(void) 20 + { 21 + return boot_cpu_has(X86_FEATURE_AVX2) && 22 + boot_cpu_has(X86_FEATURE_AVX) && 23 + boot_cpu_has(X86_FEATURE_AVX512F) && 24 + boot_cpu_has(X86_FEATURE_AVX512BW) && 25 + boot_cpu_has(X86_FEATURE_AVX512VL) && 26 + boot_cpu_has(X86_FEATURE_AVX512DQ); 27 + } 28 + 29 + static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila, 30 + int failb, void **ptrs) 31 + { 32 + u8 *p, *q, *dp, *dq; 33 + const u8 *pbmul; /* P multiplier table for B data */ 34 + const u8 *qmul; /* Q multiplier table (for both) */ 35 + const u8 x0f = 0x0f; 36 + 37 + p = (u8 *)ptrs[disks-2]; 38 + q = (u8 *)ptrs[disks-1]; 39 + 40 + /* 41 + * Compute syndrome with zero for the missing data pages 42 + * Use the dead data pages as temporary storage for 43 + * delta p and delta q 44 + */ 45 + 46 + dp = (u8 *)ptrs[faila]; 47 + ptrs[faila] = (void *)raid6_empty_zero_page; 48 + ptrs[disks-2] = dp; 49 + dq = (u8 *)ptrs[failb]; 50 + ptrs[failb] = (void *)raid6_empty_zero_page; 51 + ptrs[disks-1] = dq; 52 + 53 + raid6_call.gen_syndrome(disks, bytes, ptrs); 54 + 55 + /* Restore pointer table */ 56 + ptrs[faila] = dp; 57 + ptrs[failb] = dq; 58 + ptrs[disks-2] = p; 59 + ptrs[disks-1] = q; 60 + 61 + /* Now, pick the proper data tables */ 62 + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; 63 + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ 64 + raid6_gfexp[failb]]]; 65 + 66 + kernel_fpu_begin(); 67 + 68 + /* zmm0 = x0f[16] */ 69 + asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f)); 70 + 71 + while (bytes) { 72 + #ifdef CONFIG_X86_64 73 + asm volatile("vmovdqa64 %0, %%zmm1\n\t" 74 + "vmovdqa64 %1, %%zmm9\n\t" 75 + "vmovdqa64 %2, %%zmm0\n\t" 76 + "vmovdqa64 %3, %%zmm8\n\t" 77 + "vpxorq %4, %%zmm1, %%zmm1\n\t" 78 + "vpxorq %5, %%zmm9, %%zmm9\n\t" 79 + "vpxorq %6, %%zmm0, %%zmm0\n\t" 80 + "vpxorq %7, %%zmm8, %%zmm8" 81 + : 82 + : "m" (q[0]), "m" (q[64]), "m" (p[0]), 83 + "m" (p[64]), "m" (dq[0]), "m" (dq[64]), 84 + "m" (dp[0]), "m" (dp[64])); 85 + 86 + /* 87 + * 1 = dq[0] ^ q[0] 88 + * 9 = dq[64] ^ q[64] 89 + * 0 = dp[0] ^ p[0] 90 + * 8 = dp[64] ^ p[64] 91 + */ 92 + 93 + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" 94 + "vbroadcasti64x2 %1, %%zmm5" 95 + : 96 + : "m" (qmul[0]), "m" (qmul[16])); 97 + 98 + asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t" 99 + "vpsraw $4, %%zmm9, %%zmm12\n\t" 100 + "vpandq %%zmm7, %%zmm1, %%zmm1\n\t" 101 + "vpandq %%zmm7, %%zmm9, %%zmm9\n\t" 102 + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" 103 + "vpandq %%zmm7, %%zmm12, %%zmm12\n\t" 104 + "vpshufb %%zmm9, %%zmm4, %%zmm14\n\t" 105 + "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t" 106 + "vpshufb %%zmm12, %%zmm5, %%zmm15\n\t" 107 + "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t" 108 + "vpxorq %%zmm14, %%zmm15, %%zmm15\n\t" 109 + "vpxorq %%zmm4, %%zmm5, %%zmm5" 110 + : 111 + : ); 112 + 113 + /* 114 + * 5 = qx[0] 115 + * 15 = qx[64] 116 + */ 117 + 118 + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" 119 + "vbroadcasti64x2 %1, %%zmm1\n\t" 120 + "vpsraw $4, %%zmm0, %%zmm2\n\t" 121 + "vpsraw $4, %%zmm8, %%zmm6\n\t" 122 + "vpandq %%zmm7, %%zmm0, %%zmm3\n\t" 123 + "vpandq %%zmm7, %%zmm8, %%zmm14\n\t" 124 + "vpandq %%zmm7, %%zmm2, %%zmm2\n\t" 125 + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" 126 + "vpshufb %%zmm14, %%zmm4, %%zmm12\n\t" 127 + "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t" 128 + "vpshufb %%zmm6, %%zmm1, %%zmm13\n\t" 129 + "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t" 130 + "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t" 131 + "vpxorq %%zmm12, %%zmm13, %%zmm13" 132 + : 133 + : "m" (pbmul[0]), "m" (pbmul[16])); 134 + 135 + /* 136 + * 1 = pbmul[px[0]] 137 + * 13 = pbmul[px[64]] 138 + */ 139 + asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" 140 + "vpxorq %%zmm15, %%zmm13, %%zmm13" 141 + : 142 + : ); 143 + 144 + /* 145 + * 1 = db = DQ 146 + * 13 = db[64] = DQ[64] 147 + */ 148 + asm volatile("vmovdqa64 %%zmm1, %0\n\t" 149 + "vmovdqa64 %%zmm13,%1\n\t" 150 + "vpxorq %%zmm1, %%zmm0, %%zmm0\n\t" 151 + "vpxorq %%zmm13, %%zmm8, %%zmm8" 152 + : 153 + : "m" (dq[0]), "m" (dq[64])); 154 + 155 + asm volatile("vmovdqa64 %%zmm0, %0\n\t" 156 + "vmovdqa64 %%zmm8, %1" 157 + : 158 + : "m" (dp[0]), "m" (dp[64])); 159 + 160 + bytes -= 128; 161 + p += 128; 162 + q += 128; 163 + dp += 128; 164 + dq += 128; 165 + #else 166 + asm volatile("vmovdqa64 %0, %%zmm1\n\t" 167 + "vmovdqa64 %1, %%zmm0\n\t" 168 + "vpxorq %2, %%zmm1, %%zmm1\n\t" 169 + "vpxorq %3, %%zmm0, %%zmm0" 170 + : 171 + : "m" (*q), "m" (*p), "m"(*dq), "m" (*dp)); 172 + 173 + /* 1 = dq ^ q; 0 = dp ^ p */ 174 + 175 + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" 176 + "vbroadcasti64x2 %1, %%zmm5" 177 + : 178 + : "m" (qmul[0]), "m" (qmul[16])); 179 + 180 + /* 181 + * 1 = dq ^ q 182 + * 3 = dq ^ p >> 4 183 + */ 184 + asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t" 185 + "vpandq %%zmm7, %%zmm1, %%zmm1\n\t" 186 + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" 187 + "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t" 188 + "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t" 189 + "vpxorq %%zmm4, %%zmm5, %%zmm5" 190 + : 191 + : ); 192 + 193 + /* 5 = qx */ 194 + 195 + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" 196 + "vbroadcasti64x2 %1, %%zmm1" 197 + : 198 + : "m" (pbmul[0]), "m" (pbmul[16])); 199 + 200 + asm volatile("vpsraw $4, %%zmm0, %%zmm2\n\t" 201 + "vpandq %%zmm7, %%zmm0, %%zmm3\n\t" 202 + "vpandq %%zmm7, %%zmm2, %%zmm2\n\t" 203 + "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t" 204 + "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t" 205 + "vpxorq %%zmm4, %%zmm1, %%zmm1" 206 + : 207 + : ); 208 + 209 + /* 1 = pbmul[px] */ 210 + asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" 211 + /* 1 = db = DQ */ 212 + "vmovdqa64 %%zmm1, %0\n\t" 213 + : 214 + : "m" (dq[0])); 215 + 216 + asm volatile("vpxorq %%zmm1, %%zmm0, %%zmm0\n\t" 217 + "vmovdqa64 %%zmm0, %0" 218 + : 219 + : "m" (dp[0])); 220 + 221 + bytes -= 64; 222 + p += 64; 223 + q += 64; 224 + dp += 64; 225 + dq += 64; 226 + #endif 227 + } 228 + 229 + kernel_fpu_end(); 230 + } 231 + 232 + static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila, 233 + void **ptrs) 234 + { 235 + u8 *p, *q, *dq; 236 + const u8 *qmul; /* Q multiplier table */ 237 + const u8 x0f = 0x0f; 238 + 239 + p = (u8 *)ptrs[disks-2]; 240 + q = (u8 *)ptrs[disks-1]; 241 + 242 + /* 243 + * Compute syndrome with zero for the missing data page 244 + * Use the dead data page as temporary storage for delta q 245 + */ 246 + 247 + dq = (u8 *)ptrs[faila]; 248 + ptrs[faila] = (void *)raid6_empty_zero_page; 249 + ptrs[disks-1] = dq; 250 + 251 + raid6_call.gen_syndrome(disks, bytes, ptrs); 252 + 253 + /* Restore pointer table */ 254 + ptrs[faila] = dq; 255 + ptrs[disks-1] = q; 256 + 257 + /* Now, pick the proper data tables */ 258 + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; 259 + 260 + kernel_fpu_begin(); 261 + 262 + asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f)); 263 + 264 + while (bytes) { 265 + #ifdef CONFIG_X86_64 266 + asm volatile("vmovdqa64 %0, %%zmm3\n\t" 267 + "vmovdqa64 %1, %%zmm8\n\t" 268 + "vpxorq %2, %%zmm3, %%zmm3\n\t" 269 + "vpxorq %3, %%zmm8, %%zmm8" 270 + : 271 + : "m" (dq[0]), "m" (dq[64]), "m" (q[0]), 272 + "m" (q[64])); 273 + 274 + /* 275 + * 3 = q[0] ^ dq[0] 276 + * 8 = q[64] ^ dq[64] 277 + */ 278 + asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t" 279 + "vmovapd %%zmm0, %%zmm13\n\t" 280 + "vbroadcasti64x2 %1, %%zmm1\n\t" 281 + "vmovapd %%zmm1, %%zmm14" 282 + : 283 + : "m" (qmul[0]), "m" (qmul[16])); 284 + 285 + asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t" 286 + "vpsraw $4, %%zmm8, %%zmm12\n\t" 287 + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" 288 + "vpandq %%zmm7, %%zmm8, %%zmm8\n\t" 289 + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" 290 + "vpandq %%zmm7, %%zmm12, %%zmm12\n\t" 291 + "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t" 292 + "vpshufb %%zmm8, %%zmm13, %%zmm13\n\t" 293 + "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t" 294 + "vpshufb %%zmm12, %%zmm14, %%zmm14\n\t" 295 + "vpxorq %%zmm0, %%zmm1, %%zmm1\n\t" 296 + "vpxorq %%zmm13, %%zmm14, %%zmm14" 297 + : 298 + : ); 299 + 300 + /* 301 + * 1 = qmul[q[0] ^ dq[0]] 302 + * 14 = qmul[q[64] ^ dq[64]] 303 + */ 304 + asm volatile("vmovdqa64 %0, %%zmm2\n\t" 305 + "vmovdqa64 %1, %%zmm12\n\t" 306 + "vpxorq %%zmm1, %%zmm2, %%zmm2\n\t" 307 + "vpxorq %%zmm14, %%zmm12, %%zmm12" 308 + : 309 + : "m" (p[0]), "m" (p[64])); 310 + 311 + /* 312 + * 2 = p[0] ^ qmul[q[0] ^ dq[0]] 313 + * 12 = p[64] ^ qmul[q[64] ^ dq[64]] 314 + */ 315 + 316 + asm volatile("vmovdqa64 %%zmm1, %0\n\t" 317 + "vmovdqa64 %%zmm14, %1\n\t" 318 + "vmovdqa64 %%zmm2, %2\n\t" 319 + "vmovdqa64 %%zmm12,%3" 320 + : 321 + : "m" (dq[0]), "m" (dq[64]), "m" (p[0]), 322 + "m" (p[64])); 323 + 324 + bytes -= 128; 325 + p += 128; 326 + q += 128; 327 + dq += 128; 328 + #else 329 + asm volatile("vmovdqa64 %0, %%zmm3\n\t" 330 + "vpxorq %1, %%zmm3, %%zmm3" 331 + : 332 + : "m" (dq[0]), "m" (q[0])); 333 + 334 + /* 3 = q ^ dq */ 335 + 336 + asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t" 337 + "vbroadcasti64x2 %1, %%zmm1" 338 + : 339 + : "m" (qmul[0]), "m" (qmul[16])); 340 + 341 + asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t" 342 + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" 343 + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" 344 + "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t" 345 + "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t" 346 + "vpxorq %%zmm0, %%zmm1, %%zmm1" 347 + : 348 + : ); 349 + 350 + /* 1 = qmul[q ^ dq] */ 351 + 352 + asm volatile("vmovdqa64 %0, %%zmm2\n\t" 353 + "vpxorq %%zmm1, %%zmm2, %%zmm2" 354 + : 355 + : "m" (p[0])); 356 + 357 + /* 2 = p ^ qmul[q ^ dq] */ 358 + 359 + asm volatile("vmovdqa64 %%zmm1, %0\n\t" 360 + "vmovdqa64 %%zmm2, %1" 361 + : 362 + : "m" (dq[0]), "m" (p[0])); 363 + 364 + bytes -= 64; 365 + p += 64; 366 + q += 64; 367 + dq += 64; 368 + #endif 369 + } 370 + 371 + kernel_fpu_end(); 372 + } 373 + 374 + const struct raid6_recov_calls raid6_recov_avx512 = { 375 + .data2 = raid6_2data_recov_avx512, 376 + .datap = raid6_datap_recov_avx512, 377 + .valid = raid6_has_avx512, 378 + #ifdef CONFIG_X86_64 379 + .name = "avx512x2", 380 + #else 381 + .name = "avx512x1", 382 + #endif 383 + .priority = 3, 384 + }; 385 + 386 + #else 387 + #warning "your version of binutils lacks AVX512 support" 388 + #endif