Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

md/raid6 algorithms: xor_syndrome() for SSE2

The second and (last) optimized XOR syndrome calculation. This version
supports right and left side optimization. All CPUs with architecture
older than Haswell will benefit from it.

It should be noted that SSE2 movntdq kills performance for memory areas
that are read and written simultaneously in chunks smaller than cache
line size. So use movdqa instead for P/Q writes in sse21 and sse22 XOR
functions.

Signed-off-by: Markus Stockhausen <stockhausen@collogia.de>
Signed-off-by: NeilBrown <neilb@suse.de>

authored by

Markus Stockhausen and committed by
NeilBrown
a582564b 9a5ce91d

+227 -3
+227 -3
lib/raid6/sse2.c
··· 88 88 kernel_fpu_end(); 89 89 } 90 90 91 + 92 + static void raid6_sse21_xor_syndrome(int disks, int start, int stop, 93 + size_t bytes, void **ptrs) 94 + { 95 + u8 **dptr = (u8 **)ptrs; 96 + u8 *p, *q; 97 + int d, z, z0; 98 + 99 + z0 = stop; /* P/Q right side optimization */ 100 + p = dptr[disks-2]; /* XOR parity */ 101 + q = dptr[disks-1]; /* RS syndrome */ 102 + 103 + kernel_fpu_begin(); 104 + 105 + asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); 106 + 107 + for ( d = 0 ; d < bytes ; d += 16 ) { 108 + asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); 109 + asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); 110 + asm volatile("pxor %xmm4,%xmm2"); 111 + /* P/Q data pages */ 112 + for ( z = z0-1 ; z >= start ; z-- ) { 113 + asm volatile("pxor %xmm5,%xmm5"); 114 + asm volatile("pcmpgtb %xmm4,%xmm5"); 115 + asm volatile("paddb %xmm4,%xmm4"); 116 + asm volatile("pand %xmm0,%xmm5"); 117 + asm volatile("pxor %xmm5,%xmm4"); 118 + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); 119 + asm volatile("pxor %xmm5,%xmm2"); 120 + asm volatile("pxor %xmm5,%xmm4"); 121 + } 122 + /* P/Q left side optimization */ 123 + for ( z = start-1 ; z >= 0 ; z-- ) { 124 + asm volatile("pxor %xmm5,%xmm5"); 125 + asm volatile("pcmpgtb %xmm4,%xmm5"); 126 + asm volatile("paddb %xmm4,%xmm4"); 127 + asm volatile("pand %xmm0,%xmm5"); 128 + asm volatile("pxor %xmm5,%xmm4"); 129 + } 130 + asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); 131 + /* Don't use movntdq for r/w memory area < cache line */ 132 + asm volatile("movdqa %%xmm4,%0" : "=m" (q[d])); 133 + asm volatile("movdqa %%xmm2,%0" : "=m" (p[d])); 134 + } 135 + 136 + asm volatile("sfence" : : : "memory"); 137 + kernel_fpu_end(); 138 + } 139 + 91 140 const struct raid6_calls raid6_sse2x1 = { 92 141 raid6_sse21_gen_syndrome, 93 - NULL, /* XOR not yet implemented */ 142 + raid6_sse21_xor_syndrome, 94 143 raid6_have_sse2, 95 144 "sse2x1", 96 145 1 /* Has cache hints */ ··· 200 151 kernel_fpu_end(); 201 152 } 202 153 154 + static void raid6_sse22_xor_syndrome(int disks, int start, int stop, 155 + size_t bytes, void **ptrs) 156 + { 157 + u8 **dptr = (u8 **)ptrs; 158 + u8 *p, *q; 159 + int d, z, z0; 160 + 161 + z0 = stop; /* P/Q right side optimization */ 162 + p = dptr[disks-2]; /* XOR parity */ 163 + q = dptr[disks-1]; /* RS syndrome */ 164 + 165 + kernel_fpu_begin(); 166 + 167 + asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); 168 + 169 + for ( d = 0 ; d < bytes ; d += 32 ) { 170 + asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); 171 + asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16])); 172 + asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); 173 + asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16])); 174 + asm volatile("pxor %xmm4,%xmm2"); 175 + asm volatile("pxor %xmm6,%xmm3"); 176 + /* P/Q data pages */ 177 + for ( z = z0-1 ; z >= start ; z-- ) { 178 + asm volatile("pxor %xmm5,%xmm5"); 179 + asm volatile("pxor %xmm7,%xmm7"); 180 + asm volatile("pcmpgtb %xmm4,%xmm5"); 181 + asm volatile("pcmpgtb %xmm6,%xmm7"); 182 + asm volatile("paddb %xmm4,%xmm4"); 183 + asm volatile("paddb %xmm6,%xmm6"); 184 + asm volatile("pand %xmm0,%xmm5"); 185 + asm volatile("pand %xmm0,%xmm7"); 186 + asm volatile("pxor %xmm5,%xmm4"); 187 + asm volatile("pxor %xmm7,%xmm6"); 188 + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); 189 + asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); 190 + asm volatile("pxor %xmm5,%xmm2"); 191 + asm volatile("pxor %xmm7,%xmm3"); 192 + asm volatile("pxor %xmm5,%xmm4"); 193 + asm volatile("pxor %xmm7,%xmm6"); 194 + } 195 + /* P/Q left side optimization */ 196 + for ( z = start-1 ; z >= 0 ; z-- ) { 197 + asm volatile("pxor %xmm5,%xmm5"); 198 + asm volatile("pxor %xmm7,%xmm7"); 199 + asm volatile("pcmpgtb %xmm4,%xmm5"); 200 + asm volatile("pcmpgtb %xmm6,%xmm7"); 201 + asm volatile("paddb %xmm4,%xmm4"); 202 + asm volatile("paddb %xmm6,%xmm6"); 203 + asm volatile("pand %xmm0,%xmm5"); 204 + asm volatile("pand %xmm0,%xmm7"); 205 + asm volatile("pxor %xmm5,%xmm4"); 206 + asm volatile("pxor %xmm7,%xmm6"); 207 + } 208 + asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); 209 + asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16])); 210 + /* Don't use movntdq for r/w memory area < cache line */ 211 + asm volatile("movdqa %%xmm4,%0" : "=m" (q[d])); 212 + asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16])); 213 + asm volatile("movdqa %%xmm2,%0" : "=m" (p[d])); 214 + asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16])); 215 + } 216 + 217 + asm volatile("sfence" : : : "memory"); 218 + kernel_fpu_end(); 219 + } 220 + 203 221 const struct raid6_calls raid6_sse2x2 = { 204 222 raid6_sse22_gen_syndrome, 205 - NULL, /* XOR not yet implemented */ 223 + raid6_sse22_xor_syndrome, 206 224 raid6_have_sse2, 207 225 "sse2x2", 208 226 1 /* Has cache hints */ ··· 366 250 kernel_fpu_end(); 367 251 } 368 252 253 + static void raid6_sse24_xor_syndrome(int disks, int start, int stop, 254 + size_t bytes, void **ptrs) 255 + { 256 + u8 **dptr = (u8 **)ptrs; 257 + u8 *p, *q; 258 + int d, z, z0; 259 + 260 + z0 = stop; /* P/Q right side optimization */ 261 + p = dptr[disks-2]; /* XOR parity */ 262 + q = dptr[disks-1]; /* RS syndrome */ 263 + 264 + kernel_fpu_begin(); 265 + 266 + asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); 267 + 268 + for ( d = 0 ; d < bytes ; d += 64 ) { 269 + asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); 270 + asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16])); 271 + asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32])); 272 + asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48])); 273 + asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); 274 + asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16])); 275 + asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32])); 276 + asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48])); 277 + asm volatile("pxor %xmm4,%xmm2"); 278 + asm volatile("pxor %xmm6,%xmm3"); 279 + asm volatile("pxor %xmm12,%xmm10"); 280 + asm volatile("pxor %xmm14,%xmm11"); 281 + /* P/Q data pages */ 282 + for ( z = z0-1 ; z >= start ; z-- ) { 283 + asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); 284 + asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); 285 + asm volatile("pxor %xmm5,%xmm5"); 286 + asm volatile("pxor %xmm7,%xmm7"); 287 + asm volatile("pxor %xmm13,%xmm13"); 288 + asm volatile("pxor %xmm15,%xmm15"); 289 + asm volatile("pcmpgtb %xmm4,%xmm5"); 290 + asm volatile("pcmpgtb %xmm6,%xmm7"); 291 + asm volatile("pcmpgtb %xmm12,%xmm13"); 292 + asm volatile("pcmpgtb %xmm14,%xmm15"); 293 + asm volatile("paddb %xmm4,%xmm4"); 294 + asm volatile("paddb %xmm6,%xmm6"); 295 + asm volatile("paddb %xmm12,%xmm12"); 296 + asm volatile("paddb %xmm14,%xmm14"); 297 + asm volatile("pand %xmm0,%xmm5"); 298 + asm volatile("pand %xmm0,%xmm7"); 299 + asm volatile("pand %xmm0,%xmm13"); 300 + asm volatile("pand %xmm0,%xmm15"); 301 + asm volatile("pxor %xmm5,%xmm4"); 302 + asm volatile("pxor %xmm7,%xmm6"); 303 + asm volatile("pxor %xmm13,%xmm12"); 304 + asm volatile("pxor %xmm15,%xmm14"); 305 + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); 306 + asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); 307 + asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); 308 + asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); 309 + asm volatile("pxor %xmm5,%xmm2"); 310 + asm volatile("pxor %xmm7,%xmm3"); 311 + asm volatile("pxor %xmm13,%xmm10"); 312 + asm volatile("pxor %xmm15,%xmm11"); 313 + asm volatile("pxor %xmm5,%xmm4"); 314 + asm volatile("pxor %xmm7,%xmm6"); 315 + asm volatile("pxor %xmm13,%xmm12"); 316 + asm volatile("pxor %xmm15,%xmm14"); 317 + } 318 + asm volatile("prefetchnta %0" :: "m" (q[d])); 319 + asm volatile("prefetchnta %0" :: "m" (q[d+32])); 320 + /* P/Q left side optimization */ 321 + for ( z = start-1 ; z >= 0 ; z-- ) { 322 + asm volatile("pxor %xmm5,%xmm5"); 323 + asm volatile("pxor %xmm7,%xmm7"); 324 + asm volatile("pxor %xmm13,%xmm13"); 325 + asm volatile("pxor %xmm15,%xmm15"); 326 + asm volatile("pcmpgtb %xmm4,%xmm5"); 327 + asm volatile("pcmpgtb %xmm6,%xmm7"); 328 + asm volatile("pcmpgtb %xmm12,%xmm13"); 329 + asm volatile("pcmpgtb %xmm14,%xmm15"); 330 + asm volatile("paddb %xmm4,%xmm4"); 331 + asm volatile("paddb %xmm6,%xmm6"); 332 + asm volatile("paddb %xmm12,%xmm12"); 333 + asm volatile("paddb %xmm14,%xmm14"); 334 + asm volatile("pand %xmm0,%xmm5"); 335 + asm volatile("pand %xmm0,%xmm7"); 336 + asm volatile("pand %xmm0,%xmm13"); 337 + asm volatile("pand %xmm0,%xmm15"); 338 + asm volatile("pxor %xmm5,%xmm4"); 339 + asm volatile("pxor %xmm7,%xmm6"); 340 + asm volatile("pxor %xmm13,%xmm12"); 341 + asm volatile("pxor %xmm15,%xmm14"); 342 + } 343 + asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); 344 + asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); 345 + asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); 346 + asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); 347 + asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); 348 + asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16])); 349 + asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32])); 350 + asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48])); 351 + asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); 352 + asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); 353 + asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); 354 + asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); 355 + } 356 + asm volatile("sfence" : : : "memory"); 357 + kernel_fpu_end(); 358 + } 359 + 360 + 369 361 const struct raid6_calls raid6_sse2x4 = { 370 362 raid6_sse24_gen_syndrome, 371 - NULL, /* XOR not yet implemented */ 363 + raid6_sse24_xor_syndrome, 372 364 raid6_have_sse2, 373 365 "sse2x4", 374 366 1 /* Has cache hints */