Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

md/raid6: delta syndrome for ARM NEON

This implements XOR syndrome calculation using NEON intrinsics.
As before, the module can be built for ARM and arm64 from the
same source.

Relative performance on a Cortex-A57 based system:

raid6: int64x1 gen() 905 MB/s
raid6: int64x1 xor() 881 MB/s
raid6: int64x2 gen() 1343 MB/s
raid6: int64x2 xor() 1286 MB/s
raid6: int64x4 gen() 1896 MB/s
raid6: int64x4 xor() 1321 MB/s
raid6: int64x8 gen() 1773 MB/s
raid6: int64x8 xor() 1165 MB/s
raid6: neonx1 gen() 1834 MB/s
raid6: neonx1 xor() 1278 MB/s
raid6: neonx2 gen() 2528 MB/s
raid6: neonx2 xor() 1942 MB/s
raid6: neonx4 gen() 2888 MB/s
raid6: neonx4 xor() 2334 MB/s
raid6: neonx8 gen() 2957 MB/s
raid6: neonx8 xor() 2232 MB/s
raid6: using algorithm neonx8 gen() 2957 MB/s
raid6: .... xor() 2232 MB/s, rmw enabled

Cc: Markus Stockhausen <stockhausen@collogia.de>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: NeilBrown <neilb@suse.com>

authored by

Ard Biesheuvel and committed by
NeilBrown
0e833e69 199dc6ed

+58 -1
+12 -1
lib/raid6/neon.c
··· 40 40 (unsigned long)bytes, ptrs); \ 41 41 kernel_neon_end(); \ 42 42 } \ 43 + static void raid6_neon ## _n ## _xor_syndrome(int disks, \ 44 + int start, int stop, \ 45 + size_t bytes, void **ptrs) \ 46 + { \ 47 + void raid6_neon ## _n ## _xor_syndrome_real(int, \ 48 + int, int, unsigned long, void**); \ 49 + kernel_neon_begin(); \ 50 + raid6_neon ## _n ## _xor_syndrome_real(disks, \ 51 + start, stop, (unsigned long)bytes, ptrs); \ 52 + kernel_neon_end(); \ 53 + } \ 43 54 struct raid6_calls const raid6_neonx ## _n = { \ 44 55 raid6_neon ## _n ## _gen_syndrome, \ 45 - NULL, /* XOR not yet implemented */ \ 56 + raid6_neon ## _n ## _xor_syndrome, \ 46 57 raid6_have_neon, \ 47 58 "neonx" #_n, \ 48 59 0 \
+46
lib/raid6/neon.uc
··· 3 3 * neon.uc - RAID-6 syndrome calculation using ARM NEON instructions 4 4 * 5 5 * Copyright (C) 2012 Rob Herring 6 + * Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org> 6 7 * 7 8 * Based on altivec.uc: 8 9 * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved ··· 75 74 w1$$ = veorq_u8(w1$$, w2$$); 76 75 wq$$ = veorq_u8(w1$$, wd$$); 77 76 } 77 + vst1q_u8(&p[d+NSIZE*$$], wp$$); 78 + vst1q_u8(&q[d+NSIZE*$$], wq$$); 79 + } 80 + } 81 + 82 + void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop, 83 + unsigned long bytes, void **ptrs) 84 + { 85 + uint8_t **dptr = (uint8_t **)ptrs; 86 + uint8_t *p, *q; 87 + int d, z, z0; 88 + 89 + register unative_t wd$$, wq$$, wp$$, w1$$, w2$$; 90 + const unative_t x1d = NBYTES(0x1d); 91 + 92 + z0 = stop; /* P/Q right side optimization */ 93 + p = dptr[disks-2]; /* XOR parity */ 94 + q = dptr[disks-1]; /* RS syndrome */ 95 + 96 + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { 97 + wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]); 98 + wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$); 99 + 100 + /* P/Q data pages */ 101 + for ( z = z0-1 ; z >= start ; z-- ) { 102 + wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]); 103 + wp$$ = veorq_u8(wp$$, wd$$); 104 + w2$$ = MASK(wq$$); 105 + w1$$ = SHLBYTE(wq$$); 106 + 107 + w2$$ = vandq_u8(w2$$, x1d); 108 + w1$$ = veorq_u8(w1$$, w2$$); 109 + wq$$ = veorq_u8(w1$$, wd$$); 110 + } 111 + /* P/Q left side optimization */ 112 + for ( z = start-1 ; z >= 0 ; z-- ) { 113 + w2$$ = MASK(wq$$); 114 + w1$$ = SHLBYTE(wq$$); 115 + 116 + w2$$ = vandq_u8(w2$$, x1d); 117 + wq$$ = veorq_u8(w1$$, w2$$); 118 + } 119 + w1$$ = vld1q_u8(&q[d+NSIZE*$$]); 120 + wq$$ = veorq_u8(wq$$, w1$$); 121 + 78 122 vst1q_u8(&p[d+NSIZE*$$], wp$$); 79 123 vst1q_u8(&q[d+NSIZE*$$], wq$$); 80 124 }