Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ARM: crypto: add NEON accelerated XOR implementation

Add a source file xor-neon.c (which is really just the reference
C implementation passed through the GCC vectorizer) and hook it
up to the XOR framework.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Nicolas Pitre <nico@linaro.org>

+121
+73
arch/arm/include/asm/xor.h
··· 7 7 * it under the terms of the GNU General Public License version 2 as 8 8 * published by the Free Software Foundation. 9 9 */ 10 + #include <linux/hardirq.h> 10 11 #include <asm-generic/xor.h> 12 + #include <asm/hwcap.h> 13 + #include <asm/neon.h> 11 14 12 15 #define __XOR(a1, a2) a1 ^= a2 13 16 ··· 141 138 xor_speed(&xor_block_arm4regs); \ 142 139 xor_speed(&xor_block_8regs); \ 143 140 xor_speed(&xor_block_32regs); \ 141 + NEON_TEMPLATES; \ 144 142 } while (0) 143 + 144 + #ifdef CONFIG_KERNEL_MODE_NEON 145 + 146 + extern struct xor_block_template const xor_block_neon_inner; 147 + 148 + static void 149 + xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 150 + { 151 + if (in_interrupt()) { 152 + xor_arm4regs_2(bytes, p1, p2); 153 + } else { 154 + kernel_neon_begin(); 155 + xor_block_neon_inner.do_2(bytes, p1, p2); 156 + kernel_neon_end(); 157 + } 158 + } 159 + 160 + static void 161 + xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 162 + unsigned long *p3) 163 + { 164 + if (in_interrupt()) { 165 + xor_arm4regs_3(bytes, p1, p2, p3); 166 + } else { 167 + kernel_neon_begin(); 168 + xor_block_neon_inner.do_3(bytes, p1, p2, p3); 169 + kernel_neon_end(); 170 + } 171 + } 172 + 173 + static void 174 + xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 175 + unsigned long *p3, unsigned long *p4) 176 + { 177 + if (in_interrupt()) { 178 + xor_arm4regs_4(bytes, p1, p2, p3, p4); 179 + } else { 180 + kernel_neon_begin(); 181 + xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4); 182 + kernel_neon_end(); 183 + } 184 + } 185 + 186 + static void 187 + xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 188 + unsigned long *p3, unsigned long *p4, unsigned long *p5) 189 + { 190 + if (in_interrupt()) { 191 + xor_arm4regs_5(bytes, p1, p2, p3, p4, p5); 192 + } else { 193 + kernel_neon_begin(); 194 + xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5); 195 + kernel_neon_end(); 196 + } 197 + } 198 + 199 + static struct xor_block_template xor_block_neon = { 200 + .name = "neon", 201 + .do_2 = xor_neon_2, 202 + .do_3 = xor_neon_3, 203 + .do_4 = xor_neon_4, 204 + .do_5 = xor_neon_5 205 + }; 206 + 207 + #define NEON_TEMPLATES \ 208 + do { if (cpu_has_neon()) xor_speed(&xor_block_neon); } while (0) 209 + #else 210 + #define NEON_TEMPLATES 211 + #endif
+6
arch/arm/lib/Makefile
··· 45 45 46 46 $(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S 47 47 $(obj)/csumpartialcopyuser.o: $(obj)/csumpartialcopygeneric.S 48 + 49 + ifeq ($(CONFIG_KERNEL_MODE_NEON),y) 50 + NEON_FLAGS := -mfloat-abi=softfp -mfpu=neon 51 + CFLAGS_xor-neon.o += $(NEON_FLAGS) 52 + lib-$(CONFIG_XOR_BLOCKS) += xor-neon.o 53 + endif
+42
arch/arm/lib/xor-neon.c
··· 1 + /* 2 + * linux/arch/arm/lib/xor-neon.c 3 + * 4 + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <linux/raid/xor.h> 12 + 13 + #ifndef __ARM_NEON__ 14 + #error You should compile this file with '-mfloat-abi=softfp -mfpu=neon' 15 + #endif 16 + 17 + /* 18 + * Pull in the reference implementations while instructing GCC (through 19 + * -ftree-vectorize) to attempt to exploit implicit parallelism and emit 20 + * NEON instructions. 21 + */ 22 + #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) 23 + #pragma GCC optimize "tree-vectorize" 24 + #else 25 + /* 26 + * While older versions of GCC do not generate incorrect code, they fail to 27 + * recognize the parallel nature of these functions, and emit plain ARM code, 28 + * which is known to be slower than the optimized ARM code in asm-arm/xor.h. 29 + */ 30 + #warning This code requires at least version 4.6 of GCC 31 + #endif 32 + 33 + #pragma GCC diagnostic ignored "-Wunused-variable" 34 + #include <asm-generic/xor.h> 35 + 36 + struct xor_block_template const xor_block_neon_inner = { 37 + .name = "__inner_neon__", 38 + .do_2 = xor_8regs_2, 39 + .do_3 = xor_8regs_3, 40 + .do_4 = xor_8regs_4, 41 + .do_5 = xor_8regs_5, 42 + };