lib/crc/arm/crc32-core.S at nocache-cleanup

tjh.dev / kernel
fork atom
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork atom
kernel / lib / crc / arm / crc32-core.S
at nocache-cleanup 306 lines 6.9 kB view raw
wrap content
  1/*
  2 * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
  3 *
  4 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
  5 *
  6 * This program is free software; you can redistribute it and/or modify
  7 * it under the terms of the GNU General Public License version 2 as
  8 * published by the Free Software Foundation.
  9 */
 10
 11/* GPL HEADER START
 12 *
 13 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 14 *
 15 * This program is free software; you can redistribute it and/or modify
 16 * it under the terms of the GNU General Public License version 2 only,
 17 * as published by the Free Software Foundation.
 18 *
 19 * This program is distributed in the hope that it will be useful, but
 20 * WITHOUT ANY WARRANTY; without even the implied warranty of
 21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 22 * General Public License version 2 for more details (a copy is included
 23 * in the LICENSE file that accompanied this code).
 24 *
 25 * You should have received a copy of the GNU General Public License
 26 * version 2 along with this program; If not, see http://www.gnu.org/licenses
 27 *
 28 * Please  visit http://www.xyratex.com/contact if you need additional
 29 * information or have any questions.
 30 *
 31 * GPL HEADER END
 32 */
 33
 34/*
 35 * Copyright 2012 Xyratex Technology Limited
 36 *
 37 * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
 38 * calculation.
 39 * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
 40 * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
 41 * at:
 42 * https://www.intel.com/products/processor/manuals/
 43 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
 44 * Volume 2B: Instruction Set Reference, N-Z
 45 *
 46 * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
 47 *	      Alexander Boyko <Alexander_Boyko@xyratex.com>
 48 */
 49
 50#include <linux/linkage.h>
 51#include <asm/assembler.h>
 52
 53	.text
 54	.align		6
 55	.arch		armv8-a
 56	.arch_extension	crc
 57	.fpu		crypto-neon-fp-armv8
 58
 59.Lcrc32_constants:
 60	/*
 61	 * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
 62	 * #define CONSTANT_R1  0x154442bd4LL
 63	 *
 64	 * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
 65	 * #define CONSTANT_R2  0x1c6e41596LL
 66	 */
 67	.quad		0x0000000154442bd4
 68	.quad		0x00000001c6e41596
 69
 70	/*
 71	 * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
 72	 * #define CONSTANT_R3  0x1751997d0LL
 73	 *
 74	 * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
 75	 * #define CONSTANT_R4  0x0ccaa009eLL
 76	 */
 77	.quad		0x00000001751997d0
 78	.quad		0x00000000ccaa009e
 79
 80	/*
 81	 * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
 82	 * #define CONSTANT_R5  0x163cd6124LL
 83	 */
 84	.quad		0x0000000163cd6124
 85	.quad		0x00000000FFFFFFFF
 86
 87	/*
 88	 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
 89	 *
 90	 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
 91	 *                                                      = 0x1F7011641LL
 92	 * #define CONSTANT_RU  0x1F7011641LL
 93	 */
 94	.quad		0x00000001DB710641
 95	.quad		0x00000001F7011641
 96
 97.Lcrc32c_constants:
 98	.quad		0x00000000740eef02
 99	.quad		0x000000009e4addf8
100	.quad		0x00000000f20c0dfe
101	.quad		0x000000014cd00bd6
102	.quad		0x00000000dd45aab8
103	.quad		0x00000000FFFFFFFF
104	.quad		0x0000000105ec76f0
105	.quad		0x00000000dea713f1
106
107	dCONSTANTl	.req	d0
108	dCONSTANTh	.req	d1
109	qCONSTANT	.req	q0
110
111	BUF		.req	r0
112	LEN		.req	r1
113	CRC		.req	r2
114
115	qzr		.req	q9
116
117	/**
118	 * Calculate crc32
119	 * BUF - buffer
120	 * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
121	 * CRC - initial crc32
122	 * return %eax crc32
123	 * uint crc32_pmull_le(unsigned char const *buffer,
124	 *                     size_t len, uint crc32)
125	 */
126SYM_FUNC_START(crc32_pmull_le)
127	adr		r3, .Lcrc32_constants
128	b		0f
129SYM_FUNC_END(crc32_pmull_le)
130
131SYM_FUNC_START(crc32c_pmull_le)
132	adr		r3, .Lcrc32c_constants
133
1340:	bic		LEN, LEN, #15
135	vld1.8		{q1-q2}, [BUF, :128]!
136	vld1.8		{q3-q4}, [BUF, :128]!
137	vmov.i8		qzr, #0
138	vmov.i8		qCONSTANT, #0
139	vmov.32		dCONSTANTl[0], CRC
140	veor.8		d2, d2, dCONSTANTl
141	sub		LEN, LEN, #0x40
142	cmp		LEN, #0x40
143	blt		less_64
144
145	vld1.64		{qCONSTANT}, [r3]
146
147loop_64:		/* 64 bytes Full cache line folding */
148	sub		LEN, LEN, #0x40
149
150	vmull.p64	q5, d3, dCONSTANTh
151	vmull.p64	q6, d5, dCONSTANTh
152	vmull.p64	q7, d7, dCONSTANTh
153	vmull.p64	q8, d9, dCONSTANTh
154
155	vmull.p64	q1, d2, dCONSTANTl
156	vmull.p64	q2, d4, dCONSTANTl
157	vmull.p64	q3, d6, dCONSTANTl
158	vmull.p64	q4, d8, dCONSTANTl
159
160	veor.8		q1, q1, q5
161	vld1.8		{q5}, [BUF, :128]!
162	veor.8		q2, q2, q6
163	vld1.8		{q6}, [BUF, :128]!
164	veor.8		q3, q3, q7
165	vld1.8		{q7}, [BUF, :128]!
166	veor.8		q4, q4, q8
167	vld1.8		{q8}, [BUF, :128]!
168
169	veor.8		q1, q1, q5
170	veor.8		q2, q2, q6
171	veor.8		q3, q3, q7
172	veor.8		q4, q4, q8
173
174	cmp		LEN, #0x40
175	bge		loop_64
176
177less_64:		/* Folding cache line into 128bit */
178	vldr		dCONSTANTl, [r3, #16]
179	vldr		dCONSTANTh, [r3, #24]
180
181	vmull.p64	q5, d3, dCONSTANTh
182	vmull.p64	q1, d2, dCONSTANTl
183	veor.8		q1, q1, q5
184	veor.8		q1, q1, q2
185
186	vmull.p64	q5, d3, dCONSTANTh
187	vmull.p64	q1, d2, dCONSTANTl
188	veor.8		q1, q1, q5
189	veor.8		q1, q1, q3
190
191	vmull.p64	q5, d3, dCONSTANTh
192	vmull.p64	q1, d2, dCONSTANTl
193	veor.8		q1, q1, q5
194	veor.8		q1, q1, q4
195
196	teq		LEN, #0
197	beq		fold_64
198
199loop_16:		/* Folding rest buffer into 128bit */
200	subs		LEN, LEN, #0x10
201
202	vld1.8		{q2}, [BUF, :128]!
203	vmull.p64	q5, d3, dCONSTANTh
204	vmull.p64	q1, d2, dCONSTANTl
205	veor.8		q1, q1, q5
206	veor.8		q1, q1, q2
207
208	bne		loop_16
209
210fold_64:
211	/* perform the last 64 bit fold, also adds 32 zeroes
212	 * to the input stream */
213	vmull.p64	q2, d2, dCONSTANTh
214	vext.8		q1, q1, qzr, #8
215	veor.8		q1, q1, q2
216
217	/* final 32-bit fold */
218	vldr		dCONSTANTl, [r3, #32]
219	vldr		d6, [r3, #40]
220	vmov.i8		d7, #0
221
222	vext.8		q2, q1, qzr, #4
223	vand.8		d2, d2, d6
224	vmull.p64	q1, d2, dCONSTANTl
225	veor.8		q1, q1, q2
226
227	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
228	vldr		dCONSTANTl, [r3, #48]
229	vldr		dCONSTANTh, [r3, #56]
230
231	vand.8		q2, q1, q3
232	vext.8		q2, qzr, q2, #8
233	vmull.p64	q2, d5, dCONSTANTh
234	vand.8		q2, q2, q3
235	vmull.p64	q2, d4, dCONSTANTl
236	veor.8		q1, q1, q2
237	vmov		r0, s5
238
239	bx		lr
240SYM_FUNC_END(crc32c_pmull_le)
241
242	.macro		__crc32, c
243	subs		ip, r2, #8
244	bmi		.Ltail\c
245
246	tst		r1, #3
247	bne		.Lunaligned\c
248
249	teq		ip, #0
250.Laligned8\c:
251	ldrd		r2, r3, [r1], #8
252ARM_BE8(rev		r2, r2		)
253ARM_BE8(rev		r3, r3		)
254	crc32\c\()w	r0, r0, r2
255	crc32\c\()w	r0, r0, r3
256	bxeq		lr
257	subs		ip, ip, #8
258	bpl		.Laligned8\c
259
260.Ltail\c:
261	tst		ip, #4
262	beq		2f
263	ldr		r3, [r1], #4
264ARM_BE8(rev		r3, r3		)
265	crc32\c\()w	r0, r0, r3
266
2672:	tst		ip, #2
268	beq		1f
269	ldrh		r3, [r1], #2
270ARM_BE8(rev16		r3, r3		)
271	crc32\c\()h	r0, r0, r3
272
2731:	tst		ip, #1
274	bxeq		lr
275	ldrb		r3, [r1]
276	crc32\c\()b	r0, r0, r3
277	bx		lr
278
279.Lunaligned\c:
280	tst		r1, #1
281	beq		2f
282	ldrb		r3, [r1], #1
283	subs		r2, r2, #1
284	crc32\c\()b	r0, r0, r3
285
286	tst		r1, #2
287	beq		0f
2882:	ldrh		r3, [r1], #2
289	subs		r2, r2, #2
290ARM_BE8(rev16		r3, r3		)
291	crc32\c\()h	r0, r0, r3
292
2930:	subs		ip, r2, #8
294	bpl		.Laligned8\c
295	b		.Ltail\c
296	.endm
297
298	.align		5
299SYM_FUNC_START(crc32_armv8_le)
300	__crc32
301SYM_FUNC_END(crc32_armv8_le)
302
303	.align		5
304SYM_FUNC_START(crc32c_armv8_le)
305	__crc32		c
306SYM_FUNC_END(crc32c_armv8_le)