Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * IP/TCP/UDP checksumming routines
8 *
9 * Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
10 * Optimized by Joe Taylor
11 */
12
13#include <linux/errno.h>
14#include <linux/linkage.h>
15#include <asm/asmmacro.h>
16#include <asm/core.h>
17
18/*
19 * computes a partial checksum, e.g. for TCP/UDP fragments
20 */
21
22/*
23 * unsigned int csum_partial(const unsigned char *buf, int len,
24 * unsigned int sum);
25 * a2 = buf
26 * a3 = len
27 * a4 = sum
28 *
29 * This function assumes 2- or 4-byte alignment. Other alignments will fail!
30 */
31
32/* ONES_ADD converts twos-complement math to ones-complement. */
33#define ONES_ADD(sum, val) \
34 add sum, sum, val ; \
35 bgeu sum, val, 99f ; \
36 addi sum, sum, 1 ; \
3799: ;
38
39.text
40ENTRY(csum_partial)
41
42 /*
43 * Experiments with Ethernet and SLIP connections show that buf
44 * is aligned on either a 2-byte or 4-byte boundary.
45 */
46 abi_entry_default
47 extui a5, a2, 0, 2
48 bnez a5, 8f /* branch if 2-byte aligned */
49 /* Fall-through on common case, 4-byte alignment */
501:
51 srli a5, a3, 5 /* 32-byte chunks */
52#if XCHAL_HAVE_LOOPS
53 loopgtz a5, 2f
54#else
55 beqz a5, 2f
56 slli a5, a5, 5
57 add a5, a5, a2 /* a5 = end of last 32-byte chunk */
58.Loop1:
59#endif
60 l32i a6, a2, 0
61 l32i a7, a2, 4
62 ONES_ADD(a4, a6)
63 ONES_ADD(a4, a7)
64 l32i a6, a2, 8
65 l32i a7, a2, 12
66 ONES_ADD(a4, a6)
67 ONES_ADD(a4, a7)
68 l32i a6, a2, 16
69 l32i a7, a2, 20
70 ONES_ADD(a4, a6)
71 ONES_ADD(a4, a7)
72 l32i a6, a2, 24
73 l32i a7, a2, 28
74 ONES_ADD(a4, a6)
75 ONES_ADD(a4, a7)
76 addi a2, a2, 4*8
77#if !XCHAL_HAVE_LOOPS
78 blt a2, a5, .Loop1
79#endif
802:
81 extui a5, a3, 2, 3 /* remaining 4-byte chunks */
82#if XCHAL_HAVE_LOOPS
83 loopgtz a5, 3f
84#else
85 beqz a5, 3f
86 slli a5, a5, 2
87 add a5, a5, a2 /* a5 = end of last 4-byte chunk */
88.Loop2:
89#endif
90 l32i a6, a2, 0
91 ONES_ADD(a4, a6)
92 addi a2, a2, 4
93#if !XCHAL_HAVE_LOOPS
94 blt a2, a5, .Loop2
95#endif
963:
97 _bbci.l a3, 1, 5f /* remaining 2-byte chunk */
98 l16ui a6, a2, 0
99 ONES_ADD(a4, a6)
100 addi a2, a2, 2
1015:
102 _bbci.l a3, 0, 7f /* remaining 1-byte chunk */
1036: l8ui a6, a2, 0
104#ifdef __XTENSA_EB__
105 slli a6, a6, 8 /* load byte into bits 8..15 */
106#endif
107 ONES_ADD(a4, a6)
1087:
109 mov a2, a4
110 abi_ret_default
111
112 /* uncommon case, buf is 2-byte aligned */
1138:
114 beqz a3, 7b /* branch if len == 0 */
115 beqi a3, 1, 6b /* branch if len == 1 */
116
117 extui a5, a2, 0, 1
118 bnez a5, 8f /* branch if 1-byte aligned */
119
120 l16ui a6, a2, 0 /* common case, len >= 2 */
121 ONES_ADD(a4, a6)
122 addi a2, a2, 2 /* adjust buf */
123 addi a3, a3, -2 /* adjust len */
124 j 1b /* now buf is 4-byte aligned */
125
126 /* case: odd-byte aligned, len > 1
127 * This case is dog slow, so don't give us an odd address.
128 * (I don't think this ever happens, but just in case.)
129 */
1308:
131 srli a5, a3, 2 /* 4-byte chunks */
132#if XCHAL_HAVE_LOOPS
133 loopgtz a5, 2f
134#else
135 beqz a5, 2f
136 slli a5, a5, 2
137 add a5, a5, a2 /* a5 = end of last 4-byte chunk */
138.Loop3:
139#endif
140 l8ui a6, a2, 0 /* bits 24..31 */
141 l16ui a7, a2, 1 /* bits 8..23 */
142 l8ui a8, a2, 3 /* bits 0.. 8 */
143#ifdef __XTENSA_EB__
144 slli a6, a6, 24
145#else
146 slli a8, a8, 24
147#endif
148 slli a7, a7, 8
149 or a7, a7, a6
150 or a7, a7, a8
151 ONES_ADD(a4, a7)
152 addi a2, a2, 4
153#if !XCHAL_HAVE_LOOPS
154 blt a2, a5, .Loop3
155#endif
1562:
157 _bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */
158 l8ui a6, a2, 0
159 l8ui a7, a2, 1
160#ifdef __XTENSA_EB__
161 slli a6, a6, 8
162#else
163 slli a7, a7, 8
164#endif
165 or a7, a7, a6
166 ONES_ADD(a4, a7)
167 addi a2, a2, 2
1683:
169 j 5b /* branch to handle the remaining byte */
170
171ENDPROC(csum_partial)
172EXPORT_SYMBOL(csum_partial)
173
174/*
175 * Copy from ds while checksumming, otherwise like csum_partial
176 */
177
178/*
179unsigned int csum_partial_copy_generic (const char *src, char *dst, int len)
180 a2 = src
181 a3 = dst
182 a4 = len
183 a5 = sum
184 a8 = temp
185 a9 = temp
186 a10 = temp
187
188 This function is optimized for 4-byte aligned addresses. Other
189 alignments work, but not nearly as efficiently.
190 */
191
192ENTRY(csum_partial_copy_generic)
193
194 abi_entry_default
195 movi a5, -1
196 or a10, a2, a3
197
198 /* We optimize the following alignment tests for the 4-byte
199 aligned case. Two bbsi.l instructions might seem more optimal
200 (commented out below). However, both labels 5: and 3: are out
201 of the imm8 range, so the assembler relaxes them into
202 equivalent bbci.l, j combinations, which is actually
203 slower. */
204
205 extui a9, a10, 0, 2
206 beqz a9, 1f /* branch if both are 4-byte aligned */
207 bbsi.l a10, 0, 5f /* branch if one address is odd */
208 j 3f /* one address is 2-byte aligned */
209
210/* _bbsi.l a10, 0, 5f */ /* branch if odd address */
211/* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */
212
2131:
214 /* src and dst are both 4-byte aligned */
215 srli a10, a4, 5 /* 32-byte chunks */
216#if XCHAL_HAVE_LOOPS
217 loopgtz a10, 2f
218#else
219 beqz a10, 2f
220 slli a10, a10, 5
221 add a10, a10, a2 /* a10 = end of last 32-byte src chunk */
222.Loop5:
223#endif
224EX(10f) l32i a9, a2, 0
225EX(10f) l32i a8, a2, 4
226EX(10f) s32i a9, a3, 0
227EX(10f) s32i a8, a3, 4
228 ONES_ADD(a5, a9)
229 ONES_ADD(a5, a8)
230EX(10f) l32i a9, a2, 8
231EX(10f) l32i a8, a2, 12
232EX(10f) s32i a9, a3, 8
233EX(10f) s32i a8, a3, 12
234 ONES_ADD(a5, a9)
235 ONES_ADD(a5, a8)
236EX(10f) l32i a9, a2, 16
237EX(10f) l32i a8, a2, 20
238EX(10f) s32i a9, a3, 16
239EX(10f) s32i a8, a3, 20
240 ONES_ADD(a5, a9)
241 ONES_ADD(a5, a8)
242EX(10f) l32i a9, a2, 24
243EX(10f) l32i a8, a2, 28
244EX(10f) s32i a9, a3, 24
245EX(10f) s32i a8, a3, 28
246 ONES_ADD(a5, a9)
247 ONES_ADD(a5, a8)
248 addi a2, a2, 32
249 addi a3, a3, 32
250#if !XCHAL_HAVE_LOOPS
251 blt a2, a10, .Loop5
252#endif
2532:
254 extui a10, a4, 2, 3 /* remaining 4-byte chunks */
255 extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */
256#if XCHAL_HAVE_LOOPS
257 loopgtz a10, 3f
258#else
259 beqz a10, 3f
260 slli a10, a10, 2
261 add a10, a10, a2 /* a10 = end of last 4-byte src chunk */
262.Loop6:
263#endif
264EX(10f) l32i a9, a2, 0
265EX(10f) s32i a9, a3, 0
266 ONES_ADD(a5, a9)
267 addi a2, a2, 4
268 addi a3, a3, 4
269#if !XCHAL_HAVE_LOOPS
270 blt a2, a10, .Loop6
271#endif
2723:
273 /*
274 Control comes to here in two cases: (1) It may fall through
275 to here from the 4-byte alignment case to process, at most,
276 one 2-byte chunk. (2) It branches to here from above if
277 either src or dst is 2-byte aligned, and we process all bytes
278 here, except for perhaps a trailing odd byte. It's
279 inefficient, so align your addresses to 4-byte boundaries.
280
281 a2 = src
282 a3 = dst
283 a4 = len
284 a5 = sum
285 */
286 srli a10, a4, 1 /* 2-byte chunks */
287#if XCHAL_HAVE_LOOPS
288 loopgtz a10, 4f
289#else
290 beqz a10, 4f
291 slli a10, a10, 1
292 add a10, a10, a2 /* a10 = end of last 2-byte src chunk */
293.Loop7:
294#endif
295EX(10f) l16ui a9, a2, 0
296EX(10f) s16i a9, a3, 0
297 ONES_ADD(a5, a9)
298 addi a2, a2, 2
299 addi a3, a3, 2
300#if !XCHAL_HAVE_LOOPS
301 blt a2, a10, .Loop7
302#endif
3034:
304 /* This section processes a possible trailing odd byte. */
305 _bbci.l a4, 0, 8f /* 1-byte chunk */
306EX(10f) l8ui a9, a2, 0
307EX(10f) s8i a9, a3, 0
308#ifdef __XTENSA_EB__
309 slli a9, a9, 8 /* shift byte to bits 8..15 */
310#endif
311 ONES_ADD(a5, a9)
3128:
313 mov a2, a5
314 abi_ret_default
315
3165:
317 /* Control branch to here when either src or dst is odd. We
318 process all bytes using 8-bit accesses. Grossly inefficient,
319 so don't feed us an odd address. */
320
321 srli a10, a4, 1 /* handle in pairs for 16-bit csum */
322#if XCHAL_HAVE_LOOPS
323 loopgtz a10, 6f
324#else
325 beqz a10, 6f
326 slli a10, a10, 1
327 add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */
328.Loop8:
329#endif
330EX(10f) l8ui a9, a2, 0
331EX(10f) l8ui a8, a2, 1
332EX(10f) s8i a9, a3, 0
333EX(10f) s8i a8, a3, 1
334#ifdef __XTENSA_EB__
335 slli a9, a9, 8 /* combine into a single 16-bit value */
336#else /* for checksum computation */
337 slli a8, a8, 8
338#endif
339 or a9, a9, a8
340 ONES_ADD(a5, a9)
341 addi a2, a2, 2
342 addi a3, a3, 2
343#if !XCHAL_HAVE_LOOPS
344 blt a2, a10, .Loop8
345#endif
3466:
347 j 4b /* process the possible trailing odd byte */
348
349ENDPROC(csum_partial_copy_generic)
350EXPORT_SYMBOL(csum_partial_copy_generic)
351
352
353# Exception handler:
354.section .fixup, "ax"
35510:
356 movi a2, 0
357 abi_ret_default
358
359.previous