Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * IP/TCP/UDP checksumming routines
8 *
9 * Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
10 * Optimized by Joe Taylor
11 */
12
13#include <linux/errno.h>
14#include <linux/linkage.h>
15#include <asm/asmmacro.h>
16#include <asm/core.h>
17
18/*
19 * computes a partial checksum, e.g. for TCP/UDP fragments
20 */
21
22/*
23 * unsigned int csum_partial(const unsigned char *buf, int len,
24 * unsigned int sum);
25 * a2 = buf
26 * a3 = len
27 * a4 = sum
28 *
29 * This function assumes 2- or 4-byte alignment. Other alignments will fail!
30 */
31
32/* ONES_ADD converts twos-complement math to ones-complement. */
33#define ONES_ADD(sum, val) \
34 add sum, sum, val ; \
35 bgeu sum, val, 99f ; \
36 addi sum, sum, 1 ; \
3799: ;
38
39.text
40ENTRY(csum_partial)
41
42 /*
43 * Experiments with Ethernet and SLIP connections show that buf
44 * is aligned on either a 2-byte or 4-byte boundary.
45 */
46 abi_entry_default
47 extui a5, a2, 0, 2
48 bnez a5, 8f /* branch if 2-byte aligned */
49 /* Fall-through on common case, 4-byte alignment */
501:
51 srli a5, a3, 5 /* 32-byte chunks */
52#if XCHAL_HAVE_LOOPS
53 loopgtz a5, 2f
54#else
55 beqz a5, 2f
56 slli a5, a5, 5
57 add a5, a5, a2 /* a5 = end of last 32-byte chunk */
58.Loop1:
59#endif
60 l32i a6, a2, 0
61 l32i a7, a2, 4
62 ONES_ADD(a4, a6)
63 ONES_ADD(a4, a7)
64 l32i a6, a2, 8
65 l32i a7, a2, 12
66 ONES_ADD(a4, a6)
67 ONES_ADD(a4, a7)
68 l32i a6, a2, 16
69 l32i a7, a2, 20
70 ONES_ADD(a4, a6)
71 ONES_ADD(a4, a7)
72 l32i a6, a2, 24
73 l32i a7, a2, 28
74 ONES_ADD(a4, a6)
75 ONES_ADD(a4, a7)
76 addi a2, a2, 4*8
77#if !XCHAL_HAVE_LOOPS
78 blt a2, a5, .Loop1
79#endif
802:
81 extui a5, a3, 2, 3 /* remaining 4-byte chunks */
82#if XCHAL_HAVE_LOOPS
83 loopgtz a5, 3f
84#else
85 beqz a5, 3f
86 slli a5, a5, 2
87 add a5, a5, a2 /* a5 = end of last 4-byte chunk */
88.Loop2:
89#endif
90 l32i a6, a2, 0
91 ONES_ADD(a4, a6)
92 addi a2, a2, 4
93#if !XCHAL_HAVE_LOOPS
94 blt a2, a5, .Loop2
95#endif
963:
97 _bbci.l a3, 1, 5f /* remaining 2-byte chunk */
98 l16ui a6, a2, 0
99 ONES_ADD(a4, a6)
100 addi a2, a2, 2
1015:
102 _bbci.l a3, 0, 7f /* remaining 1-byte chunk */
1036: l8ui a6, a2, 0
104#ifdef __XTENSA_EB__
105 slli a6, a6, 8 /* load byte into bits 8..15 */
106#endif
107 ONES_ADD(a4, a6)
1087:
109 mov a2, a4
110 abi_ret_default
111
112 /* uncommon case, buf is 2-byte aligned */
1138:
114 beqz a3, 7b /* branch if len == 0 */
115 beqi a3, 1, 6b /* branch if len == 1 */
116
117 extui a5, a2, 0, 1
118 bnez a5, 8f /* branch if 1-byte aligned */
119
120 l16ui a6, a2, 0 /* common case, len >= 2 */
121 ONES_ADD(a4, a6)
122 addi a2, a2, 2 /* adjust buf */
123 addi a3, a3, -2 /* adjust len */
124 j 1b /* now buf is 4-byte aligned */
125
126 /* case: odd-byte aligned, len > 1
127 * This case is dog slow, so don't give us an odd address.
128 * (I don't think this ever happens, but just in case.)
129 */
1308:
131 srli a5, a3, 2 /* 4-byte chunks */
132#if XCHAL_HAVE_LOOPS
133 loopgtz a5, 2f
134#else
135 beqz a5, 2f
136 slli a5, a5, 2
137 add a5, a5, a2 /* a5 = end of last 4-byte chunk */
138.Loop3:
139#endif
140 l8ui a6, a2, 0 /* bits 24..31 */
141 l16ui a7, a2, 1 /* bits 8..23 */
142 l8ui a8, a2, 3 /* bits 0.. 8 */
143#ifdef __XTENSA_EB__
144 slli a6, a6, 24
145#else
146 slli a8, a8, 24
147#endif
148 slli a7, a7, 8
149 or a7, a7, a6
150 or a7, a7, a8
151 ONES_ADD(a4, a7)
152 addi a2, a2, 4
153#if !XCHAL_HAVE_LOOPS
154 blt a2, a5, .Loop3
155#endif
1562:
157 _bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */
158 l8ui a6, a2, 0
159 l8ui a7, a2, 1
160#ifdef __XTENSA_EB__
161 slli a6, a6, 8
162#else
163 slli a7, a7, 8
164#endif
165 or a7, a7, a6
166 ONES_ADD(a4, a7)
167 addi a2, a2, 2
1683:
169 j 5b /* branch to handle the remaining byte */
170
171ENDPROC(csum_partial)
172
173/*
174 * Copy from ds while checksumming, otherwise like csum_partial
175 */
176
177/*
178unsigned int csum_partial_copy_generic (const char *src, char *dst, int len)
179 a2 = src
180 a3 = dst
181 a4 = len
182 a5 = sum
183 a8 = temp
184 a9 = temp
185 a10 = temp
186
187 This function is optimized for 4-byte aligned addresses. Other
188 alignments work, but not nearly as efficiently.
189 */
190
191ENTRY(csum_partial_copy_generic)
192
193 abi_entry_default
194 movi a5, -1
195 or a10, a2, a3
196
197 /* We optimize the following alignment tests for the 4-byte
198 aligned case. Two bbsi.l instructions might seem more optimal
199 (commented out below). However, both labels 5: and 3: are out
200 of the imm8 range, so the assembler relaxes them into
201 equivalent bbci.l, j combinations, which is actually
202 slower. */
203
204 extui a9, a10, 0, 2
205 beqz a9, 1f /* branch if both are 4-byte aligned */
206 bbsi.l a10, 0, 5f /* branch if one address is odd */
207 j 3f /* one address is 2-byte aligned */
208
209/* _bbsi.l a10, 0, 5f */ /* branch if odd address */
210/* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */
211
2121:
213 /* src and dst are both 4-byte aligned */
214 srli a10, a4, 5 /* 32-byte chunks */
215#if XCHAL_HAVE_LOOPS
216 loopgtz a10, 2f
217#else
218 beqz a10, 2f
219 slli a10, a10, 5
220 add a10, a10, a2 /* a10 = end of last 32-byte src chunk */
221.Loop5:
222#endif
223EX(10f) l32i a9, a2, 0
224EX(10f) l32i a8, a2, 4
225EX(10f) s32i a9, a3, 0
226EX(10f) s32i a8, a3, 4
227 ONES_ADD(a5, a9)
228 ONES_ADD(a5, a8)
229EX(10f) l32i a9, a2, 8
230EX(10f) l32i a8, a2, 12
231EX(10f) s32i a9, a3, 8
232EX(10f) s32i a8, a3, 12
233 ONES_ADD(a5, a9)
234 ONES_ADD(a5, a8)
235EX(10f) l32i a9, a2, 16
236EX(10f) l32i a8, a2, 20
237EX(10f) s32i a9, a3, 16
238EX(10f) s32i a8, a3, 20
239 ONES_ADD(a5, a9)
240 ONES_ADD(a5, a8)
241EX(10f) l32i a9, a2, 24
242EX(10f) l32i a8, a2, 28
243EX(10f) s32i a9, a3, 24
244EX(10f) s32i a8, a3, 28
245 ONES_ADD(a5, a9)
246 ONES_ADD(a5, a8)
247 addi a2, a2, 32
248 addi a3, a3, 32
249#if !XCHAL_HAVE_LOOPS
250 blt a2, a10, .Loop5
251#endif
2522:
253 extui a10, a4, 2, 3 /* remaining 4-byte chunks */
254 extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */
255#if XCHAL_HAVE_LOOPS
256 loopgtz a10, 3f
257#else
258 beqz a10, 3f
259 slli a10, a10, 2
260 add a10, a10, a2 /* a10 = end of last 4-byte src chunk */
261.Loop6:
262#endif
263EX(10f) l32i a9, a2, 0
264EX(10f) s32i a9, a3, 0
265 ONES_ADD(a5, a9)
266 addi a2, a2, 4
267 addi a3, a3, 4
268#if !XCHAL_HAVE_LOOPS
269 blt a2, a10, .Loop6
270#endif
2713:
272 /*
273 Control comes to here in two cases: (1) It may fall through
274 to here from the 4-byte alignment case to process, at most,
275 one 2-byte chunk. (2) It branches to here from above if
276 either src or dst is 2-byte aligned, and we process all bytes
277 here, except for perhaps a trailing odd byte. It's
278 inefficient, so align your addresses to 4-byte boundaries.
279
280 a2 = src
281 a3 = dst
282 a4 = len
283 a5 = sum
284 */
285 srli a10, a4, 1 /* 2-byte chunks */
286#if XCHAL_HAVE_LOOPS
287 loopgtz a10, 4f
288#else
289 beqz a10, 4f
290 slli a10, a10, 1
291 add a10, a10, a2 /* a10 = end of last 2-byte src chunk */
292.Loop7:
293#endif
294EX(10f) l16ui a9, a2, 0
295EX(10f) s16i a9, a3, 0
296 ONES_ADD(a5, a9)
297 addi a2, a2, 2
298 addi a3, a3, 2
299#if !XCHAL_HAVE_LOOPS
300 blt a2, a10, .Loop7
301#endif
3024:
303 /* This section processes a possible trailing odd byte. */
304 _bbci.l a4, 0, 8f /* 1-byte chunk */
305EX(10f) l8ui a9, a2, 0
306EX(10f) s8i a9, a3, 0
307#ifdef __XTENSA_EB__
308 slli a9, a9, 8 /* shift byte to bits 8..15 */
309#endif
310 ONES_ADD(a5, a9)
3118:
312 mov a2, a5
313 abi_ret_default
314
3155:
316 /* Control branch to here when either src or dst is odd. We
317 process all bytes using 8-bit accesses. Grossly inefficient,
318 so don't feed us an odd address. */
319
320 srli a10, a4, 1 /* handle in pairs for 16-bit csum */
321#if XCHAL_HAVE_LOOPS
322 loopgtz a10, 6f
323#else
324 beqz a10, 6f
325 slli a10, a10, 1
326 add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */
327.Loop8:
328#endif
329EX(10f) l8ui a9, a2, 0
330EX(10f) l8ui a8, a2, 1
331EX(10f) s8i a9, a3, 0
332EX(10f) s8i a8, a3, 1
333#ifdef __XTENSA_EB__
334 slli a9, a9, 8 /* combine into a single 16-bit value */
335#else /* for checksum computation */
336 slli a8, a8, 8
337#endif
338 or a9, a9, a8
339 ONES_ADD(a5, a9)
340 addi a2, a2, 2
341 addi a3, a3, 2
342#if !XCHAL_HAVE_LOOPS
343 blt a2, a10, .Loop8
344#endif
3456:
346 j 4b /* process the possible trailing odd byte */
347
348ENDPROC(csum_partial_copy_generic)
349
350
351# Exception handler:
352.section .fixup, "ax"
35310:
354 movi a2, 0
355 abi_ret_default
356
357.previous