arch/i386/lib/checksum.S at v2.6.12

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / arch / i386 / lib / checksum.S
at v2.6.12 496 lines 10 kB view raw
wrap content
  1/*
  2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
  3 *		operating system.  INET is implemented using the  BSD Socket
  4 *		interface as the means of communication with the user level.
  5 *
  6 *		IP/TCP/UDP checksumming routines
  7 *
  8 * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
  9 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 10 *		Tom May, <ftom@netcom.com>
 11 *              Pentium Pro/II routines:
 12 *              Alexander Kjeldaas <astor@guardian.no>
 13 *              Finn Arne Gangstad <finnag@guardian.no>
 14 *		Lots of code moved from tcp.c and ip.c; see those files
 15 *		for more names.
 16 *
 17 * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
 18 *			     handling.
 19 *		Andi Kleen,  add zeroing on error
 20 *                   converted to pure assembler
 21 *
 22 *		This program is free software; you can redistribute it and/or
 23 *		modify it under the terms of the GNU General Public License
 24 *		as published by the Free Software Foundation; either version
 25 *		2 of the License, or (at your option) any later version.
 26 */
 27
 28#include <linux/config.h>
 29#include <asm/errno.h>
 30				
 31/*
 32 * computes a partial checksum, e.g. for TCP/UDP fragments
 33 */
 34
 35/*	
 36unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
 37 */
 38		
 39.text
 40.align 4
 41.globl csum_partial								
 42		
 43#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
 44
 45	  /*		
 46	   * Experiments with Ethernet and SLIP connections show that buff
 47	   * is aligned on either a 2-byte or 4-byte boundary.  We get at
 48	   * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
 49	   * Fortunately, it is easy to convert 2-byte alignment to 4-byte
 50	   * alignment for the unrolled loop.
 51	   */		
 52csum_partial:	
 53	pushl %esi
 54	pushl %ebx
 55	movl 20(%esp),%eax	# Function arg: unsigned int sum
 56	movl 16(%esp),%ecx	# Function arg: int len
 57	movl 12(%esp),%esi	# Function arg: unsigned char *buff
 58	testl $3, %esi		# Check alignment.
 59	jz 2f			# Jump if alignment is ok.
 60	testl $1, %esi		# Check alignment.
 61	jz 10f			# Jump if alignment is boundary of 2bytes.
 62
 63	# buf is odd
 64	dec %ecx
 65	jl 8f
 66	movzbl (%esi), %ebx
 67	adcl %ebx, %eax
 68	roll $8, %eax
 69	inc %esi
 70	testl $2, %esi
 71	jz 2f
 7210:
 73	subl $2, %ecx		# Alignment uses up two bytes.
 74	jae 1f			# Jump if we had at least two bytes.
 75	addl $2, %ecx		# ecx was < 2.  Deal with it.
 76	jmp 4f
 771:	movw (%esi), %bx
 78	addl $2, %esi
 79	addw %bx, %ax
 80	adcl $0, %eax
 812:
 82	movl %ecx, %edx
 83	shrl $5, %ecx
 84	jz 2f
 85	testl %esi, %esi
 861:	movl (%esi), %ebx
 87	adcl %ebx, %eax
 88	movl 4(%esi), %ebx
 89	adcl %ebx, %eax
 90	movl 8(%esi), %ebx
 91	adcl %ebx, %eax
 92	movl 12(%esi), %ebx
 93	adcl %ebx, %eax
 94	movl 16(%esi), %ebx
 95	adcl %ebx, %eax
 96	movl 20(%esi), %ebx
 97	adcl %ebx, %eax
 98	movl 24(%esi), %ebx
 99	adcl %ebx, %eax
100	movl 28(%esi), %ebx
101	adcl %ebx, %eax
102	lea 32(%esi), %esi
103	dec %ecx
104	jne 1b
105	adcl $0, %eax
1062:	movl %edx, %ecx
107	andl $0x1c, %edx
108	je 4f
109	shrl $2, %edx		# This clears CF
1103:	adcl (%esi), %eax
111	lea 4(%esi), %esi
112	dec %edx
113	jne 3b
114	adcl $0, %eax
1154:	andl $3, %ecx
116	jz 7f
117	cmpl $2, %ecx
118	jb 5f
119	movw (%esi),%cx
120	leal 2(%esi),%esi
121	je 6f
122	shll $16,%ecx
1235:	movb (%esi),%cl
1246:	addl %ecx,%eax
125	adcl $0, %eax 
1267:	
127	testl $1, 12(%esp)
128	jz 8f
129	roll $8, %eax
1308:
131	popl %ebx
132	popl %esi
133	ret
134
135#else
136
137/* Version for PentiumII/PPro */
138
139csum_partial:
140	pushl %esi
141	pushl %ebx
142	movl 20(%esp),%eax	# Function arg: unsigned int sum
143	movl 16(%esp),%ecx	# Function arg: int len
144	movl 12(%esp),%esi	# Function arg:	const unsigned char *buf
145
146	testl $3, %esi         
147	jnz 25f                 
14810:
149	movl %ecx, %edx
150	movl %ecx, %ebx
151	andl $0x7c, %ebx
152	shrl $7, %ecx
153	addl %ebx,%esi
154	shrl $2, %ebx  
155	negl %ebx
156	lea 45f(%ebx,%ebx,2), %ebx
157	testl %esi, %esi
158	jmp *%ebx
159
160	# Handle 2-byte-aligned regions
16120:	addw (%esi), %ax
162	lea 2(%esi), %esi
163	adcl $0, %eax
164	jmp 10b
16525:
166	testl $1, %esi         
167	jz 30f                 
168	# buf is odd
169	dec %ecx
170	jl 90f
171	movzbl (%esi), %ebx
172	addl %ebx, %eax
173	adcl $0, %eax
174	roll $8, %eax
175	inc %esi
176	testl $2, %esi
177	jz 10b
178
17930:	subl $2, %ecx          
180	ja 20b                 
181	je 32f
182	addl $2, %ecx
183	jz 80f
184	movzbl (%esi),%ebx	# csumming 1 byte, 2-aligned
185	addl %ebx, %eax
186	adcl $0, %eax
187	jmp 80f
18832:
189	addw (%esi), %ax	# csumming 2 bytes, 2-aligned
190	adcl $0, %eax
191	jmp 80f
192
19340: 
194	addl -128(%esi), %eax
195	adcl -124(%esi), %eax
196	adcl -120(%esi), %eax
197	adcl -116(%esi), %eax   
198	adcl -112(%esi), %eax   
199	adcl -108(%esi), %eax
200	adcl -104(%esi), %eax
201	adcl -100(%esi), %eax
202	adcl -96(%esi), %eax
203	adcl -92(%esi), %eax
204	adcl -88(%esi), %eax
205	adcl -84(%esi), %eax
206	adcl -80(%esi), %eax
207	adcl -76(%esi), %eax
208	adcl -72(%esi), %eax
209	adcl -68(%esi), %eax
210	adcl -64(%esi), %eax     
211	adcl -60(%esi), %eax     
212	adcl -56(%esi), %eax     
213	adcl -52(%esi), %eax   
214	adcl -48(%esi), %eax   
215	adcl -44(%esi), %eax
216	adcl -40(%esi), %eax
217	adcl -36(%esi), %eax
218	adcl -32(%esi), %eax
219	adcl -28(%esi), %eax
220	adcl -24(%esi), %eax
221	adcl -20(%esi), %eax
222	adcl -16(%esi), %eax
223	adcl -12(%esi), %eax
224	adcl -8(%esi), %eax
225	adcl -4(%esi), %eax
22645:
227	lea 128(%esi), %esi
228	adcl $0, %eax
229	dec %ecx
230	jge 40b
231	movl %edx, %ecx
23250:	andl $3, %ecx
233	jz 80f
234
235	# Handle the last 1-3 bytes without jumping
236	notl %ecx		# 1->2, 2->1, 3->0, higher bits are masked
237	movl $0xffffff,%ebx	# by the shll and shrl instructions
238	shll $3,%ecx
239	shrl %cl,%ebx
240	andl -128(%esi),%ebx	# esi is 4-aligned so should be ok
241	addl %ebx,%eax
242	adcl $0,%eax
24380: 
244	testl $1, 12(%esp)
245	jz 90f
246	roll $8, %eax
24790: 
248	popl %ebx
249	popl %esi
250	ret
251				
252#endif
253
254/*
255unsigned int csum_partial_copy_generic (const char *src, char *dst,
256				  int len, int sum, int *src_err_ptr, int *dst_err_ptr)
257 */ 
258
259/*
260 * Copy from ds while checksumming, otherwise like csum_partial
261 *
262 * The macros SRC and DST specify the type of access for the instruction.
263 * thus we can call a custom exception handler for all access types.
264 *
265 * FIXME: could someone double-check whether I haven't mixed up some SRC and
266 *	  DST definitions? It's damn hard to trigger all cases.  I hope I got
267 *	  them all but there's no guarantee.
268 */
269
270#define SRC(y...)			\
271	9999: y;			\
272	.section __ex_table, "a";	\
273	.long 9999b, 6001f	;	\
274	.previous
275
276#define DST(y...)			\
277	9999: y;			\
278	.section __ex_table, "a";	\
279	.long 9999b, 6002f	;	\
280	.previous
281
282.align 4
283.globl csum_partial_copy_generic
284				
285#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
286
287#define ARGBASE 16		
288#define FP		12
289		
290csum_partial_copy_generic:
291	subl  $4,%esp	
292	pushl %edi
293	pushl %esi
294	pushl %ebx
295	movl ARGBASE+16(%esp),%eax	# sum
296	movl ARGBASE+12(%esp),%ecx	# len
297	movl ARGBASE+4(%esp),%esi	# src
298	movl ARGBASE+8(%esp),%edi	# dst
299
300	testl $2, %edi			# Check alignment. 
301	jz 2f				# Jump if alignment is ok.
302	subl $2, %ecx			# Alignment uses up two bytes.
303	jae 1f				# Jump if we had at least two bytes.
304	addl $2, %ecx			# ecx was < 2.  Deal with it.
305	jmp 4f
306SRC(1:	movw (%esi), %bx	)
307	addl $2, %esi
308DST(	movw %bx, (%edi)	)
309	addl $2, %edi
310	addw %bx, %ax	
311	adcl $0, %eax
3122:
313	movl %ecx, FP(%esp)
314	shrl $5, %ecx
315	jz 2f
316	testl %esi, %esi
317SRC(1:	movl (%esi), %ebx	)
318SRC(	movl 4(%esi), %edx	)
319	adcl %ebx, %eax
320DST(	movl %ebx, (%edi)	)
321	adcl %edx, %eax
322DST(	movl %edx, 4(%edi)	)
323
324SRC(	movl 8(%esi), %ebx	)
325SRC(	movl 12(%esi), %edx	)
326	adcl %ebx, %eax
327DST(	movl %ebx, 8(%edi)	)
328	adcl %edx, %eax
329DST(	movl %edx, 12(%edi)	)
330
331SRC(	movl 16(%esi), %ebx 	)
332SRC(	movl 20(%esi), %edx	)
333	adcl %ebx, %eax
334DST(	movl %ebx, 16(%edi)	)
335	adcl %edx, %eax
336DST(	movl %edx, 20(%edi)	)
337
338SRC(	movl 24(%esi), %ebx	)
339SRC(	movl 28(%esi), %edx	)
340	adcl %ebx, %eax
341DST(	movl %ebx, 24(%edi)	)
342	adcl %edx, %eax
343DST(	movl %edx, 28(%edi)	)
344
345	lea 32(%esi), %esi
346	lea 32(%edi), %edi
347	dec %ecx
348	jne 1b
349	adcl $0, %eax
3502:	movl FP(%esp), %edx
351	movl %edx, %ecx
352	andl $0x1c, %edx
353	je 4f
354	shrl $2, %edx			# This clears CF
355SRC(3:	movl (%esi), %ebx	)
356	adcl %ebx, %eax
357DST(	movl %ebx, (%edi)	)
358	lea 4(%esi), %esi
359	lea 4(%edi), %edi
360	dec %edx
361	jne 3b
362	adcl $0, %eax
3634:	andl $3, %ecx
364	jz 7f
365	cmpl $2, %ecx
366	jb 5f
367SRC(	movw (%esi), %cx	)
368	leal 2(%esi), %esi
369DST(	movw %cx, (%edi)	)
370	leal 2(%edi), %edi
371	je 6f
372	shll $16,%ecx
373SRC(5:	movb (%esi), %cl	)
374DST(	movb %cl, (%edi)	)
3756:	addl %ecx, %eax
376	adcl $0, %eax
3777:
3785000:
379
380# Exception handler:
381.section .fixup, "ax"							
382
3836001:
384	movl ARGBASE+20(%esp), %ebx	# src_err_ptr
385	movl $-EFAULT, (%ebx)
386
387	# zero the complete destination - computing the rest
388	# is too much work 
389	movl ARGBASE+8(%esp), %edi	# dst
390	movl ARGBASE+12(%esp), %ecx	# len
391	xorl %eax,%eax
392	rep ; stosb
393
394	jmp 5000b
395
3966002:
397	movl ARGBASE+24(%esp), %ebx	# dst_err_ptr
398	movl $-EFAULT,(%ebx)
399	jmp 5000b
400
401.previous
402
403	popl %ebx
404	popl %esi
405	popl %edi
406	popl %ecx			# equivalent to addl $4,%esp
407	ret	
408
409#else
410
411/* Version for PentiumII/PPro */
412
413#define ROUND1(x) \
414	SRC(movl x(%esi), %ebx	)	;	\
415	addl %ebx, %eax			;	\
416	DST(movl %ebx, x(%edi)	)	; 
417
418#define ROUND(x) \
419	SRC(movl x(%esi), %ebx	)	;	\
420	adcl %ebx, %eax			;	\
421	DST(movl %ebx, x(%edi)	)	;
422
423#define ARGBASE 12
424		
425csum_partial_copy_generic:
426	pushl %ebx
427	pushl %edi
428	pushl %esi
429	movl ARGBASE+4(%esp),%esi	#src
430	movl ARGBASE+8(%esp),%edi	#dst	
431	movl ARGBASE+12(%esp),%ecx	#len
432	movl ARGBASE+16(%esp),%eax	#sum
433#	movl %ecx, %edx  
434	movl %ecx, %ebx  
435	movl %esi, %edx
436	shrl $6, %ecx     
437	andl $0x3c, %ebx  
438	negl %ebx
439	subl %ebx, %esi  
440	subl %ebx, %edi  
441	lea  -1(%esi),%edx
442	andl $-32,%edx
443	lea 3f(%ebx,%ebx), %ebx
444	testl %esi, %esi 
445	jmp *%ebx
4461:	addl $64,%esi
447	addl $64,%edi 
448	SRC(movb -32(%edx),%bl)	; SRC(movb (%edx),%bl)
449	ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)	
450	ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)	
451	ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)	
452	ROUND (-16) ROUND(-12) ROUND(-8)  ROUND(-4)	
4533:	adcl $0,%eax
454	addl $64, %edx
455	dec %ecx
456	jge 1b
4574:	movl ARGBASE+12(%esp),%edx	#len
458	andl $3, %edx
459	jz 7f
460	cmpl $2, %edx
461	jb 5f
462SRC(	movw (%esi), %dx         )
463	leal 2(%esi), %esi
464DST(	movw %dx, (%edi)         )
465	leal 2(%edi), %edi
466	je 6f
467	shll $16,%edx
4685:
469SRC(	movb (%esi), %dl         )
470DST(	movb %dl, (%edi)         )
4716:	addl %edx, %eax
472	adcl $0, %eax
4737:
474.section .fixup, "ax"
4756001:	movl	ARGBASE+20(%esp), %ebx	# src_err_ptr	
476	movl $-EFAULT, (%ebx)
477	# zero the complete destination (computing the rest is too much work)
478	movl ARGBASE+8(%esp),%edi	# dst
479	movl ARGBASE+12(%esp),%ecx	# len
480	xorl %eax,%eax
481	rep; stosb
482	jmp 7b
4836002:	movl ARGBASE+24(%esp), %ebx	# dst_err_ptr
484	movl $-EFAULT, (%ebx)
485	jmp  7b			
486.previous				
487
488	popl %esi
489	popl %edi
490	popl %ebx
491	ret
492				
493#undef ROUND
494#undef ROUND1		
495		
496#endif
Configure Feed

Configure Feed