src/libm/Source/Intel/roundf.S at fixPythonPipStalling

overby.me / darling-nix

fork atom

this repo has no description

fork atom

darling-nix / src / libm / Source / Intel / roundf.S

at fixPythonPipStalling 95 lines 2.2 kB view raw

wrap content

Lubos Dolezel More progress 9y ago

a76db625

 1/* Single-precision roundf, reimplemented using integer operations
 2 * for improved performance, especially on in-order machines.
 3 *
 4 * Steve Canon, March 2009.
 5 */
 6
 7#if defined __i386__
 8
 9.text
10.align 4
11.globl _roundf
12_roundf:
13    mov		  4(%esp),			%eax	// load the input, x
14	movss	  4(%esp),			%xmm0
15    and			$0x7f800000,    %eax	// |x|
16	mov			$23,			%ecx
17	mov			$0x1,			%edx
18	sar			%cl,			%eax	// exponent(x) + 1
19	
20	sub			$0x7e,			%eax	// if |x| < 1.0f. goto 2
21	jbe			2f
22	
23	sub			%eax,			%ecx	// 23 - exponent(x) - 1
24	js			1f						// return x if |x| >= 0x1.0p23
25	
26	shl			%cl,			%edx
27	mov			$0xfffffffe,	%eax
28	shl			%cl,			%eax
29	add			%edx,		  4(%esp)	// add 0.5 (ish -- non-integral bits are garbage)
30	and			%eax,		  4(%esp)	// truncate
31	cvttps2dq	%xmm0,			%xmm0	// raise inexact
321:	flds	  4(%esp)
33	ret
34.align 4
352:	je			3f						// if |x| >= 0.5f, goto 3
36	andl		$0x80000000,  4(%esp)	// copysign(0.0, x)
37	cvttps2dq	%xmm0,			%xmm0	// raise inexact
38	flds	  4(%esp)
39	ret
40.align 4
413:	addl		$0x00800000,  4(%esp)
42	andl		$0xff800000,  4(%esp)	// copysign(1.0, x)
43	cvttps2dq	%xmm0,			%xmm0	// raise inexact
44	flds	  4(%esp)
45	ret
46	
47#elif defined __x86_64__
48
49.const
50.align 4
51mzero:	.long	0x80000000
52.align 4
53expbit: .long	0x00800000
54.align 4
55expmask:.long	0xff800000
56
57.text
58.align 4
59.globl _roundf
60_roundf:
61	movd		%xmm0,			%eax
62    and			$0x7f800000,    %eax	// |x|
63	mov			$23,			%ecx
64	mov			$0x1,			%edx
65	sar			%cl,			%eax	// exponent(x) + 1
66	
67	sub			$0x7e,			%eax	// if |x| < 1.0f. goto 2
68	jbe			2f
69	
70	sub			%eax,			%ecx	// 23 - exponent(x) - 1
71	js			1f						// return x if |x| >= 0x1.0p23
72	
73	shl			%cl,			%edx
74	mov			$0xfffffffe,	%eax
75	shl			%cl,			%eax
76	movd		%edx,			%xmm2
77	movd		%eax,			%xmm3
78	cvttps2dq	%xmm0,			%xmm1	// raise inexact
79	paddd		%xmm2,			%xmm0	// add 0.5 (ish -- non-integral bits are garbage)
80	pand		%xmm3,			%xmm0	// truncate
811:	ret
82.align 4
832:	je			3f						// if |x| >= 0.5f, goto 3
84	cvttps2dq	%xmm0,			%xmm1	// raise inexact
85	andps		mzero(%rip),	%xmm0	// copysign(0.0, x)
86	ret
87.align 4
883:	cvttps2dq	%xmm0,			%xmm1	// raise inexact
89	paddd		expbit(%rip),	%xmm0
90	pand		expmask(%rip),	%xmm0	// copysign(1.0, x)
91	ret
92
93#else
94	#error unknown arch
95#endif