src/libm/Source/Intel/truncf.S at fixPythonPipStalling

overby.me / darling-nix

fork atom

this repo has no description

fork atom

darling-nix / src / libm / Source / Intel / truncf.S

at fixPythonPipStalling 81 lines 1.8 kB view raw

wrap content

Lubos Dolezel More progress 9y ago

a76db625

 1/* Single-precision truncf
 2 *
 3 * Reimplemented for improved performance on in-order machines and
 4 * machines that support SSE 4.1
 5 *
 6 * Steve Canon, March 2009.
 7 */
 8 
 9#include <System/i386/cpu_capabilities.h>
10.set cpubits, _COMM_PAGE_CPU_CAPABILITIES
11
12#if defined __i386__
13
14.text
15.align 4
16.globl _truncf
17_truncf:
18	movss	  4(%esp),			%xmm0	// load argument
19	testl	  $(kHasSSE4_1),	cpubits
20	jz			0f
21	
22	roundss		$0x3,	%xmm0,	%xmm0	// fast path using SSE 4.1
23	movss		%xmm0,		  4(%esp)	//
24	flds	  4(%esp)					//
25	ret
26
27.align 4
280:  mov		  4(%esp),			%eax	// load the input, x
29    and			$0x7f800000,    %eax	// |x|
30	mov			$23,			%ecx
31	sub			$0x3f800000,	%eax	// if |x| < 1.0f, goto 2
32	js			2f
33	sar			%cl,			%eax	// exponent(x)
34	mov			$0xffffffff,	%edx
35	sub			%eax,			%ecx	// 23 - exponent(x)
36	js			1f						// return x if |x| >= 0x1.0p24
37	shl			%cl,			%edx
38	and			%edx,		  4(%esp)	// mask off non-integral bits
39	cvttps2dq	%xmm0,			%xmm0	// raise inexact
401:	flds	  4(%esp)
41	ret
42	
43.align 4
442:	// Handle |x| < 1.0 here.
45	andl		$0x80000000,  4(%esp)	// copysign(0.0, x)
46	cvttps2dq	%xmm0,			%xmm0	// raise inexact
47	flds	  4(%esp)
48	ret
49
50#elif defined __x86_64__
51
52.const
53.align 4
54mzero:	.long	0x80000000
55
56.text
57.align 4
58.globl _truncf
59_truncf:
60	movd		%xmm0,			%eax
61    and			$0x7f800000,    %eax	// |x|
62	mov			$23,			%ecx
63	sub			$0x3f800000,	%eax	// if |x| < 1.0f, goto 2
64	js			2f
65	sar			%cl,			%eax	// exponent(x)
66	mov			$0xffffffff,	%edx
67	sub			%eax,			%ecx	// 23 - exponent(x)
68	js			1f						// return x if |x| >= 0x1.0p23
69	shl			%cl,			%edx
70	movd		%edx,			%xmm2
71	cvttps2dq	%xmm0,			%xmm1	// raise inexact
72	andps		%xmm2,			%xmm0	// mask off non-integral bits
731:	ret
74.align 4
752:	cvttps2dq	%xmm0,			%xmm1	// raise inexact
76	andps		mzero(%rip),	%xmm0	// copysign(0.0, x)
77	ret
78	
79#else
80	#error unknown arch
81#endif