src/libm/Source/Intel/ceilf.S at fixPythonPipStalling

overby.me / darling-nix

fork atom

this repo has no description

fork atom

darling-nix / src / libm / Source / Intel / ceilf.S

at fixPythonPipStalling 93 lines 2.1 kB view raw

wrap content

Lubos Dolezel More progress 9y ago

a76db625

 1/* Single-precision ceilf, reimplemented using integer operations
 2 * for improved performance, especially on in-order machines.
 3 *
 4 * Steve Canon, March 2009.
 5 */
 6 
 7#if defined __i386__
 8
 9#include <System/i386/cpu_capabilities.h>
10.set cpubits, _COMM_PAGE_CPU_CAPABILITIES
11
12.text
13.align 4
14.globl _ceilf
15_ceilf:
16	movss	  4(%esp),			%xmm0	// load argument
17	testl	  $(kHasSSE4_1),	cpubits
18	jz			0f
19	
20	// fast path using SSE 4.1
21	roundss		$0x2,	%xmm0,	%xmm0
22	movss		%xmm0,		  4(%esp)
23	flds	  4(%esp)
24	ret
25
26.align 4
270:	// no SSE 4.1	
28	mov		  4(%esp),			%eax
29	mov			$23,			%cl
30	mov			%eax,			%edx
31	shr			%cl,			%eax	// x >> 23
32	sub			$0x7f,			%al		// unbiased exponent of x
33	jb			2f						// if |x| < 1.0, goto 2
34	
35	sub			%al,			%cl		// 23 - exponent of x
36	mov			$0xffffffff,	%eax
37	jbe			1f						// if |x| >= 0x1.0p23, goto 1
38	
39	dec			%edx					// (x - 1)
40	shl			%cl,			%eax	// m = mask for integral bits of x
41	mov			%edx,			%ecx
42	sar			$31,			%edx	// (x < 0) ? -1 : 0
43	or			%eax,			%edx	// (x < 0) ? -1 : m
44	sub			%edx,			%ecx	// (x < 0) ? x : (x + (1.0 - ulp(x)))
45	and			%ecx,			%eax	// ceil(x)
46	mov			%eax,		  4(%esp)
47	cvttps2dq	%xmm0,			%xmm0	// set inexact
481:	flds	  4(%esp)
49	ret
50.align 4
512:	cvttps2dq	%xmm0,			%xmm0	// set inexact
52	cmp			$1,				%edx	// if x > 0, goto 3
53	jge			3f
54	andl		$0x80000000,  4(%esp)	// copysign(0.0, x)
55	flds	  4(%esp)
56	ret
57.align 4
583:	movl		$0x3f800000,  4(%esp)	// return 1.0
59	flds	  4(%esp)
60	ret
61
62#elif defined __x86_64__
63
64.const
65.align 4
66one:	.long	0x3f800000
67absmask:.long	0x7fffffff
68
69.text
70.align 4
71.globl _ceilf
72_ceilf:
73	movd		%xmm0,			%eax
74	andl		absmask(%rip),	%eax
75	movd		absmask(%rip),	%xmm1
76	cmpl		$0x4b000000,	%eax
77	andnps		%xmm0,			%xmm1
78	jae			1f
79
80	cvttps2dq	%xmm0,			%xmm2
81	movdqa		%xmm0,			%xmm3
82	psrad		$31,			%xmm0	// (x < 0) ? -1 : 0
83	cvtdq2ps	%xmm2,			%xmm2
84	pcmpgtd		%xmm2,			%xmm3	// (x >i trunc(x)) ? -1 : 0
85	andnps		%xmm3,			%xmm0	// (x > trunc(x)) ? -1 : 0
86	andps		one(%rip),		%xmm0	// (x > trunc(x)) ? 1.0 : 0.0
87	addss		%xmm2,			%xmm0	// (x > trunc(x)) ? trunc(x) + 1.0 : trunc(x)
88	orps		%xmm1,			%xmm0	// ceil(x)
891:	ret
90	
91#else
92	#error unknown arch
93#endif