src/libm/Source/Intel/floorf.S at fixPythonPipStalling

overby.me / darling-nix

fork atom

this repo has no description

fork atom

darling-nix / src / libm / Source / Intel / floorf.S

at fixPythonPipStalling 89 lines 1.9 kB view raw

wrap content

Lubos Dolezel More progress 9y ago

a76db625

 1/* Single-precision floorf, reimplemented using integer operations
 2 * for improved performance, especially on in-order machines.
 3 *
 4 * Steve Canon, March 2009.
 5 */
 6
 7#if defined __i386__
 8 
 9#include <System/i386/cpu_capabilities.h>
10.set cpubits, _COMM_PAGE_CPU_CAPABILITIES
11
12.text
13.align 4
14.globl _floorf
15_floorf:
16	movss	  4(%esp),			%xmm0	// load argument
17	testl	  $(kHasSSE4_1),	cpubits
18	jz			0f
19	
20	// fast path using SSE 4.1
21	roundss		$0x1,	%xmm0,	%xmm0
22	movss		%xmm0,		  4(%esp)
23	flds	  4(%esp)
24	ret
25
26.align 4
270:	// no SSE 4.1
28	mov		  4(%esp),			%eax
29	mov			$23,			%cl
30	mov			%eax,			%edx
31	shr			%cl,			%eax
32	dec			%edx
33	sub			$0x7f,			%al
34	jb			2f
35	
36	sub			%al,			%cl
37	mov			$0xffffffff,	%eax
38	jbe			1f
39
40	shl			%cl,			%eax	// m = mask for integral bits of x
41	mov			%edx,			%ecx
42	not			%edx
43	sar			$31,			%edx	// (x < 0) ? -1 : 0
44	or			%eax,			%edx	// (x < 0) ? -1 : m
45	sub			%edx,			%ecx	// (x < 0) ? x : (x + (1.0 - ulp(x)))
46	and			%ecx,			%eax	// ceil(x)
47	mov			%eax,		  4(%esp)
48	cvttps2dq	%xmm0,			%xmm0	// set inexact
491:	flds	  4(%esp)
50	ret
51.align 4
522:	cvttps2dq	%xmm0,			%xmm0	// set inexact
53	cmp			$0xffffffff,	%edx	// if x < 0.0, goto 3
54	jl			3f
55	andl		$0x80000000,  4(%esp)	// copysign(0.0, x)
56	flds	  4(%esp)
57	ret
58.align 4
593:	movl		$0xbf800000,  4(%esp)	// return 1.0
60	flds	  4(%esp)
61	ret
62
63#elif defined __x86_64__
64
65.const
66.align 4
67mone:	.long	0xbf800000
68absmask:.long	0x7fffffff
69
70.text
71.align 4
72.globl _floorf
73_floorf:
74    movd		%xmm0,			%ecx
75    andl		$0x7fffffff,	%ecx	// |x|
76    subl		$1,				%ecx	// subtract 1. This forces |+-0| to -0
77    cmpl		$0x4afffffe,	%ecx	// values >= 0x4b000000 - 1 are either integers, NaN or Inf	
78    ja			1f						// unsigned compare adds 0 to the list
79
80    cvttps2dq   %xmm0,			%xmm2
81    cvtdq2ps    %xmm2,			%xmm2
82    cmpltss		%xmm2,			%xmm0
83    andps		mone(%rip),		%xmm0
84    addss		%xmm2,			%xmm0
851:	ret
86
87#else
88	#error unknown arch
89#endif