src/libm/Source/Intel/round.S at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / round.S
at fixPythonPipStalling 100 lines 2.2 kB view raw
wrap content
Lubos Dolezel More progress 9y ago
a76db625
  1/* double round( double )
  2 *
  3 * Reimplemented by Steve Canon, based on Ian Ollmann's implementations
  4 * tuned for increased performance on in-order machines (but faster on
  5 * out-of-order machines as well).
  6 *
  7 * Copyright 2009, Apple Inc.
  8 */
  9
 10#ifdef __i386__
 11
 12#ifdef __SSE3__
 13#define TRUNCATE						\
 14	fisttpll  4(%esp);					\
 15	fildll	  4(%esp)
 16#else
 17#define	TRUNCATE						\
 18	fnstcw	  4(%esp);					\
 19	movw	  4(%esp),			%dx;	\
 20	orw         $0xc00,		  4(%esp);	\
 21	fldcw	  4(%esp);					\
 22	frndint;							\
 23	movw        %dx,		  4(%esp);	\
 24	fldcw	  4(%esp)
 25#endif
 26
 27.text
 28.align 4
 29.globl _round
 30_round:
 31	movl	  8(%esp),			%ecx
 32	cmpl		$0x43300000,	%ecx
 33	fldl	  4(%esp)
 34	jae			2f
 35
 36	fld			%st(0)					// { x, x }
 37	TRUNCATE							// { trunc(x), x }
 38	fxch								// { x, trunc(x) }
 39	fsub		%st(1),			%st(0)	// { frac(x), x }
 40	fadd		%st(0),			%st(0)	// { 2*frac(x), x }
 41	fld1								// { 1.0, 2*frac(x), x }
 42	fucomi		%st(1),			%st(0)
 43	fstp		%st(1)					// { 1.0, trunc(x) }
 44	ja			1f
 45	fadd		%st(0),			%st(1)	// { 1.0, trunc(x) + 1.0 }
 461:	fstp		%st(0)					// { round(x) }
 47	ret
 48	
 492:	andl		$0x7fffffff,	%ecx
 50	cmpl		$0x43300000,	%ecx
 51	jge			4f
 52	
 53	fabs
 54	fld			%st(0)
 55	TRUNCATE							// { trunc(x), x }
 56	fxch								// { x, trunc(x) }
 57	fsub		%st(1),			%st(0)	// { frac(x), x }
 58	fadd		%st(0),			%st(0)	// { 2*frac(x), x }
 59	fld1								// { 1.0, 2*frac(x), x }
 60	fucomi		%st(1),			%st(0)
 61	fstp		%st(1)					// { 1.0, trunc(x) }
 62	ja			3f
 63	fadd		%st(0),			%st(1)	// { 1.0, trunc(x) + 1.0 }
 643:	fstp		%st(0)					// { round(x) }
 65	fchs
 664:	ret
 67
 68#else //x86_64
 69
 70.const
 71.align 4
 72one:	.quad	0x3ff0000000000000
 73absmask:.quad   0x7fffffffffffffff
 74half:	.quad	0x3fe0000000000000
 75thresh: .quad	0x4330000000000000
 76
 77.text
 78.align 4
 79.globl _round
 80_round:
 81	movd		%xmm0,			%rcx
 82	andq		absmask(%rip),	%rcx
 83	movsd		absmask(%rip),	%xmm2
 84	cmpq		thresh(%rip),	%rcx
 85	jae			1f
 86	
 87	cvttsd2si	%xmm0,			%rax
 88	andnpd		%xmm0,			%xmm2	// signbit(x)
 89	movsd		half(%rip),		%xmm3
 90	cvtsi2sd	%rax,			%xmm1	// trunc(x)
 91	subsd		%xmm1,			%xmm0	// frac(x)
 92	orpd		%xmm2,			%xmm1
 93	xorpd		%xmm2,			%xmm0	// |frac(x)|
 94	cmpltpd		%xmm3,			%xmm0	// (|frac(x)| < 0.5) ?
 95	andnpd		one(%rip),		%xmm0	// (|frac(x)| < 0.5) ? 0.0 : 1.0
 96	orpd		%xmm2,			%xmm0
 97	addsd		%xmm1,			%xmm0	// round(x)
 981:	ret
 99
100#endif