src/libm/Source/Intel/lroundf.S at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / lroundf.S
at fixPythonPipStalling 129 lines 3.4 kB view raw
wrap content
Lubos Dolezel Restructured source tree to prepare for merge with the "darling" repo 10y ago
f228ae16
  1
  2/*
  3 *	lroundf.s
  4 *
  5 *		by Ian Ollmann
  6 *
  7 *	Copyright (c) 2007, Apple Inc.  All Rights Reserved.
  8 *
  9 *	C99 lroundf for __i386__ and __x86_64__
 10 *
 11 */
 12 
 13#include <machine/asm.h>
 14#include "abi.h"
 15
 16
 17#if defined( __i386__ )
 18
 19
 20	ENTRY( llroundf )
 21		movl	FRAME_SIZE(STACKP),		%eax
 22		flds	FRAME_SIZE(STACKP)		//  { x }
 23		fld		%st(0)					//	{ x, x }
 24		SUBP	$16-FRAME_SIZE,			STACKP
 25
 26		fistpll	(STACKP)				// { x }, llrint(x), set invalid / inexact if necessary
 27		movl	%eax,					%edx	// x
 28		andl	$0x7fffffff,			%eax	// |x|
 29		xorl	%eax,					%edx	// signof( x )
 30		cmpl	$0x4b000000,			%eax	// |x| >= 0x1.0p23f or NaN
 31		jae		1f
 32		
 33		fildll	(STACKP)						//	{ llrint(x), x }
 34		fucomip	%st(1),					%st(0)	//	{ x } x == llrint(x)
 35		fstp	%st(0)							//	{ }
 36		je		2f								//		return llrint(x)
 37
 38		cmpl	$0x3effffff,			%eax	// |x| == 0.5f - 1 ulp
 39		je		4f								 
 40		
 41		// at this point we know that |x| < 0x1.0p23f, so we dont need a full 64-bit conversion, which is good because we really need trunc now.
 42		orl		$0x3f000000,			%edx	// copysign( 0.5f, x )
 43		movss	16(STACKP),				%xmm0	// x
 44		movd	%edx,					%xmm1	// copysign( 0.5f, x )
 45		addss	%xmm1,					%xmm0	// x + copysign( 0.5f, x )
 46		cvttss2si %xmm0,				%edx	// result = (int32_t) ( x + copysign( 0.5f, x ))
 47		movl	%edx,					%eax	// result
 48		sarl	$31,					%edx	// sign extended result
 49		ADDP	$16-FRAME_SIZE,			STACKP
 50		ret
 51
 521:		// |x| >= 0x1.0p23f or NaN
 53		fstp	%st(0)							//	{ }
 54		cmpl	$0x5f000000,			%eax	// |x| >= 0x1.0p63f
 55		jae		3f
 56
 57		// |x| is non-overflowing integer  (NaN ends up here eventually too)
 582:		movl	(STACKP),				%eax	// low 32 bits
 59		movl	4(STACKP),				%edx	// high 32 bits
 60		ADDP	$16-FRAME_SIZE,			STACKP
 61		ret
 62
 633:		// |x| overflows or is NaN
 64		cmpl	$0x7f800000,			%eax	// |x| > 0x1.0p63f
 65		ja		2b
 66
 67		// |x| overflows
 68		subl	$1,						%edx	// x < 0 ? 0x7fffffff : -1U
 69		sarl	$31,					%edx	// x < 0 ? 0 : -1U
 70		movl	(STACKP),				%eax
 71		xorl	%edx,					%eax	// x < 0 ? low result : low result ^ -1U
 72		xorl	4(STACKP),				%edx
 73		ADDP	$16-FRAME_SIZE,			STACKP
 74		ret
 75
 764:		// |x| == 0.5f - 1 ulp, return 0
 77		xorl	%eax,					%eax
 78		xorl	%edx,					%edx
 79		ADDP	$16-FRAME_SIZE,			STACKP
 80		ret
 81		
 82#define LONG_MIN_f		0x4f000000
 83
 84	ENTRY( lroundf )
 85		movl	FRAME_SIZE(STACKP),		%edx
 86		movss	FRAME_SIZE(STACKP),		%xmm0
 87#elif defined( __x86_64__ )
 88
 89#define LONG_MIN_f		0x5f000000
 90
 91
 92	ENTRY( lroundf )
 93	ENTRY( llroundf )
 94		xorq	%rdx,					%rdx
 95		movd	%xmm0,					%edx		// |x|
 96#endif
 97
 98		cvttss2si	%xmm0,				AX_P		// (long) x, set invalid / inexact if necessary
 99		MOVP	DX_P,					CX_P		// x
100		and		$0x7fffffff,			DX_P		// |x|
101		XORP	DX_P,					CX_P		// signof( x )
102		cmpl	$0x4b000000,			%edx		// |x| >= 0x1.0p23f or NaN
103		jae		2f
104
105		cvtsi2ss	AX_P,				%xmm1		// trunc(x)
106		ucomiss		%xmm0,				%xmm1		// x == trunc(x)
107		je		1f									//		return (long) x
108
109		orl		$0x3f000000,			%ecx		// copysign( 0.5f, x )
110		movd	%ecx,					%xmm1		// copysign( 0.5f, x )
111		cmpl	$0x3effffff,			%edx		// |x| == 0.5f - 1 ulp
112		je		1f									//		return (long) x
113
114		addss	%xmm1,					%xmm0		// x += copysign( 0.5, x )
115		cvttss2si %xmm0,				AX_P		// (int) (x + copysign( 0.5, x ) )
116	
117	1:	ret
118			
119	2:	// |x| >= 0x1.0p23f or NaN
120		cmpl	$0x7f800000,			%edx		// |x| is NaN
121		ja		3f
122		cmpl	$LONG_MIN_f,			%edx		// |x| < LONG_MIN
123		jb		1b
124		SUBP	$1,						CX_P		// x < 0 ? 0x7fffffff : -1LL
125		sar		$31,					CX_P		// x < 0 ? 0 : -1LL
126		XORP	CX_P,					AX_P		// flip LONG_LONG_MIN to LONG_LONG_MAX if needed
127	3:	ret
128	
129