src/libm/Source/Intel/e_exp.S at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / e_exp.S
at fixPythonPipStalling 335 lines 11 kB view raw
wrap content
Lubos Dolezel Restructured source tree to prepare for merge with the "darling" repo 10y ago
f228ae16
  1/*
  2 * Original public domain code for expl Written by J.T. Conklin <jtc@netbsd.org>.
  3 *     fixed up to deliver correct edge cases by Ian Ollmann (expl)
  4 *
  5 *		exp2l by Ian Ollmann based on algorithm by J.T. Conklin
  6 *
  7 *		expm1l is new code by Ian Ollmann
  8 */
  9
 10#include <machine/asm.h>
 11
 12#define LOCAL_STACK_SIZE	24
 13#include "abi.h"
 14
 15
 16/* e^x = 2^(x * log2(e)) */
 17/*
 18ENTRY(expl) 
 19	SUBP	$LOCAL_STACK_SIZE, STACKP
 20    movl    $0xbe205c61, 20(STACKP)     // ln2 hi  (double, low 32-bits)
 21    movl    $0x0ca86c39, 16(STACKP)     // ln2 lo  (double, low 32-bits)
 22    movl    $0x3f317218, 8(STACKP)      // ln2 hi   (float)
 23	movl	$0x7f800000, 4(STACKP)		// inf
 24	movl	$0x50000000, (STACKP)		// limit
 25
 26	//test for inf or NaN
 27	fldt	FIRST_ARG_OFFSET(STACKP)	// { f }
 28	fabs								// { |f| }
 29	flds	4(STACKP)					// { inf, |f| }
 30	fucomip	%ST(1), %ST(0)				// { |f| }		if |f| == inf or f is NaN 
 31	fstp	%ST(0)						// {}
 32	fldt	FIRST_ARG_OFFSET(STACKP)	// { f }
 33	flds	(STACKP)					// { limit, f }
 34	je		expl_special
 35
 36	//clip f to be between limit and -limit.
 37	//If we dont do this, when we scale by log2e then it is possible that we will generate spurious overflow
 38	fld		%ST(0)						// { limit, limit, f }
 39	fchs								// { -limit, limit, f }
 40	fucomi  %ST(2), %ST(0)				// { -limit, limit, f }	if( -limit < f )
 41	fcmovb  %ST(2), %ST(0)				// { max( -limit, f ), limit, f }
 42	fucomi  %ST(1), %ST(0)				// { max( -limit, f ), limit, f }	if( max( -limit, f ) > limit )
 43	fcmovnbe  %ST(1), %ST(0)			// { clipped, limit, f }
 44	fstp	%ST(2)						// { limit, clipped }
 45	fstp	%ST(0)                      // { clipped }
 46
 47    //extended precision exp:
 48    //Calculate the integer part of log2(e) * clipped
 49	fldl2e								// { log2e, clipped } 
 50    fmul    %st(1), %st(0)              // { log2e * clipped, clipped }             
 51    frndint                             // { n, clipped }
 52
 53    //figure out the fractional part. Use more than 64-bit precision so we get the exact fraction.
 54    flds    8(STACKP)                   // { ln2 hi, n, clipped }
 55    fmul    %st(1), %st(0)              // { ln2 hi * n, n, clipped }
 56    fsubrp  %st(0), %st(2)              // { n, clipped - n*ln2hi }
 57    fldl    16(STACKP)                  // { ln2 lo, n, clipped - n*ln2hi }
 58    fmul    %st(1), %st(0)              // { ln2lo*n, n, clipped - n*ln2hi }
 59    fsubrp  %st(0), %st(2)              // { n, fract }
 60    fldl2e                              // { log2e, n, fract }
 61    fmulp   %st(0), %st(2)              // { n, log2e * fract }
 62
 63    //Calculate e**fract
 64    fxch                                // { log2e * fract, n }
 65    f2xm1                               // { 2**fract( f*log2e ) -  1, n }
 66    fld1                                // { 1, 2**fract( f*log2e ) -  1, n }
 67    faddp                               // { 2**fract( f*log2e ), n }
 68
 69    //scale according to n
 70    fscale                              // {result, n }
 71    fstp    %st(1)
 72	ADDP	$LOCAL_STACK_SIZE,	STACKP
 73    ret
 74         
 75expl_special:  //handles +-Inf, QNaN, SNaN
 76	fucomip  %ST(1), %ST(0)				// { f }	if( limit > f )
 77	fldz								// { 0, f }
 78	fxch								// { f, 0 }
 79	fcmovnbe %ST(1), %ST(0)				// set the -Inf case to 0
 80	fadd	 %ST(0), %ST(0)				// silence NaNs
 81	fstp	 %ST(1)
 82	ADDP	$LOCAL_STACK_SIZE,	STACKP
 83	ret
 84*/
 85
 86
 87ENTRY(exp2l)    
 88	SUBP	$LOCAL_STACK_SIZE, STACKP
 89	movl	$0x7f800000, 4(STACKP)		// inf
 90	movl	$0x1e000000, (STACKP)		// small (0x1.0p-67f)
 91	
 92	//test for +inf, NaN
 93	fldt	FIRST_ARG_OFFSET(STACKP)	// { f }
 94	flds	4(STACKP)					// { inf, f }
 95	fucomip %ST(1), %ST(0)				// { f }
 96	je		exp2l_exit					//	return f if Inf or NaN
 97
 98	//test for -inf
 99	flds	4(STACKP)					// { inf, f }
100	fchs								// {-inf, f }
101	fucomip	%ST(1), %ST(0)				// { f }
102	jne		exp2l_main
103	fldz								// { 0, f }
104	fstp	%st(1)						// { f }
105	jmp		exp2l_exit
106	
107exp2l_main:
108	flds	(STACKP)					// { small, f }
109	fld		%st(1)						// { f, small, f }
110	fabs								// { |f|, small, f }
111	fucomip	%st(1), %st(0)				// { small, f }
112	fstp	%st(0)						// { f }
113	jb		exp2l_small
114	fld		%st(0)						// { f, f }
115	frndint								// { intf, f }
116	fxch	%st(1)						// { f, intf }
117	fsub	%st(1),%st					// { fract(f), intf } 
118	f2xm1								// { 2**fract(f) - 1, intf }
119	fld1								// { 1, 2**fract(f) - 1, intf }
120	faddp								// { 2**fract(f), intf }
121	fscale								// { result, intf }
122	fstp	%st(1)						// { result }
123	//fall through to exit
124
125exp2l_exit:
126	ADDP	$LOCAL_STACK_SIZE,	STACKP
127    ret
128
129exp2l_small:		//avoid setting the underflow flag for denormal inputs
130	fld1								//  {1, f }
131	faddp								//	{result}
132	ADDP	$LOCAL_STACK_SIZE,	STACKP
133    ret
134	
135
136//do expm1
137	// e**x - 1 = 2**(x*log2(e)) - 1
138	//
139	//		f = fract( x*log2e )		//fractional part of x*log2e
140	//		i =   int( x*log2e )		//integral part of x*log2e
141	//
142	// e**x - 1 = 2**( f + i ) - 1
143	//			= (2**f)(2**i) - 1
144	//			= (2**f)(2**i) - (2**i) + (2**i) - 1 
145	//			= (2**f - 1)(2**i)	+ (2**i) - 1 
146	//				(exact)           (needs to be done in the right order, or not at all for i == 0)
147	//
148	//		The multiplication is exact. The two adds need to be done least to greatest in order of magnitude.
149	//		Since 2**f - 1 is always in the range [ -0.5, 1.0 ], we know ahead of time that (2**f-1)(2**i)
150	//		always has less magnitude than (2**i).  if i < 0 then add the -1 first. Otherwise, add (2**i) first
151	//
152	//	There are a lot of special cases here, which is why this is a bit branchy. I tried it without
153	//	branches and wasn't able to satisfy all the constraints called for by various standards
154
155#define EXPM1L_STACK_SIZE		12
156#undef FIRST_ARG_OFFSET
157#define FIRST_ARG_OFFSET		(FRAME_SIZE + EXPM1L_STACK_SIZE)
158
159ENTRY(expm1l)  
160	SUBP	$EXPM1L_STACK_SIZE, STACKP
161	movl	$0xc6000000, 8(STACKP)					// -8192.0f
162	movl	$0x7f800000, 4(STACKP)					// inf
163	movl	$0x47000000, 0(STACKP)					// 32768
164
165	//test for inf or NaN
166	fldt		FIRST_ARG_OFFSET(STACKP)				// { x }
167	fldz								// { 0, x }
168	fucomip		%st(1), %st(0)			// { x }
169	jnbe			expm1l_neg			//	if 0 > x goto expm1_neg
170	
171	// x >= 0...
172	// get x*log2e
173	fldl2e								// { log2e, x } 
174	fmulp								// { x*log2e }
175	flds		4(STACKP)					// { inf, x*log2e }
176	fucomip		%ST(1), %ST(0)			// { x*log2e }		if x*log2e == inf  or NaN
177	je			expm1l_special
178
179	//next convert to integer and fractional parts using round to zero rounding mode
180	//calculate trunc(x*log2e)
181	flds		(STACKP)				// { 32768.0, x*log2e }
182	fucomi		%st(1), %st(0)			// { x*log2e }
183	fcmovnbe	%st(1), %st(0)			// { clipped, x*log2e }
184	fstp		%st(1)					// { clipped }
185	fld			%st(0)					// { clipped, clipped }
186    
187#if defined( __SSE3__ )
188	fisttpll	(STACKP)				// { clipped }
189	fildll		(STACKP)				// { i, clipped }
190#else
191    //set to round to zero
192    fnstcw      4(STACKP)
193    movw        4(STACKP),      %ax         
194    movw        %ax,            %dx         //save old value for later
195    orw         $0x0c00,        %ax
196    movw        %ax,            4(STACKP)
197    fldcw       4(STACKP)
198    //round to int
199	fistpll	(STACKP)                    // { clipped }
200	fildll		(STACKP)				// { i, clipped }
201    //restore old rounding mode
202    movw        %dx,            4(STACKP)
203    fldcw       4(STACKP)
204#endif
205
206	//test 0 against i
207	fldz								// {0, i, x*log2e}
208	fucomip		%st(1), %st(0)			// {i, x*log2e}
209
210	//get the fractional part of x*log2e with the same sign as x*log2e
211	fsubr		%st(0), %st(1)			// { i, f }
212	
213	//calculate 2**f-1
214	fxch								// { f, i }
215	f2xm1								// { 2**f -  1, i }
216
217	//three way switch for 0>i, i==0, i>0
218	je		exmpm1l_zero
219	
220	//This case is a bit trickier, because 2**i can overflow when the result does not
221	//we do this as:
222	//
223	//	e**x - 1	= (2**f - 1)(2**i)	+ (2**i) - 1
224	//				= (2**i)( (2**f-1) - 2**(-i) + 1)
225	//
226	//	we clamp 2**(-i) so that it can't go lower than 2**(-8192) to prevent underflow
227	//	When this happens the result is always 0x1.fffffffffffffffep-1 * 2**i, so it doesn't
228	//  much matter
229	fld		%st(1)						// { i, 2**f-1, i}
230	fchs								// {-i, 2**f-1, i}
231	flds	8(STACKP)					// {-8192.0f, -i, 2**f-1, i}
232	fucomi  %st(1), %st(0)				// {-8192.0f, -i, 2**f-1, i}
233	fcmovb  %st(1), %st(0)				// { clipped(-i), -i, 2**f-1, i}
234	fstp	%st(1)						// { clipped(-i), 2**f-1, i}
235	fld1								// {1, clipped(-i), 2**f-1, i}
236	fscale								// { 2**-i,clipped(-i), 2**f-1, i}
237	fstp	%st(1)						// { 2**-i, 2**f-1, i}
238	fsubrp								// { 2**f-1 - 2**-i, i}
239	fld1								// { 1, 2**f-1 - 2**-i, i}
240	faddp								// { 1+ 2**f-1 - 2**-i, i}
241	fscale
242	fstp	%st(1)
243
244	ADDP	$EXPM1L_STACK_SIZE,	STACKP
245    ret
246
247
248// x < 0 or x is NaN
249expm1l_neg:								// {x}
250	flds	4(STACKP)					// { inf, x }
251	fchs								// { -inf, x }
252	fucomip	%ST(1), %ST(0)				// { x }		if x == -inf  
253	je		expm1l_special				// NaNs and -Inf go to special case handling code
254
255	// clip large negative values to avoid overflow
256	flds	8(STACKP)					// { -8192, x }
257	fucomi	%st(1), %st(0)				// { -8192, x }
258	fcmovb  %st(1), %st(0)				// { clipped, x }
259	fstp	%st(1)						// { clipped }
260
261	//scale by log2e
262	fldl2e								// { log2e, clipped }	//clipped a.k.a x from here on
263	fmulp								// { x*log2e }
264
265	//next convert to integer and fractional parts using round to zero rounding mode
266	//calculate trunc(x*log2e)
267	fld			%st(0)					// { x*log2e, x*log2e }
268#if defined( __SSE3__ )
269	fisttpll	(STACKP)				// { x*log2e }
270	fildll		(STACKP)				// { i, x*log2e }
271#else
272    //set to round to zero
273    fnstcw      4(STACKP)
274    movw        4(STACKP),      %ax         
275    movw        %ax,            %dx         //save old value for later
276    orw         $0x0c00,        %ax
277    movw        %ax,            4(STACKP)
278    fldcw       4(STACKP)
279    //round to int
280	fistpll     (STACKP)				// { x*log2e }
281	fildll		(STACKP)				// { i, x*log2e }
282    //restore old rounding mode
283    movw        %dx,            4(STACKP)
284    fldcw       4(STACKP)
285#endif
286    
287
288	//test 0 against i
289	fldz								// {0, i, x*log2e}
290	fucomip		%st(1), %st(0)			// {i, x*log2e}
291
292	//get the fractional part of x*log2e with the same sign as x*log2e
293	fsubr		%st(0), %st(1)			// { i, f }
294	
295	//calculate 2**f-1
296	fxch								// { f, i }
297	f2xm1								// { 2**f -  1, i }
298
299	//jump to complex code for handling the i < 0 case
300	je	exmpm1l_zero
301
302	
303	// In the i < 0 case, 2**i has a smaller magnitude than -1, so it is added first
304	//
305	//			= (2**f - 1)(2**i)	+ (2**i) - 1 
306	//
307	// The 2**i scaling is exact, so it can be factored out:
308	//
309	//			= (2**i)(2**f-1+1) -1
310	fld1								// { 1, 2**f - 1, i }
311	faddp								// { 2**f-1+1, i }
312	fscale								// { (2**f-1+1)*(2**i), i }
313	fstp	%st(1)						// { (2**f-1+1)*(2**i) }
314	fld1								// { 1, (2**f-1+1)*(2**i) } 
315	fsubrp								// { result }
316	ADDP	$EXPM1L_STACK_SIZE,	STACKP
317    ret
318
319exmpm1l_zero:	//the i == 0 case
320	fstp	%st(1)						// { result }
321	ADDP	$EXPM1L_STACK_SIZE,	STACKP
322    ret
323
324
325expm1l_special:  //handles +-Inf, QNaN, SNaN
326	fldz								// {0, x}
327	fucomip  %ST(1), %ST(0)				// { x }	if( 0 > x )
328	fadd	 %ST(0), %ST(0)				// silence NaNs
329	fld1								// { 1, x }
330	fchs								// { -1, x }
331	fxch								// { f, -1 }
332	fcmovnbe %ST(1), %ST(0)				// set the -Inf case to -1
333	fstp	 %ST(1)
334	ADDP	$EXPM1L_STACK_SIZE,	STACKP
335	ret