src/libm/Source/Intel/asinf.S at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / asinf.S
at fixPythonPipStalling 480 lines 14 kB view raw
wrap content
Lubos Dolezel More progress 9y ago
a76db625
  1/*	asinf.s -- asinf for standard math library.
  2
  3	Written by Eric Postpischil, June 2007.
  4*/
  5
  6
  7	.literal8
  8
  9// Define the points where our algorithm changes.
 10nPoint:	.double	-.57
 11pPoint:	.double	+.57
 12
 13// Define miscellaneous constants.
 14nOne:	.double	-1
 15pOne:	.double	+1
 16HalfPi:	.double	 1.5707963267948966192313217
 17
 18// Define a coefficient for center polynomial (used for x in [-.57, +.57]).
 19C2:		.double	 0.5175137818992348748214025e-1
 20
 21// Define coefficients for tail polynomials (used for x outside [-.57, +.57]).
 22pT2:	.double	-0.1651550997063622696638513e-2
 23nT2:	.double	+0.1651550997063622696638513e-2
 24
 25// Define coefficients for reciprocal-square-root refinement.
 26S0:		.double	 4.999999720603234088021257968
 27S1:		.double	-3.333333463718495862077843807
 28
 29
 30	.const
 31	.align	4
 32
 33/*	Define some coefficients for center polynomial (used for x in [-.57,
 34	+.57]).  These are stored in pairs at aligned addresses for use in SIMD
 35	instructions.
 36*/
 37C1:		.double	 1.8337275054369541825679091, -1.4826841272512669686823077
 38C0:		.double	 1.5667812996782509912266781,  2.0555474653288563095606975
 39
 40// Define some coefficients for tail polynomial (used for x in (+.57, 1)).
 41T0:		.double	+14.0375337010427139350610970, +25.3978891245023107522273720
 42T1:		.double	 +2.7174503090559850822332204,  -8.3122365980471962999776635
 43
 44
 45// Rename the general registers (just to make it easier to keep track of them).
 46#if defined __i386__
 47	#define	r0	%eax
 48	#define	r1	%ecx
 49	#define	r2	%edx
 50	#define	r3	%ebx
 51	#define	r4	%esp
 52	#define	r5	%ebp
 53	#define	r6	%esi
 54	#define	r7	%edi
 55#elif defined __x86_64__
 56	#define	r0	%rax
 57	#define	r1	%rcx
 58	#define	r2	%rdx
 59	#define	r3	%rbx
 60	#define	r4	%rsp
 61	#define	r5	%rbp
 62	#define	r6	%rsi
 63	#define	r7	%rdi
 64#else
 65	#error "Unknown architecture."
 66#endif
 67
 68
 69	.text
 70
 71
 72// Define various symbols.
 73
 74#define	BaseP		r0		// Base address for position-independent addressing.
 75
 76#define	p			%xmm0	// Must be in %xmm0 for return on x86_64.
 77#define	x			%xmm1
 78#define	x1			%xmm2
 79#define	w			x1
 80#define	pa			%xmm3
 81#define	e			%xmm4
 82#define	rss			%xmm5
 83
 84#if defined __i386__
 85
 86	// Define location of argument x.
 87	#define	Argx			4(%esp)
 88
 89	// Define how to address data.  BaseP must contain the address of label 0.
 90	#define	Address(label)	label-0b(BaseP)
 91
 92#elif defined __x86_64__
 93
 94	// Define location of argument x.
 95	#define	Argx			%xmm0
 96
 97	// Define how to address data.
 98	#define	Address(label)	label(%rip)
 99
100#endif
101
102
103/*	float asinf(float x).
104
105	Notes:
106
107		Citations in parentheses below indicate the source of a requirement.
108
109		"C" stands for ISO/IEC 9899:TC2.
110
111		The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
112		requirements since it defers to C and requires errno behavior only if
113		we choose to support it by arranging for "math_errhandling &
114		MATH_ERRNO" to be non-zero, which we do not.
115
116	Return value:
117
118		For arcsine of +/- zero, return zero with same sign (C F.9 12 and
119		F.9.1.2).
120
121		For 1 < |x| (including infinity), return NaN (C F.9.1.2).
122
123		For a NaN, return the same NaN (C F.9 11 and 13).  (If the NaN is a
124		signalling NaN, we return the "same" NaN quieted.)
125
126		Otherwise:
127
128			If the rounding mode is round-to-nearest, return arcsine(x)
129			faithfully rounded.
130
131			Return a value in [-pi/2, +pi/2] (C 7.12.4.2 3).  Note that this
132			prohibits returning correctly rounded values for asinf(-1) and
133			asinf(+1), since pi/2 rounded to a float lies outside that
134			interval.
135		
136			Not implemented:  In other rounding modes, return arcsine(x)
137			possibly with slightly worse error, not necessarily honoring the
138			rounding mode (Ali Sazegari narrowing C F.9 10).
139
140	Exceptions:
141
142		Raise underflow for a denormal result (C F.9 7 and Draft Standard for
143		Floating-Point Arithmetic P754 Draft 1.2.5 9.5).  If the input is the
144		smallest normal, underflow may or may not be raised.  This is stricter
145		than the older 754 standard.
146
147		May or may not raise inexact, even if the result is exact (C F.9 8).
148
149		Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
150		of C 4.2.1)  or 1 < |x| (including infinity) (C F.9.1.2) but not if the
151		input is a quiet NaN (C F.9 11).
152
153		May not raise exceptions otherwise (C F.9 9).
154
155	Properties:
156
157		Monotonic, proven by exhaustive testing.
158
159		Exhaustive testing proved this routine returns faithfully rounded
160		results.  Since the rsqrtss instruction is specified to return a value
161		in an interval, tests were performed using each possible result,
162		showing that a valid result will be obtained regardless of which
163		value rsqrtss provides.
164*/
165	.align	5
166#if !defined DevelopmentInstrumentation
167	// This is the regular name used in the deployed implementation.
168		.globl _asinf
169	_asinf:
170#else
171	// This is the name used for a special test version of the routine.
172		.globl _asinfInstrumented
173	_asinfInstrumented:
174#endif
175
176	cvtss2sd	Argx, x				// Convert x to double precision.
177
178	#if defined __i386__
179
180		// Get address of 0 in BaseP.
181			call	0f				// Push program counter onto stack.
182		0:
183			pop		BaseP			// Get program counter.
184
185	#endif
186
187/*	We use different algorithms for different parts of the domain.  There
188	is a "negative tail" from -1 to nPoint, a center from nPoint to pPoint,
189	and a "positive tail" from pPoint to +1.  Here, we compare and branch
190	to the appropriate code.
191
192	There are also special cases:  NaNs, x < -1, and 1 < x.  These are weeded
193	out in PositiveTail or NegativeTail.
194*/
195
196	ucomisd		Address(pPoint), x
197	ja			PositiveTail
198
199	ucomisd		Address(nPoint), x
200	jb			NegativeTail
201
202
203/*	Here we have nPoint <= x <= pPoint.  This is handled with a simple
204	evaluation of a polynomial that approximates arcsine.
205
206	The polynomial has been arranged into the form:
207
208		x + x *
209				c2   * (x**4 + c01 * x**2 + c00)
210			*   x**2 * (x**4 + c11 * x**2 * c10).
211
212	The coefficients c00 and c10 are stored in a pair at C0, and c01 and c11
213	are stored at C1.  c2 is at C2.
214
215	The two quartic factors are evaluated in SIMD registers.  For brevity, some
216	comments below describe only one element of a register.  The other is
217	analagous.
218*/
219	movsd		x, x1				// Save a copy of x for later.
220	mulsd		x, x				// Form x**2.
221	movapd		Address(C1), p		// Get first coefficient pair.
222	unpcklpd	x, x				// Duplicate in SIMD register.
223	addpd		x, p				// Form x**2 + c1.
224	mulpd		x, p				// Form x**4 + c1 * x**2.
225	movlpd		Address(C2), x		// Put c2 in low element, with x**2 in high.
226	mulsd		x1, x				// Multiply by x.
227	addpd		Address(C0), p		// Form x**4 + c1 * x**2 + c0.
228	mulpd		x, p				// Multiply by c2*x in low and x**2 in high.
229	movhlps		p, pa				// Get high element.
230	mulsd		pa, p				// Multiply two factors.
231	addsd		x1, p				// Add final term.
232
233// Return the double-precision number currently in p.
234ReturnDoubleInp:
235	cvtsd2ss	p, p				// Convert result to single precision.
236
237// Return the single-precision number currently in p.
238ReturnSingleInp:
239
240	#if defined __i386__
241		movss		p, Argx			// Shuttle result through memory.
242			// This uses the argument area for scratch space, which is allowed.
243		flds		Argx			// Return input on floating-point stack.
244	#else
245		// On x86_64, the return value is now in p, which is %xmm0.
246	#endif
247
248	ret
249
250
251// Handle pPoint < x.
252PositiveTail:
253
254	movsd		Address(pOne), w	// Get +1 for math and comparison.
255	ucomisd		w, x				// Compare x to +1.
256	jae			InputIsPositiveSpecial
257
258/*	Here we have pPoint < x < 1.  The algorithm here is inspired by the
259	identity arcsine(x) = pi/2 - 2 * arcsine(sqrt((1-x)/2)).  Replacing arcsine
260	with an approximating polynomial would give an odd polynomial in
261	sqrt((1-x)/2), which is the same as sqrt(1-x) multiplied by some polynomial
262	in x.  So we have:
263
264		arcsine(x) ~= pi/2 + sqrt(1-x) * t(x).
265
266	Unfortunately, the square-root instruction (rsqrtss) takes too long, so we
267	use the faster reciprocal-square-root-estimate instruction instead and
268	refine its estimate.  Given an estimate e from the rsqrtss instruction, the
269	square root of 1-x is very nearly, e * (1-x) * s(e**2 * (1-x)).  Let e2a
270	be e**2 * (1-x), so sqrt(1-x) is nearly e * a * s(e2a).  The leading
271	coefficient of s, cl, has been removed (by dividing s by it) and multiplied
272	into the polynomial p above.  That leaves:
273
274		sqrt(1-x) / cl ~= e * a * s(e2a)/cl.
275
276	s(e2a)/cl is e2a**2 + s1 * e2a + s0, where s1 and s0 have been stored at
277	labels S1 and S0, above.
278
279	t(x) has been arranged into the form:
280
281			t2
282		*	(x**2 + t01 * x + t00)
283		*	(x**2 + t11 * x + t10).
284
285	The two quadratic factors are evaluated in SIMD registers.  For brevity,
286	some comments below describe only one element of a register.  The other is
287	analagous.
288
289	So, our job here is to evaluate:
290
291		a = 1-x.
292
293		e = estimate of 1/sqrt(a).
294
295		e2a = e*e*a.
296
297		asinf(x) ~= pi/2 + e * a
298			*	(e2a**2 + s1 * e2a + s0)
299			*	t2
300			*	(x**2 + t01 * x + t00)
301			*	(x**2 + t11 * x + t10).
302*/
303
304	subsd		x, w				// Form 1-x.
305	cvtsd2ss	w, e				// Convert to single precision for rsqrtss.
306	movapd		Address(T1), p		// Start asinf polynomial.
307	unpcklpd	x, x				// Duplicate x.
308	addpd		x, p				// Form x + t1.
309	#if !defined DevelopmentInstrumentation
310		// This is the regular code used in the deployed implementation.
311		rsqrtss		e, e			// Estimate 1/sqrt(1-x).
312	#else
313		/*	This instruction uses an estimate of 1/sqrt(1-x) passed by the
314			caller rather than the rsqrtss instruction.  This allows us to test
315			the implementation with all values that rsqrtss might return.
316		*/
317		movss	8(%esp), e			// Use caller's estimate of 1/sqrt(1-x).
318	#endif
319	cvtss2sd	e, e				// Convert to double precision.
320	mulpd		x, p				// Form x**2 + t1*x.
321	mulsd		e, w				// Form e    * (1-x).
322	mulsd		w, e				// Form e**2 * (1-x).
323		// "e" in comments refers to the initial estimate from rsqrtss.
324	movhpd		Address(pT2), w		// Copy coefficient into high element.
325	addpd		Address(T0), p		// Form x**2 + t1*x + t0.
326	movsd		Address(S1), rss	// Form s1.
327	mulpd		w, p				// Form e * (1-x) * p(x), split.
328	addsd		e, rss				// Form e**2 * (1-x) + s1.
329	movhlps		p, pa				// Separate high element.
330	mulsd		pa, p				// Finish e * (1-x) * p(x).
331	mulsd		e, rss				// Form e2a**2 + s1 * e2a.
332	addsd		Address(S0), rss	// Form e2a**2 + s1 * e2a + s0.
333	mulsd		rss, p				// Form e * (1-x) / sqrt(1-x) * p(x).
334	addsd		Address(HalfPi), p	// Form pi/2 - 2*asin(sqrt((1-x)/2)).
335
336	jmp			ReturnDoubleInp
337
338
339// Handle x < nPoint.
340NegativeTail:
341
342	movsd		Address(pOne), w	// Get +1 for math.
343	ucomisd		Address(nOne), x	// Compare x to -1.
344	jbe			InputIsNegativeSpecial
345
346/*	Here we have -1 < x < nPoint.  We use the same algorithm as in PositiveTail
347	but adapted for -x.
348
349	For brevity, some comments below describe only one element of a register.
350	The other is analagous.
351
352	Our job here is to evaluate:
353
354		a = 1+x.
355
356		e = estimate of 1/sqrt(a).
357
358		e2a = e*e*a.
359
360		asinf(x) ~= -pi/2 - e * a
361			*	(e2a**2 + s1 * e2a + s0)
362			*	-t2
363			*	(x**2 - t01 * x + t00)
364			*	(x**2 - t11 * x + t10).
365
366	Note that the signs of terms involving x are negated from those in
367	PositiveTail, and the result is negated as well (by changing "pi/2" to
368	"-pi/2" and "t2" to "-t2").
369
370	For convenience, the final two factors are negated:
371
372			*	(-x**2 + t01 * x - t00)
373			*	(-x**2 + t11 * x - t10).
374*/
375
376	addsd		x, w				// Form 1+x.
377	cvtsd2ss	w, e				// Convert to single precision for rsqrtss.
378	movapd		Address(T1), p		// Start asinf polynomial.
379	unpcklpd	x, x				// Duplicate x.
380	subpd		x, p				// Form -x + t1.
381	#if !defined DevelopmentInstrumentation
382		// This is the regular code used in the deployed implementation.
383		rsqrtss		e, e			// Estimate 1/sqrt(1+x).
384	#else
385		/*	This instruction uses an estimate of 1/sqrt(1+x) passed by the
386			caller rather than the rsqrtss instruction.  This allows us to test
387			the implementation with all values that rsqrtss might return.
388		*/
389		movss	8(%esp), e			// Use caller's estimate of 1/sqrt(1+x).
390	#endif
391	cvtss2sd	e, e				// Convert to double precision.
392	mulpd		x, p				// Form -x**2 + t1*x.
393	mulsd		e, w				// Form e    * (1+x).
394	mulsd		w, e				// Form e**2 * (1+x).
395		// "e" in comments refers to the initial estimate from rsqrtss.
396	movhpd		Address(nT2), w		// Copy coefficient into high element.
397	subpd		Address(T0), p		// Form -x**2 + t1*x - t0.
398	movsd		Address(S1), rss	// Form s1.
399	addsd		e, rss				// Form e**2 * (1+x) + s1.
400	mulpd		w, p				// Form e * (1+x) * p(x), split.
401	mulsd		e, rss				// Form e2a**2 + s1 * e2a.
402	movhlps		p, pa				// Separate high element.
403	addsd		Address(S0), rss	// Form e2a**2 + s1 * e2a + s0.
404	mulsd		pa, p				// Finish e * (1+x) * p(x).
405	mulsd		rss, p				// Form e * (1+x) / sqrt(1+x) * p(x).
406	subsd		Address(HalfPi), p	// Form -pi/2 - 2*asin(sqrt((1+x)/2)).
407
408	jmp			ReturnDoubleInp
409
410
411/*	Here we handle inputs greater than or equal to one, including infinity,
412	but not including NaNs.  The condition code must be set to indicate equal
413	(zero flag is one) iff the input is one.
414*/
415InputIsPositiveSpecial:
416	je			InputIsPositiveOne
417	jmp			SignalInvalid
418
419
420/*	Here we handle inputs less than or equal to -1, including -infinity, and
421	NaNs.  The condition code must be set as if by using ucomisd to compare
422	the input in the "source operand" to -1 in the "destination operand".
423*/
424InputIsNegativeSpecial:
425	jp			InputIsNaN
426	je			InputIsNegativeOne
427
428	// Continue into SignalInvalid.
429//	jmp			SignalInvalid
430
431
432/*	Here we handle inputs outside the domain of the function.  We subtract
433	infinity from itself to generate the invalid signal and return a NaN.
434*/
435
436	.literal4
437Infinity:	.long	0x7f800000
438
439	.text
440SignalInvalid:
441	movss		Address(Infinity), p
442	subss		p, p				// Generate invalid signal and NaN value.
443	jmp			ReturnSingleInp
444
445
446/*	Here we handle an input of +1 or -1.  arcsine(+1) is pi/2, which increases
447	when rounded to single precision.  However, we are required to return
448	results in [-pi/2, pi/2], so we return pi/2 rounded toward zero.
449*/
450
451	.literal8
452/*	Define values near +pi/2 and -pi/2 that yield +/- pi/2 rounded toward zero
453	when converted to single precision.  This allows us to generate inexact and
454	return the desired values for asinf(+1) and asinf(-1).
455*/
456AlmostpHalfPi:	.double	+1.5707962
457AlmostnHalfPi:	.double	-1.5707962
458
459	.text
460InputIsPositiveOne:
461	cvtsd2ss	Address(AlmostpHalfPi), p
462	jmp			ReturnSingleInp
463
464InputIsNegativeOne:
465	cvtsd2ss	Address(AlmostnHalfPi), p
466	jmp			ReturnSingleInp
467
468
469InputIsNaN:
470
471	#if defined __i386__
472		flds		Argx			// Return result on floating-point stack.
473	#else
474		cvtsd2ss	x, %xmm0		// Return in %xmm0.
475			/*	We cannot just return the input, because we must quiet a
476				signalling NaN.
477			*/
478	#endif
479
480	ret