src/libm/Source/Intel/acosf.S at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / acosf.S
at fixPythonPipStalling 474 lines 14 kB view raw
wrap content
Lubos Dolezel More progress 9y ago
a76db625
  1/*	acosf.s -- acosf for standard math library.
  2
  3	Written by Eric Postpischil, July 2007.
  4*/
  5
  6
  7	.literal8
  8
  9// Define the points where our algorithm changes.
 10nPoint:	.double	-.62000000476837158
 11pPoint:	.double	+.62000000476837158
 12
 13// Define miscellaneous constants.
 14nOne:	.double	-1
 15pOne:	.double	+1
 16HalfPi:	.double	 1.5707963267948966192313217
 17Pi:		.double	 3.1415926535897932384626433
 18
 19// Define a coefficient for center polynomial (used for x in [-.62, +.62]).
 20C2:		.double	 -0.7009458946846486978642201e-1
 21
 22// Define coefficients for tail polynomial (used for x outside [-.62, +.62]).
 23pT2:	.double	  0.1475254770791043897245664e-2
 24nT2:	.double	 -0.1475254770791043897245664e-2
 25
 26// Define coefficients for reciprocal-square-root refinement.
 27S0:		.double	 4.999999720603234088021257968
 28S1:		.double	-3.333333463718495862077843807
 29
 30
 31	.const
 32	.align	4
 33
 34/*	Define some coefficients for center polynomial (used for x in [-.62,
 35	+.62]).  These are stored in pairs at aligned addresses for use in SIMD
 36	instructions.
 37*/
 38C1:		.double	+1.5714407739207253912688773, -1.5375796594506042231041545
 39C0:		.double	+1.2521077245762256716713145, +1.8992998261460658570880250
 40
 41/*	Define some coefficients for positive tail polynomial (used for x in (+.62,
 42 	1)).
 43*/
 44T0:		.double	+14.7303701447222424478789710, +27.0927484663960477135094919
 45T1:		.double	 +2.7185038560132749878882294,  -8.6073428141752546318770571
 46
 47
 48// Rename the general registers (just to make it easier to keep track of them).
 49#if defined __i386__
 50	#define	r0	%eax
 51	#define	r1	%ecx
 52	#define	r2	%edx
 53	#define	r3	%ebx
 54	#define	r4	%esp
 55	#define	r5	%ebp
 56	#define	r6	%esi
 57	#define	r7	%edi
 58#elif defined __x86_64__
 59	#define	r0	%rax
 60	#define	r1	%rcx
 61	#define	r2	%rdx
 62	#define	r3	%rbx
 63	#define	r4	%rsp
 64	#define	r5	%rbp
 65	#define	r6	%rsi
 66	#define	r7	%rdi
 67#else
 68	#error "Unknown architecture."
 69#endif
 70
 71
 72	.text
 73
 74
 75// Define various symbols.
 76
 77#define	BaseP		r0		// Base address for position-independent addressing.
 78
 79#define	p			%xmm0	// Must be in %xmm0 for return on x86_64.
 80#define	x			%xmm1
 81#define	x1			%xmm2
 82#define	w			x1
 83#define	pa			%xmm3
 84#define	e			%xmm4
 85#define	rss			%xmm5
 86
 87#if defined __i386__
 88
 89	// Define location of argument x.
 90	#define	Argx			4(%esp)
 91
 92	// Define how to address data.  BaseP must contain the address of label 0.
 93	#define	Address(label)	label-0b(BaseP)
 94
 95#elif defined __x86_64__
 96
 97	// Define location of argument x.
 98	#define	Argx			%xmm0
 99
100	// Define how to address data.
101	#define	Address(label)	label(%rip)
102
103#endif
104
105
106/*	float acosf(float x).
107
108	Notes:
109
110		Citations in parentheses below indicate the source of a requirement.
111
112		"C" stands for ISO/IEC 9899:TC2.
113
114		The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
115		requirements since it defers to C and requires errno behavior only if
116		we choose to support it by arranging for "math_errhandling &
117		MATH_ERRNO" to be non-zero, which we do not.
118
119	Return value:
120
121		For arcosine of 1, return +0 (C F.9.1.1).
122
123		For 1 < |x| (including infinity), return NaN (C F.9.1.1).
124
125		For a NaN, return the same NaN (C F.9 11 and 13).  (If the NaN is a
126		signalling NaN, we return the "same" NaN quieted.)
127
128		Otherwise:
129
130			If the rounding mode is round-to-nearest, return arccosine(x)
131			faithfully rounded.
132
133			Return a value in [0, pi] (C 7.12.4.1 3).  Note that this prohibits
134			returning a correctly rounded value for acosf(-1), since pi rounded
135			to a float lies outside that interval.
136		
137			Not implemented:  In other rounding modes, return arccosine(x)
138			possibly with slightly worse error, not necessarily honoring the
139			rounding mode (Ali Sazegari narrowing C F.9 10).
140
141	Exceptions:
142
143		May or may not raise inexact, even if the result is exact (C F.9 8).
144
145		Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
146		of C 4.2.1)  or 1 < |x| (including infinity) (C F.9.1.1) but not if the
147		input is a quiet NaN (C F.9 11).
148
149		May not raise exceptions otherwise (C F.9 9).
150
151	Properties:
152
153		Monotonic, proven by exhaustive testing.
154
155		Exhaustive testing proved this routine returns faithfully rounded
156		results.  Since the rsqrtss instruction is specified to return a value
157		in an interval, tests were performed using each possible result,
158		showing that a valid result will be obtained regardless of which
159		value rsqrtss provides.
160*/
161	.align	5
162#if !defined DevelopmentInstrumentation
163	// This is the regular name used in the deployed implementation.
164		.globl _acosf
165	_acosf:
166#else
167	// This is the name used for a special test version of the routine.
168		.globl _acosfInstrumented
169	_acosfInstrumented:
170#endif
171
172	cvtss2sd	Argx, x				// Convert x to double precision.
173
174	#if defined __i386__
175
176		// Get address of 0 in BaseP.
177			call	0f				// Push program counter onto stack.
178		0:
179			pop		BaseP			// Get program counter.
180
181	#endif
182
183/*	We use different algorithms for different parts of the domain.  There
184	is a "negative tail" from -1 to nPoint, a center from nPoint to pPoint,
185	and a "positive tail" from pPoint to +1.  Here, we compare and branch
186	to the appropriate code.
187
188	There are also special cases:  NaNs, x < -1, and 1 < x.  These are weeded
189	out in PositiveTail or NegativeTail.
190*/
191
192	ucomisd		Address(pPoint), x
193	ja			PositiveTail
194
195	ucomisd		Address(nPoint), x
196	jb			NegativeTail
197
198
199/*	Here we have nPoint <= x <= pPoint.  This is handled with a simple
200	evaluation of a polynomial that approximates arccosine.
201
202	The polynomial has been arranged into the form:
203
204		pi/2 - x + x *
205				c2   * (x**4 + c01 * x**2 + c00)
206			*   x**2 * (x**4 + c11 * x**2 * c10)).
207
208	The coefficients c00 and c10 are stored in a pair at C0, and c01 and c11
209	are stored at C1.  c2 is at C2.
210
211	The two quartic factors are evaluated in SIMD registers.  For brevity, some
212	comments below describe only one element of a register.  The other is
213	analagous.
214*/
215	movsd		x, x1				// Save a copy of x for later.
216	mulsd		x, x				// Form x**2.
217	movapd		Address(C1), p		// Get first coefficient pair.
218	unpcklpd	x, x				// Duplicate in SIMD register.
219	addpd		x, p				// Form x**2 + c1.
220	mulpd		x, p				// Form x**4 + c1 * x**2.
221	movlpd		Address(C2), x		// Put c2 in low element, with x**2 in high.
222	mulsd		x1, x				// Multiply by x.
223	addpd		Address(C0), p		// Form x**4 + c1 * x**2 + c0.
224	mulpd		x, p				// Multiply by c2*x in low and x**2 in high.
225	movhlps		p, pa				// Get high element.
226	subsd		Address(HalfPi), x1	// Form x - pi/2.
227	mulsd		pa, p				// Multiply two factors.
228	subsd		x1, p				// Form pi/2 - x + x * c2 * ...
229
230// Return the double-precision number currently in p.
231ReturnDoubleInp:
232	cvtsd2ss	p, p				// Convert result to single precision.
233
234// Return the single-precision number currently in p.
235ReturnSingleInp:
236
237	#if defined __i386__
238		movss		p, Argx			// Shuttle result through memory.
239			// This uses the argument area for scratch space, which is allowed.
240		flds		Argx			// Return input on floating-point stack.
241	#else
242		// On x86_64, the return value is now in p, which is %xmm0.
243	#endif
244
245	ret
246
247
248// Handle pPoint < x.
249PositiveTail:
250
251	movsd		Address(pOne), w	// Get +1 for math and comparison.
252	ucomisd		w, x				// Compare x to +1.
253	jae			InputIsPositiveSpecial
254
255/*	Here we have pPoint < x < 1.  The algorithm here is inspired by the
256	identity arccosine(x) = 2 * arcsine(sqrt((1-x)/2)).  Replacing arcsine with
257	an approximating polynomial would give an odd polynomial in sqrt((1-x)/2),
258	which is the same as sqrt(1-x) multiplied by some polynomial in x.  So we
259	have:
260
261		arccosine(x) ~= sqrt(1-x) * t(x).
262
263	Unfortunately, the square-root instruction (rsqrtss) takes too long, so we
264	use the faster reciprocal-square-root-estimate instruction instead and
265	refine its estimate.  Given an estimate e from the rsqrtss instruction, the
266	square root of 1-x is very nearly, e * (1-x) * s(e**2 * (1-x)).  Let e2a
267	be e**2 * (1-x), so sqrt(1-x) is nearly e * a * s(e2a).  The leading
268	coefficient of s, cl, has been removed (by dividing s by it) and multiplied
269	into the polynomial p above.  That leaves:
270
271		sqrt(1-x) / cl ~= e * a * s(e2a)/cl.
272
273	s(e2a)/cl is e2a**2 + s1 * e2a + s0, where s1 and s0 have been stored at
274	labels S1 and S0, above.
275
276	t(x) has been arranged into the form:
277
278			t2
279		*	(x**2 + t01 * x + t00)
280		*	(x**2 + t11 * x + t10).
281
282	The two quadratic factors are evaluated in SIMD registers.  For brevity,
283	some comments below describe only one element of a register.  The other is
284	analagous.
285
286	So, our job here is to evaluate:
287
288		a = 1-x.
289
290		e = estimate of 1/sqrt(a).
291
292		e2a = e*e*a.
293
294		acosf(x) ~= e * a
295			*	(e2a**2 + s1 * e2a + s0)
296			*	t2
297			*	(x**2 + t01 * x + t00)
298			*	(x**2 + t11 * x + t10).
299*/
300
301	subsd		x, w				// Form 1-x.
302	cvtsd2ss	w, e				// Convert to single precision for rsqrtss.
303	movapd		Address(T1), p		// Start asinf polynomial.
304	unpcklpd	x, x				// Duplicate x.
305	addpd		x, p				// Form x + t1.
306	#if !defined DevelopmentInstrumentation
307		// This is the regular code used in the deployed implementation.
308		rsqrtss		e, e			// Estimate 1/sqrt(1-x).
309	#else
310		/*	This instruction uses an estimate of 1/sqrt(1-x) passed by the
311			caller rather than the rsqrtss instruction.  This allows us to test
312			the implementation with all values that rsqrtss might return.
313		*/
314		movss	8(%esp), e			// Use caller's estimate of 1/sqrt(1-x).
315	#endif
316	cvtss2sd	e, e				// Convert to double precision.
317	mulpd		x, p				// Form x**2 + t1*x.
318	mulsd		e, w				// Form e    * (1-x).
319	mulsd		w, e				// Form e**2 * (1-x).
320		// "e" in comments refers to the initial estimate from rsqrtss.
321	movhpd		Address(pT2), w		// Copy coefficient into high element.
322	addpd		Address(T0), p		// Form x**2 + t1*x + t0.
323	movsd		Address(S1), rss	// Form s1.
324	addsd		e, rss				// Form e**2 * (1-x) + s1.
325	mulpd		w, p				// Form e * (1-x) * p(x), split.
326	mulsd		e, rss				// Form e2a**2 + s1 * e2a.
327	movhlps		p, pa				// Separate high element.
328	addsd		Address(S0), rss	// Form e2a**2 + s1 * e2a + s0.
329	mulsd		pa, p				// Finish e * (1-x) * p(x).
330	mulsd		rss, p				// Form e * (1-x) / sqrt(1-x) * p(x).
331
332	jmp			ReturnDoubleInp
333
334
335// Handle x < nPoint.
336NegativeTail:
337
338	movsd		Address(pOne), w	// Get +1 for math.
339	ucomisd		Address(nOne), x	// Compare x to -1.
340	jbe			InputIsNegativeSpecial
341
342/*	Here we have -1 < x < nPoint.  We use the same algorithm as in PositiveTail
343	but adapted for -x.
344
345	For brevity, some comments below describe only one element of a register.
346	The other is analagous.
347
348	Our job here is to evaluate:
349
350		a = 1+x.
351
352		e = estimate of 1/sqrt(a).
353
354		e2a = e*e*a.
355
356		acosf(x) ~= pi - e * a
357			*	(e2a**2 + s1 * e2a + s0)
358			*	t2
359			*	(x**2 - t01 * x + t00)
360			*	(x**2 - t11 * x + t10).
361
362	Note that the signs of terms involving x are negated from those in
363	PositiveTail.
364
365	For convenience, the final two factors are negated:
366
367			*	(-x**2 + t01 * x - t00)
368			*	(-x**2 + t11 * x - t10).
369*/
370
371	addsd		x, w				// Form 1+x.
372	cvtsd2ss	w, e				// Convert to single precision for rsqrtss.
373	movapd		Address(T1), p		// Start asinf polynomial.
374	unpcklpd	x, x				// Duplicate x.
375	subpd		x, p				// Form -x + t1.
376	#if !defined DevelopmentInstrumentation
377		// This is the regular code used in the deployed implementation.
378		rsqrtss		e, e			// Estimate 1/sqrt(1+x).
379	#else
380		/*	This instruction uses an estimate of 1/sqrt(1+x) passed by the
381			caller rather than the rsqrtss instruction.  This allows us to test
382			the implementation with all values that rsqrtss might return.
383		*/
384		movss	8(%esp), e			// Use caller's estimate of 1/sqrt(1+x).
385	#endif
386	cvtss2sd	e, e				// Convert to double precision.
387	mulpd		x, p				// Form -x**2 + t1*x.
388	mulsd		e, w				// Form e    * (1+x).
389	mulsd		w, e				// Form e**2 * (1+x).
390		// "e" in comments refers to the initial estimate from rsqrtss.
391	movhpd		Address(nT2), w		// Copy coefficient into high element.
392	subpd		Address(T0), p		// Form -x**2 + t1*x - t0.
393	movsd		Address(S1), rss	// Form s1.
394	addsd		e, rss				// Form e**2 * (1+x) + s1.
395	mulpd		w, p				// Form e * (1+x) * p(x), split.
396	mulsd		e, rss				// Form e2a**2 + s1 * e2a.
397	movhlps		p, pa				// Separate high element.
398	addsd		Address(S0), rss	// Form e2a**2 + s1 * e2a + s0.
399	mulsd		pa, p				// Finish e * (1+x) * p(x).
400	mulsd		rss, p				// Form e * (1+x) / sqrt(1+x) * p(x).
401	addsd		Address(Pi), p		// Form pi - 2*asin(sqrt((1+x)/2)).
402
403	jmp			ReturnDoubleInp
404
405
406/*	Here we handle inputs greater than or equal to one, including infinity,
407	but not including NaNs.  The condition code must be set to indicate equal
408	(zero flag is one) iff the input is one.
409*/
410InputIsPositiveSpecial:
411	je			InputIsPositiveOne
412	jmp			SignalInvalid
413
414
415/*	Here we handle inputs less than or equal to -1, including -infinity, and
416	NaNs.  The condition code must be set as if by using ucomisd to compare
417	the input in the "source operand" to -1 in the "destination operand".
418*/
419InputIsNegativeSpecial:
420	jp			InputIsNaN
421	je			InputIsNegativeOne
422
423	// Continue into SignalInvalid.
424//	jmp			SignalInvalid
425
426
427/*	Here we handle inputs outside the domain of the function.  We subtract
428	infinity from itself to generate the invalid signal and return a NaN.
429*/
430
431	.literal4
432Infinity:	.long	0x7f800000
433
434	.text
435SignalInvalid:
436	movss		Address(Infinity), p
437	subss		p, p				// Generate invalid signal and NaN value.
438	jmp			ReturnSingleInp
439
440
441/*	Here we handle an input of +1 or -1.  arccosine(-1) is pi, which increases
442	when rounded to single precision.  However, we are required to return
443	results in [0, pi], so we return pi rounded down.
444*/
445
446	.literal8
447/*	Define a value near pi that yields pi rounded down when converted to
448	single precision.  This allows us to generate inexact and return the
449	desired value for acosf(-1).
450*/
451AlmostPi:	.double	3.1415925
452
453	.text
454InputIsPositiveOne:
455	xorps		p, p					// Return zero.
456	jmp			ReturnSingleInp
457
458InputIsNegativeOne:
459	cvtsd2ss	Address(AlmostPi), p
460	jmp			ReturnSingleInp
461
462
463InputIsNaN:
464
465	#if defined __i386__
466		flds		Argx			// Return result on floating-point stack.
467	#else
468		cvtsd2ss	x, %xmm0		// Return in %xmm0.
469			/*	We cannot just return the input, because we must quiet a
470				signalling NaN.
471			*/
472	#endif
473
474	ret