src/libm/Source/Intel/atanf.S at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / atanf.S
at fixPythonPipStalling 369 lines 11 kB view raw
wrap content
Lubos Dolezel More progress 9y ago
a76db625
  1/*	atanf.s -- atanf for standard math library.
  2
  3	Written by Eric Postpischil, July 2007.
  4*/
  5
  6
  7	.literal8
  8
  9// Define the points where our algorithm changes.
 10nPoint0:	.double	-1
 11pPoint0:	.double	+1
 12nPoint1:	.double	-16777216
 13pPoint1:	.double	+16777216
 14
 15// Define miscellaneous constants.
 16nHalfPi:	.double	-1.5707963267948966192313217
 17One:		.double	 1
 18pHalfPi:	.double	+1.5707963267948966192313217
 19
 20// Define a coefficient for center polynomial (used for x in [-1, +1]).
 21C2:			.double	 0.0029352921857004596570518
 22
 23
 24	.const
 25	.align	4
 26
 27/*	Define some coefficients for center polynomial (used for x in [-1, +1]).
 28	These are stored in pairs at aligned addresses for use in SIMD
 29	instructions.
 30*/
 31C01:		.double	 2.2971562298314633761676433,  0.0207432003007420961489920
 32C00:		.double	 2.4449692297316409651126041,  3.7888879287802702842997915
 33C11:		.double	-2.9466967515109826289085300, -4.9721072376211623038916292
 34C10:		.double	 5.4728447324456990092824269,  6.7197076223592378022736307
 35
 36// Define coefficients for the tail polynomial (used for x outside [-1, +1]).
 37T01:	.double	 0.9193672354545696501477531, -0.0052035944094405570566862
 38T00:	.double	 0.3992772987220534996563340,  0.2521268658714555740707959
 39T11:	.double	-0.5273186542866779494437308, -0.7201738803584184183894208
 40T10:	.double	 0.1730466268612773143731748,  0.1408679409162453515360961
 41
 42
 43// Rename the general registers (just to make it easier to keep track of them).
 44#if defined __i386__
 45	#define	r0	%eax
 46	#define	r1	%ecx
 47	#define	r2	%edx
 48	#define	r3	%ebx
 49	#define	r4	%esp
 50	#define	r5	%ebp
 51	#define	r6	%esi
 52	#define	r7	%edi
 53#elif defined __x86_64__
 54	#define	r0	%rax
 55	#define	r1	%rcx
 56	#define	r2	%rdx
 57	#define	r3	%rbx
 58	#define	r4	%rsp
 59	#define	r5	%rbp
 60	#define	r6	%rsi
 61	#define	r7	%rdi
 62#else
 63	#error "Unknown architecture."
 64#endif
 65
 66
 67	.text
 68
 69
 70// Define various symbols.
 71
 72#define	BaseP		r0		// Base address for position-independent addressing.
 73
 74#define	p0			%xmm0	// Must be in %xmm0 for return on x86_64.
 75#define	x			%xmm1
 76#define	p1			%xmm2
 77#define	x1			%xmm3
 78#define	v0			%xmm4
 79#define	t0			%xmm5
 80
 81#if defined __i386__
 82
 83	// Define location of argument x.
 84	#define	Argx			4(%esp)
 85
 86	// Define how to address data.  BaseP must contain the address of label 0.
 87	#define	Address(label)	label-0b(BaseP)
 88
 89#elif defined __x86_64__
 90
 91	// Define location of argument x.
 92	#define	Argx			%xmm0
 93
 94	// Define how to address data.
 95	#define	Address(label)	label(%rip)
 96
 97#endif
 98
 99
100/*	float atanf(float x).
101
102	Notes:
103
104		Citations in parentheses below indicate the source of a requirement.
105
106		"C" stands for ISO/IEC 9899:TC2.
107
108		The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
109		requirements since it defers to C and requires errno behavior only if
110		we choose to support it by arranging for "math_errhandling &
111		MATH_ERRNO" to be non-zero, which we do not.
112
113	Return value:
114
115		For arctangent of +/- zero, return zero with same sign (C F.9 12 and
116		F.9.1.3).
117
118		For arctangent of +/- infinity, return +/- pi/2 (C F.9.1.3).
119
120		For a NaN, return the same NaN (C F.9 11 and 13).  (If the NaN is a
121		signalling NaN, we return the "same" NaN quieted.)
122
123		Otherwise:
124
125			If the rounding mode is round-to-nearest, return arctangent(x)
126			faithfully rounded.
127
128			Return a value in [-pi/2, +pi/2] (C 7.12.4.3 3).  Note that this
129			prohibits returning correctly rounded values for atanf of large
130			positive or negative arguments, since pi/2 rounded to a float lies
131			outside that interval.
132		
133			Not implemented:  In other rounding modes, return arctangent(x)
134			possibly with slightly worse error, not necessarily honoring the
135			rounding mode (Ali Sazegari narrowing C F.9 10).
136
137	Exceptions:
138
139		Raise underflow for a denormal result (C F.9 7 and Draft Standard for
140		Floating-Point Arithmetic P754 Draft 1.2.5 9.5).  If the input is the
141		smallest normal, underflow may or may not be raised.  This is stricter
142		than the older 754 standard.
143
144		May or may not raise inexact, even if the result is exact (C F.9 8).
145
146		Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
147		of C 4.2.1)  but not if the input is a quiet NaN (C F.9 11).
148
149		May not raise exceptions otherwise (C F.9 9).
150
151	Properties:
152
153		Monotonic.
154
155		Exhaustive testing proved this routine returns faithfully rounded
156		results.
157*/
158	.align	5
159#if !defined DevelopmentInstrumentation
160	// This is the regular name used in the deployed implementation.
161		.globl _atanf
162	_atanf:
163#else
164	// This is the name used for a special test version of the routine.
165		.globl _atanfInstrumented
166	_atanfInstrumented:
167#endif
168
169	cvtss2sd	Argx, x				// Convert x to double precision.
170
171	#if defined __i386__
172
173		// Get address of 0 in BaseP.
174			call	0f				// Push program counter onto stack.
175		0:
176			pop		BaseP			// Get program counter.
177
178	#endif
179
180/*	We use different algorithms for different parts of the domain.  There
181	is a "negative tail" from -infinity to nPoint0, a center from nPoint0 to
182	pPoint0, and a "positive tail" from pPoint0 to +infinity.  Here, we compare
183	and branch to the appropriate code.
184
185	NaNs are weeded out in PositiveTail or NegativeTail.
186*/
187
188	ucomisd		Address(pPoint0), x
189	ja			PositiveTail
190
191	ucomisd		Address(nPoint0), x
192	jb			NegativeTail
193
194
195/*	Here we have nPoint0 <= x <= pPoint0.  This is handled with a simple
196	evaluation of a polynomial that approximates arctangent.
197
198	The polynomial has been arranged into the form:
199
200		x * c2
201			* ((x**4 + c01 * x**2 + c00))
202			* ((x**4 + c11 * x**2 + c10))
203			* ((x**4 + c21 * x**2 + c20))
204			* ((x**4 + c31 * x**2 + c30))
205
206	The coefficients are stored in pairs, with c01 and c21 at C01, c00 and c20
207	at C00, c11 and c31 at C11, and c10 and c30 at C10.  c2 is at C2.
208
209	The quartic factors are evaluated in SIMD registers.  For brevity, some
210	comments below describe only one element of a register.  The other is
211	analagous.
212*/
213	movlhps		x, x1				// Save a copy of x for later.
214	mulsd		x, x				// Form x**2.
215#define	x2	x	// Define name describing current register contents.
216	movlhps		x2, x2				// Duplicate x**2.
217	movapd		Address(C11), p1
218	addpd		x2, p1				// Form x**2 + c11.
219	mulpd		x2, p1				// Form x**4 + c11 * x**2.
220	addpd		Address(C10), p1	// Form x**4 + c11 * x**2 + c10.
221	movapd		Address(C01), p0	// Get first coefficients.
222	movlpd		Address(C2), x1		// Put c2 in one element, with x in other.
223	addpd		x2, p0				// Form x**2 + c01.
224	mulpd		x2, p0				// Form x**4 + c01 * x**2.
225	addpd		Address(C00), p0	// Form x**4 + c01 * x**2 + c00.
226	mulpd		p1, p0				// Combine factors.
227	mulpd		x1, p0				// Multiply by x and c2.
228	movhlps		p0, p1				// Get high element.
229	mulsd		p1, p0				// Finish combining factors.
230#undef x2
231
232// Return the double-precision number currently in p0.
233ReturnDoubleInp0:
234	cvtsd2ss	p0, p0				// Convert result to single precision.
235
236// Return the single-precision number currently in p0.
237ReturnSingleInp0:
238
239	#if defined __i386__
240		movss		p0, Argx		// Shuttle result through memory.
241			// This uses the argument area for scratch space, which is allowed.
242		flds		Argx			// Return input on floating-point stack.
243	#else
244		// On x86_64, the return value is now in p0, which is %xmm0.
245	#endif
246
247	ret
248
249
250// Handle pPoint0 < x.
251PositiveTail:
252
253	ucomisd		Address(pPoint1), x		// Compare x to algorithm change point.
254	jae			InputIsPositiveSpecial
255
256	movsd		Address(pHalfPi), p0	// Prepare +pi/2.
257
258CommonTail:
259
260/*	Here we have pPoint0 < x < pPoint1.  The algorithm here is inspired by the
261	identity (for positive x) arctangent(x) = +pi/2 - arctangent(1/x).  This
262	is approximated by evaluating:
263
264		+pi/2 -
265			* ((x**4 + t01 * x**2 + t00))
266			* ((x**4 + t11 * x**2 + t10))
267			* ((x**4 + t21 * x**2 + t20))
268			* ((x**4 + t31 * x**2 + t30))
269			* (1/x) * (1/x)**16.
270
271	The quartic factors are evaluated in SIMD registers.  For brevity, some
272	comments below describe only one element of a register.  The other is
273	analagous.
274*/
275
276	movsd		Address(One), v0	// Get constant.
277	divsd		x, v0				// Form 1/x, which we will call r.
278		/*	I tried rcpss, but using it requires a precision conversion, two
279			adds, and three multiplies.  divsd performance is not bad on Core2, 
280			and I was not able to get the rcpss version to run as fast.
281		*/
282	mulsd		x, x				// Form x**2.
283#define	x2	x	// Define name describing current register contents.
284	movlhps		v0, v0				// Copy finished reciprocal to high element.
285	movlhps		x2, x2				// Duplicate x**2.
286	mulsd		v0, v0				// Form r**2.
287	mulsd		v0, v0				// Form r**4.
288	mulsd		v0, v0				// Form r**8.
289	movapd		Address(T11), p1	// Get first coefficients.
290	mulsd		v0, v0				// Form r**16.
291
292	movapd		Address(T01), t0	// Get first coefficients.
293	addpd		x2, p1				// Form x**2 + t11.
294	mulpd		x2, p1				// Form x**4 + t11 * x**2.
295	addpd		Address(T10), p1	// Form x**4 + t11 * x**2 + t10.
296	addpd		x2, t0				// Form x**2 + t01.
297	mulpd		x2, t0				// Form x**4 + t01 * x**2.
298	addpd		Address(T00), t0	// Form x**4 + t01 * x**2 + t00.
299	mulpd		p1, t0				// Combine factors.
300	mulpd		v0, t0				// Multiply by r**16 and r.
301	movhlps		t0, p1				// Get high element.
302	mulsd		p1, t0				// Finish combining factors.
303	subsd		t0, p0				// Subtract from pi/2.
304#undef x2
305
306	jmp			ReturnDoubleInp0
307
308
309// Handle x < nPoint0.
310NegativeTail:
311
312	ucomisd		Address(nPoint1), x		// Compare x to algorithm change point.
313	jbe			InputIsNegativeSpecial
314
315	movsd		Address(nHalfPi), p0	// Prepare -pi/2.
316
317	jmp			CommonTail
318
319
320	.literal8
321/*	Define values near +/- pi/2 that yield +/- pi/2 rounded toward zero when
322	converted to single precision.  This allows us to generate inexact and
323	return the desired values for atanf of big positive and negative numbers.
324*/
325AlmostpHalfPi:	.double	+1.5707962
326AlmostnHalfPi:	.double	-1.5707962
327
328
329	.text
330
331/*	Here we handle inputs greater or equal to pPoint1, including infinity,
332	but not including NaNs.
333*/
334InputIsPositiveSpecial:
335
336	/*	The input is a big positive number.  Return +pi/2 rounded toward zero
337		and generate an inexact exception.
338	*/
339	cvtsd2ss	Address(AlmostpHalfPi), p0
340	jmp			ReturnSingleInp0
341
342
343/*	Here we handle inputs less than or equal to -1, including -infinity and
344	NaNs.  The condition code must be set as if by using ucomisd to compare the
345	input in the "source operand" to nPoint1 in the "destination operand".
346*/
347InputIsNegativeSpecial:
348
349	jp			InputIsNaN
350
351	/*	The input is a big negative number.  Return -pi/2 rounded toward zero
352		and generate an inexact exception.
353	*/
354	cvtsd2ss	Address(AlmostnHalfPi), p0
355	jmp			ReturnSingleInp0
356
357
358InputIsNaN:
359
360	#if defined __i386__
361		flds		Argx			// Return result on floating-point stack.
362	#else
363		cvtsd2ss	x, %xmm0		// Return in %xmm0.
364			/*	We cannot just return the input, because we must quiet a
365				signalling NaN.
366			*/
367	#endif
368
369	ret