src/libm/Source/Intel/atan2f.S at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / atan2f.S
at fixPythonPipStalling 433 lines 12 kB view raw
wrap content
Lubos Dolezel More progress 9y ago
a76db625
  1/*	atan2f.s -- atan2f for standard math library.
  2
  3	Written by Eric Postpischil, July 2007.
  4*/
  5
  6
  7	.literal8
  8
  9// Define miscellaneous constants.
 10
 11Threshold:	.double	2.384185791015625e-07		// 2**-22.
 12nZero:		.double	-0
 13
 14pPi1v4:		.double	+.7853981633974483096156608	// 1/4 pi.
 15nPi1v4:		.double	-.7853981633974483096156608
 16pPi1v2:		.double	+1.570796326794896619231322	// 1/2 pi.
 17nPi1v2:		.double	-1.570796326794896619231322
 18pPi3v4:		.double	+2.356194490192344928846982	// 3/4 pi.
 19nPi3v4:		.double	-2.356194490192344928846982
 20
 21/*	Define values near +/- pi that yield +/- pi rounded toward zero when
 22	converted to single precision.  This allows us to generate inexact and
 23	return the desired values for atan2f on and near the negative side of the x
 24	axis.
 25*/
 26AlmostpPi:	.double	+3.1415924
 27
 28// Define a coefficient for center polynomial (used for x in [-1, +1]).
 29C2:			.double	 0.0029352921857004596570518
 30
 31
 32	.const
 33	.align	4
 34
 35/*	Define some coefficients for center polynomial (used for x in [-1, +1]).
 36	These are stored in pairs at aligned addresses for use in SIMD
 37	instructions.
 38*/
 39C01:		.double	 2.2971562298314633761676433,  0.0207432003007420961489920
 40C00:		.double	 2.4449692297316409651126041,  3.7888879287802702842997915
 41C11:		.double	-2.9466967515109826289085300, -4.9721072376211623038916292
 42C10:		.double	 5.4728447324456990092824269,  6.7197076223592378022736307
 43
 44// This needs to be 16-byte aligned because it is used in an orpd instruction.
 45	.align	4
 46pPi:		.double	+3.141592653589793238462643	// pi.
 47
 48
 49// Rename the general registers (just to make it easier to keep track of them).
 50#if defined __i386__
 51	#define	r0	%eax
 52	#define	r1	%ecx
 53	#define	r2	%edx
 54	#define	r3	%ebx
 55	#define	r4	%esp
 56	#define	r5	%ebp
 57	#define	r6	%esi
 58	#define	r7	%edi
 59#elif defined __x86_64__
 60	#define	r0	%rax
 61	#define	r1	%rcx
 62	#define	r2	%rdx
 63	#define	r3	%rbx
 64	#define	r4	%rsp
 65	#define	r5	%rbp
 66	#define	r6	%rsi
 67	#define	r7	%rdi
 68#else
 69	#error "Unknown architecture."
 70#endif
 71
 72
 73	.text
 74
 75
 76// Define various symbols.
 77
 78#define	BaseP		r0		// Base address for position-independent addressing.
 79
 80#define	y			%xmm0	// Must be in %xmm0 for return on x86_64.
 81#define	x			%xmm1
 82#define	p0			%xmm2
 83#define	x1			%xmm3
 84#define	t0			%xmm4
 85#define	Base		%xmm5
 86#define	p1			%xmm6
 87
 88#if defined __i386__
 89
 90	// Define locations of arguments.
 91	#define	Argy			4(%esp)
 92	#define	Argx			8(%esp)
 93
 94	// Define how to address data.  BaseP must contain the address of label 0.
 95	#define	Address(label)	label-0b(BaseP)
 96
 97#elif defined __x86_64__
 98
 99	// Define locations of arguments.
100	#define	Argx			%xmm1
101	#define	Argy			%xmm0
102
103	// Define how to address data.
104	#define	Address(label)	label(%rip)
105
106#endif
107
108
109/*	float atan2f(float x).
110
111	Notes:
112
113		This routine has not been proven to be correct.  See the notes in the
114		accompanying C version regarding potential proof.  The polynomial it
115		uses was proven to provide faithfully rounded results in atanf.  atan2f
116		introduces additional error performing the division and additional
117		points used in the domain of the polynomial.
118
119		Citations in parentheses below indicate the source of a requirement.
120
121		"C" stands for ISO/IEC 9899:TC2.
122
123		The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
124		requirements since it defers to C and requires errno behavior only if
125		we choose to support it by arranging for "math_errhandling &
126		MATH_ERRNO" to be non-zero, which we do not.
127
128	Return value for atan2f(y, x) (C F.9.1 12 and C F.9.1.4):
129
130		y			x			atan2f(y, x)
131
132		-infinity	-infinity	-3*pi/4.
133					< 0			-2*pi/4.
134					-0			-2*pi/4.
135					+0			-2*pi/4.
136					> 0			-2*pi/4.
137					+infinity	-1*pi/4.
138
139		< 0			-infinity	-4*pi/4.
140					< 0			arctangent(y/x) in [-4*pi/4, -2*pi/4].
141					-0			-2*pi/4.
142					+0			-2*pi/4.
143					> 0			arctangent(y/x) in [-2*pi/4, -0*pi/4].
144					+infinity	-0*pi/4.
145
146		-0			-infinity	-4*pi/4.
147					< 0			-4*pi/4.
148					-0			-4*pi/4.
149					+0			-0*pi/4.
150					> 0			-0*pi/4.
151					+infinity	-0*pi/4.
152
153		+0			-infinity	+4*pi/4.
154					< 0			+4*pi/4.
155					-0			+4*pi/4.
156					+0			+0*pi/4.
157					> 0			+0*pi/4.
158					+infinity	+0*pi/4.
159
160		> 0			-infinity	+4*pi/4.
161					< 0			arctangent(y/x) in [+2*pi/4, +4*pi/4].
162					-0			+2*pi/4.
163					+0			+2*pi/4.
164					> 0			arctangent(y/x) in [+0*pi/4, +2*pi/4].
165					+infinity	+0*pi/4.
166
167		+infinity	-infinity	+3*pi/4.
168					< 0			+2*pi/4.
169					-0			+2*pi/4.
170					+0			+2*pi/4.
171					> 0			+2*pi/4.
172					+infinity	+1*pi/4.
173
174		If either input is a NaN, return one of the NaNs in the input.  (C F.9
175		11 and 13).  (If the NaN is a signalling NaN, we return the "same" NaN
176		quieted.)
177
178		Otherwise:
179
180			If the rounding mode is round-to-nearest, return arctangent(x)
181			faithfully rounded.
182
183			Return a value in [-pi, +pi] (C 7.12.4.4 3).  Note that this
184			prohibits returning correctly rounded values for -pi and +pi, since
185			pi rounded to a float lies outside that interval.
186		
187			Not implemented:  In other rounding modes, return arctangent(x)
188			possibly with slightly worse error, not necessarily honoring the
189			rounding mode (Ali Sazegari narrowing C F.9 10).
190
191	Exceptions:
192
193		Raise underflow for a denormal result (C F.9 7 and Draft Standard for
194		Floating-Point Arithmetic P754 Draft 1.2.5 9.5).  If the input is the
195		smallest normal, underflow may or may not be raised.  This is stricter
196		than the older 754 standard.
197
198		May or may not raise inexact, even if the result is exact (C F.9 8).
199
200		Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
201		of C 4.2.1)  but not if the input is a quiet NaN (C F.9 11).
202
203		May not raise exceptions otherwise (C F.9 9).
204
205	Properties:
206
207		We desire this routine to be monotonic, but that has not
208		been proven.  (For atan2f, monotonicity would mean, if (x0, y0) and
209		(x1, y1) are in the same quadrant, then y0/x0 <= y1/x1 implies
210		atan2f(y0, x0) <= atan2f(y1, x1).)
211*/
212	.align	5
213	.globl _atan2f
214_atan2f:
215
216	cvtss2sd	Argy, y					// Convert to double precision.
217	cvtss2sd	Argx, x
218
219	#if defined __i386__
220
221		// Get address of 0 in BaseP.
222			call	0f					// Push program counter onto stack.
223		0:
224			pop		BaseP				// Get program counter.
225
226	#endif
227
228#define	nx	t0
229	movsd		Address(nZero), nx
230	xorpd		x, nx					// Negate x.
231
232	ucomisd		x, y
233	jae			yGEx					// If we jump, y >= x.
234	je			Unordered				// If we jump, an operand is a NaN.
235										// If we fall through, y < x.
236
237	ucomisd		y, nx
238	jae			yLTx_and_nxGEy			// If we jump, y < x and -x >= y.
239										// If we fall through, -x < y < x.
240
241	// Return atand(y/x).
242	divsd		x, y					// Form y/x.
243	movsd		y, x					// Move to register used by Polynomial.
244	movsd		Address(nZero), Base	// Set Base to -0.
245		// This makes the return value -0 if y is -0 and x > 0.
246	jmp			Polynomial				// Return 0 + arctangent(y/x).
247
248
249yLTx_and_nxGEy:							// Here y < x && y <= -x.
250	je			yLTx_and_nxEQy			// If we jump, y < x && -x == y.
251										// If we fall through, y < x < -y.
252
253	// Return -pi/2 - atand(x/y).
254	movsd		Address(nZero), t0		// Get -0 for sign bit.
255	divsd		y, x					// Form x/y.
256	xorpd		t0, x					// Form -x/y.
257	movsd		Address(nPi1v2), Base	// Set Base to -pi/2.
258	jmp			Polynomial				// Return -pi/2 + arctangent(-x/y).
259
260
261yLTx_and_nxEQy: 						// Here -x == y < x.
262	// Return -1/4*pi with inexact exception.
263	cvtsd2ss	Address(nPi1v4), y
264	jmp			ReturnSingle
265
266
267yGEx:									// Here y >= x.
268	je			yEQx					// If we jump, y == x.
269										// If we fall through, y > x.
270
271	ucomisd		y, nx
272#undef	nx
273	jae			yGTx_and_nxGEy			// If we jump, y > x && -x >= y.
274										// If we fall through, -y < x < y.
275
276	// Return +pi/2 - atand(x/y).
277	movsd		Address(nZero), t0		// Get -0 for sign bit.
278	divsd		y, x					// Form x/y.
279	movsd		Address(pPi1v2), Base	// Set Base to +pi/2.
280	xorpd		t0, x					// Form -x/y.
281	jmp			Polynomial				// Return +pi/2 + arctangent(-x/y).
282
283
284yGTx_and_nxGEy:							// Here y > x && -x >= y.
285	je			yGTx_and_nxEQy			// If we jump, y > x && -x == y.
286										// If we fall through, x < y < -x.
287
288	movsd		Address(nZero), Base	// Get mask for sign bit.
289	movapd		Base, t0				// Copy mask.
290	andpd		y, Base					// Extract sign bit of y.
291	divsd		x, y					// Form y/x.
292	andnpd		y, t0					// Take absolute value of quotient.
293	comisd		Address(Threshold), t0	// Is quotient small?
294	jbe			NearNegativeXAxis
295	orpd		Address(pPi), Base		// Set Base to pi with y's sign.
296	movsd		y, x					// Move to register used by Polynomial.
297	jmp			Polynomial				// Return +/- pi + arctangent(y/x).
298
299
300NearNegativeXAxis:
301	// Return -pi or +pi, matching the sign of y, rounded toward zero.
302	movsd		Address(AlmostpPi), p0
303	xorpd		Base, p0
304	jmp			ReturnDouble
305
306
307yGTx_and_nxEQy:							// Here x < y == -x.
308	// Return +3/4*pi with inexact exception.
309	cvtsd2ss	Address(pPi3v4), y
310	jmp			ReturnSingle
311
312
313yEQx:									// Here y == x.
314
315	ucomisd		Address(nZero), y
316	jae			yEQx_and_yGE0			// If we jump, y == x && y >= 0.
317										// If we fall through, x == y < -x.
318
319	// Return -3/4*pi with inexact exception.
320	cvtsd2ss	Address(nPi3v4), y
321	jmp			ReturnSingle
322
323
324yEQx_and_yGE0:							// Here y == x && y >= 0.
325	je			yEQx_and_yEQ0			// If we jump, y == x && y == 0.
326										// If we fall through, -x < y == x.
327
328	// Return +1/4*pi with inexact exception.
329	cvtsd2ss	Address(pPi1v4), y
330	jmp			ReturnSingle
331
332
333yEQx_and_yEQ0:							// Here y == x == 0.
334
335	/*	Return:
336			x	y	atan2f(y, x)
337			-0	-0	-pi, with inexact exception.
338			-0	+0	+pi, with inexact exception.
339			+0	-0	-0.
340			+0	+0	+0.
341	*/
342
343	/*	We want to know if x is -0 or +0, but there is no direct test for that
344		that puts the results in a vector register.  We do an arithmetic right
345		shift to fill up the exponent bits with copies of the sign bit.  This
346		produces a NaN or +0.  Then we test for "unordered", which yields all
347		one bits if x was -0 and all zero bits if x was +0.
348	*/
349	psraw		$12, x
350	cmpunordsd	x, x
351
352	// Form (almost) pi if x was -0 and 0 if x was +0.
353	movsd		Address(AlmostpPi), p0
354	andpd		x, p0
355
356	orpd		y, p0					// Apply the sign bit from y.
357	jmp			ReturnDouble
358
359
360Unordered:								// Here x or y is a NaN.
361	addsd		x, y					// Return one of the NaNs, quieted.
362	cvtsd2ss	y, y
363	jmp			ReturnSingle
364
365
366/*	This is the principal arctangent evaluation.  Previous code has prepared
367	the Base and y registers, and we need to calculate Base + arctangent(y).
368	The result is then converted to a single-precision number and returned.
369
370	-1 <= y <= +1.  (Actually, -1 < y < +1.  The equalities were sidetracked
371	during all the branching above, and division of two different
372	single-precision numbers converted to double-precision never rounds to
373	one.)
374
375	There are some slight inefficiencies here, in that special cases could omit
376	a few instructions -- sometimes the base is zero or y had to be negated to
377	fit this common code.  So, if speed is all important, this routine might be
378	speeded up a little by replicating this code.
379*/
380Polynomial:
381
382/*	The polynomial that approximates arctangent has been arranged into the
383 	form:
384
385		x * c2
386			* ((x**4 + c01 * x**2 + c00))
387			* ((x**4 + c11 * x**2 + c10))
388			* ((x**4 + c21 * x**2 + c20))
389			* ((x**4 + c31 * x**2 + c30))
390
391	The coefficients are stored in pairs, with c01 and c21 at C01, c00 and c20
392	at C00, c11 and c31 at C11, and c10 and c30 at C10.  c2 is at C2.
393
394	The quartic factors are evaluated in SIMD registers.  For brevity, some
395	comments below describe only one element of a register.  The other is
396	analagous.
397*/
398	movsd		x, x1				// Save a copy of x for later.
399	movapd		Address(C11), p1
400	mulsd		x, x				// Form x**2.
401#define	x2	x	// Define name describing current register contents.
402	movlhps		x2, x2				// Duplicate x**2.
403	addpd		x2, p1				// Form x**2 + c11.
404	mulpd		x2, p1				// Form x**4 + c11 * x**2.
405	addpd		Address(C10), p1	// Form x**4 + c11 * x**2 + c10.
406	movapd		Address(C01), p0	// Get first coefficients.
407	addpd		x2, p0				// Form x**2 + c01.
408	mulpd		x2, p0				// Form x**4 + c01 * x**2.
409	movhpd		Address(C2), x1		// Put c2 in one element, with x in other.
410	addpd		Address(C00), p0	// Form x**4 + c01 * x**2 + c00.
411	mulpd		p1, p0				// Combine factors.
412	mulpd		x1, p0				// Multiply by x and c2.
413	movhlps		p0, p1				// Get high element.
414	mulsd		p1, p0				// Finish combining factors.
415#undef x2
416	addsd		Base, p0
417
418// Return the double-precision number currently in p0.
419ReturnDouble:
420	cvtsd2ss	p0, y				// Convert result to single precision.
421
422// Return the single-precision number currently in p0.
423ReturnSingle:
424
425	#if defined __i386__
426		movss		y, Argx			// Shuttle result through memory.
427			// This uses the argument area for scratch space, which is allowed.
428		flds		Argx			// Return input on floating-point stack.
429	#else
430		// On x86_64, the return value is now in y, which is %xmm0.
431	#endif
432
433	ret