src/libm/Source/Intel/tanf.S at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / tanf.S
at fixPythonPipStalling 351 lines 10 kB view raw
wrap content
Lubos Dolezel More progress 9y ago
a76db625
  1/*	tanf.s -- tanf for standard math library.
  2
  3	Written by Eric Postpischil, June 2007.
  4*/
  5
  6
  7#define	SignificandBits	24
  8#define	ExponentBits	 8
  9
 10	.literal8
 11
 12	.align	3
 13
 14NegativeOne:
 15	.double	-1
 16
 17
 18	.const
 19
 20	.align	4
 21
 22/*	Coefficients to calculate:
 23
 24		x * (x**4 + c02*x**2 + c00) * (x**4 + c12*x**2 + c10)
 25		  * (x**4 + c22*x**2 + c20) * (       c32*x**2 + c30)
 26
 27	c02 and c12 are stored as a pair at the location below labeled c02,
 28	c00 and c10 are at c00, c22 and c23 at c22, and c20 and c30 at c20.
 29
 30	This polynomial was designed to cover two binades (supporting an
 31	exponent fold of one bit) for tanf with its period (pi) partitioned into
 32	two intervals.  The expanded polynomial was factored to create the form
 33	above convenient for calculating with SIMD.
 34
 35	This was minimized for ULP error.
 36*/
 37c02:	.double	-0.34431297545117363561, -1.53503935362160698123
 38c00:	.double	+0.75337415977492632831, +0.92783629599735333058
 39c22:	.double	+0.91301179133375347365, +5.27860261872543679865
 40c20:	.double	+0.58807104225499649417, +3.82127246397188494330
 41
 42
 43/*	ExponentFold specifies how many bits of the exponent will not be used  to
 44	look up entries in the table.  This must match the ExponentFold used to
 45	generate the table.
 46
 47	The behavior for infinite inputs relies on the table contents -- if the
 48	table entries looked up for an infinite input are m0 and m1, then
 49	m0*infinity + m1*infinity should generate a NaN.  This happens of either m0
 50	or m1 is a zero or they have opposite signs.  If this were not true,
 51	additional arrangements would have to be made for infinite inputs.
 52*/
 53#define	ExponentFold	1
 54
 55// Include the reduction table.  See tanfMakeTable.c for more information.
 56#include "tanfTable.s"
 57
 58
 59// Rename the general registers (just to make it easier to keep track of them).
 60#if defined __i386__
 61	#define	r0	%eax
 62	#define	r1	%ecx
 63	#define	r2	%edx
 64	#define	r3	%ebx
 65	#define	r4	%esp
 66	#define	r5	%ebp
 67	#define	r6	%esi
 68	#define	r7	%edi
 69#elif defined __x86_64__
 70	#define	r0	%rax
 71	#define	r1	%rcx
 72	#define	r2	%rdx
 73	#define	r3	%rbx
 74	#define	r4	%rsp
 75	#define	r5	%rbp
 76	#define	r6	%rsi
 77	#define	r7	%rdi
 78#else
 79	#error "Unknown architecture."
 80#endif
 81
 82
 83	.text
 84
 85
 86// Define symbols for tanf.
 87#define	BaseP		r0		// Base address for position-independent addressing.
 88#define	xInteger	r1		// The input x in an integer register.
 89#define	ki			r1		// x*m0 rounded to an integer.
 90
 91#define	p0			%xmm0	// Must be in %xmm0 for return on x86_64.
 92#define	p2			%xmm1
 93#define	x			%xmm2
 94#define	k			%xmm3
 95#define	m1			%xmm4
 96#define	x1 			%xmm5
 97#define	Numerator	%xmm6
 98
 99#if defined __i386__
100
101	// Define location of argument x.
102	#define	Argx			4(%esp)
103
104	// Define how to address data.  BaseP must contain the address of label 0.
105	#define	Address(label)	label-0b(BaseP)
106
107#elif defined __x86_64__
108
109	// Define location of argument x.
110	#define	Argx			%xmm0
111
112	// Define how to address data.
113	#define	Address(label)	label(%rip)
114
115#endif
116
117
118/*	float tanf(float x).
119
120	Notes:
121
122		Citations in parentheses below indicate the source of a requirement.
123
124		"C" stands for ISO/IEC 9899:TC2.
125
126		The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
127		requirements since it defers to C and requires errno behavior only if
128		we choose to support it by arranging for "math_errhandling &
129		MATH_ERRNO" to be non-zero, which we do not.
130
131	Return value:
132
133		For +/- zero, return zero with same sign (C F.9 12 and F.9.1.7).
134
135		For +/- infinity, return a NaN (C F.9.1.7).
136
137		For a NaN, return the same NaN (C F.9 11 and 13).
138
139		Otherwise:
140
141			If the rounding mode is round-to-nearest, return tangent(x)
142			faithfully rounded.
143		  
144			Not currently implemented:  In other rounding modes, return
145			tangent(x) possibly with slightly worse error, not necessarily
146			honoring the rounding mode (Ali Sazegari narrowing C F.9 10).
147
148	Exceptions:
149
150		Raise underflow for a denormal result (C F.9 7 and Draft Standard for
151		Floating-Point Arithmetic P754 Draft 1.2.5 9.5).  If the input is the
152		smallest normal, underflow may or may not be raised.  This is stricter
153		than the older 754 standard.
154
155		May or may not raise inexact, even if the result is exact (C F.9 8).
156
157		Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
158		of C F.2.1) or an infinity (C F.9.1.7) but not if the input is a quiet
159		NaN (C F.9 11).
160
161		May not raise exceptions otherwise (C F.9 9).
162
163	Properties:
164
165		Desired to be monotonic.  Not yet proven!
166
167		Exhaustive testing proved this routine returns faithfully rounded
168		results.
169*/
170	.align	5
171	.globl _tanf
172_tanf:
173
174	// Put x into an integer register.
175	#if defined __i386__
176		mov		Argx, xInteger		// Put x into xInteger.
177	#elif defined __x86_64__
178		movd	Argx, xInteger		// Put x into xInteger.
179	#endif
180
181	cvtss2sd	Argx, x				// Convert x to double precision.
182
183	#if defined __i386__
184
185		// Get address of 0 in BaseP.
186			call	0f				// Push program counter onto stack.
187		0:
188			pop		BaseP			// Get program counter.
189
190	#elif defined __x86_64__
191
192		// Get address of reduction table in BaseP.
193			lea		ReductionTable(%rip), BaseP
194
195	#endif
196
197	movsd		Address(NegativeOne), Numerator
198
199	unpcklpd	x, x				// Duplicate x.
200
201	/*	Shift:
202
203			SignificandBits-1 bits right to remove significand.
204
205			ExponentFold bits right to remove unused bits of exponent.
206
207			4 bits left to multiply by size of a table entry.
208	*/
209	shr		$SignificandBits-1+ExponentFold-4, xInteger
210
211	// Clear bits other than those we are using from exponent.
212	and		$(1<<ExponentBits-ExponentFold)-1 << 4, xInteger
213	je		ExponentIsZero			// Branch so we can get -0 right.
214
215	ucomisd		x, x				// Test for Nan.
216	jpe			HandleNaN
217
218	// Get table entry containing m0 and m1 and calculate x*m0 and x*m1.
219	#if defined __i386__
220		mulpd	ReductionTable-0b(BaseP, xInteger), x
221	#elif defined __x86_64__
222		mulpd	(BaseP, xInteger), x
223	#endif
224		/*	Let the exponent of x be E, that is, E is the largest integer such
225			that 2**E <= x.  We use e to index the table, where e is E with the
226			low ExponentFold bits set to zero.  For example, if ExponentFold is
227			1, e = E & ~1.
228
229			The table entry indexed by e represents a number m that is (u/p %
230			1)/u*2**a, where u is the ULP of 2**e, p is the period (pi), and
231			2**a is the number of intervals we divide the period into (we use
232			two, so a is one).
233
234			Observe that when 2**e <= x, x/u is an integer, and x/u <
235			2**(24+ExponentFold).
236
237			Then for some integer k, x*m = x*(u/p % 1)/u*2**a = x*(u/p -
238			k)/u*2**a = x/p*2**a - x/u*k*2**a.  x/u*k*2**a is an integer, so
239			x*m is congruent to x/p*2**a modulo 1.
240
241			So tangent(x*m*p*2**-a) equals tangent(x).
242
243			However, we do not have m exactly.  The table contains two numbers
244			for 2**e, m0 and m1.  m0 has the first 29 bits of m, and m1 has the
245			next 53 bits.
246
247			Since x has 23 bits and m0 has 29 bits, their multiplication in
248			double format is exact.  From that product, we subtract the nearest
249			integer, which changes nothing modulo 1 but reduces the magnitude.
250			Then we add x*m1, yielding a precise but inexact approximation of
251			x*m0 % 1 + x*m1.
252
253			The magnitude of x*m0 % 1 is at most 1/2, and x*m1 may make the sum
254			slightly larger.
255
256			Recall that m is (u/p % 1)/u*2**a, so m's magnitude is at most
257			.5/u*2**a.  m0 contains 29 bits of this, so m1's magnitude is at
258			most 2**-29*.5/u*2**a.  With an exponent fold of 1 bit, x/u is less
259			than 2**25, so |x*m1| <= |x*2**-29*.5/u*2**a| <
260			2**25**2**-29*.5*2**a = 2**(a-5).
261
262			Therefore, |x*m0 % 1 + x*m1| < .5 + 2**(a-5).  The polynomial that
263			approximates tangent(p*x*2**-a) has been engineered to be good in
264			this interval.
265		*/
266
267	movapd		Address(c02), p0	// Get c02 and c12.
268	movapd		Address(c22), p2	// Get c22 and c32.
269
270	cvtsd2si	x, ki				// Round to integer.
271		// This requires round-to-nearest mode in the MXCSR.
272	cvtsi2sd	ki, k				// Convert to floating-point.
273	movhlps		x, m1				// Get x*m1.
274	subsd		k, x				// Subtract integer, leaving fraction.
275	addsd		m1, x				// Get x*m0%1 + x*m1.
276
277/*	Below this point, references in the comments to "x" refer to the
278	result of the reduction, not to the original input argument.
279*/
280	unpcklpd	x, x				// Duplicate x for SIMD work.
281	test		$1, ki				// Is 2**0 bit on?
282	movsd		x, x1				// Save a copy of x for later.
283	mulpd		x, x				// Get x**2.
284#define	x2	x	// Use the name x2 to emphasize we have x**2.
285	addpd		x2, p0				// Get x**2+c02 and x**2+c12.
286	mulpd		x2, p0				// Get x**4+c02*x**2 and x**4+c12*x**2.
287	addpd		Address(c00), p0	// Finish first pair of terms.
288	addsd		x2, p2				// Get x**2+c22 and c32.
289	mulpd		x2, p2				// Get x**4+c22*x**2 and c32*x**2.
290	addpd		Address(c20), p2	// Finish second pair of terms.
291	mulpd		p2, p0				// Multiply parallel pairs.
292	mulsd		x1, p0				// Multiply isolated x term.
293	movhlps		p0, p2				// Get high factor.
294	mulsd		p2, p0				// Combine parts from SIMD work.
295	jne			2f					// Branch if bit was on.
296
297// ki was even, tangent is in p0.
298
299	cvtsd2ss	p0, p0				// Convert result to single precision.
300	#if defined __i386__
301		movss		p0, Argx		// Shuttle result through memory.
302			// This uses the argument area for scratch space, which is allowed.
303		flds		Argx			// Return result on floating-point stack.
304	#else
305		// On x86_64, the return value is now in p0, which is %xmm0.
306	#endif
307
308	ret
309
310// ki was odd, tangent is -1/p0.
3112:
312	divsd		p0, Numerator
313	cvtsd2ss	Numerator, p0		// Convert result to single precision.
314
315ReturnSingle:
316	#if defined __i386__
317		movss		p0, Argx		// Shuttle result through memory.
318			// This uses the argument area for scratch space, which is allowed.
319		flds		Argx			// Return result on floating-point stack.
320	#else
321		// On x86_64, the return value is now in p0, which is %xmm0.
322	#endif
323
324	ret
325
326
327// Handle a NaN input.
328HandleNaN:
329
330	#if defined __i386__
331		flds		Argx			// Return result on floating-point stack.
332	#else
333		cvtsd2ss	x, %xmm0		// Return in %xmm0.
334			/*	We cannot just return the input, because we must quiet a
335				signalling NaN.
336			*/
337	#endif
338
339	ret
340 
341
342/*	The raw exponent field is zero, so the input is a zero or a denormal.  We
343	need to handle this separately to get -0 right, and it is faster.
344*/
345ExponentIsZero:
346	.literal8
347Finagle:	.double	1.00000001
348	.text
349	mulsd		Address(Finagle), x	// Make non-zero results inexact.
350	cvtsd2ss	x, p0				// Return x.
351	jmp			ReturnSingle