src/libm/Source/Intel/sinfcosf.S at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / sinfcosf.S
at fixPythonPipStalling 490 lines 15 kB view raw
wrap content
Lubos Dolezel More progress 9y ago
a76db625
  1/*	sinfcosf.s -- sinf and cosf for standard math library.
  2
  3	Written by Eric Postpischil, June 2007.
  4*/
  5
  6
  7#define	SignificandBits	24
  8#define	ExponentBits	 8
  9
 10	.literal8
 11
 12	.align	3
 13
 14Half:	.double	.5
 15
 16
 17	.const
 18
 19	.align	4
 20
 21/*	Coefficients to calculate x*(x**4+c0*x**2+c1) * c4*(x**4+c2*x**2+c3).
 22
 23	c0 and c2 are paired for loading into an XMM register, and so are c1 and c3.
 24*/
 25// Two binades, fractions of pi, minimize ULP error, refactored:
 26C02:	.double		-4.1647719829166299, -3.6493923712207774
 27C13:	.double		15.485466333375713, 2.6522034727867632
 28C4:		.double		0.076492480587904088
 29
 30/*	ExponentFold specifies how many bits of the exponent will not be used  to
 31	look up entries in the table.  This must match the ExponentFold used to
 32	generate the table.
 33
 34	The behavior for infinite inputs relies on the table contents -- if the
 35	table entries looked up for an infinite input are m0 and m1, then
 36	m0*infinity + m1*infinity should generate a NaN.  This happens of either m0
 37	or m1 is a zero or they have opposite signs.  If this were not true,
 38	additional arrangements would have to be made for infinite inputs.
 39*/
 40#define	ExponentFold	1
 41
 42// Include the reduction table.  See sinfcosfMakeTable.c for more information.
 43#include "sinfcosfTable.s"
 44
 45
 46// Rename the general registers (just to make it easier to keep track of them).
 47#if defined __i386__
 48	#define	r0	%eax
 49	#define	r1	%ecx
 50	#define	r2	%edx
 51	#define	r3	%ebx
 52	#define	r4	%esp
 53	#define	r5	%ebp
 54	#define	r6	%esi
 55	#define	r7	%edi
 56#elif defined __x86_64__
 57	#define	r0	%rax
 58	#define	r1	%rcx
 59	#define	r2	%rdx
 60	#define	r3	%rbx
 61	#define	r4	%rsp
 62	#define	r5	%rbp
 63	#define	r6	%rsi
 64	#define	r7	%rdi
 65#else
 66	#error "Unknown architecture."
 67#endif
 68
 69
 70	.text
 71
 72
 73// Define symbols common to sinf and cosf.
 74#define	BaseP		r0		// Base address for position-independent addressing.
 75#define	xInteger	r1		// The input x in an integer register.
 76
 77#define	p			%xmm0	// Must be in %xmm0 for return on x86_64.
 78#define	x			%xmm1
 79#define	ki			%xmm2
 80#define	k			%xmm3
 81#define	m1			%xmm4
 82#define	pa			m1
 83
 84#if defined __i386__
 85
 86	// Define location of argument x.
 87	#define	Argx			4(%esp)
 88
 89	// Define how to address data.  BaseP must contain the address of label 0.
 90	#define	Address(label)	label-0b(BaseP)
 91
 92#elif defined __x86_64__
 93
 94	// Define location of argument x.
 95	#define	Argx			%xmm0
 96
 97	// Define how to address data.
 98	#define	Address(label)	label(%rip)
 99
100#endif
101
102
103/*	float sinf(float x).
104
105	Notes:
106
107		Citations in parentheses below indicate the source of a requirement.
108
109		"C" stands for ISO/IEC 9899:TC2.
110
111		The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
112		requirements since it defers to C and requires errno behavior only if
113		we choose to support it by arranging for "math_errhandling &
114		MATH_ERRNO" to be non-zero, which we do not.
115
116	Return value:
117
118		For +/- zero, return zero with same sign (C F.9 12 and F.9.1.6).
119
120		For +/- infinity, return a NaN (C F.9.1.6).
121
122		For a NaN, return the same NaN (C F.9 11 and 13).
123
124		Otherwise:
125
126			If the rounding mode is round-to-nearest, return sine(x) faithfully
127			rounded.
128		  
129			Not currently implemented:  In other rounding modes, return sine(x)
130			possibly with slightly worse error, not necessarily honoring the
131			rounding mode (Ali Sazegari narrowing C F.9 10).
132
133		All results are in [-1, 1].  This is corollary to faithful rounding.
134
135	Exceptions:
136
137		Raise underflow for a denormal result (C F.9 7 and Draft Standard for
138		Floating-Point Arithmetic P754 Draft 1.2.5 9.5).  If the input is the
139		smallest normal, underflow may or may not be raised.  This is stricter
140		than the older 754 standard.
141
142		May or may not raise inexact, even if the result is exact (C F.9 8).
143
144		Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
145		of C 4.2.1) or an infinity (C F.9.1.6) but not if the input is a quiet
146		NaN (C F.9 11).
147
148		May not raise exceptions otherwise (C F.9 9).
149
150	Properties:
151
152		Desired to be monotonic.  Not yet proven!
153
154		Exhaustive testing proved this routine returns faithfully rounded
155		results.
156*/
157	.align	5
158	.globl _sinf
159_sinf:
160
161	// Put x into an integer register.
162	#if defined __i386__
163		mov		Argx, xInteger		// Put x into xInteger.
164	#elif defined __x86_64__
165		movd	Argx, xInteger		// Put x into xInteger.
166	#endif
167
168	cvtss2sd	Argx, x				// Convert x to double precision.
169
170	#if defined __i386__
171
172		// Get address of 0 in BaseP.
173			call	0f				// Push program counter onto stack.
174		0:
175			pop		BaseP			// Get program counter.
176
177	#elif defined __x86_64__
178
179		// Get address of reduction table in BaseP.
180			lea		ReductionTable(%rip), BaseP
181
182	#endif
183
184	unpcklpd	x, x				// Duplicate x.
185
186	/*	Shift:
187
188			SignificandBits-1 bits right to remove significand.
189
190			ExponentFold bits right to remove unused bits of exponent.
191
192			4 bits left to multiply by size of a table entry.
193	*/
194	shr		$SignificandBits-1+ExponentFold-4, xInteger
195
196	// Clear bits other than those we are using from exponent.
197	and		$(1<<ExponentBits-ExponentFold)-1 << 4, xInteger
198	je		ExponentIsZero			// Branch so we can get -0 right.
199
200	ucomisd		x, x				// Test for NaN.
201	jpe			HandleNaN
202
203	// Get table entry containing m0 and m1 and calculate x*m0 and x*m1.
204	#if defined __i386__
205		mulpd	ReductionTable-0b(BaseP, xInteger), x
206	#elif defined __x86_64__
207		mulpd	(BaseP, xInteger), x
208	#endif
209		/*	Let the exponent of x be E, that is, E is the largest integer such
210			that 2**E <= x.  We use e to index the table, where e is E with the
211			low ExponentFold bits set to zero.  For example, if ExponentFold is
212			1, e = E & ~1.
213
214			The table entry indexed by e represents a number m that is (u/p %
215			1)/u*2**a, where u is the ULP of 2**e, p is the period (2*pi), and
216			2**a is the number of intervals we divide the period into (we use
217			two, so a is one).
218
219			Observe that when 2**e <= x, x/u is an integer, and x/u <
220			2**(24+ExponentFold).
221
222			Then for some integer k, x*m = x*(u/p % 1)/u*2**a = x*(u/p -
223			k)/u*2**a = x/p*2**a - x/u*k*2**a.  x/u*k*2**a is an integer, so
224			x*m is congruent to x/p*2**a modulo 1.
225
226			So sin(x*m*p*2**-a) equals sin(x).
227
228			However, we do not have m exactly.  The table contains two numbers
229			for 2**e, m0 and m1.  m0 has the first 29 bits of m, and m1 has the
230			next 53 bits.
231
232			Since x has 23 bits and m0 has 29 bits, their multiplication in
233			double format is exact.  From that product, we subtract the nearest
234			integer, which changes nothing modulo 1 but reduces the magnitude.
235			Then we add x*m1, yielding a precise but inexact approximation of
236			x*m0 % 1 + x*m1.
237
238			The magnitude of x*m0 % 1 is at most 1/2, and x*m1 may make the sum
239			slightly larger.
240
241			Recall that m is (u/p % 1)/u*2**a, so m's magnitude is at most
242			.5/u*2**a.  m0 contains 29 bits of this, so m1's magnitude is at
243			most 2**-29*.5/u*2**a.  With an exponent fold of 1 bit, x/u is less
244			than 2**25, so |x*m1| <= |x*2**-29*.5/u*2**a| <
245			2**25**2**-29*.5*2**a = 2**(a-5).
246
247			Therefore, |x*m0 % 1 + x*m1| < .5 + 2**(a-5).  The polynomial that
248			approximates sin(p*x*2**-a) has been engineered to be good in this
249			interval.
250		*/
251
252	cvtpd2dq	x, ki				// Round to integer.
253		// This requires round-to-nearest mode in the MXCSR.
254	cvtdq2pd	ki, k				// Convert to floating-point.
255	movhlps		x, m1				// Get x*m1.
256	movapd		Address(C02), p		// Get c0 and c2.
257	psllq		$63, ki				// Move low bit to high bit.
258	subsd		k, x				// Subtract integer, leaving fraction.
259	addsd		m1, x				// Get x*m0%1 + x*m1.
260	xorpd		x, ki				// Negate x if low bit of ki was set.
261	mulsd		x, x				// Square x.
262	unpcklpd	x, x				// Duplicate x**2.
263	movhpd		Address(C4), ki		// Get sign*x and c4.
264	addpd		x, p				// Make x**2+c0 and x**2+c2.
265	mulpd		x, p				// Make x**4+c0*x**2 and x**4+c2*x**2.
266	addpd		Address(C13), p		// Make x**4+c0*x**2+c1 and x**4+c2*x**2+c3.
267	mulpd		ki, p				// Make sign*x*(x**4+c0*x**2+c1) and
268									// c4*(x**4+c2*x**2+c3).
269	movhlps		p, pa				// Get c4*(x**4+c2*x**2+c3).
270	mulsd		pa, p				// Multiply final two factors.
271	cvtsd2ss	p, p				// Convert result to single precision.
272
273ReturnSingle:
274	#if defined __i386__
275		movss		p, Argx			// Shuttle result through memory.
276			// This uses the argument area for scratch space, which is allowed.
277		flds		Argx			// Return input on floating-point stack.
278	#else
279		// On x86_64, the return value is now in p, which is %xmm0.
280	#endif
281
282	ret
283
284
285/*	The raw exponent field is zero, so the input is a zero or a denormal.  We
286	need to handle this separately to get -0 right, and it is faster.
287*/
288ExponentIsZero:
289	.literal8
290Finagle:	.double	1.00000001
291	.text
292	mulsd		Address(Finagle), x	// Make non-zero results inexact.
293	cvtsd2ss	x, p				// Return x.
294	jmp			ReturnSingle
295
296
297// Handle a NaN input.
298HandleNaN:
299
300	#if defined __i386__
301		flds		Argx			// Return result on floating-point stack.
302	#else
303		cvtsd2ss	x, %xmm0		// Return in %xmm0.
304			/*	We cannot just return the input, because we must quiet a
305				signalling NaN.
306			*/
307	#endif
308
309	ret
310
311
312/*	float cosf(float x).
313
314	Notes:
315
316		Citations in parentheses below indicate the source of a requirement.
317
318		"C" stands for ISO/IEC 9899:TC2.
319
320		The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
321		requirements since it defers to C and requires errno behavior only if
322		we choose to support it by arranging for "math_errhandling &
323		MATH_ERRNO" to be non-zero, which we do not.
324
325	Return value:
326
327		For +/- infinity, return a NaN (C F.9.1.6).
328
329		For a NaN, return the same NaN (C F.9 11 and 13).
330
331		Otherwise:
332
333			If the rounding mode is round-to-nearest, return cosine(x)
334			faithfully rounded.
335		  
336			Not currently implemented:  In other rounding modes, return sine(x)
337			possibly with slightly worse error, not necessarily honoring the
338			rounding mode (Ali Sazegari narrowing C F.9 10).
339
340		All results are in [-1, 1].  This is corollary to faithful rounding.
341
342	Exceptions:
343
344		Raise underflow for a denormal result (C F.9 7 and Draft Standard for
345		Floating-Point Arithmetic P754 Draft 1.2.5 9.5).  If the input is the
346		smallest normal, underflow may or may not be raised.  This is stricter
347		than the older 754 standard.
348
349		May or may not raise inexact, even if the result is exact (C F.9 8).
350
351		Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
352		of C 4.2.1) or an infinity (C F.9.1.5) but not if the input is a quiet
353		NaN (C F.9 11).
354
355		May not raise exceptions otherwise (C F.9 9).
356
357	Properties:
358
359		Desired to be monotonic.  Not yet proven!
360
361		Exhaustive testing proved this routine returns faithfully rounded
362		results.
363*/
364	.align	5
365	.globl _cosf
366_cosf:
367	// This code is identical to _sinf except for the addition of Half.
368
369	// Put x into an integer register.
370	#if defined __i386__
371		mov		Argx, xInteger		// Put x into xInteger.
372	#elif defined __x86_64__
373		movd	Argx, xInteger		// Put x into xInteger.
374	#endif
375
376	cvtss2sd	Argx, x				// Convert x to double precision.
377
378	#if defined __i386__
379
380		// Get address of 0 in BaseP.
381			call	0f				// Push program counter onto stack.
382		0:
383			pop		BaseP			// Get program counter.
384
385	#elif defined __x86_64__
386
387		// Get address of reduction table in BaseP.
388			lea		ReductionTable(%rip), BaseP
389
390	#endif
391
392	unpcklpd	x, x				// Duplicate x.
393
394	/*	Shift:
395
396			SignificandBits-1 bits right to remove significand.
397
398			ExponentFold bits right to remove unused bits of exponent.
399
400			4 bits left to multiply by size of a table entry.
401	*/
402	shr		$SignificandBits-1+ExponentFold-4, xInteger
403
404	// Clear bits other than those we are using from exponent.
405	and		$(1<<ExponentBits-ExponentFold)-1 << 4, xInteger
406
407	ucomisd		x, x				// Test for NaN.
408	jpe			HandleNaN
409
410	// Get table entry containing m0 and m1 and calculate x*m0 and x*m1.
411	#if defined __i386__
412		mulpd	ReductionTable-0b+0*8(BaseP, xInteger), x
413	#elif defined __x86_64__
414		mulpd	(BaseP, xInteger), x
415	#endif
416		/*	Let the exponent of x be E, that is, E is the largest integer such
417			that 2**E <= x.  We use e to index the table, where e is E with the
418			low ExponentFold bits set to zero.  For example, if ExponentFold is
419			1, e = E & ~1.
420
421			The table entry indexed by e represents a number m that is (u/p %
422			1)/u*2**a, where u is the ULP of 2**e, p is the period (2*pi), and
423			2**a is the number of intervals we divide the period into (we use
424			two, so a is one).
425
426			Observe that when 2**e <= x, x/u is an integer, and x/u <
427			2**(24+ExponentFold).
428
429			Then for some integer k, x*m = x*(u/p % 1)/u*2**a = x*(u/p -
430			k)/u*2**a = x/p*2**a - x/u*k*2**a.  x/u*k*2**a is an integer, so
431			x*m is congruent to x/p*2**a modulo 1.
432
433			So cos(x*m*p*2**-a) equals cos(x).
434
435			However, we do not have m exactly.  The table contains two numbers
436			for 2**e, m0 and m1.  m0 has the first 29 bits of m, and m1 has the
437			next 53 bits.
438
439			Since x has 23 bits and m0 has 29 bits, their multiplication in
440			double format is exact.  From that product, we subtract the nearest
441			integer, which changes nothing modulo 1 but reduces the magnitude.
442			Then we add x*m1, yielding a precise but inexact approximation of
443			x*m0 % 1 + x*m1.
444
445			The magnitude of x*m0 % 1 is at most 1/2, and x*m1 may make the sum
446			slightly larger.
447
448			Recall that m is (u/p % 1)/u*2**a, so m's magnitude is at most
449			.5/u*2**a.  m0 contains 29 bits of this, so m1's magnitude is at
450			most 2**-29*.5/u*2**a.  With an exponent fold of 1 bit, x/u is less
451			than 2**25, so |x*m1| <= |x*2**-29*.5/u*2**a| <
452			2**25**2**-29*.5*2**a = 2**(a-5).
453
454			Therefore, |x*m0 % 1 + x*m1| < .5 + 2**(a-5).  The polynomial that
455			approximates sin(p*x*2**-a) has been engineered to be good in this
456			interval.
457		*/
458
459	addsd		Address(Half), x
460		// Add a quarter circle, then calculate sine.
461	cvtpd2dq	x, ki				// Round to integer.
462		// This requires round-to-nearest mode in the MXCSR.
463	cvtdq2pd	ki, k				// Convert to floating-point.
464	movhlps		x, m1				// Get x*m1.
465	movapd		Address(C02), p		// Get c0 and c2.
466	psllq		$63, ki				// Move low bit to high bit.
467	subsd		k, x				// Subtract integer, leaving fraction.
468	addsd		m1, x				// Get x*m0%1 + x*m1.
469	xorpd		x, ki				// Negate x if low bit of ki was set.
470	mulsd		x, x				// Square x.
471	unpcklpd	x, x				// Duplicate x**2.
472	movhpd		Address(C4), ki		// Get sign*x and c4.
473	addpd		x, p				// Make x**2+c0 and x**2+c2.
474	mulpd		x, p				// Make x**4+c0*x**2 and x**4+c2*x**2.
475	addpd		Address(C13), p		// Make x**4+c0*x**2+c1 and x**4+c2*x**2+c3.
476	mulpd		ki, p				// Make sign*x*(x**4+c0*x**2+c1) and
477									// c4*(x**4+c2*x**2+c3).
478	movhlps		p, pa				// Get c4*(x**4+c2*x**2+c3).
479	mulsd		pa, p				// Multiply final two factors.
480	cvtsd2ss	p, p				// Convert result to single precision.
481
482	#if defined __i386__
483		movss		p, Argx			// Shuttle result through memory.
484			// This uses the argument area for scratch space, which is allowed.
485		flds		Argx			// Return result on floating-point stack.
486	#else
487		// On x86_64, the return value is now in p, which is %xmm0.
488	#endif
489
490	ret