this repo has no description
1/* sinfcosf.s -- sinf and cosf for standard math library.
2
3 Written by Eric Postpischil, June 2007.
4*/
5
6
7#define SignificandBits 24
8#define ExponentBits 8
9
10 .literal8
11
12 .align 3
13
14Half: .double .5
15
16
17 .const
18
19 .align 4
20
21/* Coefficients to calculate x*(x**4+c0*x**2+c1) * c4*(x**4+c2*x**2+c3).
22
23 c0 and c2 are paired for loading into an XMM register, and so are c1 and c3.
24*/
25// Two binades, fractions of pi, minimize ULP error, refactored:
26C02: .double -4.1647719829166299, -3.6493923712207774
27C13: .double 15.485466333375713, 2.6522034727867632
28C4: .double 0.076492480587904088
29
30/* ExponentFold specifies how many bits of the exponent will not be used to
31 look up entries in the table. This must match the ExponentFold used to
32 generate the table.
33
34 The behavior for infinite inputs relies on the table contents -- if the
35 table entries looked up for an infinite input are m0 and m1, then
36 m0*infinity + m1*infinity should generate a NaN. This happens of either m0
37 or m1 is a zero or they have opposite signs. If this were not true,
38 additional arrangements would have to be made for infinite inputs.
39*/
40#define ExponentFold 1
41
42// Include the reduction table. See sinfcosfMakeTable.c for more information.
43#include "sinfcosfTable.s"
44
45
46// Rename the general registers (just to make it easier to keep track of them).
47#if defined __i386__
48 #define r0 %eax
49 #define r1 %ecx
50 #define r2 %edx
51 #define r3 %ebx
52 #define r4 %esp
53 #define r5 %ebp
54 #define r6 %esi
55 #define r7 %edi
56#elif defined __x86_64__
57 #define r0 %rax
58 #define r1 %rcx
59 #define r2 %rdx
60 #define r3 %rbx
61 #define r4 %rsp
62 #define r5 %rbp
63 #define r6 %rsi
64 #define r7 %rdi
65#else
66 #error "Unknown architecture."
67#endif
68
69
70 .text
71
72
73// Define symbols common to sinf and cosf.
74#define BaseP r0 // Base address for position-independent addressing.
75#define xInteger r1 // The input x in an integer register.
76
77#define p %xmm0 // Must be in %xmm0 for return on x86_64.
78#define x %xmm1
79#define ki %xmm2
80#define k %xmm3
81#define m1 %xmm4
82#define pa m1
83
84#if defined __i386__
85
86 // Define location of argument x.
87 #define Argx 4(%esp)
88
89 // Define how to address data. BaseP must contain the address of label 0.
90 #define Address(label) label-0b(BaseP)
91
92#elif defined __x86_64__
93
94 // Define location of argument x.
95 #define Argx %xmm0
96
97 // Define how to address data.
98 #define Address(label) label(%rip)
99
100#endif
101
102
103/* float sinf(float x).
104
105 Notes:
106
107 Citations in parentheses below indicate the source of a requirement.
108
109 "C" stands for ISO/IEC 9899:TC2.
110
111 The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
112 requirements since it defers to C and requires errno behavior only if
113 we choose to support it by arranging for "math_errhandling &
114 MATH_ERRNO" to be non-zero, which we do not.
115
116 Return value:
117
118 For +/- zero, return zero with same sign (C F.9 12 and F.9.1.6).
119
120 For +/- infinity, return a NaN (C F.9.1.6).
121
122 For a NaN, return the same NaN (C F.9 11 and 13).
123
124 Otherwise:
125
126 If the rounding mode is round-to-nearest, return sine(x) faithfully
127 rounded.
128
129 Not currently implemented: In other rounding modes, return sine(x)
130 possibly with slightly worse error, not necessarily honoring the
131 rounding mode (Ali Sazegari narrowing C F.9 10).
132
133 All results are in [-1, 1]. This is corollary to faithful rounding.
134
135 Exceptions:
136
137 Raise underflow for a denormal result (C F.9 7 and Draft Standard for
138 Floating-Point Arithmetic P754 Draft 1.2.5 9.5). If the input is the
139 smallest normal, underflow may or may not be raised. This is stricter
140 than the older 754 standard.
141
142 May or may not raise inexact, even if the result is exact (C F.9 8).
143
144 Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
145 of C 4.2.1) or an infinity (C F.9.1.6) but not if the input is a quiet
146 NaN (C F.9 11).
147
148 May not raise exceptions otherwise (C F.9 9).
149
150 Properties:
151
152 Desired to be monotonic. Not yet proven!
153
154 Exhaustive testing proved this routine returns faithfully rounded
155 results.
156*/
157 .align 5
158 .globl _sinf
159_sinf:
160
161 // Put x into an integer register.
162 #if defined __i386__
163 mov Argx, xInteger // Put x into xInteger.
164 #elif defined __x86_64__
165 movd Argx, xInteger // Put x into xInteger.
166 #endif
167
168 cvtss2sd Argx, x // Convert x to double precision.
169
170 #if defined __i386__
171
172 // Get address of 0 in BaseP.
173 call 0f // Push program counter onto stack.
174 0:
175 pop BaseP // Get program counter.
176
177 #elif defined __x86_64__
178
179 // Get address of reduction table in BaseP.
180 lea ReductionTable(%rip), BaseP
181
182 #endif
183
184 unpcklpd x, x // Duplicate x.
185
186 /* Shift:
187
188 SignificandBits-1 bits right to remove significand.
189
190 ExponentFold bits right to remove unused bits of exponent.
191
192 4 bits left to multiply by size of a table entry.
193 */
194 shr $SignificandBits-1+ExponentFold-4, xInteger
195
196 // Clear bits other than those we are using from exponent.
197 and $(1<<ExponentBits-ExponentFold)-1 << 4, xInteger
198 je ExponentIsZero // Branch so we can get -0 right.
199
200 ucomisd x, x // Test for NaN.
201 jpe HandleNaN
202
203 // Get table entry containing m0 and m1 and calculate x*m0 and x*m1.
204 #if defined __i386__
205 mulpd ReductionTable-0b(BaseP, xInteger), x
206 #elif defined __x86_64__
207 mulpd (BaseP, xInteger), x
208 #endif
209 /* Let the exponent of x be E, that is, E is the largest integer such
210 that 2**E <= x. We use e to index the table, where e is E with the
211 low ExponentFold bits set to zero. For example, if ExponentFold is
212 1, e = E & ~1.
213
214 The table entry indexed by e represents a number m that is (u/p %
215 1)/u*2**a, where u is the ULP of 2**e, p is the period (2*pi), and
216 2**a is the number of intervals we divide the period into (we use
217 two, so a is one).
218
219 Observe that when 2**e <= x, x/u is an integer, and x/u <
220 2**(24+ExponentFold).
221
222 Then for some integer k, x*m = x*(u/p % 1)/u*2**a = x*(u/p -
223 k)/u*2**a = x/p*2**a - x/u*k*2**a. x/u*k*2**a is an integer, so
224 x*m is congruent to x/p*2**a modulo 1.
225
226 So sin(x*m*p*2**-a) equals sin(x).
227
228 However, we do not have m exactly. The table contains two numbers
229 for 2**e, m0 and m1. m0 has the first 29 bits of m, and m1 has the
230 next 53 bits.
231
232 Since x has 23 bits and m0 has 29 bits, their multiplication in
233 double format is exact. From that product, we subtract the nearest
234 integer, which changes nothing modulo 1 but reduces the magnitude.
235 Then we add x*m1, yielding a precise but inexact approximation of
236 x*m0 % 1 + x*m1.
237
238 The magnitude of x*m0 % 1 is at most 1/2, and x*m1 may make the sum
239 slightly larger.
240
241 Recall that m is (u/p % 1)/u*2**a, so m's magnitude is at most
242 .5/u*2**a. m0 contains 29 bits of this, so m1's magnitude is at
243 most 2**-29*.5/u*2**a. With an exponent fold of 1 bit, x/u is less
244 than 2**25, so |x*m1| <= |x*2**-29*.5/u*2**a| <
245 2**25**2**-29*.5*2**a = 2**(a-5).
246
247 Therefore, |x*m0 % 1 + x*m1| < .5 + 2**(a-5). The polynomial that
248 approximates sin(p*x*2**-a) has been engineered to be good in this
249 interval.
250 */
251
252 cvtpd2dq x, ki // Round to integer.
253 // This requires round-to-nearest mode in the MXCSR.
254 cvtdq2pd ki, k // Convert to floating-point.
255 movhlps x, m1 // Get x*m1.
256 movapd Address(C02), p // Get c0 and c2.
257 psllq $63, ki // Move low bit to high bit.
258 subsd k, x // Subtract integer, leaving fraction.
259 addsd m1, x // Get x*m0%1 + x*m1.
260 xorpd x, ki // Negate x if low bit of ki was set.
261 mulsd x, x // Square x.
262 unpcklpd x, x // Duplicate x**2.
263 movhpd Address(C4), ki // Get sign*x and c4.
264 addpd x, p // Make x**2+c0 and x**2+c2.
265 mulpd x, p // Make x**4+c0*x**2 and x**4+c2*x**2.
266 addpd Address(C13), p // Make x**4+c0*x**2+c1 and x**4+c2*x**2+c3.
267 mulpd ki, p // Make sign*x*(x**4+c0*x**2+c1) and
268 // c4*(x**4+c2*x**2+c3).
269 movhlps p, pa // Get c4*(x**4+c2*x**2+c3).
270 mulsd pa, p // Multiply final two factors.
271 cvtsd2ss p, p // Convert result to single precision.
272
273ReturnSingle:
274 #if defined __i386__
275 movss p, Argx // Shuttle result through memory.
276 // This uses the argument area for scratch space, which is allowed.
277 flds Argx // Return input on floating-point stack.
278 #else
279 // On x86_64, the return value is now in p, which is %xmm0.
280 #endif
281
282 ret
283
284
285/* The raw exponent field is zero, so the input is a zero or a denormal. We
286 need to handle this separately to get -0 right, and it is faster.
287*/
288ExponentIsZero:
289 .literal8
290Finagle: .double 1.00000001
291 .text
292 mulsd Address(Finagle), x // Make non-zero results inexact.
293 cvtsd2ss x, p // Return x.
294 jmp ReturnSingle
295
296
297// Handle a NaN input.
298HandleNaN:
299
300 #if defined __i386__
301 flds Argx // Return result on floating-point stack.
302 #else
303 cvtsd2ss x, %xmm0 // Return in %xmm0.
304 /* We cannot just return the input, because we must quiet a
305 signalling NaN.
306 */
307 #endif
308
309 ret
310
311
312/* float cosf(float x).
313
314 Notes:
315
316 Citations in parentheses below indicate the source of a requirement.
317
318 "C" stands for ISO/IEC 9899:TC2.
319
320 The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
321 requirements since it defers to C and requires errno behavior only if
322 we choose to support it by arranging for "math_errhandling &
323 MATH_ERRNO" to be non-zero, which we do not.
324
325 Return value:
326
327 For +/- infinity, return a NaN (C F.9.1.6).
328
329 For a NaN, return the same NaN (C F.9 11 and 13).
330
331 Otherwise:
332
333 If the rounding mode is round-to-nearest, return cosine(x)
334 faithfully rounded.
335
336 Not currently implemented: In other rounding modes, return sine(x)
337 possibly with slightly worse error, not necessarily honoring the
338 rounding mode (Ali Sazegari narrowing C F.9 10).
339
340 All results are in [-1, 1]. This is corollary to faithful rounding.
341
342 Exceptions:
343
344 Raise underflow for a denormal result (C F.9 7 and Draft Standard for
345 Floating-Point Arithmetic P754 Draft 1.2.5 9.5). If the input is the
346 smallest normal, underflow may or may not be raised. This is stricter
347 than the older 754 standard.
348
349 May or may not raise inexact, even if the result is exact (C F.9 8).
350
351 Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
352 of C 4.2.1) or an infinity (C F.9.1.5) but not if the input is a quiet
353 NaN (C F.9 11).
354
355 May not raise exceptions otherwise (C F.9 9).
356
357 Properties:
358
359 Desired to be monotonic. Not yet proven!
360
361 Exhaustive testing proved this routine returns faithfully rounded
362 results.
363*/
364 .align 5
365 .globl _cosf
366_cosf:
367 // This code is identical to _sinf except for the addition of Half.
368
369 // Put x into an integer register.
370 #if defined __i386__
371 mov Argx, xInteger // Put x into xInteger.
372 #elif defined __x86_64__
373 movd Argx, xInteger // Put x into xInteger.
374 #endif
375
376 cvtss2sd Argx, x // Convert x to double precision.
377
378 #if defined __i386__
379
380 // Get address of 0 in BaseP.
381 call 0f // Push program counter onto stack.
382 0:
383 pop BaseP // Get program counter.
384
385 #elif defined __x86_64__
386
387 // Get address of reduction table in BaseP.
388 lea ReductionTable(%rip), BaseP
389
390 #endif
391
392 unpcklpd x, x // Duplicate x.
393
394 /* Shift:
395
396 SignificandBits-1 bits right to remove significand.
397
398 ExponentFold bits right to remove unused bits of exponent.
399
400 4 bits left to multiply by size of a table entry.
401 */
402 shr $SignificandBits-1+ExponentFold-4, xInteger
403
404 // Clear bits other than those we are using from exponent.
405 and $(1<<ExponentBits-ExponentFold)-1 << 4, xInteger
406
407 ucomisd x, x // Test for NaN.
408 jpe HandleNaN
409
410 // Get table entry containing m0 and m1 and calculate x*m0 and x*m1.
411 #if defined __i386__
412 mulpd ReductionTable-0b+0*8(BaseP, xInteger), x
413 #elif defined __x86_64__
414 mulpd (BaseP, xInteger), x
415 #endif
416 /* Let the exponent of x be E, that is, E is the largest integer such
417 that 2**E <= x. We use e to index the table, where e is E with the
418 low ExponentFold bits set to zero. For example, if ExponentFold is
419 1, e = E & ~1.
420
421 The table entry indexed by e represents a number m that is (u/p %
422 1)/u*2**a, where u is the ULP of 2**e, p is the period (2*pi), and
423 2**a is the number of intervals we divide the period into (we use
424 two, so a is one).
425
426 Observe that when 2**e <= x, x/u is an integer, and x/u <
427 2**(24+ExponentFold).
428
429 Then for some integer k, x*m = x*(u/p % 1)/u*2**a = x*(u/p -
430 k)/u*2**a = x/p*2**a - x/u*k*2**a. x/u*k*2**a is an integer, so
431 x*m is congruent to x/p*2**a modulo 1.
432
433 So cos(x*m*p*2**-a) equals cos(x).
434
435 However, we do not have m exactly. The table contains two numbers
436 for 2**e, m0 and m1. m0 has the first 29 bits of m, and m1 has the
437 next 53 bits.
438
439 Since x has 23 bits and m0 has 29 bits, their multiplication in
440 double format is exact. From that product, we subtract the nearest
441 integer, which changes nothing modulo 1 but reduces the magnitude.
442 Then we add x*m1, yielding a precise but inexact approximation of
443 x*m0 % 1 + x*m1.
444
445 The magnitude of x*m0 % 1 is at most 1/2, and x*m1 may make the sum
446 slightly larger.
447
448 Recall that m is (u/p % 1)/u*2**a, so m's magnitude is at most
449 .5/u*2**a. m0 contains 29 bits of this, so m1's magnitude is at
450 most 2**-29*.5/u*2**a. With an exponent fold of 1 bit, x/u is less
451 than 2**25, so |x*m1| <= |x*2**-29*.5/u*2**a| <
452 2**25**2**-29*.5*2**a = 2**(a-5).
453
454 Therefore, |x*m0 % 1 + x*m1| < .5 + 2**(a-5). The polynomial that
455 approximates sin(p*x*2**-a) has been engineered to be good in this
456 interval.
457 */
458
459 addsd Address(Half), x
460 // Add a quarter circle, then calculate sine.
461 cvtpd2dq x, ki // Round to integer.
462 // This requires round-to-nearest mode in the MXCSR.
463 cvtdq2pd ki, k // Convert to floating-point.
464 movhlps x, m1 // Get x*m1.
465 movapd Address(C02), p // Get c0 and c2.
466 psllq $63, ki // Move low bit to high bit.
467 subsd k, x // Subtract integer, leaving fraction.
468 addsd m1, x // Get x*m0%1 + x*m1.
469 xorpd x, ki // Negate x if low bit of ki was set.
470 mulsd x, x // Square x.
471 unpcklpd x, x // Duplicate x**2.
472 movhpd Address(C4), ki // Get sign*x and c4.
473 addpd x, p // Make x**2+c0 and x**2+c2.
474 mulpd x, p // Make x**4+c0*x**2 and x**4+c2*x**2.
475 addpd Address(C13), p // Make x**4+c0*x**2+c1 and x**4+c2*x**2+c3.
476 mulpd ki, p // Make sign*x*(x**4+c0*x**2+c1) and
477 // c4*(x**4+c2*x**2+c3).
478 movhlps p, pa // Get c4*(x**4+c2*x**2+c3).
479 mulsd pa, p // Multiply final two factors.
480 cvtsd2ss p, p // Convert result to single precision.
481
482 #if defined __i386__
483 movss p, Argx // Shuttle result through memory.
484 // This uses the argument area for scratch space, which is allowed.
485 flds Argx // Return result on floating-point stack.
486 #else
487 // On x86_64, the return value is now in p, which is %xmm0.
488 #endif
489
490 ret