this repo has no description
1/* tanf.s -- tanf for standard math library.
2
3 Written by Eric Postpischil, June 2007.
4*/
5
6
7#define SignificandBits 24
8#define ExponentBits 8
9
10 .literal8
11
12 .align 3
13
14NegativeOne:
15 .double -1
16
17
18 .const
19
20 .align 4
21
22/* Coefficients to calculate:
23
24 x * (x**4 + c02*x**2 + c00) * (x**4 + c12*x**2 + c10)
25 * (x**4 + c22*x**2 + c20) * ( c32*x**2 + c30)
26
27 c02 and c12 are stored as a pair at the location below labeled c02,
28 c00 and c10 are at c00, c22 and c23 at c22, and c20 and c30 at c20.
29
30 This polynomial was designed to cover two binades (supporting an
31 exponent fold of one bit) for tanf with its period (pi) partitioned into
32 two intervals. The expanded polynomial was factored to create the form
33 above convenient for calculating with SIMD.
34
35 This was minimized for ULP error.
36*/
37c02: .double -0.34431297545117363561, -1.53503935362160698123
38c00: .double +0.75337415977492632831, +0.92783629599735333058
39c22: .double +0.91301179133375347365, +5.27860261872543679865
40c20: .double +0.58807104225499649417, +3.82127246397188494330
41
42
43/* ExponentFold specifies how many bits of the exponent will not be used to
44 look up entries in the table. This must match the ExponentFold used to
45 generate the table.
46
47 The behavior for infinite inputs relies on the table contents -- if the
48 table entries looked up for an infinite input are m0 and m1, then
49 m0*infinity + m1*infinity should generate a NaN. This happens of either m0
50 or m1 is a zero or they have opposite signs. If this were not true,
51 additional arrangements would have to be made for infinite inputs.
52*/
53#define ExponentFold 1
54
55// Include the reduction table. See tanfMakeTable.c for more information.
56#include "tanfTable.s"
57
58
59// Rename the general registers (just to make it easier to keep track of them).
60#if defined __i386__
61 #define r0 %eax
62 #define r1 %ecx
63 #define r2 %edx
64 #define r3 %ebx
65 #define r4 %esp
66 #define r5 %ebp
67 #define r6 %esi
68 #define r7 %edi
69#elif defined __x86_64__
70 #define r0 %rax
71 #define r1 %rcx
72 #define r2 %rdx
73 #define r3 %rbx
74 #define r4 %rsp
75 #define r5 %rbp
76 #define r6 %rsi
77 #define r7 %rdi
78#else
79 #error "Unknown architecture."
80#endif
81
82
83 .text
84
85
86// Define symbols for tanf.
87#define BaseP r0 // Base address for position-independent addressing.
88#define xInteger r1 // The input x in an integer register.
89#define ki r1 // x*m0 rounded to an integer.
90
91#define p0 %xmm0 // Must be in %xmm0 for return on x86_64.
92#define p2 %xmm1
93#define x %xmm2
94#define k %xmm3
95#define m1 %xmm4
96#define x1 %xmm5
97#define Numerator %xmm6
98
99#if defined __i386__
100
101 // Define location of argument x.
102 #define Argx 4(%esp)
103
104 // Define how to address data. BaseP must contain the address of label 0.
105 #define Address(label) label-0b(BaseP)
106
107#elif defined __x86_64__
108
109 // Define location of argument x.
110 #define Argx %xmm0
111
112 // Define how to address data.
113 #define Address(label) label(%rip)
114
115#endif
116
117
118/* float tanf(float x).
119
120 Notes:
121
122 Citations in parentheses below indicate the source of a requirement.
123
124 "C" stands for ISO/IEC 9899:TC2.
125
126 The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
127 requirements since it defers to C and requires errno behavior only if
128 we choose to support it by arranging for "math_errhandling &
129 MATH_ERRNO" to be non-zero, which we do not.
130
131 Return value:
132
133 For +/- zero, return zero with same sign (C F.9 12 and F.9.1.7).
134
135 For +/- infinity, return a NaN (C F.9.1.7).
136
137 For a NaN, return the same NaN (C F.9 11 and 13).
138
139 Otherwise:
140
141 If the rounding mode is round-to-nearest, return tangent(x)
142 faithfully rounded.
143
144 Not currently implemented: In other rounding modes, return
145 tangent(x) possibly with slightly worse error, not necessarily
146 honoring the rounding mode (Ali Sazegari narrowing C F.9 10).
147
148 Exceptions:
149
150 Raise underflow for a denormal result (C F.9 7 and Draft Standard for
151 Floating-Point Arithmetic P754 Draft 1.2.5 9.5). If the input is the
152 smallest normal, underflow may or may not be raised. This is stricter
153 than the older 754 standard.
154
155 May or may not raise inexact, even if the result is exact (C F.9 8).
156
157 Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
158 of C F.2.1) or an infinity (C F.9.1.7) but not if the input is a quiet
159 NaN (C F.9 11).
160
161 May not raise exceptions otherwise (C F.9 9).
162
163 Properties:
164
165 Desired to be monotonic. Not yet proven!
166
167 Exhaustive testing proved this routine returns faithfully rounded
168 results.
169*/
170 .align 5
171 .globl _tanf
172_tanf:
173
174 // Put x into an integer register.
175 #if defined __i386__
176 mov Argx, xInteger // Put x into xInteger.
177 #elif defined __x86_64__
178 movd Argx, xInteger // Put x into xInteger.
179 #endif
180
181 cvtss2sd Argx, x // Convert x to double precision.
182
183 #if defined __i386__
184
185 // Get address of 0 in BaseP.
186 call 0f // Push program counter onto stack.
187 0:
188 pop BaseP // Get program counter.
189
190 #elif defined __x86_64__
191
192 // Get address of reduction table in BaseP.
193 lea ReductionTable(%rip), BaseP
194
195 #endif
196
197 movsd Address(NegativeOne), Numerator
198
199 unpcklpd x, x // Duplicate x.
200
201 /* Shift:
202
203 SignificandBits-1 bits right to remove significand.
204
205 ExponentFold bits right to remove unused bits of exponent.
206
207 4 bits left to multiply by size of a table entry.
208 */
209 shr $SignificandBits-1+ExponentFold-4, xInteger
210
211 // Clear bits other than those we are using from exponent.
212 and $(1<<ExponentBits-ExponentFold)-1 << 4, xInteger
213 je ExponentIsZero // Branch so we can get -0 right.
214
215 ucomisd x, x // Test for Nan.
216 jpe HandleNaN
217
218 // Get table entry containing m0 and m1 and calculate x*m0 and x*m1.
219 #if defined __i386__
220 mulpd ReductionTable-0b(BaseP, xInteger), x
221 #elif defined __x86_64__
222 mulpd (BaseP, xInteger), x
223 #endif
224 /* Let the exponent of x be E, that is, E is the largest integer such
225 that 2**E <= x. We use e to index the table, where e is E with the
226 low ExponentFold bits set to zero. For example, if ExponentFold is
227 1, e = E & ~1.
228
229 The table entry indexed by e represents a number m that is (u/p %
230 1)/u*2**a, where u is the ULP of 2**e, p is the period (pi), and
231 2**a is the number of intervals we divide the period into (we use
232 two, so a is one).
233
234 Observe that when 2**e <= x, x/u is an integer, and x/u <
235 2**(24+ExponentFold).
236
237 Then for some integer k, x*m = x*(u/p % 1)/u*2**a = x*(u/p -
238 k)/u*2**a = x/p*2**a - x/u*k*2**a. x/u*k*2**a is an integer, so
239 x*m is congruent to x/p*2**a modulo 1.
240
241 So tangent(x*m*p*2**-a) equals tangent(x).
242
243 However, we do not have m exactly. The table contains two numbers
244 for 2**e, m0 and m1. m0 has the first 29 bits of m, and m1 has the
245 next 53 bits.
246
247 Since x has 23 bits and m0 has 29 bits, their multiplication in
248 double format is exact. From that product, we subtract the nearest
249 integer, which changes nothing modulo 1 but reduces the magnitude.
250 Then we add x*m1, yielding a precise but inexact approximation of
251 x*m0 % 1 + x*m1.
252
253 The magnitude of x*m0 % 1 is at most 1/2, and x*m1 may make the sum
254 slightly larger.
255
256 Recall that m is (u/p % 1)/u*2**a, so m's magnitude is at most
257 .5/u*2**a. m0 contains 29 bits of this, so m1's magnitude is at
258 most 2**-29*.5/u*2**a. With an exponent fold of 1 bit, x/u is less
259 than 2**25, so |x*m1| <= |x*2**-29*.5/u*2**a| <
260 2**25**2**-29*.5*2**a = 2**(a-5).
261
262 Therefore, |x*m0 % 1 + x*m1| < .5 + 2**(a-5). The polynomial that
263 approximates tangent(p*x*2**-a) has been engineered to be good in
264 this interval.
265 */
266
267 movapd Address(c02), p0 // Get c02 and c12.
268 movapd Address(c22), p2 // Get c22 and c32.
269
270 cvtsd2si x, ki // Round to integer.
271 // This requires round-to-nearest mode in the MXCSR.
272 cvtsi2sd ki, k // Convert to floating-point.
273 movhlps x, m1 // Get x*m1.
274 subsd k, x // Subtract integer, leaving fraction.
275 addsd m1, x // Get x*m0%1 + x*m1.
276
277/* Below this point, references in the comments to "x" refer to the
278 result of the reduction, not to the original input argument.
279*/
280 unpcklpd x, x // Duplicate x for SIMD work.
281 test $1, ki // Is 2**0 bit on?
282 movsd x, x1 // Save a copy of x for later.
283 mulpd x, x // Get x**2.
284#define x2 x // Use the name x2 to emphasize we have x**2.
285 addpd x2, p0 // Get x**2+c02 and x**2+c12.
286 mulpd x2, p0 // Get x**4+c02*x**2 and x**4+c12*x**2.
287 addpd Address(c00), p0 // Finish first pair of terms.
288 addsd x2, p2 // Get x**2+c22 and c32.
289 mulpd x2, p2 // Get x**4+c22*x**2 and c32*x**2.
290 addpd Address(c20), p2 // Finish second pair of terms.
291 mulpd p2, p0 // Multiply parallel pairs.
292 mulsd x1, p0 // Multiply isolated x term.
293 movhlps p0, p2 // Get high factor.
294 mulsd p2, p0 // Combine parts from SIMD work.
295 jne 2f // Branch if bit was on.
296
297// ki was even, tangent is in p0.
298
299 cvtsd2ss p0, p0 // Convert result to single precision.
300 #if defined __i386__
301 movss p0, Argx // Shuttle result through memory.
302 // This uses the argument area for scratch space, which is allowed.
303 flds Argx // Return result on floating-point stack.
304 #else
305 // On x86_64, the return value is now in p0, which is %xmm0.
306 #endif
307
308 ret
309
310// ki was odd, tangent is -1/p0.
3112:
312 divsd p0, Numerator
313 cvtsd2ss Numerator, p0 // Convert result to single precision.
314
315ReturnSingle:
316 #if defined __i386__
317 movss p0, Argx // Shuttle result through memory.
318 // This uses the argument area for scratch space, which is allowed.
319 flds Argx // Return result on floating-point stack.
320 #else
321 // On x86_64, the return value is now in p0, which is %xmm0.
322 #endif
323
324 ret
325
326
327// Handle a NaN input.
328HandleNaN:
329
330 #if defined __i386__
331 flds Argx // Return result on floating-point stack.
332 #else
333 cvtsd2ss x, %xmm0 // Return in %xmm0.
334 /* We cannot just return the input, because we must quiet a
335 signalling NaN.
336 */
337 #endif
338
339 ret
340
341
342/* The raw exponent field is zero, so the input is a zero or a denormal. We
343 need to handle this separately to get -0 right, and it is faster.
344*/
345ExponentIsZero:
346 .literal8
347Finagle: .double 1.00000001
348 .text
349 mulsd Address(Finagle), x // Make non-zero results inexact.
350 cvtsd2ss x, p0 // Return x.
351 jmp ReturnSingle