this repo has no description
1/* atanf.s -- atanf for standard math library.
2
3 Written by Eric Postpischil, July 2007.
4*/
5
6
7 .literal8
8
9// Define the points where our algorithm changes.
10nPoint0: .double -1
11pPoint0: .double +1
12nPoint1: .double -16777216
13pPoint1: .double +16777216
14
15// Define miscellaneous constants.
16nHalfPi: .double -1.5707963267948966192313217
17One: .double 1
18pHalfPi: .double +1.5707963267948966192313217
19
20// Define a coefficient for center polynomial (used for x in [-1, +1]).
21C2: .double 0.0029352921857004596570518
22
23
24 .const
25 .align 4
26
27/* Define some coefficients for center polynomial (used for x in [-1, +1]).
28 These are stored in pairs at aligned addresses for use in SIMD
29 instructions.
30*/
31C01: .double 2.2971562298314633761676433, 0.0207432003007420961489920
32C00: .double 2.4449692297316409651126041, 3.7888879287802702842997915
33C11: .double -2.9466967515109826289085300, -4.9721072376211623038916292
34C10: .double 5.4728447324456990092824269, 6.7197076223592378022736307
35
36// Define coefficients for the tail polynomial (used for x outside [-1, +1]).
37T01: .double 0.9193672354545696501477531, -0.0052035944094405570566862
38T00: .double 0.3992772987220534996563340, 0.2521268658714555740707959
39T11: .double -0.5273186542866779494437308, -0.7201738803584184183894208
40T10: .double 0.1730466268612773143731748, 0.1408679409162453515360961
41
42
43// Rename the general registers (just to make it easier to keep track of them).
44#if defined __i386__
45 #define r0 %eax
46 #define r1 %ecx
47 #define r2 %edx
48 #define r3 %ebx
49 #define r4 %esp
50 #define r5 %ebp
51 #define r6 %esi
52 #define r7 %edi
53#elif defined __x86_64__
54 #define r0 %rax
55 #define r1 %rcx
56 #define r2 %rdx
57 #define r3 %rbx
58 #define r4 %rsp
59 #define r5 %rbp
60 #define r6 %rsi
61 #define r7 %rdi
62#else
63 #error "Unknown architecture."
64#endif
65
66
67 .text
68
69
70// Define various symbols.
71
72#define BaseP r0 // Base address for position-independent addressing.
73
74#define p0 %xmm0 // Must be in %xmm0 for return on x86_64.
75#define x %xmm1
76#define p1 %xmm2
77#define x1 %xmm3
78#define v0 %xmm4
79#define t0 %xmm5
80
81#if defined __i386__
82
83 // Define location of argument x.
84 #define Argx 4(%esp)
85
86 // Define how to address data. BaseP must contain the address of label 0.
87 #define Address(label) label-0b(BaseP)
88
89#elif defined __x86_64__
90
91 // Define location of argument x.
92 #define Argx %xmm0
93
94 // Define how to address data.
95 #define Address(label) label(%rip)
96
97#endif
98
99
100/* float atanf(float x).
101
102 Notes:
103
104 Citations in parentheses below indicate the source of a requirement.
105
106 "C" stands for ISO/IEC 9899:TC2.
107
108 The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
109 requirements since it defers to C and requires errno behavior only if
110 we choose to support it by arranging for "math_errhandling &
111 MATH_ERRNO" to be non-zero, which we do not.
112
113 Return value:
114
115 For arctangent of +/- zero, return zero with same sign (C F.9 12 and
116 F.9.1.3).
117
118 For arctangent of +/- infinity, return +/- pi/2 (C F.9.1.3).
119
120 For a NaN, return the same NaN (C F.9 11 and 13). (If the NaN is a
121 signalling NaN, we return the "same" NaN quieted.)
122
123 Otherwise:
124
125 If the rounding mode is round-to-nearest, return arctangent(x)
126 faithfully rounded.
127
128 Return a value in [-pi/2, +pi/2] (C 7.12.4.3 3). Note that this
129 prohibits returning correctly rounded values for atanf of large
130 positive or negative arguments, since pi/2 rounded to a float lies
131 outside that interval.
132
133 Not implemented: In other rounding modes, return arctangent(x)
134 possibly with slightly worse error, not necessarily honoring the
135 rounding mode (Ali Sazegari narrowing C F.9 10).
136
137 Exceptions:
138
139 Raise underflow for a denormal result (C F.9 7 and Draft Standard for
140 Floating-Point Arithmetic P754 Draft 1.2.5 9.5). If the input is the
141 smallest normal, underflow may or may not be raised. This is stricter
142 than the older 754 standard.
143
144 May or may not raise inexact, even if the result is exact (C F.9 8).
145
146 Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
147 of C 4.2.1) but not if the input is a quiet NaN (C F.9 11).
148
149 May not raise exceptions otherwise (C F.9 9).
150
151 Properties:
152
153 Monotonic.
154
155 Exhaustive testing proved this routine returns faithfully rounded
156 results.
157*/
158 .align 5
159#if !defined DevelopmentInstrumentation
160 // This is the regular name used in the deployed implementation.
161 .globl _atanf
162 _atanf:
163#else
164 // This is the name used for a special test version of the routine.
165 .globl _atanfInstrumented
166 _atanfInstrumented:
167#endif
168
169 cvtss2sd Argx, x // Convert x to double precision.
170
171 #if defined __i386__
172
173 // Get address of 0 in BaseP.
174 call 0f // Push program counter onto stack.
175 0:
176 pop BaseP // Get program counter.
177
178 #endif
179
180/* We use different algorithms for different parts of the domain. There
181 is a "negative tail" from -infinity to nPoint0, a center from nPoint0 to
182 pPoint0, and a "positive tail" from pPoint0 to +infinity. Here, we compare
183 and branch to the appropriate code.
184
185 NaNs are weeded out in PositiveTail or NegativeTail.
186*/
187
188 ucomisd Address(pPoint0), x
189 ja PositiveTail
190
191 ucomisd Address(nPoint0), x
192 jb NegativeTail
193
194
195/* Here we have nPoint0 <= x <= pPoint0. This is handled with a simple
196 evaluation of a polynomial that approximates arctangent.
197
198 The polynomial has been arranged into the form:
199
200 x * c2
201 * ((x**4 + c01 * x**2 + c00))
202 * ((x**4 + c11 * x**2 + c10))
203 * ((x**4 + c21 * x**2 + c20))
204 * ((x**4 + c31 * x**2 + c30))
205
206 The coefficients are stored in pairs, with c01 and c21 at C01, c00 and c20
207 at C00, c11 and c31 at C11, and c10 and c30 at C10. c2 is at C2.
208
209 The quartic factors are evaluated in SIMD registers. For brevity, some
210 comments below describe only one element of a register. The other is
211 analagous.
212*/
213 movlhps x, x1 // Save a copy of x for later.
214 mulsd x, x // Form x**2.
215#define x2 x // Define name describing current register contents.
216 movlhps x2, x2 // Duplicate x**2.
217 movapd Address(C11), p1
218 addpd x2, p1 // Form x**2 + c11.
219 mulpd x2, p1 // Form x**4 + c11 * x**2.
220 addpd Address(C10), p1 // Form x**4 + c11 * x**2 + c10.
221 movapd Address(C01), p0 // Get first coefficients.
222 movlpd Address(C2), x1 // Put c2 in one element, with x in other.
223 addpd x2, p0 // Form x**2 + c01.
224 mulpd x2, p0 // Form x**4 + c01 * x**2.
225 addpd Address(C00), p0 // Form x**4 + c01 * x**2 + c00.
226 mulpd p1, p0 // Combine factors.
227 mulpd x1, p0 // Multiply by x and c2.
228 movhlps p0, p1 // Get high element.
229 mulsd p1, p0 // Finish combining factors.
230#undef x2
231
232// Return the double-precision number currently in p0.
233ReturnDoubleInp0:
234 cvtsd2ss p0, p0 // Convert result to single precision.
235
236// Return the single-precision number currently in p0.
237ReturnSingleInp0:
238
239 #if defined __i386__
240 movss p0, Argx // Shuttle result through memory.
241 // This uses the argument area for scratch space, which is allowed.
242 flds Argx // Return input on floating-point stack.
243 #else
244 // On x86_64, the return value is now in p0, which is %xmm0.
245 #endif
246
247 ret
248
249
250// Handle pPoint0 < x.
251PositiveTail:
252
253 ucomisd Address(pPoint1), x // Compare x to algorithm change point.
254 jae InputIsPositiveSpecial
255
256 movsd Address(pHalfPi), p0 // Prepare +pi/2.
257
258CommonTail:
259
260/* Here we have pPoint0 < x < pPoint1. The algorithm here is inspired by the
261 identity (for positive x) arctangent(x) = +pi/2 - arctangent(1/x). This
262 is approximated by evaluating:
263
264 +pi/2 -
265 * ((x**4 + t01 * x**2 + t00))
266 * ((x**4 + t11 * x**2 + t10))
267 * ((x**4 + t21 * x**2 + t20))
268 * ((x**4 + t31 * x**2 + t30))
269 * (1/x) * (1/x)**16.
270
271 The quartic factors are evaluated in SIMD registers. For brevity, some
272 comments below describe only one element of a register. The other is
273 analagous.
274*/
275
276 movsd Address(One), v0 // Get constant.
277 divsd x, v0 // Form 1/x, which we will call r.
278 /* I tried rcpss, but using it requires a precision conversion, two
279 adds, and three multiplies. divsd performance is not bad on Core2,
280 and I was not able to get the rcpss version to run as fast.
281 */
282 mulsd x, x // Form x**2.
283#define x2 x // Define name describing current register contents.
284 movlhps v0, v0 // Copy finished reciprocal to high element.
285 movlhps x2, x2 // Duplicate x**2.
286 mulsd v0, v0 // Form r**2.
287 mulsd v0, v0 // Form r**4.
288 mulsd v0, v0 // Form r**8.
289 movapd Address(T11), p1 // Get first coefficients.
290 mulsd v0, v0 // Form r**16.
291
292 movapd Address(T01), t0 // Get first coefficients.
293 addpd x2, p1 // Form x**2 + t11.
294 mulpd x2, p1 // Form x**4 + t11 * x**2.
295 addpd Address(T10), p1 // Form x**4 + t11 * x**2 + t10.
296 addpd x2, t0 // Form x**2 + t01.
297 mulpd x2, t0 // Form x**4 + t01 * x**2.
298 addpd Address(T00), t0 // Form x**4 + t01 * x**2 + t00.
299 mulpd p1, t0 // Combine factors.
300 mulpd v0, t0 // Multiply by r**16 and r.
301 movhlps t0, p1 // Get high element.
302 mulsd p1, t0 // Finish combining factors.
303 subsd t0, p0 // Subtract from pi/2.
304#undef x2
305
306 jmp ReturnDoubleInp0
307
308
309// Handle x < nPoint0.
310NegativeTail:
311
312 ucomisd Address(nPoint1), x // Compare x to algorithm change point.
313 jbe InputIsNegativeSpecial
314
315 movsd Address(nHalfPi), p0 // Prepare -pi/2.
316
317 jmp CommonTail
318
319
320 .literal8
321/* Define values near +/- pi/2 that yield +/- pi/2 rounded toward zero when
322 converted to single precision. This allows us to generate inexact and
323 return the desired values for atanf of big positive and negative numbers.
324*/
325AlmostpHalfPi: .double +1.5707962
326AlmostnHalfPi: .double -1.5707962
327
328
329 .text
330
331/* Here we handle inputs greater or equal to pPoint1, including infinity,
332 but not including NaNs.
333*/
334InputIsPositiveSpecial:
335
336 /* The input is a big positive number. Return +pi/2 rounded toward zero
337 and generate an inexact exception.
338 */
339 cvtsd2ss Address(AlmostpHalfPi), p0
340 jmp ReturnSingleInp0
341
342
343/* Here we handle inputs less than or equal to -1, including -infinity and
344 NaNs. The condition code must be set as if by using ucomisd to compare the
345 input in the "source operand" to nPoint1 in the "destination operand".
346*/
347InputIsNegativeSpecial:
348
349 jp InputIsNaN
350
351 /* The input is a big negative number. Return -pi/2 rounded toward zero
352 and generate an inexact exception.
353 */
354 cvtsd2ss Address(AlmostnHalfPi), p0
355 jmp ReturnSingleInp0
356
357
358InputIsNaN:
359
360 #if defined __i386__
361 flds Argx // Return result on floating-point stack.
362 #else
363 cvtsd2ss x, %xmm0 // Return in %xmm0.
364 /* We cannot just return the input, because we must quiet a
365 signalling NaN.
366 */
367 #endif
368
369 ret