this repo has no description
1/* atan2f.s -- atan2f for standard math library.
2
3 Written by Eric Postpischil, July 2007.
4*/
5
6
7 .literal8
8
9// Define miscellaneous constants.
10
11Threshold: .double 2.384185791015625e-07 // 2**-22.
12nZero: .double -0
13
14pPi1v4: .double +.7853981633974483096156608 // 1/4 pi.
15nPi1v4: .double -.7853981633974483096156608
16pPi1v2: .double +1.570796326794896619231322 // 1/2 pi.
17nPi1v2: .double -1.570796326794896619231322
18pPi3v4: .double +2.356194490192344928846982 // 3/4 pi.
19nPi3v4: .double -2.356194490192344928846982
20
21/* Define values near +/- pi that yield +/- pi rounded toward zero when
22 converted to single precision. This allows us to generate inexact and
23 return the desired values for atan2f on and near the negative side of the x
24 axis.
25*/
26AlmostpPi: .double +3.1415924
27
28// Define a coefficient for center polynomial (used for x in [-1, +1]).
29C2: .double 0.0029352921857004596570518
30
31
32 .const
33 .align 4
34
35/* Define some coefficients for center polynomial (used for x in [-1, +1]).
36 These are stored in pairs at aligned addresses for use in SIMD
37 instructions.
38*/
39C01: .double 2.2971562298314633761676433, 0.0207432003007420961489920
40C00: .double 2.4449692297316409651126041, 3.7888879287802702842997915
41C11: .double -2.9466967515109826289085300, -4.9721072376211623038916292
42C10: .double 5.4728447324456990092824269, 6.7197076223592378022736307
43
44// This needs to be 16-byte aligned because it is used in an orpd instruction.
45 .align 4
46pPi: .double +3.141592653589793238462643 // pi.
47
48
49// Rename the general registers (just to make it easier to keep track of them).
50#if defined __i386__
51 #define r0 %eax
52 #define r1 %ecx
53 #define r2 %edx
54 #define r3 %ebx
55 #define r4 %esp
56 #define r5 %ebp
57 #define r6 %esi
58 #define r7 %edi
59#elif defined __x86_64__
60 #define r0 %rax
61 #define r1 %rcx
62 #define r2 %rdx
63 #define r3 %rbx
64 #define r4 %rsp
65 #define r5 %rbp
66 #define r6 %rsi
67 #define r7 %rdi
68#else
69 #error "Unknown architecture."
70#endif
71
72
73 .text
74
75
76// Define various symbols.
77
78#define BaseP r0 // Base address for position-independent addressing.
79
80#define y %xmm0 // Must be in %xmm0 for return on x86_64.
81#define x %xmm1
82#define p0 %xmm2
83#define x1 %xmm3
84#define t0 %xmm4
85#define Base %xmm5
86#define p1 %xmm6
87
88#if defined __i386__
89
90 // Define locations of arguments.
91 #define Argy 4(%esp)
92 #define Argx 8(%esp)
93
94 // Define how to address data. BaseP must contain the address of label 0.
95 #define Address(label) label-0b(BaseP)
96
97#elif defined __x86_64__
98
99 // Define locations of arguments.
100 #define Argx %xmm1
101 #define Argy %xmm0
102
103 // Define how to address data.
104 #define Address(label) label(%rip)
105
106#endif
107
108
109/* float atan2f(float x).
110
111 Notes:
112
113 This routine has not been proven to be correct. See the notes in the
114 accompanying C version regarding potential proof. The polynomial it
115 uses was proven to provide faithfully rounded results in atanf. atan2f
116 introduces additional error performing the division and additional
117 points used in the domain of the polynomial.
118
119 Citations in parentheses below indicate the source of a requirement.
120
121 "C" stands for ISO/IEC 9899:TC2.
122
123 The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
124 requirements since it defers to C and requires errno behavior only if
125 we choose to support it by arranging for "math_errhandling &
126 MATH_ERRNO" to be non-zero, which we do not.
127
128 Return value for atan2f(y, x) (C F.9.1 12 and C F.9.1.4):
129
130 y x atan2f(y, x)
131
132 -infinity -infinity -3*pi/4.
133 < 0 -2*pi/4.
134 -0 -2*pi/4.
135 +0 -2*pi/4.
136 > 0 -2*pi/4.
137 +infinity -1*pi/4.
138
139 < 0 -infinity -4*pi/4.
140 < 0 arctangent(y/x) in [-4*pi/4, -2*pi/4].
141 -0 -2*pi/4.
142 +0 -2*pi/4.
143 > 0 arctangent(y/x) in [-2*pi/4, -0*pi/4].
144 +infinity -0*pi/4.
145
146 -0 -infinity -4*pi/4.
147 < 0 -4*pi/4.
148 -0 -4*pi/4.
149 +0 -0*pi/4.
150 > 0 -0*pi/4.
151 +infinity -0*pi/4.
152
153 +0 -infinity +4*pi/4.
154 < 0 +4*pi/4.
155 -0 +4*pi/4.
156 +0 +0*pi/4.
157 > 0 +0*pi/4.
158 +infinity +0*pi/4.
159
160 > 0 -infinity +4*pi/4.
161 < 0 arctangent(y/x) in [+2*pi/4, +4*pi/4].
162 -0 +2*pi/4.
163 +0 +2*pi/4.
164 > 0 arctangent(y/x) in [+0*pi/4, +2*pi/4].
165 +infinity +0*pi/4.
166
167 +infinity -infinity +3*pi/4.
168 < 0 +2*pi/4.
169 -0 +2*pi/4.
170 +0 +2*pi/4.
171 > 0 +2*pi/4.
172 +infinity +1*pi/4.
173
174 If either input is a NaN, return one of the NaNs in the input. (C F.9
175 11 and 13). (If the NaN is a signalling NaN, we return the "same" NaN
176 quieted.)
177
178 Otherwise:
179
180 If the rounding mode is round-to-nearest, return arctangent(x)
181 faithfully rounded.
182
183 Return a value in [-pi, +pi] (C 7.12.4.4 3). Note that this
184 prohibits returning correctly rounded values for -pi and +pi, since
185 pi rounded to a float lies outside that interval.
186
187 Not implemented: In other rounding modes, return arctangent(x)
188 possibly with slightly worse error, not necessarily honoring the
189 rounding mode (Ali Sazegari narrowing C F.9 10).
190
191 Exceptions:
192
193 Raise underflow for a denormal result (C F.9 7 and Draft Standard for
194 Floating-Point Arithmetic P754 Draft 1.2.5 9.5). If the input is the
195 smallest normal, underflow may or may not be raised. This is stricter
196 than the older 754 standard.
197
198 May or may not raise inexact, even if the result is exact (C F.9 8).
199
200 Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
201 of C 4.2.1) but not if the input is a quiet NaN (C F.9 11).
202
203 May not raise exceptions otherwise (C F.9 9).
204
205 Properties:
206
207 We desire this routine to be monotonic, but that has not
208 been proven. (For atan2f, monotonicity would mean, if (x0, y0) and
209 (x1, y1) are in the same quadrant, then y0/x0 <= y1/x1 implies
210 atan2f(y0, x0) <= atan2f(y1, x1).)
211*/
212 .align 5
213 .globl _atan2f
214_atan2f:
215
216 cvtss2sd Argy, y // Convert to double precision.
217 cvtss2sd Argx, x
218
219 #if defined __i386__
220
221 // Get address of 0 in BaseP.
222 call 0f // Push program counter onto stack.
223 0:
224 pop BaseP // Get program counter.
225
226 #endif
227
228#define nx t0
229 movsd Address(nZero), nx
230 xorpd x, nx // Negate x.
231
232 ucomisd x, y
233 jae yGEx // If we jump, y >= x.
234 je Unordered // If we jump, an operand is a NaN.
235 // If we fall through, y < x.
236
237 ucomisd y, nx
238 jae yLTx_and_nxGEy // If we jump, y < x and -x >= y.
239 // If we fall through, -x < y < x.
240
241 // Return atand(y/x).
242 divsd x, y // Form y/x.
243 movsd y, x // Move to register used by Polynomial.
244 movsd Address(nZero), Base // Set Base to -0.
245 // This makes the return value -0 if y is -0 and x > 0.
246 jmp Polynomial // Return 0 + arctangent(y/x).
247
248
249yLTx_and_nxGEy: // Here y < x && y <= -x.
250 je yLTx_and_nxEQy // If we jump, y < x && -x == y.
251 // If we fall through, y < x < -y.
252
253 // Return -pi/2 - atand(x/y).
254 movsd Address(nZero), t0 // Get -0 for sign bit.
255 divsd y, x // Form x/y.
256 xorpd t0, x // Form -x/y.
257 movsd Address(nPi1v2), Base // Set Base to -pi/2.
258 jmp Polynomial // Return -pi/2 + arctangent(-x/y).
259
260
261yLTx_and_nxEQy: // Here -x == y < x.
262 // Return -1/4*pi with inexact exception.
263 cvtsd2ss Address(nPi1v4), y
264 jmp ReturnSingle
265
266
267yGEx: // Here y >= x.
268 je yEQx // If we jump, y == x.
269 // If we fall through, y > x.
270
271 ucomisd y, nx
272#undef nx
273 jae yGTx_and_nxGEy // If we jump, y > x && -x >= y.
274 // If we fall through, -y < x < y.
275
276 // Return +pi/2 - atand(x/y).
277 movsd Address(nZero), t0 // Get -0 for sign bit.
278 divsd y, x // Form x/y.
279 movsd Address(pPi1v2), Base // Set Base to +pi/2.
280 xorpd t0, x // Form -x/y.
281 jmp Polynomial // Return +pi/2 + arctangent(-x/y).
282
283
284yGTx_and_nxGEy: // Here y > x && -x >= y.
285 je yGTx_and_nxEQy // If we jump, y > x && -x == y.
286 // If we fall through, x < y < -x.
287
288 movsd Address(nZero), Base // Get mask for sign bit.
289 movapd Base, t0 // Copy mask.
290 andpd y, Base // Extract sign bit of y.
291 divsd x, y // Form y/x.
292 andnpd y, t0 // Take absolute value of quotient.
293 comisd Address(Threshold), t0 // Is quotient small?
294 jbe NearNegativeXAxis
295 orpd Address(pPi), Base // Set Base to pi with y's sign.
296 movsd y, x // Move to register used by Polynomial.
297 jmp Polynomial // Return +/- pi + arctangent(y/x).
298
299
300NearNegativeXAxis:
301 // Return -pi or +pi, matching the sign of y, rounded toward zero.
302 movsd Address(AlmostpPi), p0
303 xorpd Base, p0
304 jmp ReturnDouble
305
306
307yGTx_and_nxEQy: // Here x < y == -x.
308 // Return +3/4*pi with inexact exception.
309 cvtsd2ss Address(pPi3v4), y
310 jmp ReturnSingle
311
312
313yEQx: // Here y == x.
314
315 ucomisd Address(nZero), y
316 jae yEQx_and_yGE0 // If we jump, y == x && y >= 0.
317 // If we fall through, x == y < -x.
318
319 // Return -3/4*pi with inexact exception.
320 cvtsd2ss Address(nPi3v4), y
321 jmp ReturnSingle
322
323
324yEQx_and_yGE0: // Here y == x && y >= 0.
325 je yEQx_and_yEQ0 // If we jump, y == x && y == 0.
326 // If we fall through, -x < y == x.
327
328 // Return +1/4*pi with inexact exception.
329 cvtsd2ss Address(pPi1v4), y
330 jmp ReturnSingle
331
332
333yEQx_and_yEQ0: // Here y == x == 0.
334
335 /* Return:
336 x y atan2f(y, x)
337 -0 -0 -pi, with inexact exception.
338 -0 +0 +pi, with inexact exception.
339 +0 -0 -0.
340 +0 +0 +0.
341 */
342
343 /* We want to know if x is -0 or +0, but there is no direct test for that
344 that puts the results in a vector register. We do an arithmetic right
345 shift to fill up the exponent bits with copies of the sign bit. This
346 produces a NaN or +0. Then we test for "unordered", which yields all
347 one bits if x was -0 and all zero bits if x was +0.
348 */
349 psraw $12, x
350 cmpunordsd x, x
351
352 // Form (almost) pi if x was -0 and 0 if x was +0.
353 movsd Address(AlmostpPi), p0
354 andpd x, p0
355
356 orpd y, p0 // Apply the sign bit from y.
357 jmp ReturnDouble
358
359
360Unordered: // Here x or y is a NaN.
361 addsd x, y // Return one of the NaNs, quieted.
362 cvtsd2ss y, y
363 jmp ReturnSingle
364
365
366/* This is the principal arctangent evaluation. Previous code has prepared
367 the Base and y registers, and we need to calculate Base + arctangent(y).
368 The result is then converted to a single-precision number and returned.
369
370 -1 <= y <= +1. (Actually, -1 < y < +1. The equalities were sidetracked
371 during all the branching above, and division of two different
372 single-precision numbers converted to double-precision never rounds to
373 one.)
374
375 There are some slight inefficiencies here, in that special cases could omit
376 a few instructions -- sometimes the base is zero or y had to be negated to
377 fit this common code. So, if speed is all important, this routine might be
378 speeded up a little by replicating this code.
379*/
380Polynomial:
381
382/* The polynomial that approximates arctangent has been arranged into the
383 form:
384
385 x * c2
386 * ((x**4 + c01 * x**2 + c00))
387 * ((x**4 + c11 * x**2 + c10))
388 * ((x**4 + c21 * x**2 + c20))
389 * ((x**4 + c31 * x**2 + c30))
390
391 The coefficients are stored in pairs, with c01 and c21 at C01, c00 and c20
392 at C00, c11 and c31 at C11, and c10 and c30 at C10. c2 is at C2.
393
394 The quartic factors are evaluated in SIMD registers. For brevity, some
395 comments below describe only one element of a register. The other is
396 analagous.
397*/
398 movsd x, x1 // Save a copy of x for later.
399 movapd Address(C11), p1
400 mulsd x, x // Form x**2.
401#define x2 x // Define name describing current register contents.
402 movlhps x2, x2 // Duplicate x**2.
403 addpd x2, p1 // Form x**2 + c11.
404 mulpd x2, p1 // Form x**4 + c11 * x**2.
405 addpd Address(C10), p1 // Form x**4 + c11 * x**2 + c10.
406 movapd Address(C01), p0 // Get first coefficients.
407 addpd x2, p0 // Form x**2 + c01.
408 mulpd x2, p0 // Form x**4 + c01 * x**2.
409 movhpd Address(C2), x1 // Put c2 in one element, with x in other.
410 addpd Address(C00), p0 // Form x**4 + c01 * x**2 + c00.
411 mulpd p1, p0 // Combine factors.
412 mulpd x1, p0 // Multiply by x and c2.
413 movhlps p0, p1 // Get high element.
414 mulsd p1, p0 // Finish combining factors.
415#undef x2
416 addsd Base, p0
417
418// Return the double-precision number currently in p0.
419ReturnDouble:
420 cvtsd2ss p0, y // Convert result to single precision.
421
422// Return the single-precision number currently in p0.
423ReturnSingle:
424
425 #if defined __i386__
426 movss y, Argx // Shuttle result through memory.
427 // This uses the argument area for scratch space, which is allowed.
428 flds Argx // Return input on floating-point stack.
429 #else
430 // On x86_64, the return value is now in y, which is %xmm0.
431 #endif
432
433 ret