this repo has no description
at fixPythonPipStalling 433 lines 12 kB view raw
1/* atan2f.s -- atan2f for standard math library. 2 3 Written by Eric Postpischil, July 2007. 4*/ 5 6 7 .literal8 8 9// Define miscellaneous constants. 10 11Threshold: .double 2.384185791015625e-07 // 2**-22. 12nZero: .double -0 13 14pPi1v4: .double +.7853981633974483096156608 // 1/4 pi. 15nPi1v4: .double -.7853981633974483096156608 16pPi1v2: .double +1.570796326794896619231322 // 1/2 pi. 17nPi1v2: .double -1.570796326794896619231322 18pPi3v4: .double +2.356194490192344928846982 // 3/4 pi. 19nPi3v4: .double -2.356194490192344928846982 20 21/* Define values near +/- pi that yield +/- pi rounded toward zero when 22 converted to single precision. This allows us to generate inexact and 23 return the desired values for atan2f on and near the negative side of the x 24 axis. 25*/ 26AlmostpPi: .double +3.1415924 27 28// Define a coefficient for center polynomial (used for x in [-1, +1]). 29C2: .double 0.0029352921857004596570518 30 31 32 .const 33 .align 4 34 35/* Define some coefficients for center polynomial (used for x in [-1, +1]). 36 These are stored in pairs at aligned addresses for use in SIMD 37 instructions. 38*/ 39C01: .double 2.2971562298314633761676433, 0.0207432003007420961489920 40C00: .double 2.4449692297316409651126041, 3.7888879287802702842997915 41C11: .double -2.9466967515109826289085300, -4.9721072376211623038916292 42C10: .double 5.4728447324456990092824269, 6.7197076223592378022736307 43 44// This needs to be 16-byte aligned because it is used in an orpd instruction. 45 .align 4 46pPi: .double +3.141592653589793238462643 // pi. 47 48 49// Rename the general registers (just to make it easier to keep track of them). 50#if defined __i386__ 51 #define r0 %eax 52 #define r1 %ecx 53 #define r2 %edx 54 #define r3 %ebx 55 #define r4 %esp 56 #define r5 %ebp 57 #define r6 %esi 58 #define r7 %edi 59#elif defined __x86_64__ 60 #define r0 %rax 61 #define r1 %rcx 62 #define r2 %rdx 63 #define r3 %rbx 64 #define r4 %rsp 65 #define r5 %rbp 66 #define r6 %rsi 67 #define r7 %rdi 68#else 69 #error "Unknown architecture." 70#endif 71 72 73 .text 74 75 76// Define various symbols. 77 78#define BaseP r0 // Base address for position-independent addressing. 79 80#define y %xmm0 // Must be in %xmm0 for return on x86_64. 81#define x %xmm1 82#define p0 %xmm2 83#define x1 %xmm3 84#define t0 %xmm4 85#define Base %xmm5 86#define p1 %xmm6 87 88#if defined __i386__ 89 90 // Define locations of arguments. 91 #define Argy 4(%esp) 92 #define Argx 8(%esp) 93 94 // Define how to address data. BaseP must contain the address of label 0. 95 #define Address(label) label-0b(BaseP) 96 97#elif defined __x86_64__ 98 99 // Define locations of arguments. 100 #define Argx %xmm1 101 #define Argy %xmm0 102 103 // Define how to address data. 104 #define Address(label) label(%rip) 105 106#endif 107 108 109/* float atan2f(float x). 110 111 Notes: 112 113 This routine has not been proven to be correct. See the notes in the 114 accompanying C version regarding potential proof. The polynomial it 115 uses was proven to provide faithfully rounded results in atanf. atan2f 116 introduces additional error performing the division and additional 117 points used in the domain of the polynomial. 118 119 Citations in parentheses below indicate the source of a requirement. 120 121 "C" stands for ISO/IEC 9899:TC2. 122 123 The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no 124 requirements since it defers to C and requires errno behavior only if 125 we choose to support it by arranging for "math_errhandling & 126 MATH_ERRNO" to be non-zero, which we do not. 127 128 Return value for atan2f(y, x) (C F.9.1 12 and C F.9.1.4): 129 130 y x atan2f(y, x) 131 132 -infinity -infinity -3*pi/4. 133 < 0 -2*pi/4. 134 -0 -2*pi/4. 135 +0 -2*pi/4. 136 > 0 -2*pi/4. 137 +infinity -1*pi/4. 138 139 < 0 -infinity -4*pi/4. 140 < 0 arctangent(y/x) in [-4*pi/4, -2*pi/4]. 141 -0 -2*pi/4. 142 +0 -2*pi/4. 143 > 0 arctangent(y/x) in [-2*pi/4, -0*pi/4]. 144 +infinity -0*pi/4. 145 146 -0 -infinity -4*pi/4. 147 < 0 -4*pi/4. 148 -0 -4*pi/4. 149 +0 -0*pi/4. 150 > 0 -0*pi/4. 151 +infinity -0*pi/4. 152 153 +0 -infinity +4*pi/4. 154 < 0 +4*pi/4. 155 -0 +4*pi/4. 156 +0 +0*pi/4. 157 > 0 +0*pi/4. 158 +infinity +0*pi/4. 159 160 > 0 -infinity +4*pi/4. 161 < 0 arctangent(y/x) in [+2*pi/4, +4*pi/4]. 162 -0 +2*pi/4. 163 +0 +2*pi/4. 164 > 0 arctangent(y/x) in [+0*pi/4, +2*pi/4]. 165 +infinity +0*pi/4. 166 167 +infinity -infinity +3*pi/4. 168 < 0 +2*pi/4. 169 -0 +2*pi/4. 170 +0 +2*pi/4. 171 > 0 +2*pi/4. 172 +infinity +1*pi/4. 173 174 If either input is a NaN, return one of the NaNs in the input. (C F.9 175 11 and 13). (If the NaN is a signalling NaN, we return the "same" NaN 176 quieted.) 177 178 Otherwise: 179 180 If the rounding mode is round-to-nearest, return arctangent(x) 181 faithfully rounded. 182 183 Return a value in [-pi, +pi] (C 7.12.4.4 3). Note that this 184 prohibits returning correctly rounded values for -pi and +pi, since 185 pi rounded to a float lies outside that interval. 186 187 Not implemented: In other rounding modes, return arctangent(x) 188 possibly with slightly worse error, not necessarily honoring the 189 rounding mode (Ali Sazegari narrowing C F.9 10). 190 191 Exceptions: 192 193 Raise underflow for a denormal result (C F.9 7 and Draft Standard for 194 Floating-Point Arithmetic P754 Draft 1.2.5 9.5). If the input is the 195 smallest normal, underflow may or may not be raised. This is stricter 196 than the older 754 standard. 197 198 May or may not raise inexact, even if the result is exact (C F.9 8). 199 200 Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite 201 of C 4.2.1) but not if the input is a quiet NaN (C F.9 11). 202 203 May not raise exceptions otherwise (C F.9 9). 204 205 Properties: 206 207 We desire this routine to be monotonic, but that has not 208 been proven. (For atan2f, monotonicity would mean, if (x0, y0) and 209 (x1, y1) are in the same quadrant, then y0/x0 <= y1/x1 implies 210 atan2f(y0, x0) <= atan2f(y1, x1).) 211*/ 212 .align 5 213 .globl _atan2f 214_atan2f: 215 216 cvtss2sd Argy, y // Convert to double precision. 217 cvtss2sd Argx, x 218 219 #if defined __i386__ 220 221 // Get address of 0 in BaseP. 222 call 0f // Push program counter onto stack. 223 0: 224 pop BaseP // Get program counter. 225 226 #endif 227 228#define nx t0 229 movsd Address(nZero), nx 230 xorpd x, nx // Negate x. 231 232 ucomisd x, y 233 jae yGEx // If we jump, y >= x. 234 je Unordered // If we jump, an operand is a NaN. 235 // If we fall through, y < x. 236 237 ucomisd y, nx 238 jae yLTx_and_nxGEy // If we jump, y < x and -x >= y. 239 // If we fall through, -x < y < x. 240 241 // Return atand(y/x). 242 divsd x, y // Form y/x. 243 movsd y, x // Move to register used by Polynomial. 244 movsd Address(nZero), Base // Set Base to -0. 245 // This makes the return value -0 if y is -0 and x > 0. 246 jmp Polynomial // Return 0 + arctangent(y/x). 247 248 249yLTx_and_nxGEy: // Here y < x && y <= -x. 250 je yLTx_and_nxEQy // If we jump, y < x && -x == y. 251 // If we fall through, y < x < -y. 252 253 // Return -pi/2 - atand(x/y). 254 movsd Address(nZero), t0 // Get -0 for sign bit. 255 divsd y, x // Form x/y. 256 xorpd t0, x // Form -x/y. 257 movsd Address(nPi1v2), Base // Set Base to -pi/2. 258 jmp Polynomial // Return -pi/2 + arctangent(-x/y). 259 260 261yLTx_and_nxEQy: // Here -x == y < x. 262 // Return -1/4*pi with inexact exception. 263 cvtsd2ss Address(nPi1v4), y 264 jmp ReturnSingle 265 266 267yGEx: // Here y >= x. 268 je yEQx // If we jump, y == x. 269 // If we fall through, y > x. 270 271 ucomisd y, nx 272#undef nx 273 jae yGTx_and_nxGEy // If we jump, y > x && -x >= y. 274 // If we fall through, -y < x < y. 275 276 // Return +pi/2 - atand(x/y). 277 movsd Address(nZero), t0 // Get -0 for sign bit. 278 divsd y, x // Form x/y. 279 movsd Address(pPi1v2), Base // Set Base to +pi/2. 280 xorpd t0, x // Form -x/y. 281 jmp Polynomial // Return +pi/2 + arctangent(-x/y). 282 283 284yGTx_and_nxGEy: // Here y > x && -x >= y. 285 je yGTx_and_nxEQy // If we jump, y > x && -x == y. 286 // If we fall through, x < y < -x. 287 288 movsd Address(nZero), Base // Get mask for sign bit. 289 movapd Base, t0 // Copy mask. 290 andpd y, Base // Extract sign bit of y. 291 divsd x, y // Form y/x. 292 andnpd y, t0 // Take absolute value of quotient. 293 comisd Address(Threshold), t0 // Is quotient small? 294 jbe NearNegativeXAxis 295 orpd Address(pPi), Base // Set Base to pi with y's sign. 296 movsd y, x // Move to register used by Polynomial. 297 jmp Polynomial // Return +/- pi + arctangent(y/x). 298 299 300NearNegativeXAxis: 301 // Return -pi or +pi, matching the sign of y, rounded toward zero. 302 movsd Address(AlmostpPi), p0 303 xorpd Base, p0 304 jmp ReturnDouble 305 306 307yGTx_and_nxEQy: // Here x < y == -x. 308 // Return +3/4*pi with inexact exception. 309 cvtsd2ss Address(pPi3v4), y 310 jmp ReturnSingle 311 312 313yEQx: // Here y == x. 314 315 ucomisd Address(nZero), y 316 jae yEQx_and_yGE0 // If we jump, y == x && y >= 0. 317 // If we fall through, x == y < -x. 318 319 // Return -3/4*pi with inexact exception. 320 cvtsd2ss Address(nPi3v4), y 321 jmp ReturnSingle 322 323 324yEQx_and_yGE0: // Here y == x && y >= 0. 325 je yEQx_and_yEQ0 // If we jump, y == x && y == 0. 326 // If we fall through, -x < y == x. 327 328 // Return +1/4*pi with inexact exception. 329 cvtsd2ss Address(pPi1v4), y 330 jmp ReturnSingle 331 332 333yEQx_and_yEQ0: // Here y == x == 0. 334 335 /* Return: 336 x y atan2f(y, x) 337 -0 -0 -pi, with inexact exception. 338 -0 +0 +pi, with inexact exception. 339 +0 -0 -0. 340 +0 +0 +0. 341 */ 342 343 /* We want to know if x is -0 or +0, but there is no direct test for that 344 that puts the results in a vector register. We do an arithmetic right 345 shift to fill up the exponent bits with copies of the sign bit. This 346 produces a NaN or +0. Then we test for "unordered", which yields all 347 one bits if x was -0 and all zero bits if x was +0. 348 */ 349 psraw $12, x 350 cmpunordsd x, x 351 352 // Form (almost) pi if x was -0 and 0 if x was +0. 353 movsd Address(AlmostpPi), p0 354 andpd x, p0 355 356 orpd y, p0 // Apply the sign bit from y. 357 jmp ReturnDouble 358 359 360Unordered: // Here x or y is a NaN. 361 addsd x, y // Return one of the NaNs, quieted. 362 cvtsd2ss y, y 363 jmp ReturnSingle 364 365 366/* This is the principal arctangent evaluation. Previous code has prepared 367 the Base and y registers, and we need to calculate Base + arctangent(y). 368 The result is then converted to a single-precision number and returned. 369 370 -1 <= y <= +1. (Actually, -1 < y < +1. The equalities were sidetracked 371 during all the branching above, and division of two different 372 single-precision numbers converted to double-precision never rounds to 373 one.) 374 375 There are some slight inefficiencies here, in that special cases could omit 376 a few instructions -- sometimes the base is zero or y had to be negated to 377 fit this common code. So, if speed is all important, this routine might be 378 speeded up a little by replicating this code. 379*/ 380Polynomial: 381 382/* The polynomial that approximates arctangent has been arranged into the 383 form: 384 385 x * c2 386 * ((x**4 + c01 * x**2 + c00)) 387 * ((x**4 + c11 * x**2 + c10)) 388 * ((x**4 + c21 * x**2 + c20)) 389 * ((x**4 + c31 * x**2 + c30)) 390 391 The coefficients are stored in pairs, with c01 and c21 at C01, c00 and c20 392 at C00, c11 and c31 at C11, and c10 and c30 at C10. c2 is at C2. 393 394 The quartic factors are evaluated in SIMD registers. For brevity, some 395 comments below describe only one element of a register. The other is 396 analagous. 397*/ 398 movsd x, x1 // Save a copy of x for later. 399 movapd Address(C11), p1 400 mulsd x, x // Form x**2. 401#define x2 x // Define name describing current register contents. 402 movlhps x2, x2 // Duplicate x**2. 403 addpd x2, p1 // Form x**2 + c11. 404 mulpd x2, p1 // Form x**4 + c11 * x**2. 405 addpd Address(C10), p1 // Form x**4 + c11 * x**2 + c10. 406 movapd Address(C01), p0 // Get first coefficients. 407 addpd x2, p0 // Form x**2 + c01. 408 mulpd x2, p0 // Form x**4 + c01 * x**2. 409 movhpd Address(C2), x1 // Put c2 in one element, with x in other. 410 addpd Address(C00), p0 // Form x**4 + c01 * x**2 + c00. 411 mulpd p1, p0 // Combine factors. 412 mulpd x1, p0 // Multiply by x and c2. 413 movhlps p0, p1 // Get high element. 414 mulsd p1, p0 // Finish combining factors. 415#undef x2 416 addsd Base, p0 417 418// Return the double-precision number currently in p0. 419ReturnDouble: 420 cvtsd2ss p0, y // Convert result to single precision. 421 422// Return the single-precision number currently in p0. 423ReturnSingle: 424 425 #if defined __i386__ 426 movss y, Argx // Shuttle result through memory. 427 // This uses the argument area for scratch space, which is allowed. 428 flds Argx // Return input on floating-point stack. 429 #else 430 // On x86_64, the return value is now in y, which is %xmm0. 431 #endif 432 433 ret