this repo has no description
at fixPythonPipStalling 369 lines 11 kB view raw
1/* atanf.s -- atanf for standard math library. 2 3 Written by Eric Postpischil, July 2007. 4*/ 5 6 7 .literal8 8 9// Define the points where our algorithm changes. 10nPoint0: .double -1 11pPoint0: .double +1 12nPoint1: .double -16777216 13pPoint1: .double +16777216 14 15// Define miscellaneous constants. 16nHalfPi: .double -1.5707963267948966192313217 17One: .double 1 18pHalfPi: .double +1.5707963267948966192313217 19 20// Define a coefficient for center polynomial (used for x in [-1, +1]). 21C2: .double 0.0029352921857004596570518 22 23 24 .const 25 .align 4 26 27/* Define some coefficients for center polynomial (used for x in [-1, +1]). 28 These are stored in pairs at aligned addresses for use in SIMD 29 instructions. 30*/ 31C01: .double 2.2971562298314633761676433, 0.0207432003007420961489920 32C00: .double 2.4449692297316409651126041, 3.7888879287802702842997915 33C11: .double -2.9466967515109826289085300, -4.9721072376211623038916292 34C10: .double 5.4728447324456990092824269, 6.7197076223592378022736307 35 36// Define coefficients for the tail polynomial (used for x outside [-1, +1]). 37T01: .double 0.9193672354545696501477531, -0.0052035944094405570566862 38T00: .double 0.3992772987220534996563340, 0.2521268658714555740707959 39T11: .double -0.5273186542866779494437308, -0.7201738803584184183894208 40T10: .double 0.1730466268612773143731748, 0.1408679409162453515360961 41 42 43// Rename the general registers (just to make it easier to keep track of them). 44#if defined __i386__ 45 #define r0 %eax 46 #define r1 %ecx 47 #define r2 %edx 48 #define r3 %ebx 49 #define r4 %esp 50 #define r5 %ebp 51 #define r6 %esi 52 #define r7 %edi 53#elif defined __x86_64__ 54 #define r0 %rax 55 #define r1 %rcx 56 #define r2 %rdx 57 #define r3 %rbx 58 #define r4 %rsp 59 #define r5 %rbp 60 #define r6 %rsi 61 #define r7 %rdi 62#else 63 #error "Unknown architecture." 64#endif 65 66 67 .text 68 69 70// Define various symbols. 71 72#define BaseP r0 // Base address for position-independent addressing. 73 74#define p0 %xmm0 // Must be in %xmm0 for return on x86_64. 75#define x %xmm1 76#define p1 %xmm2 77#define x1 %xmm3 78#define v0 %xmm4 79#define t0 %xmm5 80 81#if defined __i386__ 82 83 // Define location of argument x. 84 #define Argx 4(%esp) 85 86 // Define how to address data. BaseP must contain the address of label 0. 87 #define Address(label) label-0b(BaseP) 88 89#elif defined __x86_64__ 90 91 // Define location of argument x. 92 #define Argx %xmm0 93 94 // Define how to address data. 95 #define Address(label) label(%rip) 96 97#endif 98 99 100/* float atanf(float x). 101 102 Notes: 103 104 Citations in parentheses below indicate the source of a requirement. 105 106 "C" stands for ISO/IEC 9899:TC2. 107 108 The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no 109 requirements since it defers to C and requires errno behavior only if 110 we choose to support it by arranging for "math_errhandling & 111 MATH_ERRNO" to be non-zero, which we do not. 112 113 Return value: 114 115 For arctangent of +/- zero, return zero with same sign (C F.9 12 and 116 F.9.1.3). 117 118 For arctangent of +/- infinity, return +/- pi/2 (C F.9.1.3). 119 120 For a NaN, return the same NaN (C F.9 11 and 13). (If the NaN is a 121 signalling NaN, we return the "same" NaN quieted.) 122 123 Otherwise: 124 125 If the rounding mode is round-to-nearest, return arctangent(x) 126 faithfully rounded. 127 128 Return a value in [-pi/2, +pi/2] (C 7.12.4.3 3). Note that this 129 prohibits returning correctly rounded values for atanf of large 130 positive or negative arguments, since pi/2 rounded to a float lies 131 outside that interval. 132 133 Not implemented: In other rounding modes, return arctangent(x) 134 possibly with slightly worse error, not necessarily honoring the 135 rounding mode (Ali Sazegari narrowing C F.9 10). 136 137 Exceptions: 138 139 Raise underflow for a denormal result (C F.9 7 and Draft Standard for 140 Floating-Point Arithmetic P754 Draft 1.2.5 9.5). If the input is the 141 smallest normal, underflow may or may not be raised. This is stricter 142 than the older 754 standard. 143 144 May or may not raise inexact, even if the result is exact (C F.9 8). 145 146 Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite 147 of C 4.2.1) but not if the input is a quiet NaN (C F.9 11). 148 149 May not raise exceptions otherwise (C F.9 9). 150 151 Properties: 152 153 Monotonic. 154 155 Exhaustive testing proved this routine returns faithfully rounded 156 results. 157*/ 158 .align 5 159#if !defined DevelopmentInstrumentation 160 // This is the regular name used in the deployed implementation. 161 .globl _atanf 162 _atanf: 163#else 164 // This is the name used for a special test version of the routine. 165 .globl _atanfInstrumented 166 _atanfInstrumented: 167#endif 168 169 cvtss2sd Argx, x // Convert x to double precision. 170 171 #if defined __i386__ 172 173 // Get address of 0 in BaseP. 174 call 0f // Push program counter onto stack. 175 0: 176 pop BaseP // Get program counter. 177 178 #endif 179 180/* We use different algorithms for different parts of the domain. There 181 is a "negative tail" from -infinity to nPoint0, a center from nPoint0 to 182 pPoint0, and a "positive tail" from pPoint0 to +infinity. Here, we compare 183 and branch to the appropriate code. 184 185 NaNs are weeded out in PositiveTail or NegativeTail. 186*/ 187 188 ucomisd Address(pPoint0), x 189 ja PositiveTail 190 191 ucomisd Address(nPoint0), x 192 jb NegativeTail 193 194 195/* Here we have nPoint0 <= x <= pPoint0. This is handled with a simple 196 evaluation of a polynomial that approximates arctangent. 197 198 The polynomial has been arranged into the form: 199 200 x * c2 201 * ((x**4 + c01 * x**2 + c00)) 202 * ((x**4 + c11 * x**2 + c10)) 203 * ((x**4 + c21 * x**2 + c20)) 204 * ((x**4 + c31 * x**2 + c30)) 205 206 The coefficients are stored in pairs, with c01 and c21 at C01, c00 and c20 207 at C00, c11 and c31 at C11, and c10 and c30 at C10. c2 is at C2. 208 209 The quartic factors are evaluated in SIMD registers. For brevity, some 210 comments below describe only one element of a register. The other is 211 analagous. 212*/ 213 movlhps x, x1 // Save a copy of x for later. 214 mulsd x, x // Form x**2. 215#define x2 x // Define name describing current register contents. 216 movlhps x2, x2 // Duplicate x**2. 217 movapd Address(C11), p1 218 addpd x2, p1 // Form x**2 + c11. 219 mulpd x2, p1 // Form x**4 + c11 * x**2. 220 addpd Address(C10), p1 // Form x**4 + c11 * x**2 + c10. 221 movapd Address(C01), p0 // Get first coefficients. 222 movlpd Address(C2), x1 // Put c2 in one element, with x in other. 223 addpd x2, p0 // Form x**2 + c01. 224 mulpd x2, p0 // Form x**4 + c01 * x**2. 225 addpd Address(C00), p0 // Form x**4 + c01 * x**2 + c00. 226 mulpd p1, p0 // Combine factors. 227 mulpd x1, p0 // Multiply by x and c2. 228 movhlps p0, p1 // Get high element. 229 mulsd p1, p0 // Finish combining factors. 230#undef x2 231 232// Return the double-precision number currently in p0. 233ReturnDoubleInp0: 234 cvtsd2ss p0, p0 // Convert result to single precision. 235 236// Return the single-precision number currently in p0. 237ReturnSingleInp0: 238 239 #if defined __i386__ 240 movss p0, Argx // Shuttle result through memory. 241 // This uses the argument area for scratch space, which is allowed. 242 flds Argx // Return input on floating-point stack. 243 #else 244 // On x86_64, the return value is now in p0, which is %xmm0. 245 #endif 246 247 ret 248 249 250// Handle pPoint0 < x. 251PositiveTail: 252 253 ucomisd Address(pPoint1), x // Compare x to algorithm change point. 254 jae InputIsPositiveSpecial 255 256 movsd Address(pHalfPi), p0 // Prepare +pi/2. 257 258CommonTail: 259 260/* Here we have pPoint0 < x < pPoint1. The algorithm here is inspired by the 261 identity (for positive x) arctangent(x) = +pi/2 - arctangent(1/x). This 262 is approximated by evaluating: 263 264 +pi/2 - 265 * ((x**4 + t01 * x**2 + t00)) 266 * ((x**4 + t11 * x**2 + t10)) 267 * ((x**4 + t21 * x**2 + t20)) 268 * ((x**4 + t31 * x**2 + t30)) 269 * (1/x) * (1/x)**16. 270 271 The quartic factors are evaluated in SIMD registers. For brevity, some 272 comments below describe only one element of a register. The other is 273 analagous. 274*/ 275 276 movsd Address(One), v0 // Get constant. 277 divsd x, v0 // Form 1/x, which we will call r. 278 /* I tried rcpss, but using it requires a precision conversion, two 279 adds, and three multiplies. divsd performance is not bad on Core2, 280 and I was not able to get the rcpss version to run as fast. 281 */ 282 mulsd x, x // Form x**2. 283#define x2 x // Define name describing current register contents. 284 movlhps v0, v0 // Copy finished reciprocal to high element. 285 movlhps x2, x2 // Duplicate x**2. 286 mulsd v0, v0 // Form r**2. 287 mulsd v0, v0 // Form r**4. 288 mulsd v0, v0 // Form r**8. 289 movapd Address(T11), p1 // Get first coefficients. 290 mulsd v0, v0 // Form r**16. 291 292 movapd Address(T01), t0 // Get first coefficients. 293 addpd x2, p1 // Form x**2 + t11. 294 mulpd x2, p1 // Form x**4 + t11 * x**2. 295 addpd Address(T10), p1 // Form x**4 + t11 * x**2 + t10. 296 addpd x2, t0 // Form x**2 + t01. 297 mulpd x2, t0 // Form x**4 + t01 * x**2. 298 addpd Address(T00), t0 // Form x**4 + t01 * x**2 + t00. 299 mulpd p1, t0 // Combine factors. 300 mulpd v0, t0 // Multiply by r**16 and r. 301 movhlps t0, p1 // Get high element. 302 mulsd p1, t0 // Finish combining factors. 303 subsd t0, p0 // Subtract from pi/2. 304#undef x2 305 306 jmp ReturnDoubleInp0 307 308 309// Handle x < nPoint0. 310NegativeTail: 311 312 ucomisd Address(nPoint1), x // Compare x to algorithm change point. 313 jbe InputIsNegativeSpecial 314 315 movsd Address(nHalfPi), p0 // Prepare -pi/2. 316 317 jmp CommonTail 318 319 320 .literal8 321/* Define values near +/- pi/2 that yield +/- pi/2 rounded toward zero when 322 converted to single precision. This allows us to generate inexact and 323 return the desired values for atanf of big positive and negative numbers. 324*/ 325AlmostpHalfPi: .double +1.5707962 326AlmostnHalfPi: .double -1.5707962 327 328 329 .text 330 331/* Here we handle inputs greater or equal to pPoint1, including infinity, 332 but not including NaNs. 333*/ 334InputIsPositiveSpecial: 335 336 /* The input is a big positive number. Return +pi/2 rounded toward zero 337 and generate an inexact exception. 338 */ 339 cvtsd2ss Address(AlmostpHalfPi), p0 340 jmp ReturnSingleInp0 341 342 343/* Here we handle inputs less than or equal to -1, including -infinity and 344 NaNs. The condition code must be set as if by using ucomisd to compare the 345 input in the "source operand" to nPoint1 in the "destination operand". 346*/ 347InputIsNegativeSpecial: 348 349 jp InputIsNaN 350 351 /* The input is a big negative number. Return -pi/2 rounded toward zero 352 and generate an inexact exception. 353 */ 354 cvtsd2ss Address(AlmostnHalfPi), p0 355 jmp ReturnSingleInp0 356 357 358InputIsNaN: 359 360 #if defined __i386__ 361 flds Argx // Return result on floating-point stack. 362 #else 363 cvtsd2ss x, %xmm0 // Return in %xmm0. 364 /* We cannot just return the input, because we must quiet a 365 signalling NaN. 366 */ 367 #endif 368 369 ret