this repo has no description
at fixPythonPipStalling 474 lines 14 kB view raw
1/* acosf.s -- acosf for standard math library. 2 3 Written by Eric Postpischil, July 2007. 4*/ 5 6 7 .literal8 8 9// Define the points where our algorithm changes. 10nPoint: .double -.62000000476837158 11pPoint: .double +.62000000476837158 12 13// Define miscellaneous constants. 14nOne: .double -1 15pOne: .double +1 16HalfPi: .double 1.5707963267948966192313217 17Pi: .double 3.1415926535897932384626433 18 19// Define a coefficient for center polynomial (used for x in [-.62, +.62]). 20C2: .double -0.7009458946846486978642201e-1 21 22// Define coefficients for tail polynomial (used for x outside [-.62, +.62]). 23pT2: .double 0.1475254770791043897245664e-2 24nT2: .double -0.1475254770791043897245664e-2 25 26// Define coefficients for reciprocal-square-root refinement. 27S0: .double 4.999999720603234088021257968 28S1: .double -3.333333463718495862077843807 29 30 31 .const 32 .align 4 33 34/* Define some coefficients for center polynomial (used for x in [-.62, 35 +.62]). These are stored in pairs at aligned addresses for use in SIMD 36 instructions. 37*/ 38C1: .double +1.5714407739207253912688773, -1.5375796594506042231041545 39C0: .double +1.2521077245762256716713145, +1.8992998261460658570880250 40 41/* Define some coefficients for positive tail polynomial (used for x in (+.62, 42 1)). 43*/ 44T0: .double +14.7303701447222424478789710, +27.0927484663960477135094919 45T1: .double +2.7185038560132749878882294, -8.6073428141752546318770571 46 47 48// Rename the general registers (just to make it easier to keep track of them). 49#if defined __i386__ 50 #define r0 %eax 51 #define r1 %ecx 52 #define r2 %edx 53 #define r3 %ebx 54 #define r4 %esp 55 #define r5 %ebp 56 #define r6 %esi 57 #define r7 %edi 58#elif defined __x86_64__ 59 #define r0 %rax 60 #define r1 %rcx 61 #define r2 %rdx 62 #define r3 %rbx 63 #define r4 %rsp 64 #define r5 %rbp 65 #define r6 %rsi 66 #define r7 %rdi 67#else 68 #error "Unknown architecture." 69#endif 70 71 72 .text 73 74 75// Define various symbols. 76 77#define BaseP r0 // Base address for position-independent addressing. 78 79#define p %xmm0 // Must be in %xmm0 for return on x86_64. 80#define x %xmm1 81#define x1 %xmm2 82#define w x1 83#define pa %xmm3 84#define e %xmm4 85#define rss %xmm5 86 87#if defined __i386__ 88 89 // Define location of argument x. 90 #define Argx 4(%esp) 91 92 // Define how to address data. BaseP must contain the address of label 0. 93 #define Address(label) label-0b(BaseP) 94 95#elif defined __x86_64__ 96 97 // Define location of argument x. 98 #define Argx %xmm0 99 100 // Define how to address data. 101 #define Address(label) label(%rip) 102 103#endif 104 105 106/* float acosf(float x). 107 108 Notes: 109 110 Citations in parentheses below indicate the source of a requirement. 111 112 "C" stands for ISO/IEC 9899:TC2. 113 114 The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no 115 requirements since it defers to C and requires errno behavior only if 116 we choose to support it by arranging for "math_errhandling & 117 MATH_ERRNO" to be non-zero, which we do not. 118 119 Return value: 120 121 For arcosine of 1, return +0 (C F.9.1.1). 122 123 For 1 < |x| (including infinity), return NaN (C F.9.1.1). 124 125 For a NaN, return the same NaN (C F.9 11 and 13). (If the NaN is a 126 signalling NaN, we return the "same" NaN quieted.) 127 128 Otherwise: 129 130 If the rounding mode is round-to-nearest, return arccosine(x) 131 faithfully rounded. 132 133 Return a value in [0, pi] (C 7.12.4.1 3). Note that this prohibits 134 returning a correctly rounded value for acosf(-1), since pi rounded 135 to a float lies outside that interval. 136 137 Not implemented: In other rounding modes, return arccosine(x) 138 possibly with slightly worse error, not necessarily honoring the 139 rounding mode (Ali Sazegari narrowing C F.9 10). 140 141 Exceptions: 142 143 May or may not raise inexact, even if the result is exact (C F.9 8). 144 145 Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite 146 of C 4.2.1) or 1 < |x| (including infinity) (C F.9.1.1) but not if the 147 input is a quiet NaN (C F.9 11). 148 149 May not raise exceptions otherwise (C F.9 9). 150 151 Properties: 152 153 Monotonic, proven by exhaustive testing. 154 155 Exhaustive testing proved this routine returns faithfully rounded 156 results. Since the rsqrtss instruction is specified to return a value 157 in an interval, tests were performed using each possible result, 158 showing that a valid result will be obtained regardless of which 159 value rsqrtss provides. 160*/ 161 .align 5 162#if !defined DevelopmentInstrumentation 163 // This is the regular name used in the deployed implementation. 164 .globl _acosf 165 _acosf: 166#else 167 // This is the name used for a special test version of the routine. 168 .globl _acosfInstrumented 169 _acosfInstrumented: 170#endif 171 172 cvtss2sd Argx, x // Convert x to double precision. 173 174 #if defined __i386__ 175 176 // Get address of 0 in BaseP. 177 call 0f // Push program counter onto stack. 178 0: 179 pop BaseP // Get program counter. 180 181 #endif 182 183/* We use different algorithms for different parts of the domain. There 184 is a "negative tail" from -1 to nPoint, a center from nPoint to pPoint, 185 and a "positive tail" from pPoint to +1. Here, we compare and branch 186 to the appropriate code. 187 188 There are also special cases: NaNs, x < -1, and 1 < x. These are weeded 189 out in PositiveTail or NegativeTail. 190*/ 191 192 ucomisd Address(pPoint), x 193 ja PositiveTail 194 195 ucomisd Address(nPoint), x 196 jb NegativeTail 197 198 199/* Here we have nPoint <= x <= pPoint. This is handled with a simple 200 evaluation of a polynomial that approximates arccosine. 201 202 The polynomial has been arranged into the form: 203 204 pi/2 - x + x * 205 c2 * (x**4 + c01 * x**2 + c00) 206 * x**2 * (x**4 + c11 * x**2 * c10)). 207 208 The coefficients c00 and c10 are stored in a pair at C0, and c01 and c11 209 are stored at C1. c2 is at C2. 210 211 The two quartic factors are evaluated in SIMD registers. For brevity, some 212 comments below describe only one element of a register. The other is 213 analagous. 214*/ 215 movsd x, x1 // Save a copy of x for later. 216 mulsd x, x // Form x**2. 217 movapd Address(C1), p // Get first coefficient pair. 218 unpcklpd x, x // Duplicate in SIMD register. 219 addpd x, p // Form x**2 + c1. 220 mulpd x, p // Form x**4 + c1 * x**2. 221 movlpd Address(C2), x // Put c2 in low element, with x**2 in high. 222 mulsd x1, x // Multiply by x. 223 addpd Address(C0), p // Form x**4 + c1 * x**2 + c0. 224 mulpd x, p // Multiply by c2*x in low and x**2 in high. 225 movhlps p, pa // Get high element. 226 subsd Address(HalfPi), x1 // Form x - pi/2. 227 mulsd pa, p // Multiply two factors. 228 subsd x1, p // Form pi/2 - x + x * c2 * ... 229 230// Return the double-precision number currently in p. 231ReturnDoubleInp: 232 cvtsd2ss p, p // Convert result to single precision. 233 234// Return the single-precision number currently in p. 235ReturnSingleInp: 236 237 #if defined __i386__ 238 movss p, Argx // Shuttle result through memory. 239 // This uses the argument area for scratch space, which is allowed. 240 flds Argx // Return input on floating-point stack. 241 #else 242 // On x86_64, the return value is now in p, which is %xmm0. 243 #endif 244 245 ret 246 247 248// Handle pPoint < x. 249PositiveTail: 250 251 movsd Address(pOne), w // Get +1 for math and comparison. 252 ucomisd w, x // Compare x to +1. 253 jae InputIsPositiveSpecial 254 255/* Here we have pPoint < x < 1. The algorithm here is inspired by the 256 identity arccosine(x) = 2 * arcsine(sqrt((1-x)/2)). Replacing arcsine with 257 an approximating polynomial would give an odd polynomial in sqrt((1-x)/2), 258 which is the same as sqrt(1-x) multiplied by some polynomial in x. So we 259 have: 260 261 arccosine(x) ~= sqrt(1-x) * t(x). 262 263 Unfortunately, the square-root instruction (rsqrtss) takes too long, so we 264 use the faster reciprocal-square-root-estimate instruction instead and 265 refine its estimate. Given an estimate e from the rsqrtss instruction, the 266 square root of 1-x is very nearly, e * (1-x) * s(e**2 * (1-x)). Let e2a 267 be e**2 * (1-x), so sqrt(1-x) is nearly e * a * s(e2a). The leading 268 coefficient of s, cl, has been removed (by dividing s by it) and multiplied 269 into the polynomial p above. That leaves: 270 271 sqrt(1-x) / cl ~= e * a * s(e2a)/cl. 272 273 s(e2a)/cl is e2a**2 + s1 * e2a + s0, where s1 and s0 have been stored at 274 labels S1 and S0, above. 275 276 t(x) has been arranged into the form: 277 278 t2 279 * (x**2 + t01 * x + t00) 280 * (x**2 + t11 * x + t10). 281 282 The two quadratic factors are evaluated in SIMD registers. For brevity, 283 some comments below describe only one element of a register. The other is 284 analagous. 285 286 So, our job here is to evaluate: 287 288 a = 1-x. 289 290 e = estimate of 1/sqrt(a). 291 292 e2a = e*e*a. 293 294 acosf(x) ~= e * a 295 * (e2a**2 + s1 * e2a + s0) 296 * t2 297 * (x**2 + t01 * x + t00) 298 * (x**2 + t11 * x + t10). 299*/ 300 301 subsd x, w // Form 1-x. 302 cvtsd2ss w, e // Convert to single precision for rsqrtss. 303 movapd Address(T1), p // Start asinf polynomial. 304 unpcklpd x, x // Duplicate x. 305 addpd x, p // Form x + t1. 306 #if !defined DevelopmentInstrumentation 307 // This is the regular code used in the deployed implementation. 308 rsqrtss e, e // Estimate 1/sqrt(1-x). 309 #else 310 /* This instruction uses an estimate of 1/sqrt(1-x) passed by the 311 caller rather than the rsqrtss instruction. This allows us to test 312 the implementation with all values that rsqrtss might return. 313 */ 314 movss 8(%esp), e // Use caller's estimate of 1/sqrt(1-x). 315 #endif 316 cvtss2sd e, e // Convert to double precision. 317 mulpd x, p // Form x**2 + t1*x. 318 mulsd e, w // Form e * (1-x). 319 mulsd w, e // Form e**2 * (1-x). 320 // "e" in comments refers to the initial estimate from rsqrtss. 321 movhpd Address(pT2), w // Copy coefficient into high element. 322 addpd Address(T0), p // Form x**2 + t1*x + t0. 323 movsd Address(S1), rss // Form s1. 324 addsd e, rss // Form e**2 * (1-x) + s1. 325 mulpd w, p // Form e * (1-x) * p(x), split. 326 mulsd e, rss // Form e2a**2 + s1 * e2a. 327 movhlps p, pa // Separate high element. 328 addsd Address(S0), rss // Form e2a**2 + s1 * e2a + s0. 329 mulsd pa, p // Finish e * (1-x) * p(x). 330 mulsd rss, p // Form e * (1-x) / sqrt(1-x) * p(x). 331 332 jmp ReturnDoubleInp 333 334 335// Handle x < nPoint. 336NegativeTail: 337 338 movsd Address(pOne), w // Get +1 for math. 339 ucomisd Address(nOne), x // Compare x to -1. 340 jbe InputIsNegativeSpecial 341 342/* Here we have -1 < x < nPoint. We use the same algorithm as in PositiveTail 343 but adapted for -x. 344 345 For brevity, some comments below describe only one element of a register. 346 The other is analagous. 347 348 Our job here is to evaluate: 349 350 a = 1+x. 351 352 e = estimate of 1/sqrt(a). 353 354 e2a = e*e*a. 355 356 acosf(x) ~= pi - e * a 357 * (e2a**2 + s1 * e2a + s0) 358 * t2 359 * (x**2 - t01 * x + t00) 360 * (x**2 - t11 * x + t10). 361 362 Note that the signs of terms involving x are negated from those in 363 PositiveTail. 364 365 For convenience, the final two factors are negated: 366 367 * (-x**2 + t01 * x - t00) 368 * (-x**2 + t11 * x - t10). 369*/ 370 371 addsd x, w // Form 1+x. 372 cvtsd2ss w, e // Convert to single precision for rsqrtss. 373 movapd Address(T1), p // Start asinf polynomial. 374 unpcklpd x, x // Duplicate x. 375 subpd x, p // Form -x + t1. 376 #if !defined DevelopmentInstrumentation 377 // This is the regular code used in the deployed implementation. 378 rsqrtss e, e // Estimate 1/sqrt(1+x). 379 #else 380 /* This instruction uses an estimate of 1/sqrt(1+x) passed by the 381 caller rather than the rsqrtss instruction. This allows us to test 382 the implementation with all values that rsqrtss might return. 383 */ 384 movss 8(%esp), e // Use caller's estimate of 1/sqrt(1+x). 385 #endif 386 cvtss2sd e, e // Convert to double precision. 387 mulpd x, p // Form -x**2 + t1*x. 388 mulsd e, w // Form e * (1+x). 389 mulsd w, e // Form e**2 * (1+x). 390 // "e" in comments refers to the initial estimate from rsqrtss. 391 movhpd Address(nT2), w // Copy coefficient into high element. 392 subpd Address(T0), p // Form -x**2 + t1*x - t0. 393 movsd Address(S1), rss // Form s1. 394 addsd e, rss // Form e**2 * (1+x) + s1. 395 mulpd w, p // Form e * (1+x) * p(x), split. 396 mulsd e, rss // Form e2a**2 + s1 * e2a. 397 movhlps p, pa // Separate high element. 398 addsd Address(S0), rss // Form e2a**2 + s1 * e2a + s0. 399 mulsd pa, p // Finish e * (1+x) * p(x). 400 mulsd rss, p // Form e * (1+x) / sqrt(1+x) * p(x). 401 addsd Address(Pi), p // Form pi - 2*asin(sqrt((1+x)/2)). 402 403 jmp ReturnDoubleInp 404 405 406/* Here we handle inputs greater than or equal to one, including infinity, 407 but not including NaNs. The condition code must be set to indicate equal 408 (zero flag is one) iff the input is one. 409*/ 410InputIsPositiveSpecial: 411 je InputIsPositiveOne 412 jmp SignalInvalid 413 414 415/* Here we handle inputs less than or equal to -1, including -infinity, and 416 NaNs. The condition code must be set as if by using ucomisd to compare 417 the input in the "source operand" to -1 in the "destination operand". 418*/ 419InputIsNegativeSpecial: 420 jp InputIsNaN 421 je InputIsNegativeOne 422 423 // Continue into SignalInvalid. 424// jmp SignalInvalid 425 426 427/* Here we handle inputs outside the domain of the function. We subtract 428 infinity from itself to generate the invalid signal and return a NaN. 429*/ 430 431 .literal4 432Infinity: .long 0x7f800000 433 434 .text 435SignalInvalid: 436 movss Address(Infinity), p 437 subss p, p // Generate invalid signal and NaN value. 438 jmp ReturnSingleInp 439 440 441/* Here we handle an input of +1 or -1. arccosine(-1) is pi, which increases 442 when rounded to single precision. However, we are required to return 443 results in [0, pi], so we return pi rounded down. 444*/ 445 446 .literal8 447/* Define a value near pi that yields pi rounded down when converted to 448 single precision. This allows us to generate inexact and return the 449 desired value for acosf(-1). 450*/ 451AlmostPi: .double 3.1415925 452 453 .text 454InputIsPositiveOne: 455 xorps p, p // Return zero. 456 jmp ReturnSingleInp 457 458InputIsNegativeOne: 459 cvtsd2ss Address(AlmostPi), p 460 jmp ReturnSingleInp 461 462 463InputIsNaN: 464 465 #if defined __i386__ 466 flds Argx // Return result on floating-point stack. 467 #else 468 cvtsd2ss x, %xmm0 // Return in %xmm0. 469 /* We cannot just return the input, because we must quiet a 470 signalling NaN. 471 */ 472 #endif 473 474 ret