this repo has no description
at fixPythonPipStalling 480 lines 14 kB view raw
1/* asinf.s -- asinf for standard math library. 2 3 Written by Eric Postpischil, June 2007. 4*/ 5 6 7 .literal8 8 9// Define the points where our algorithm changes. 10nPoint: .double -.57 11pPoint: .double +.57 12 13// Define miscellaneous constants. 14nOne: .double -1 15pOne: .double +1 16HalfPi: .double 1.5707963267948966192313217 17 18// Define a coefficient for center polynomial (used for x in [-.57, +.57]). 19C2: .double 0.5175137818992348748214025e-1 20 21// Define coefficients for tail polynomials (used for x outside [-.57, +.57]). 22pT2: .double -0.1651550997063622696638513e-2 23nT2: .double +0.1651550997063622696638513e-2 24 25// Define coefficients for reciprocal-square-root refinement. 26S0: .double 4.999999720603234088021257968 27S1: .double -3.333333463718495862077843807 28 29 30 .const 31 .align 4 32 33/* Define some coefficients for center polynomial (used for x in [-.57, 34 +.57]). These are stored in pairs at aligned addresses for use in SIMD 35 instructions. 36*/ 37C1: .double 1.8337275054369541825679091, -1.4826841272512669686823077 38C0: .double 1.5667812996782509912266781, 2.0555474653288563095606975 39 40// Define some coefficients for tail polynomial (used for x in (+.57, 1)). 41T0: .double +14.0375337010427139350610970, +25.3978891245023107522273720 42T1: .double +2.7174503090559850822332204, -8.3122365980471962999776635 43 44 45// Rename the general registers (just to make it easier to keep track of them). 46#if defined __i386__ 47 #define r0 %eax 48 #define r1 %ecx 49 #define r2 %edx 50 #define r3 %ebx 51 #define r4 %esp 52 #define r5 %ebp 53 #define r6 %esi 54 #define r7 %edi 55#elif defined __x86_64__ 56 #define r0 %rax 57 #define r1 %rcx 58 #define r2 %rdx 59 #define r3 %rbx 60 #define r4 %rsp 61 #define r5 %rbp 62 #define r6 %rsi 63 #define r7 %rdi 64#else 65 #error "Unknown architecture." 66#endif 67 68 69 .text 70 71 72// Define various symbols. 73 74#define BaseP r0 // Base address for position-independent addressing. 75 76#define p %xmm0 // Must be in %xmm0 for return on x86_64. 77#define x %xmm1 78#define x1 %xmm2 79#define w x1 80#define pa %xmm3 81#define e %xmm4 82#define rss %xmm5 83 84#if defined __i386__ 85 86 // Define location of argument x. 87 #define Argx 4(%esp) 88 89 // Define how to address data. BaseP must contain the address of label 0. 90 #define Address(label) label-0b(BaseP) 91 92#elif defined __x86_64__ 93 94 // Define location of argument x. 95 #define Argx %xmm0 96 97 // Define how to address data. 98 #define Address(label) label(%rip) 99 100#endif 101 102 103/* float asinf(float x). 104 105 Notes: 106 107 Citations in parentheses below indicate the source of a requirement. 108 109 "C" stands for ISO/IEC 9899:TC2. 110 111 The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no 112 requirements since it defers to C and requires errno behavior only if 113 we choose to support it by arranging for "math_errhandling & 114 MATH_ERRNO" to be non-zero, which we do not. 115 116 Return value: 117 118 For arcsine of +/- zero, return zero with same sign (C F.9 12 and 119 F.9.1.2). 120 121 For 1 < |x| (including infinity), return NaN (C F.9.1.2). 122 123 For a NaN, return the same NaN (C F.9 11 and 13). (If the NaN is a 124 signalling NaN, we return the "same" NaN quieted.) 125 126 Otherwise: 127 128 If the rounding mode is round-to-nearest, return arcsine(x) 129 faithfully rounded. 130 131 Return a value in [-pi/2, +pi/2] (C 7.12.4.2 3). Note that this 132 prohibits returning correctly rounded values for asinf(-1) and 133 asinf(+1), since pi/2 rounded to a float lies outside that 134 interval. 135 136 Not implemented: In other rounding modes, return arcsine(x) 137 possibly with slightly worse error, not necessarily honoring the 138 rounding mode (Ali Sazegari narrowing C F.9 10). 139 140 Exceptions: 141 142 Raise underflow for a denormal result (C F.9 7 and Draft Standard for 143 Floating-Point Arithmetic P754 Draft 1.2.5 9.5). If the input is the 144 smallest normal, underflow may or may not be raised. This is stricter 145 than the older 754 standard. 146 147 May or may not raise inexact, even if the result is exact (C F.9 8). 148 149 Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite 150 of C 4.2.1) or 1 < |x| (including infinity) (C F.9.1.2) but not if the 151 input is a quiet NaN (C F.9 11). 152 153 May not raise exceptions otherwise (C F.9 9). 154 155 Properties: 156 157 Monotonic, proven by exhaustive testing. 158 159 Exhaustive testing proved this routine returns faithfully rounded 160 results. Since the rsqrtss instruction is specified to return a value 161 in an interval, tests were performed using each possible result, 162 showing that a valid result will be obtained regardless of which 163 value rsqrtss provides. 164*/ 165 .align 5 166#if !defined DevelopmentInstrumentation 167 // This is the regular name used in the deployed implementation. 168 .globl _asinf 169 _asinf: 170#else 171 // This is the name used for a special test version of the routine. 172 .globl _asinfInstrumented 173 _asinfInstrumented: 174#endif 175 176 cvtss2sd Argx, x // Convert x to double precision. 177 178 #if defined __i386__ 179 180 // Get address of 0 in BaseP. 181 call 0f // Push program counter onto stack. 182 0: 183 pop BaseP // Get program counter. 184 185 #endif 186 187/* We use different algorithms for different parts of the domain. There 188 is a "negative tail" from -1 to nPoint, a center from nPoint to pPoint, 189 and a "positive tail" from pPoint to +1. Here, we compare and branch 190 to the appropriate code. 191 192 There are also special cases: NaNs, x < -1, and 1 < x. These are weeded 193 out in PositiveTail or NegativeTail. 194*/ 195 196 ucomisd Address(pPoint), x 197 ja PositiveTail 198 199 ucomisd Address(nPoint), x 200 jb NegativeTail 201 202 203/* Here we have nPoint <= x <= pPoint. This is handled with a simple 204 evaluation of a polynomial that approximates arcsine. 205 206 The polynomial has been arranged into the form: 207 208 x + x * 209 c2 * (x**4 + c01 * x**2 + c00) 210 * x**2 * (x**4 + c11 * x**2 * c10). 211 212 The coefficients c00 and c10 are stored in a pair at C0, and c01 and c11 213 are stored at C1. c2 is at C2. 214 215 The two quartic factors are evaluated in SIMD registers. For brevity, some 216 comments below describe only one element of a register. The other is 217 analagous. 218*/ 219 movsd x, x1 // Save a copy of x for later. 220 mulsd x, x // Form x**2. 221 movapd Address(C1), p // Get first coefficient pair. 222 unpcklpd x, x // Duplicate in SIMD register. 223 addpd x, p // Form x**2 + c1. 224 mulpd x, p // Form x**4 + c1 * x**2. 225 movlpd Address(C2), x // Put c2 in low element, with x**2 in high. 226 mulsd x1, x // Multiply by x. 227 addpd Address(C0), p // Form x**4 + c1 * x**2 + c0. 228 mulpd x, p // Multiply by c2*x in low and x**2 in high. 229 movhlps p, pa // Get high element. 230 mulsd pa, p // Multiply two factors. 231 addsd x1, p // Add final term. 232 233// Return the double-precision number currently in p. 234ReturnDoubleInp: 235 cvtsd2ss p, p // Convert result to single precision. 236 237// Return the single-precision number currently in p. 238ReturnSingleInp: 239 240 #if defined __i386__ 241 movss p, Argx // Shuttle result through memory. 242 // This uses the argument area for scratch space, which is allowed. 243 flds Argx // Return input on floating-point stack. 244 #else 245 // On x86_64, the return value is now in p, which is %xmm0. 246 #endif 247 248 ret 249 250 251// Handle pPoint < x. 252PositiveTail: 253 254 movsd Address(pOne), w // Get +1 for math and comparison. 255 ucomisd w, x // Compare x to +1. 256 jae InputIsPositiveSpecial 257 258/* Here we have pPoint < x < 1. The algorithm here is inspired by the 259 identity arcsine(x) = pi/2 - 2 * arcsine(sqrt((1-x)/2)). Replacing arcsine 260 with an approximating polynomial would give an odd polynomial in 261 sqrt((1-x)/2), which is the same as sqrt(1-x) multiplied by some polynomial 262 in x. So we have: 263 264 arcsine(x) ~= pi/2 + sqrt(1-x) * t(x). 265 266 Unfortunately, the square-root instruction (rsqrtss) takes too long, so we 267 use the faster reciprocal-square-root-estimate instruction instead and 268 refine its estimate. Given an estimate e from the rsqrtss instruction, the 269 square root of 1-x is very nearly, e * (1-x) * s(e**2 * (1-x)). Let e2a 270 be e**2 * (1-x), so sqrt(1-x) is nearly e * a * s(e2a). The leading 271 coefficient of s, cl, has been removed (by dividing s by it) and multiplied 272 into the polynomial p above. That leaves: 273 274 sqrt(1-x) / cl ~= e * a * s(e2a)/cl. 275 276 s(e2a)/cl is e2a**2 + s1 * e2a + s0, where s1 and s0 have been stored at 277 labels S1 and S0, above. 278 279 t(x) has been arranged into the form: 280 281 t2 282 * (x**2 + t01 * x + t00) 283 * (x**2 + t11 * x + t10). 284 285 The two quadratic factors are evaluated in SIMD registers. For brevity, 286 some comments below describe only one element of a register. The other is 287 analagous. 288 289 So, our job here is to evaluate: 290 291 a = 1-x. 292 293 e = estimate of 1/sqrt(a). 294 295 e2a = e*e*a. 296 297 asinf(x) ~= pi/2 + e * a 298 * (e2a**2 + s1 * e2a + s0) 299 * t2 300 * (x**2 + t01 * x + t00) 301 * (x**2 + t11 * x + t10). 302*/ 303 304 subsd x, w // Form 1-x. 305 cvtsd2ss w, e // Convert to single precision for rsqrtss. 306 movapd Address(T1), p // Start asinf polynomial. 307 unpcklpd x, x // Duplicate x. 308 addpd x, p // Form x + t1. 309 #if !defined DevelopmentInstrumentation 310 // This is the regular code used in the deployed implementation. 311 rsqrtss e, e // Estimate 1/sqrt(1-x). 312 #else 313 /* This instruction uses an estimate of 1/sqrt(1-x) passed by the 314 caller rather than the rsqrtss instruction. This allows us to test 315 the implementation with all values that rsqrtss might return. 316 */ 317 movss 8(%esp), e // Use caller's estimate of 1/sqrt(1-x). 318 #endif 319 cvtss2sd e, e // Convert to double precision. 320 mulpd x, p // Form x**2 + t1*x. 321 mulsd e, w // Form e * (1-x). 322 mulsd w, e // Form e**2 * (1-x). 323 // "e" in comments refers to the initial estimate from rsqrtss. 324 movhpd Address(pT2), w // Copy coefficient into high element. 325 addpd Address(T0), p // Form x**2 + t1*x + t0. 326 movsd Address(S1), rss // Form s1. 327 mulpd w, p // Form e * (1-x) * p(x), split. 328 addsd e, rss // Form e**2 * (1-x) + s1. 329 movhlps p, pa // Separate high element. 330 mulsd pa, p // Finish e * (1-x) * p(x). 331 mulsd e, rss // Form e2a**2 + s1 * e2a. 332 addsd Address(S0), rss // Form e2a**2 + s1 * e2a + s0. 333 mulsd rss, p // Form e * (1-x) / sqrt(1-x) * p(x). 334 addsd Address(HalfPi), p // Form pi/2 - 2*asin(sqrt((1-x)/2)). 335 336 jmp ReturnDoubleInp 337 338 339// Handle x < nPoint. 340NegativeTail: 341 342 movsd Address(pOne), w // Get +1 for math. 343 ucomisd Address(nOne), x // Compare x to -1. 344 jbe InputIsNegativeSpecial 345 346/* Here we have -1 < x < nPoint. We use the same algorithm as in PositiveTail 347 but adapted for -x. 348 349 For brevity, some comments below describe only one element of a register. 350 The other is analagous. 351 352 Our job here is to evaluate: 353 354 a = 1+x. 355 356 e = estimate of 1/sqrt(a). 357 358 e2a = e*e*a. 359 360 asinf(x) ~= -pi/2 - e * a 361 * (e2a**2 + s1 * e2a + s0) 362 * -t2 363 * (x**2 - t01 * x + t00) 364 * (x**2 - t11 * x + t10). 365 366 Note that the signs of terms involving x are negated from those in 367 PositiveTail, and the result is negated as well (by changing "pi/2" to 368 "-pi/2" and "t2" to "-t2"). 369 370 For convenience, the final two factors are negated: 371 372 * (-x**2 + t01 * x - t00) 373 * (-x**2 + t11 * x - t10). 374*/ 375 376 addsd x, w // Form 1+x. 377 cvtsd2ss w, e // Convert to single precision for rsqrtss. 378 movapd Address(T1), p // Start asinf polynomial. 379 unpcklpd x, x // Duplicate x. 380 subpd x, p // Form -x + t1. 381 #if !defined DevelopmentInstrumentation 382 // This is the regular code used in the deployed implementation. 383 rsqrtss e, e // Estimate 1/sqrt(1+x). 384 #else 385 /* This instruction uses an estimate of 1/sqrt(1+x) passed by the 386 caller rather than the rsqrtss instruction. This allows us to test 387 the implementation with all values that rsqrtss might return. 388 */ 389 movss 8(%esp), e // Use caller's estimate of 1/sqrt(1+x). 390 #endif 391 cvtss2sd e, e // Convert to double precision. 392 mulpd x, p // Form -x**2 + t1*x. 393 mulsd e, w // Form e * (1+x). 394 mulsd w, e // Form e**2 * (1+x). 395 // "e" in comments refers to the initial estimate from rsqrtss. 396 movhpd Address(nT2), w // Copy coefficient into high element. 397 subpd Address(T0), p // Form -x**2 + t1*x - t0. 398 movsd Address(S1), rss // Form s1. 399 addsd e, rss // Form e**2 * (1+x) + s1. 400 mulpd w, p // Form e * (1+x) * p(x), split. 401 mulsd e, rss // Form e2a**2 + s1 * e2a. 402 movhlps p, pa // Separate high element. 403 addsd Address(S0), rss // Form e2a**2 + s1 * e2a + s0. 404 mulsd pa, p // Finish e * (1+x) * p(x). 405 mulsd rss, p // Form e * (1+x) / sqrt(1+x) * p(x). 406 subsd Address(HalfPi), p // Form -pi/2 - 2*asin(sqrt((1+x)/2)). 407 408 jmp ReturnDoubleInp 409 410 411/* Here we handle inputs greater than or equal to one, including infinity, 412 but not including NaNs. The condition code must be set to indicate equal 413 (zero flag is one) iff the input is one. 414*/ 415InputIsPositiveSpecial: 416 je InputIsPositiveOne 417 jmp SignalInvalid 418 419 420/* Here we handle inputs less than or equal to -1, including -infinity, and 421 NaNs. The condition code must be set as if by using ucomisd to compare 422 the input in the "source operand" to -1 in the "destination operand". 423*/ 424InputIsNegativeSpecial: 425 jp InputIsNaN 426 je InputIsNegativeOne 427 428 // Continue into SignalInvalid. 429// jmp SignalInvalid 430 431 432/* Here we handle inputs outside the domain of the function. We subtract 433 infinity from itself to generate the invalid signal and return a NaN. 434*/ 435 436 .literal4 437Infinity: .long 0x7f800000 438 439 .text 440SignalInvalid: 441 movss Address(Infinity), p 442 subss p, p // Generate invalid signal and NaN value. 443 jmp ReturnSingleInp 444 445 446/* Here we handle an input of +1 or -1. arcsine(+1) is pi/2, which increases 447 when rounded to single precision. However, we are required to return 448 results in [-pi/2, pi/2], so we return pi/2 rounded toward zero. 449*/ 450 451 .literal8 452/* Define values near +pi/2 and -pi/2 that yield +/- pi/2 rounded toward zero 453 when converted to single precision. This allows us to generate inexact and 454 return the desired values for asinf(+1) and asinf(-1). 455*/ 456AlmostpHalfPi: .double +1.5707962 457AlmostnHalfPi: .double -1.5707962 458 459 .text 460InputIsPositiveOne: 461 cvtsd2ss Address(AlmostpHalfPi), p 462 jmp ReturnSingleInp 463 464InputIsNegativeOne: 465 cvtsd2ss Address(AlmostnHalfPi), p 466 jmp ReturnSingleInp 467 468 469InputIsNaN: 470 471 #if defined __i386__ 472 flds Argx // Return result on floating-point stack. 473 #else 474 cvtsd2ss x, %xmm0 // Return in %xmm0. 475 /* We cannot just return the input, because we must quiet a 476 signalling NaN. 477 */ 478 #endif 479 480 ret