this repo has no description
at fixPythonPipStalling 436 lines 16 kB view raw
1// Note that while this file has a .s extension it is not intended to be assmbled directly 2// but rather included with appropriate #defines in place. 3 4// This begins the base-universal logarithm code. 5#ifndef BASE2 6#define BASE2 0 7#endif 8#ifndef BASEE 9#define BASEE 0 10#endif 11#ifndef BASE10 12#define BASE10 0 13#endif 14#ifndef LOGUP 15#define LOGUP 0 16#endif 17 18// Assume we need to respect the flags unless directed otherwise 19#ifndef FENVON 20#define FENVON 1 21#endif 22 23#if BASE2 24// double log2asm( double x ) 25ENTRY( log2 ) 26#elif BASEE 27#if LOGUP 28// private double logup( double x ) // log(x+ULP(x)/2) 29ENTRY( _logup ) 30#else 31// double log( double x ) 32ENTRY( log ) 33#endif 34#elif BASE10 35// double log10( double x ) 36ENTRY( log10 ) 37#else 38#error "One of BASE2, BASEE, BASE10 needs to be set to 1" 39#endif 40 // Early out if x is NaN, Inf, 0, or <0 41 // I.e., if the encoding of x is not in (0, 7ff0000000000000) 42#if defined( __x86_64__ ) 43 movd %xmm0, %rax 44 movd %xmm0, %rdx 45 subq REL_ADDR(small_cut), %rax 46 cmpq REL_ADDR(large_cut), %rax 47#else 48 movl 4+FRAME_SIZE(STACKP), %eax 49 movl 4+FRAME_SIZE(STACKP), %edx 50 movsd FRAME_SIZE(STACKP), %xmm0 51 call 0f // PIC code 520: pop %ecx // PIC base 53 sub $0x00100000, %eax 54 cmp $0x7fe00000, %eax 55#endif 56 jae 3f // Jump if NaN, Inf, Denorm, 0, or negative 57 58 SUBP $LOCAL_STACK_SIZE, STACKP 59 60#if LOGUP 61 // Need to adjust h with a small correction for log1p(x) when 1+x is not representable 62 //u = (x < 1.0) ? -0x1p-53 : 0x1p-54 = ...? 0xbca0000000000000 : 0x3c90000000000000; 63 // Add in the half ulp now that we know we have room. 64 //h = (f - a) + u; // h = R(f-a)+ulp(x) 65 movapd %xmm0, %xmm6 // x 66 cmpltsd REL_ADDR(one), %xmm6 // (x < 1.0) 67 movsd REL_ADDR(logup_ulp_mask), %xmm5 // xor(-0x1p-53, 0x1p-54) = 0x8030000000000000 68 andpd %xmm5, %xmm6 // m = (x < 1.0) ? 0x8030000000000000 : 0 69 movsd REL_ADDR(_1pm54), %xmm5 // 0x1p-54 70 xorpd %xmm5, %xmm6 // m ^ 0x1p-54 = (x < 1.0) ? 0x8030000000000000 ^ 0x1p-54 : 0x1p-54 71 movsd %xmm6, 24(STACKP) // (x < 1.0) ? -0x1p-53 : 0x1p-54 72#endif 73 74 // Break x down into exp, f with -0.25 <= f < 0.5 and x = 2^exp(1+f) 75#if defined( __x86_64__ ) 76 movd %xmm0, %r8 // u.u = encoding of x 77 shrq $(52), %rdx // exponent >>= 52 78 andq REL_ADDR(frexp_mant_mask), %r8 // mantissa = u.u & 0x800fffffffffffffULL; 79 80 movd %xmm0, %rcx // encoding of xadjust 81 82 // or in -1 + bias to the new exponent for the mantissa 83 84 addq $-1023, %rdx // exponent + bias (= -1023 for normal data) 851: //This is where denormal inputs re-enter with bias compensated for denormal exponent and appropriate mantissa 86 //Note: for denormal xmm0 and rcx inputs are no longer the original x 87 88 shrq $51, %rcx // (encoding) >> 51 89 andq $0x1, %rcx // upperhalf = (encoding & 0x00080000ULL) >> 51 = (upperhalf != 0ULL); 90 addq %rcx, %rdx // exp = bias + (int) ( exponent >> 52 ) + (upperhalf != 0ULL); 91 movl %edx, 8(STACKP) // Save exp for later. Free dx and cx. 92 93 //Note: for denormal inputs this is no longer the original x 94 movd %xmm0, %rax // u.u 95 // grab before adjusting for which half we are in 96 andq REL_ADDR(frexp_head_mask), %rax // head = u.u & 0x000ff00000000000ULL; 97 shrq $((52-8)-5), %rax // k<<5 = (int)(head>>(52-8))<<5; 98 movd %xmm0, %rcx // u.u = encoding of xadjust 99 andq REL_ADDR(frexp_half_mask), %rcx // upperhalf = u.u & 0x0008000000000000ULL; 100 shlq $1, %rcx // (upperhalf << 1) 101 orq REL_ADDR(one), %r8 // (mantissa | 0x3ff0000000000000ULL) 102 // Divide by two if significand >= 3/2 103 xorq %rcx, %r8 // mantissa = (mantissa | 0x3ff0000000000000ULL) ^ (upperhalf << 1); 104 movd %r8, %xmm1 // fp1 105 // Now: 106 // k<<5 = %eax 107 // exp = %rdx 108 // fp1 = %xmm1 = (fp1 > 1.5) ? (0.5 * fp1 - 1.0) : (fp1 - 1.0); 109#else 110 movapd %xmm0, %xmm1 // u.u = encoding of x 111 andpd REL_ADDR(frexp_mant_mask), %xmm1 // mantissa = u.u & 0x800fffffffffffffULL; 112 113 movapd %xmm0, %xmm2 // u.u = encoding of adjusted x 114 psrlq $32, %xmm0 // top half of encoding of adjusted x 115 116 // or in -1 + bias to the new exponent for the mantissa 117 andpd REL_ADDR(frexp_half_mask), %xmm2 // upperhalf = u.u & 0x0008000000000000ULL; 118 shrl $(52-32), %edx // exponent >>= 52-32 119 120 addl $-1023, %edx // exponent + bias; bias = -1023 for normal data 121 1221: //This is where denormal inputs re-enter with bias compensated for denormal exponent and appropriate mantissa 123 124 movapd %xmm2, %xmm7 // upperhalf 125 psrlq $51, %xmm7 // (upperhalf != 0ULL) = upperhalf >> 51 == 1 or 0 126 movd %xmm7, 8(STACKP) // (int) ( exponent >> 52 ) 127 add 8(STACKP), %edx // exp = bias + (int) ( exponent >> 52 ) + (upperhalf != 0ULL); 128 movl %edx, 8(STACKP) // exp 129 movd %xmm0, %eax // Encoding of top half of x 130 // grab before adjusting for which half we are in 131 andl $0x000ff000, %eax // head = u.u & 0x000ff00000000000ULL; 132 shrl $(12-5), %eax // k<<5 = (int)(head>>(52-8-32))<<5; 133 psllq $1, %xmm2 // (upperhalf << 1) 134 orpd REL_ADDR(xone), %xmm1 // (mantissa | 0x3ff0000000000000ULL) 135 // Divide by two if significand >= 3/2 136 xorpd %xmm2, %xmm1 // mantissa = (mantissa | 0x3ff0000000000000ULL) ^ (upperhalf << 1); 137 138 // Now: 139 // k<<5 = %eax 140 // exp = 8(STACKP) 141 // fp1 = %xmm1 = (fp1 > 1.5) ? (0.5 * fp1) : (fp1); 142#endif 143 144// We look up a+1 rather than a, so we can work with fp1 - ap1 rather than f - a 145 146#if BASE2 147 // if fp1 == 1.0 return nd // base 2 only 148 movsd REL_ADDR(one), %xmm2 // 1.0 149 ucomisd %xmm2, %xmm1 // f == 1.0 set cc 150 je 7f 151#endif 152 153 movsd %xmm1, (STACKP) 154 155// The lookup table is in a funny format since it has 2 long double and a single. 156 // {10-byte va ; 2-byte pad ; 4-byte single a ; 10-byte lg1pa ; 2-byte pad, 4-byte a+1} 157#if defined( __x86_64__ ) 158 // in 64-bit need lea to get address 159 lea REL_ADDR(LOOKUP), %rdx 160 fldt (%rdx,AX_P,1) // [va] 161 fldl (STACKP) // [va ; f+1] 162 163 fsubs 28(%rdx,AX_P,1) // [va ; h = f+1 - a+1] 164#if LOGUP 165 //halfulp stuff goes here... 166 //h = (f - a) + u; // h = R(f-a)+ulp(x) 167 faddl 24(STACKP) // [va ; h = (f - a) + u] 168#endif //LOGUP 169 fmulp %st, %st(1) // [c = h * va] 170 fstl (STACKP) // [c] -- with double version stored on the stack 171 172 fldt 16(%rdx,AX_P,1) // [c ; lg1pa ] 173#if BASE2 174 // logbxhi = nd+lg1pa; // base 2 175 fiaddl 8(STACKP) // [c ; lghi = nd + lg1pa] 176#elif BASEE 177 // base e stuff goes here... 178 fiaddl 8(STACKP) // [c ; lghi = nd + lg1pa] 179 fldt REL_ADDR(ln2l) // [c ; lghi = nd + lg1pa ; ln2l] 180 fmulp %st, %st(1) // [ c ; logbxhi = ln2l*(nd+lg1pa)] 181#elif BASE10 182 // base 10 stuff goes here... 183 // free(%rax) 184 // free(%rdx) 185#if FENVON 186 movd %xmm0, %rdx // Original x encoding 187 shrq $(53-4), %rdx // 16*k = k << 4. k = key >> 53 = the top 6 of bottom 7 bits of exponent of x 188 andq $0x3f, %rdx // key = _mm_and_pd(vx,log10_key_mask); 189 movd %xmm0, %rax // Original x encoding 190 lea REL_ADDR(isPowerOf10), %rcx // Get base of table 191 cmp (%rcx,%rdx,1), %rax // if(isPowerOf10[k][0] == x) {return isPowerOf10[k][1];} 192 je 8f 193#endif //FENVON base 10 64-bit 194 fiaddl 8(STACKP) // [c ; lghi = nd + lg1pa] 195 fldt REL_ADDR(log102l) // [c ; lghi = nd + lg1pa ; log102l] 196 fmulp %st, %st(1) // [ c ; logbxhi = log102l*(nd+lg1pa)] 197#endif 198 199#else // i386 200 fldt (LOOKUP-0b)(%ecx,%eax,1) // [va] 201 fldl (STACKP) // [va ; f+1] 202 //halfulp stuff goes here... 203 fsubs (LOOKUP-0b+28)(%ecx,%eax,1) // [va ; h = f+1 - a+1] 204#if LOGUP 205 //h = (f - a) + u; // h = R(f-a)+ulp(x) 206 faddl 24(STACKP) // [va ; h = (f - a) + u] 207#endif //LOGUP 208 fmulp %st, %st(1) // [c = h * va] 209 fstl (STACKP) // [c] -- with double version stored on the stack 210 fldt (LOOKUP-0b+16)(%ecx,%eax,1) // [c ; lg1pa ] 211 212#if BASE2 213 // logbxhi = nd+lg1pa; // base 2 214 fiaddl 8(STACKP) // [c ; lghi = nd + lg1pa] 215#elif BASEE 216 // base e stuff goes here... 217 fiaddl 8(STACKP) // [c ; lghi = nd + lg1pa] 218 fldt REL_ADDR(ln2l) // [c ; lghi = nd + lg1pa ; ln2l] 219 fmulp %st, %st(1) // [ c ; logbxhi = ln2l*(nd+lg1pa)] 220#elif BASE10 221 // base 10 stuff goes here... 222 // free(%edx) 223#if FENVON 224 movl (4+LOCAL_STACK_SIZE)+FRAME_SIZE(STACKP), %edx 225 andl $0x07e00000, %edx // key = _mm_and_pd(vx,log10_key_mask); 226 shrl $((53-32)-4), %edx // 16*k = k << 4. k = key >> 53 = the top 6 of bottom 7 bits of exponent of x 227 movsd (LOCAL_STACK_SIZE)+FRAME_SIZE(STACKP), %xmm5 228 comisd (isPowerOf10-0b)(%ecx,%edx,1), %xmm5 // if(isPowerOf10[k][0] == x) {return isPowerOf10[k][1];} 229 je 8f 230#endif //FENVON base 10 i386 231 fiaddl 8(STACKP) // [c ; lghi = nd + lg1pa] 232 fldt REL_ADDR(log102l) // [c ; lghi = nd + lg1pa ; log102l] 233 fmulp %st, %st(1) // [ c ; logbxhi = log102l*(nd+lg1pa)] 234#endif //base 10 i386 235 236#endif // i386 237 238 239#ifdef __SSE3__ 240 movddup (STACKP), %xmm0 // get cd, double version of c off stack in both registers 241#else 242 movsd (STACKP), %xmm0 243 movhpd (STACKP), %xmm0 244#endif 245 movapd %xmm0, %xmm1 // [cd,cd] 246 addpd REL_ADDR(a01), %xmm0 // [a0,a1] + [cd,cd] 247 mulpd %xmm1, %xmm0 // [a0+cd,a1+cd] * [cd,cd] 248 addpd REL_ADDR(b01), %xmm0 // vpoly = [b0,b1] + [(a0+cd)*cd,(a1+cd)*cd] 249 250//long double logbec is log_b(e) * c 251//double c5b is log_b(e) * c5 252 fld %st(1) // [c ; lghi ; c] 253#if BASE2 254 // in base 2 logbec = lgel * c; c5b = c5_2; 255 fldt REL_ADDR(lgel) // [c ; lghi ; c ; lgel] 256 fmulp %st, %st(1) // [c ; lghi ; logbec = lgel * c] 257#define c5b c5_2 258#elif BASEE 259 // in base e logbec = c; c5b = c5_e; 260 // [c ; lghi ; logbec = lnel * c = c] 261#define c5b c5_e 262#elif BASE10 263 // in base 10 logbec = log10el * c; c5b = c5_10; 264 fldt REL_ADDR(log10el) // [c ; lghi ; log10el ; c] 265 fmulp %st, %st(1) // [c ; lghi ; logbec = log10el * c] 266#define c5b c5_10 267#endif 268 fxch %st(2) // [logbec ; lghi ; c] 269 fldt REL_ADDR(c0) // [logbec ; lghi ; c ; c0] 270 fmulp %st, %st(1) // [logbec ; lghi ; c*c0] 271 fld1 // [logbec ; lghi ; c*c0 ; 1.0] 272 faddp %st, %st(1) // [logbec ; lghi ; 1.0 + c*c0] 273 fmulp %st, %st(2) // [linear = logbec*(1.0 + c*c0) ; lghi] 274 fxch %st(1) // [lghi ; linear] 275 276 movapd %xmm1, %xmm2 // [cd, cd] 277 movhpd REL_ADDR(c5b), %xmm1 // [c5b, cd] 278 mulpd %xmm2, %xmm1 // vccc5 = [c5b*cd, cd*cd] 279 mulpd %xmm1, %xmm0 // vp = vccc5 * vpoly = [c5b*cd*(b0+(a0+cd)*cd), cd*cd*(b1+(a1+cd)*cd)] 280 //Do a horizontal multiply. 281 movapd %xmm0, %xmm1 // [ *, vplo] 282 shufpd $3, %xmm0, %xmm0 // [ *, vphi] 283 mulsd %xmm1, %xmm0 // [ *, cccp = c5b*cd*(b0+(a0+cd)*cd) * cd*cd*(b1+(a1+cd)*cd)] 284 movsd %xmm0, (STACKP) // Store so we can load into x87 285 faddl (STACKP) // [lghi ; logbxlo = linear + cccp] 286 faddp %st, %st(1) // [longResult = lghi + logbxlo] 287 // Round to double 288 fstpl (STACKP) // result [<empty stack>] 289#if defined( __x86_64__ ) 290 movq (STACKP), %xmm0 // result 291#else 292 fldl (STACKP) 293#endif 294#undef c5b 295 2962: // Clean up stack and return 297 ADDP $LOCAL_STACK_SIZE, STACKP 298 ret 299 300//////////////////////////////////////////////////////////////// 301 302 3033: // Special operand handling: NaN, Inf, Denorm, 0, or negative 304 // When we get here AX has as encoding(x) - 0x00100000[00000000 in 64-bit] 305 // +denormal -> denormal handling 306 // +zero -> -inf #ZERO_DIVIDE 307 // -stuff -> sqrt(x) (pass quiet NaNs or raise invalid) 308 // -zero -> -inf #ZERO_DIVIDE 309 // +NaN -> x+x 310 // +inf -> x+x 311 xorpd %xmm2, %xmm2 // 0.0 312 ucomisd %xmm2, %xmm0 // f == 0 set cc 313 jp 9f // NaN: compares unequal and signals invalid only for signaling NaN 314 je 4f // Zero: x == 0.0 -> -1 / zero 315 316 // We now need to distinguish between Denormal, Negative, and NaN/Inf. 317 // Adding the small_cut back in sets the EFLAGS for conditional branches 318#if defined( __x86_64__ ) 319 addq REL_ADDR(small_cut), %rax 320#else 321 addl $0x00100000, %eax 322#endif 323 jb 5f // Denormal (i.e., Inf > x > 0 and we have ruled out other cases) 324 js 6f // Negative sign (may be "-NaN", -Inf, -Normal, or -Denormal) 325 326 // If we fall through to here, we have "+NaN" or +Inf 3279: 328#if defined( __i386__ ) // Light cleanup required 329 movsd %xmm0, FRAME_SIZE(STACKP) 330 fldl FRAME_SIZE(STACKP) 331#endif 332 ret 333 3344: // Zero: x == 0.0 -> -1 / zero 335 //We need to return -Inf for zero and set div/0 flag 336 xorpd %xmm1, %xmm1 337 pcmpeqb %xmm0, %xmm0 // -1U 338 cvtdq2pd %xmm0, %xmm0 // -1.0f 339 divsd %xmm1, %xmm0 // -1.0f / 0 = -Inf + div/0 flag 340#if defined( __i386__ ) // Light cleanup required 341 movsd %xmm0, FRAME_SIZE(STACKP) 342 fldl FRAME_SIZE(STACKP) 343#endif 344 ret 345 3465: // Denormal 347//Denormal inputs are frexped with extra care and then we branch back into the mainline code. 348 349 SUBP $LOCAL_STACK_SIZE, STACKP 350 // Break x down into exp, f with -0.25 <= f < 0.5 and x = 2^exp(1+f) 351 // knowing that we are starting with denormal x 352#if defined( __x86_64__ ) 353 movd %xmm0, %r8 // u.u = encoding of x 354 andq REL_ADDR(frexp_mant_mask), %r8 // mantissa = u.u & 0x800fffffffffffffULL; 355 // or in bias to the new exponent for the mantissa 356 orq REL_ADDR(one), %r8 // mantissa |= 0x3ff0000000000000ULL; 357 movd %r8, %xmm0 // mantissa 358 subsd REL_ADDR(one), %xmm0 359 movd %xmm0, %rdx // encoding rescaled input 360 movd %xmm0, %r8 // encoding rescaled input 361 shrq $(52), %rdx // exponent >>= 52 362 andq REL_ADDR(frexp_mant_mask), %r8 // mantissa = u.u & 0x800fffffffffffffULL; 363 movd %xmm0, %rcx // encoding of xadjust 364 // or in -1 + bias to the new exponent for the mantissa 365 addq $(-1023-1022), %rdx // exponent + bias (= -1023-1022 for subnormal data) 366#else 367 movapd %xmm0, %xmm1 // u.u = encoding of x 368 // or in bias to the new exponent for the mantissa 369 movq REL_ADDR(one), %xmm2 // 1.0 370 orpd %xmm2, %xmm0 // mantissa = _mm_or_pd(x, xOne); 371 subsd %xmm2, %xmm0 // u = mantissa - xOne; 372 movapd %xmm0, %xmm1 // u.u = encoding of adjusted x 373 movapd %xmm0, %xmm7 // u.u = encoding of adjusted x (but we only want the high half) 374 psrlq $32, %xmm7 // top half of encoding of adjusted x 375 movd %xmm7, %edx // top half of encoding of adjusted x 376 andpd REL_ADDR(frexp_mant_mask), %xmm1 // mantissa = u.u & 0x800fffffffffffffULL; 377 378 movapd %xmm0, %xmm2 // u.u = encoding of adjusted x 379 psrlq $32, %xmm0 // top half of encoding of adjusted x 380 381 // or in -1 + bias to the new exponent for the mantissa 382 andpd REL_ADDR(frexp_half_mask), %xmm2 // upperhalf = u.u & 0x0008000000000000ULL; 383 shrl $(52-32), %edx // exponent >>= 52-32 384 385 addl $(-1023-1022), %edx // exponent + bias; bias = -1023-1022 for subnormal data 386 387#endif 388 jmp 1b // Re-enter main line with bias compensated for denormal exponent and appropriate mantissa 389 390 3916: // Negative sign (may be "-NaN", -Inf, -Normal, or -Denormal) 392 // In any case sqrt(x) will do the right thing. 393 // -QNaN will pass through 394 // -SNaN will raise invalid and be quieted 395 // -N will raise invalid and return a quiet NaN 396 sqrtsd %xmm0, %xmm0 397#if defined( __i386__ ) // Light cleanup required 398 movsd %xmm0, FRAME_SIZE(STACKP) 399 fldl FRAME_SIZE(STACKP) 400#endif 401 ret 402 403#if BASE2 4047: //early out for power of 2 405 ADDP $LOCAL_STACK_SIZE, STACKP 406#if defined( __i386__ ) 407 movl %edx, FRAME_SIZE( STACKP ) 408 fildl FRAME_SIZE(STACKP) 409#else 410 cvtsi2sd %rdx, %xmm0 411#endif 412 ret 413#endif 414 415// This base10 branch only needed if FENVON 416#if FENVON 4178: // If(isPowerOf10[k][0] == x) {return isPowerOf10[k][1];} 418 // Come in with two things on the x87 stack // [c ; lg1pa ] 419 fstp %st // [c ] 420 fstp %st // [<empty>] 421#if defined( __i386__ ) 422 movsd (isPowerOf10-0b+8)(%ecx,%edx,1), %xmm0 // isPowerOf10[k][1] 423#else 424 movsd 8(%rcx,%rdx,1), %xmm0 // isPowerOf10[k][1] 425#endif 426 // Clean up stack and return 427 ADDP $LOCAL_STACK_SIZE, STACKP 428 429#if defined( __i386__ ) // Light cleanup required 430 movsd %xmm0, FRAME_SIZE(STACKP) 431 fldl FRAME_SIZE(STACKP) 432#endif 433 ret 434#endif 435 436////////////////////////////////////////////////////////////////