this repo has no description
at fixPythonPipStalling 228 lines 8.0 kB view raw
1// Note that while this file has a .s extension it is not intended to be assmbled directly 2// but rather included with appropriate #defines in place. 3 4#define _log log@PLT 5#define __logup _logup@PLT 6 7// double log1p( double x ) 8ENTRY( log1p ) 9// Cases 10// Main case: -0.25 <= x && x < 0.5 11// [0000000000000000, 3fe0000000000000) 12// [8000000000000000, bfd0000000000000] 13// Log1p_core: compute k and jump into log with f = x, n = 0. 14// Early out: 15// x is small: |x| in [0, 3f10000000000000) 16 17// x < -0.25 || x >= 0.5: 18// Call log(x+1) if x == (x+1)-1, otherwise 19// Call logup(x+1) = log(x+1+ulp(x)/2) 20// Note, the Special operands could be handled by the log(x+1) call: 21// x is NaN or inf: fff*, 7ff* 22// x+1 <= 0: [bff0000000000000, fff0000000000000) 23 24#if defined( __x86_64__ ) 25 movd %xmm0, %rax 26 movd %xmm0, %rdx 27 andq REL_ADDR(mzero), %rdx // Sign bit 28 andq REL_ADDR(notmzero), %rax // absx 29 shrq $11, %rdx // sign >> 11 30 addq %rdx, %rax // t = x < 0?-2*x:x 31 movsd REL_ADDR(mone), %xmm2 // -1.0 32 movapd %xmm0, %xmm1 // x 33 cmpq REL_ADDR(half), %rax // t < 0.5 34#else 35 movl 4+FRAME_SIZE(STACKP), %eax 36 movl 4+FRAME_SIZE(STACKP), %edx 37 andl $0x80000000, %edx // Sign bit 38 andl $0x7fffffff, %eax // absx 39 call 0f // PIC code 400: pop %ecx // PIC base 41 movsd REL_ADDR(mone), %xmm2 // -1.0 42 shrl $11, %edx // sign >> 11 43 addl %edx, %eax // t = x < 0?-2*x:x 44 movsd FRAME_SIZE(STACKP), %xmm0 45 movsd FRAME_SIZE(STACKP), %xmm1 46 cmpl $0x3fe00000, %eax // t < 0.5 47#endif 48 jb 7f // Jump for main case: -0.25 < x && x < 0.5 49 // x < -0.25, or x >= 0.5, or x is NoN or Inf 50 // This does not work for x in [0.5-, 1.0) since we lose a bit when adding one. 51 //TBD: It might be faster to do this in parallel with the frexp in log 52 // and compute a correction that is always added. 53 54 ucomisd %xmm2, %xmm0 // x < -1.0 ? Since log(1+x) would be invalid 55 jb 5f 56 movapd %xmm2, %xmm3 // -1.0 57 subsd %xmm2, %xmm0 // xp = x + 1.0 = x - -1.0 58 addsd %xmm0, %xmm2 // xpm = xp - 1.0 = xp + -1.0 59 ucomisd %xmm1, %xmm2 // x == xpm? 60#if defined(__i386__) 61 movsd %xmm0, FRAME_SIZE(STACKP) //overwrite x with xp 62#endif 63 je _log // log(x+1.0) 64 andpd REL_ADDR(log1p_not_ulp_mask), %xmm1 // xm 65 movsd REL_ADDR(one), %xmm0 // 1.0 66 addsd %xmm1, %xmm0 // xp = 1.0+xm 67#if defined(__i386__) 68 movsd %xmm0, FRAME_SIZE(STACKP) 69#endif 70 jmp __logup // log((x-ulp)+1+ulp) 71 727: 73 // %xmm0 = x 74 // %xmm1 = x 75 76//////////////////////////////////////////////////////////////// 77// log1p core early out for small values 78 movsd REL_ADDR(mzero), %xmm2 // 0x8000000000000000 79 andnpd %xmm0, %xmm2 // absx = fabs(x) // TBD: already have computed absx in 64-bit mode??? 80 comisd REL_ADDR(_1pm14), %xmm2 // (absx < 0x1p-14) 81 jbe 3f 82 83//get_k: compute the table lookup index, k. 84 SUBP $LOCAL_STACK_SIZE, STACKP 85 86//////////////////////////////////////////////////////////////// 87 // f = x 88 addsd REL_ADDR(one), %xmm0 // fp1 = f+1 89 movsd REL_ADDR(threehalves), %xmm2 // 1.5 90 cmpeqsd %xmm0, %xmm2 // (fp1 == 1.5) ? -1 : 0 91 psrlq $(52-8), %xmm0 // top 8 bits of significand encoding of adjusted x 92 paddq %xmm2, %xmm0 // (int)(fp1>>(52-8)) - (fp1 == 1.5); 93 movd %xmm0, AX_P 94 // grab before adjusting for which half we are in 95 and $0x000000ff, AX_P //k = (int)(head>>(52-8)) - (fp1 == 1.5); 96 97 // Now: 98 // k = %eax 99 // exp = 0 100 // fp1 = (fp1 > 1.5) ? (0.5 * fp1) : (fp1); 101 // %xmm1 = f = fp1 - 1 = x 102 103 movsd %xmm1, (STACKP) 104 shl $5, AX_P 105 106// The lookup table is in a funny format since it has 2 long double and a single. 107 // {10-byte va ; 2-byte pad ; 4-byte single a ; 10-byte lg1pa ; 6-byte pad} 108#if defined( __x86_64__ ) 109 // in 64-bit need lea to get address 110 lea REL_ADDR(LOOKUP), %rdx 111 fldt (%rdx,AX_P,1) // [va] 112 fldl (STACKP) // [va ; f] 113 //halfulp stuff goes here... 114 fsubs 12(%rdx,AX_P,1) // [va ; h = f - a] // should we do f-a in x87 or SSE??? In x87 it is easy to write the load and convert from single 115 fmulp %st, %st(1) // [c = h * va] 116 fstl (STACKP) // [c] -- with double version stored on the stack 117 118 fldt 16(%rdx,AX_P,1) // [c ; lg1pa ] 119#else // i386 120 fldt (LOOKUP-0b)(%ecx,%eax,1) // [va] 121 fldl (STACKP) // [va ; f] 122 123 fsubs (LOOKUP-0b+12)(%ecx,%eax,1) // [va ; h = f - a] 124 fmulp %st, %st(1) // [c = h * va] 125 fstl (STACKP) // [c] -- with double version stored on the stack 126 fldt (LOOKUP-0b+16)(%ecx,%eax,1) // [c ; lg1pa ] 127#endif // i386 128 129 // base e specific stuff here... 130 // [c ; lghi = 0 + lg1pa] since nd = 0; 131 fldt REL_ADDR(ln2l) // [c ; lghi = nd + lg1pa ; ln2l] 132 fmulp %st, %st(1) // [ c ; logbxhi = ln2l*(nd+lg1pa)] 133 134#ifdef __SSE3__ 135 movddup (STACKP), %xmm0 // get cd, double version of c off stack in both registers 136#else 137 movsd (STACKP), %xmm0 138 movhpd (STACKP), %xmm0 139#endif 140 movapd %xmm0, %xmm1 // [cd,cd] 141 addpd REL_ADDR(a01), %xmm0 // [a0,a1] + [cd,cd] 142 mulpd %xmm1, %xmm0 // [a0+cd,a1+cd] * [cd,cd] 143 addpd REL_ADDR(b01), %xmm0 // vpoly = [b0,b1] + [(a0+cd)*cd,(a1+cd)*cd] 144 145//long double logbec is log_b(e) * c 146//double c5b is log_b(e) * c5 147 fld %st(1) // [c ; lghi ; c] 148 // in base e logbec = c; c5b = c5_e; 149 // [c ; lghi ; logbec = lnel * c = c] 150#define c5b c5_e 151 fxch %st(2) // [logbec ; lghi ; c] 152 fldt REL_ADDR(c0) // [logbec ; lghi ; c ; c0] 153 fmulp %st, %st(1) // [logbec ; lghi ; c*c0] 154 fld1 // [logbec ; lghi ; c*c0 ; 1.0] 155 faddp %st, %st(1) // [logbec ; lghi ; 1.0 + c*c0] 156 fmulp %st, %st(2) // [linear = logbec*(1.0 + c*c0) ; lghi] 157 fxch %st(1) // [lghi ; linear] 158 159 movapd %xmm1, %xmm2 // [cd, cd] 160 movhpd REL_ADDR(c5b), %xmm1 // [c5b, cd] 161 mulpd %xmm2, %xmm1 // vccc5 = [c5b*cd, cd*cd] 162 mulpd %xmm1, %xmm0 // vp = vccc5 * vpoly = [c5b*cd*(b0+(a0+cd)*cd), cd*cd*(b1+(a1+cd)*cd)] 163 //Need to do a horizontal multiply 164 movapd %xmm0, %xmm1 // [ *, vplo] 165 shufpd $3, %xmm0, %xmm0 // [ *, vphi] 166 mulsd %xmm1, %xmm0 // [ *, cccp = c5b*cd*(b0+(a0+cd)*cd) * cd*cd*(b1+(a1+cd)*cd)] 167 movsd %xmm0, (STACKP) // Store so we can load into x87 168 faddl (STACKP) // [lghi ; logbxlo = linear + cccp] 169 faddp %st, %st(1) // [longResult = lghi + logbxlo] 170 // Convert to double 171 fstpl (STACKP) // result [<empty stack>] 172#if defined( __x86_64__ ) 173 movq (STACKP), %xmm0 // result 174#else 175 fldl (STACKP) // result 176#endif 177#undef c5b 178 179 // Clean up stack and return 180 ADDP $LOCAL_STACK_SIZE, STACKP 181 ret 182 183//////////////////////////////////////////////////////////////// 184// Here we know that |x| <= 0x1p-14 185// %xmm0 x 186// %xmm1 x 187// %xmm2 absx 188 189// TODO could have a better poly for less error, or wider interval 1903: 191 comisd REL_ADDR(_1pm54), %xmm2 // (absx <= 0x1p-54) 192 jbe 4f 193 // So 0x1p-54 < |x| < 0x1p-14 194 // We compute a small poly: 195 // p = ( (-0.25 * x + 0x0.55555555555555p0) * x + -0.5) * (x*x) + x 196 mulsd REL_ADDR(mquarter), %xmm1 // -0.25 * x 197 mulsd %xmm2, %xmm2 // x*x = absx * absx 198 addsd REL_ADDR(third), %xmm1 // (-0.25 * x + 0x0.55555555555555p0) 199 mulsd %xmm0, %xmm1 // (-0.25 * x + 0x0.55555555555555p0) * x 200 subsd REL_ADDR(half), %xmm1 // ( (-0.25 * x + 0x0.55555555555555p0) * x + -0.5) 201 mulsd %xmm2, %xmm1 // p = ( (-0.25 * x + 0x0.55555555555555p0) * x + -0.5) * (x*x) 202 addsd %xmm1, %xmm0 // return p 203#if defined( __i386__ ) // Light cleanup required 204 movsd %xmm0, FRAME_SIZE(STACKP) 205 fldl FRAME_SIZE(STACKP) 206#endif 207 ret 208 209 2104: // if(absx < 0x1p-54) ... 211 // Note: this handles denormals, 0, and -0 also 212#if FENVON 213 addsd REL_ADDR(one), %xmm1 // x + 1.0 to raise inexact since x is small (except when x == 0) 214#endif 215 // retrun x 216#if defined( __i386__ ) // Light cleanup required 217 movsd %xmm0, FRAME_SIZE(STACKP) 218 fldl FRAME_SIZE(STACKP) 219#endif 220 ret 221 2225: // x < -1.0 223 sqrtsd %xmm0, %xmm0 224#if defined( __i386__ ) // Light cleanup required 225 movsd %xmm0, FRAME_SIZE(STACKP) 226 fldl FRAME_SIZE(STACKP) 227#endif 228 ret