this repo has no description
at fixPythonPipStalling 335 lines 11 kB view raw
1/* 2 * Original public domain code for expl Written by J.T. Conklin <jtc@netbsd.org>. 3 * fixed up to deliver correct edge cases by Ian Ollmann (expl) 4 * 5 * exp2l by Ian Ollmann based on algorithm by J.T. Conklin 6 * 7 * expm1l is new code by Ian Ollmann 8 */ 9 10#include <machine/asm.h> 11 12#define LOCAL_STACK_SIZE 24 13#include "abi.h" 14 15 16/* e^x = 2^(x * log2(e)) */ 17/* 18ENTRY(expl) 19 SUBP $LOCAL_STACK_SIZE, STACKP 20 movl $0xbe205c61, 20(STACKP) // ln2 hi (double, low 32-bits) 21 movl $0x0ca86c39, 16(STACKP) // ln2 lo (double, low 32-bits) 22 movl $0x3f317218, 8(STACKP) // ln2 hi (float) 23 movl $0x7f800000, 4(STACKP) // inf 24 movl $0x50000000, (STACKP) // limit 25 26 //test for inf or NaN 27 fldt FIRST_ARG_OFFSET(STACKP) // { f } 28 fabs // { |f| } 29 flds 4(STACKP) // { inf, |f| } 30 fucomip %ST(1), %ST(0) // { |f| } if |f| == inf or f is NaN 31 fstp %ST(0) // {} 32 fldt FIRST_ARG_OFFSET(STACKP) // { f } 33 flds (STACKP) // { limit, f } 34 je expl_special 35 36 //clip f to be between limit and -limit. 37 //If we dont do this, when we scale by log2e then it is possible that we will generate spurious overflow 38 fld %ST(0) // { limit, limit, f } 39 fchs // { -limit, limit, f } 40 fucomi %ST(2), %ST(0) // { -limit, limit, f } if( -limit < f ) 41 fcmovb %ST(2), %ST(0) // { max( -limit, f ), limit, f } 42 fucomi %ST(1), %ST(0) // { max( -limit, f ), limit, f } if( max( -limit, f ) > limit ) 43 fcmovnbe %ST(1), %ST(0) // { clipped, limit, f } 44 fstp %ST(2) // { limit, clipped } 45 fstp %ST(0) // { clipped } 46 47 //extended precision exp: 48 //Calculate the integer part of log2(e) * clipped 49 fldl2e // { log2e, clipped } 50 fmul %st(1), %st(0) // { log2e * clipped, clipped } 51 frndint // { n, clipped } 52 53 //figure out the fractional part. Use more than 64-bit precision so we get the exact fraction. 54 flds 8(STACKP) // { ln2 hi, n, clipped } 55 fmul %st(1), %st(0) // { ln2 hi * n, n, clipped } 56 fsubrp %st(0), %st(2) // { n, clipped - n*ln2hi } 57 fldl 16(STACKP) // { ln2 lo, n, clipped - n*ln2hi } 58 fmul %st(1), %st(0) // { ln2lo*n, n, clipped - n*ln2hi } 59 fsubrp %st(0), %st(2) // { n, fract } 60 fldl2e // { log2e, n, fract } 61 fmulp %st(0), %st(2) // { n, log2e * fract } 62 63 //Calculate e**fract 64 fxch // { log2e * fract, n } 65 f2xm1 // { 2**fract( f*log2e ) - 1, n } 66 fld1 // { 1, 2**fract( f*log2e ) - 1, n } 67 faddp // { 2**fract( f*log2e ), n } 68 69 //scale according to n 70 fscale // {result, n } 71 fstp %st(1) 72 ADDP $LOCAL_STACK_SIZE, STACKP 73 ret 74 75expl_special: //handles +-Inf, QNaN, SNaN 76 fucomip %ST(1), %ST(0) // { f } if( limit > f ) 77 fldz // { 0, f } 78 fxch // { f, 0 } 79 fcmovnbe %ST(1), %ST(0) // set the -Inf case to 0 80 fadd %ST(0), %ST(0) // silence NaNs 81 fstp %ST(1) 82 ADDP $LOCAL_STACK_SIZE, STACKP 83 ret 84*/ 85 86 87ENTRY(exp2l) 88 SUBP $LOCAL_STACK_SIZE, STACKP 89 movl $0x7f800000, 4(STACKP) // inf 90 movl $0x1e000000, (STACKP) // small (0x1.0p-67f) 91 92 //test for +inf, NaN 93 fldt FIRST_ARG_OFFSET(STACKP) // { f } 94 flds 4(STACKP) // { inf, f } 95 fucomip %ST(1), %ST(0) // { f } 96 je exp2l_exit // return f if Inf or NaN 97 98 //test for -inf 99 flds 4(STACKP) // { inf, f } 100 fchs // {-inf, f } 101 fucomip %ST(1), %ST(0) // { f } 102 jne exp2l_main 103 fldz // { 0, f } 104 fstp %st(1) // { f } 105 jmp exp2l_exit 106 107exp2l_main: 108 flds (STACKP) // { small, f } 109 fld %st(1) // { f, small, f } 110 fabs // { |f|, small, f } 111 fucomip %st(1), %st(0) // { small, f } 112 fstp %st(0) // { f } 113 jb exp2l_small 114 fld %st(0) // { f, f } 115 frndint // { intf, f } 116 fxch %st(1) // { f, intf } 117 fsub %st(1),%st // { fract(f), intf } 118 f2xm1 // { 2**fract(f) - 1, intf } 119 fld1 // { 1, 2**fract(f) - 1, intf } 120 faddp // { 2**fract(f), intf } 121 fscale // { result, intf } 122 fstp %st(1) // { result } 123 //fall through to exit 124 125exp2l_exit: 126 ADDP $LOCAL_STACK_SIZE, STACKP 127 ret 128 129exp2l_small: //avoid setting the underflow flag for denormal inputs 130 fld1 // {1, f } 131 faddp // {result} 132 ADDP $LOCAL_STACK_SIZE, STACKP 133 ret 134 135 136//do expm1 137 // e**x - 1 = 2**(x*log2(e)) - 1 138 // 139 // f = fract( x*log2e ) //fractional part of x*log2e 140 // i = int( x*log2e ) //integral part of x*log2e 141 // 142 // e**x - 1 = 2**( f + i ) - 1 143 // = (2**f)(2**i) - 1 144 // = (2**f)(2**i) - (2**i) + (2**i) - 1 145 // = (2**f - 1)(2**i) + (2**i) - 1 146 // (exact) (needs to be done in the right order, or not at all for i == 0) 147 // 148 // The multiplication is exact. The two adds need to be done least to greatest in order of magnitude. 149 // Since 2**f - 1 is always in the range [ -0.5, 1.0 ], we know ahead of time that (2**f-1)(2**i) 150 // always has less magnitude than (2**i). if i < 0 then add the -1 first. Otherwise, add (2**i) first 151 // 152 // There are a lot of special cases here, which is why this is a bit branchy. I tried it without 153 // branches and wasn't able to satisfy all the constraints called for by various standards 154 155#define EXPM1L_STACK_SIZE 12 156#undef FIRST_ARG_OFFSET 157#define FIRST_ARG_OFFSET (FRAME_SIZE + EXPM1L_STACK_SIZE) 158 159ENTRY(expm1l) 160 SUBP $EXPM1L_STACK_SIZE, STACKP 161 movl $0xc6000000, 8(STACKP) // -8192.0f 162 movl $0x7f800000, 4(STACKP) // inf 163 movl $0x47000000, 0(STACKP) // 32768 164 165 //test for inf or NaN 166 fldt FIRST_ARG_OFFSET(STACKP) // { x } 167 fldz // { 0, x } 168 fucomip %st(1), %st(0) // { x } 169 jnbe expm1l_neg // if 0 > x goto expm1_neg 170 171 // x >= 0... 172 // get x*log2e 173 fldl2e // { log2e, x } 174 fmulp // { x*log2e } 175 flds 4(STACKP) // { inf, x*log2e } 176 fucomip %ST(1), %ST(0) // { x*log2e } if x*log2e == inf or NaN 177 je expm1l_special 178 179 //next convert to integer and fractional parts using round to zero rounding mode 180 //calculate trunc(x*log2e) 181 flds (STACKP) // { 32768.0, x*log2e } 182 fucomi %st(1), %st(0) // { x*log2e } 183 fcmovnbe %st(1), %st(0) // { clipped, x*log2e } 184 fstp %st(1) // { clipped } 185 fld %st(0) // { clipped, clipped } 186 187#if defined( __SSE3__ ) 188 fisttpll (STACKP) // { clipped } 189 fildll (STACKP) // { i, clipped } 190#else 191 //set to round to zero 192 fnstcw 4(STACKP) 193 movw 4(STACKP), %ax 194 movw %ax, %dx //save old value for later 195 orw $0x0c00, %ax 196 movw %ax, 4(STACKP) 197 fldcw 4(STACKP) 198 //round to int 199 fistpll (STACKP) // { clipped } 200 fildll (STACKP) // { i, clipped } 201 //restore old rounding mode 202 movw %dx, 4(STACKP) 203 fldcw 4(STACKP) 204#endif 205 206 //test 0 against i 207 fldz // {0, i, x*log2e} 208 fucomip %st(1), %st(0) // {i, x*log2e} 209 210 //get the fractional part of x*log2e with the same sign as x*log2e 211 fsubr %st(0), %st(1) // { i, f } 212 213 //calculate 2**f-1 214 fxch // { f, i } 215 f2xm1 // { 2**f - 1, i } 216 217 //three way switch for 0>i, i==0, i>0 218 je exmpm1l_zero 219 220 //This case is a bit trickier, because 2**i can overflow when the result does not 221 //we do this as: 222 // 223 // e**x - 1 = (2**f - 1)(2**i) + (2**i) - 1 224 // = (2**i)( (2**f-1) - 2**(-i) + 1) 225 // 226 // we clamp 2**(-i) so that it can't go lower than 2**(-8192) to prevent underflow 227 // When this happens the result is always 0x1.fffffffffffffffep-1 * 2**i, so it doesn't 228 // much matter 229 fld %st(1) // { i, 2**f-1, i} 230 fchs // {-i, 2**f-1, i} 231 flds 8(STACKP) // {-8192.0f, -i, 2**f-1, i} 232 fucomi %st(1), %st(0) // {-8192.0f, -i, 2**f-1, i} 233 fcmovb %st(1), %st(0) // { clipped(-i), -i, 2**f-1, i} 234 fstp %st(1) // { clipped(-i), 2**f-1, i} 235 fld1 // {1, clipped(-i), 2**f-1, i} 236 fscale // { 2**-i,clipped(-i), 2**f-1, i} 237 fstp %st(1) // { 2**-i, 2**f-1, i} 238 fsubrp // { 2**f-1 - 2**-i, i} 239 fld1 // { 1, 2**f-1 - 2**-i, i} 240 faddp // { 1+ 2**f-1 - 2**-i, i} 241 fscale 242 fstp %st(1) 243 244 ADDP $EXPM1L_STACK_SIZE, STACKP 245 ret 246 247 248// x < 0 or x is NaN 249expm1l_neg: // {x} 250 flds 4(STACKP) // { inf, x } 251 fchs // { -inf, x } 252 fucomip %ST(1), %ST(0) // { x } if x == -inf 253 je expm1l_special // NaNs and -Inf go to special case handling code 254 255 // clip large negative values to avoid overflow 256 flds 8(STACKP) // { -8192, x } 257 fucomi %st(1), %st(0) // { -8192, x } 258 fcmovb %st(1), %st(0) // { clipped, x } 259 fstp %st(1) // { clipped } 260 261 //scale by log2e 262 fldl2e // { log2e, clipped } //clipped a.k.a x from here on 263 fmulp // { x*log2e } 264 265 //next convert to integer and fractional parts using round to zero rounding mode 266 //calculate trunc(x*log2e) 267 fld %st(0) // { x*log2e, x*log2e } 268#if defined( __SSE3__ ) 269 fisttpll (STACKP) // { x*log2e } 270 fildll (STACKP) // { i, x*log2e } 271#else 272 //set to round to zero 273 fnstcw 4(STACKP) 274 movw 4(STACKP), %ax 275 movw %ax, %dx //save old value for later 276 orw $0x0c00, %ax 277 movw %ax, 4(STACKP) 278 fldcw 4(STACKP) 279 //round to int 280 fistpll (STACKP) // { x*log2e } 281 fildll (STACKP) // { i, x*log2e } 282 //restore old rounding mode 283 movw %dx, 4(STACKP) 284 fldcw 4(STACKP) 285#endif 286 287 288 //test 0 against i 289 fldz // {0, i, x*log2e} 290 fucomip %st(1), %st(0) // {i, x*log2e} 291 292 //get the fractional part of x*log2e with the same sign as x*log2e 293 fsubr %st(0), %st(1) // { i, f } 294 295 //calculate 2**f-1 296 fxch // { f, i } 297 f2xm1 // { 2**f - 1, i } 298 299 //jump to complex code for handling the i < 0 case 300 je exmpm1l_zero 301 302 303 // In the i < 0 case, 2**i has a smaller magnitude than -1, so it is added first 304 // 305 // = (2**f - 1)(2**i) + (2**i) - 1 306 // 307 // The 2**i scaling is exact, so it can be factored out: 308 // 309 // = (2**i)(2**f-1+1) -1 310 fld1 // { 1, 2**f - 1, i } 311 faddp // { 2**f-1+1, i } 312 fscale // { (2**f-1+1)*(2**i), i } 313 fstp %st(1) // { (2**f-1+1)*(2**i) } 314 fld1 // { 1, (2**f-1+1)*(2**i) } 315 fsubrp // { result } 316 ADDP $EXPM1L_STACK_SIZE, STACKP 317 ret 318 319exmpm1l_zero: //the i == 0 case 320 fstp %st(1) // { result } 321 ADDP $EXPM1L_STACK_SIZE, STACKP 322 ret 323 324 325expm1l_special: //handles +-Inf, QNaN, SNaN 326 fldz // {0, x} 327 fucomip %ST(1), %ST(0) // { x } if( 0 > x ) 328 fadd %ST(0), %ST(0) // silence NaNs 329 fld1 // { 1, x } 330 fchs // { -1, x } 331 fxch // { f, -1 } 332 fcmovnbe %ST(1), %ST(0) // set the -Inf case to -1 333 fstp %ST(1) 334 ADDP $EXPM1L_STACK_SIZE, STACKP 335 ret