this repo has no description
at fixPythonPipStalling 392 lines 17 kB view raw
1/* 2 * expf.s 3 * 4 * by Ian Ollmann 5 * 6 * Copyright (c) 2007, Apple Inc. All Rights Reserved. 7 * 8 * This file implements the C99 expf function for the MacOS X __i386__ and __x86_64__ abis. 9 */ 10 11#include <machine/asm.h> 12#include "abi.h" 13 14// 15// Overall approach 16// 17// We calculate e**(y) as follows: 18// 19// e**y = 2**x x = y / ln(2) 20// 21// ...and proceed largely as we do in exp2f(): 22// 23// i = trunc(x) 24// f = x - i 25// 26// 2**(x) = 2**(f+i) = 2**f * 2**i 27// 28// We choose trunc here, so that 2**f always has the property of pushing 2**i that much closer to the 29// extrema of finite, non-zero floating point computation. That is: 30// 31// x < 1 --> 2**f <= 1 32// x == 1 --> 2**f = 1 33// x > 1 --> 2**f >= 1 34// 35// This means that our 2**i calculation will not underflow/overflow prematurely before 2**f has had its say. 36// Round to nearest even would produce somewhat more accurate results, but accomplishing something like that 37// rounding without knowledge of the current rounding mode is expensive. 38// 39// In principle, the 2**f part can further be broken up as: 40// 41// 2**f = 2**(c+r) = 2**c * 2**r 42// 43// Where c is chosen so as to make r as close to 0 as possible and c can be determined fairly exactly by 44// consulting a lookup table. Our choice of trunc above means we cant sure of the sign of f, 45// so our table ends up being twice as large as it would have if we had chosen floor or ceil. On the bright 46// side, we never have to worry about f = x - trunc(f) being inexact in single precision. 47// 48// Unfortunately, in practice this reduction seemed to be bad for throughput, so this implementation went with 49// a large polynomial instead that covers the range -1,1 or -0.5,0.5 depending on whether we know the rounding 50// mode is nearest or not. 51// 52// Most of the arithmetic will be done in double precision. I investigated calculating the fractional term as 2**r-1, so as 53// to preserve precision -- we expect a lot of zeros between the 1 and the first non-zero bit. Normally, we dont want 54// throw that away until the final operation so that non-default rounding modes arrive at something close to the 55// right answer. Doing float in double precision means that this is unnecessary however. 56// 57// This code is probably worth revisiting once more to see if we can come up with a cheap reduction. These huge 58// polynomials just arent good for latency. 59// 60// 61 62.align 4 63// 8th order minimax fit of exp2 on [-1.0,1.0]. |error| < 0.402865722354948566583852e-9: 64expf_c: .quad 0x40bc03f30399c376, 0x3ff000000001ea2a // c4/c8 = 0.961813690023115610862381719985771e-2 / 0.134107709538786543922336536865157e-5, c0 = 1.0 + 0.278626872016317130037181614004e-10 65 .quad 0x408f10e7f73e6d8f, 0x3fe62e42fd0933ee // c5/c8 = 0.133318252930790403741964203236548e-2 / 0.134107709538786543922336536865157e-5, c1 = .693147176943623740308984004029708 66 .quad 0x405cb616a9384e69, 0x3fcebfbdfd0f0afa // c6/c8 = 0.154016177542147239746127455226575e-3 / 0.134107709538786543922336536865157e-5, c2 = .240226505817268621584559118975830 67 .quad 0x4027173ebd288ba1, 0x3fac6b0a74f15403 // c7/c8 = 0.154832722143258821052933667742417e-4 / 0.134107709538786543922336536865157e-5, c3 = 0.555041568519883074165425891257052e-1 68 .quad 0x3eb67fe1dc3105ba // c8 = 0.134107709538786543922336536865157e-5 69 70 71.align 3 72expf_nofenv_c: .quad 0x3ff0000000058fca // 1.0 + -8.09329727503262700660348520172e-11 73 .double 0.693147206709644041218074094717934 74 .double 0.240226515050550309232521176082490 75 .double 0.0555032721577134480296332498187050 76 .double 0.00961799451418197772157647729048837 77 .double 0.00134004316655903280893023589328151 78 .double 0.000154780222739739895295319810010147 79 80.literal8 81reciprocalLn2: .quad 0x3ff71547652b82fe // 1.0 / ln(2) 82 83.text 84 85#if defined( __x86_64__ ) 86 #define RELATIVE_ADDR( _a) (_a)( %rip ) 87 #define RELATIVE_ADDR_B( _a) (_a)( %rip ) 88#elif defined( __i386__ ) 89 #define RELATIVE_ADDR( _a) (_a)-expf_body( CX_P ) 90 #define RELATIVE_ADDR_B( _a) (_a)-expf_no_fenv_body( CX_P ) 91 92//a short routine to get the local address 93.align 4 94expf_pic: movl (%esp), %ecx //copy address of local_addr to %ecx 95 ret 96#else 97 #error arch not supported 98#endif 99 100//Accurate to within 0.5068 ulp in round to nearest. 101ENTRY( expf ) 102#if defined( __i386__ ) 103 movl FRAME_SIZE(STACKP), %eax 104 movss FRAME_SIZE(STACKP), %xmm0 105#else 106 movd %xmm0, %eax 107#endif 108 109 cvtss2sd %xmm0, %xmm2 // (double) x 110 movl %eax, %ecx // x 111 andl $0x7fffffff, %eax // |x|vl 112 movl $1023, %edx // double precision bias 113 subl $0x33b8aa3b, %eax // subtract 0x1.0p-24f / ln(2) 114 cmpl $0x0f7d1d57, %eax // if( |x| > 126/ln(2) || isnan(x) || |x| < 0x1.0p-26f /ln(2) ) 115 ja 3f // goto 3 116 117 //The test above was a little too aggressive. 118 //values 126 < x < 128, and -150 < x < -126 will come back here: 119 //For x < -126, we change %edx to hold the exponent for 2**-126, 120 // and subtract -126 from x 121 1221: 123//PIC 124#if defined( __i386__ ) 125 calll expf_pic // set %ecx to point to local_addr 126expf_body: 127#endif 128 129 mulsd RELATIVE_ADDR( reciprocalLn2 ), %xmm2 130 lea RELATIVE_ADDR( expf_c), CX_P 131 132 133 //extract fractional part and exit early if it is zero 134 cvttsd2si %xmm2, %eax // trunc(x) 135 addl %eax, %edx // calculate biased exponent of result 136 cvtsi2sd %eax, %xmm1 // trunc(x) 137 movd %edx, %xmm7 // exponent part of result 138 psllq $52, %xmm7 // shift result exponent into place 139 subsd %xmm1, %xmm2 // get the fractional part 140 141 142 143 // c0 + c1x1 + c2x2 + c3x3 + c4x4 + c5x5 + c6x6 + c7x7 + c8x8 144#if defined( __SSE3__ ) 145 movddup %xmm2, %xmm1 // { x, x } 146#else 147 movapd %xmm2, %xmm1 // x 148 unpcklpd %xmm1, %xmm1 // { x, x } 149#endif 150 mulsd %xmm2, %xmm2 // x*x 151 movapd %xmm1, %xmm3 152 mulpd 48(CX_P), %xmm1 // { c3x, (c7/c8)x } 153 mulpd 16(CX_P), %xmm3 // { c1x, (c5/c8)x } 154#if defined( __SSE3__ ) 155 movddup %xmm2, %xmm4 // { xx, xx } 156#else 157 movapd %xmm2, %xmm4 // xx 158 unpcklpd %xmm4, %xmm4 // { xx, xx } 159#endif 160 mulsd %xmm2, %xmm2 // xx*xx 161 addpd 32(CX_P), %xmm1 // { c2 + c3x, (c6/c8) + (c7/c8)x } 162 addpd (CX_P), %xmm3 // { c0 + c1x, (c4/c8) + (c5/c8)x } 163 mulpd %xmm4, %xmm1 // { c2xx + c3xxx, (c6/c8)xx + (c7/c8)xxx } 164 addsd %xmm2, %xmm3 // { c0 + c1x, (c4/c8) + (c5/c8)x + xxxx } 165 mulsd 64(CX_P), %xmm2 // c8 * xxxx 166 addpd %xmm1, %xmm3 // { c0 + c1x + c2xx + c3xxx, (c4/c8) + (c5/c8)x + (c6/c8)xx + (c7/c8)xxx + xxxx } 167 movhlps %xmm3, %xmm6 // { ?, c0 + c1x + c2xx + c3xxx } 168 mulsd %xmm2, %xmm3 // { ..., c8xxxx* ((c4/c8) + (c5/c8)x + (c6/c8)xx + (c7/c8)xxx + xxxx) } 169 addsd %xmm6, %xmm3 // c0 + c1x + c2xx + c3xxx + c4xxxx + c5xxxxx + c6xxxxxx + c7xxxxxxx + c8xxxxxxxxx 170 mulsd %xmm7, %xmm3 // 2**i * {c0 + c1x + c2xx + c3xxx + c4xxxx + c5xxxxx + c6xxxxxx + c7xxxxxxx + c8xxxxxxxxx} 171 172// convert to single precision and return 173 cvtsd2ss %xmm3, %xmm0 174#if defined( __i386__ ) 175 movss %xmm0, FRAME_SIZE(STACKP) 176 flds FRAME_SIZE(STACKP) 177#endif 178 ret 179 180 1813: // |x| > 126 || isnan(x) || |x| < 0x1.0p-26f 182 jge 4f // if( |x| > 126 || isnan(x) ) goto 4 183 184 //fall through for common case of |x| < 0x1.0p-26f 185 movl $0x3f800000, %edx // 1.0f 186 movd %edx, %xmm1 // 1.0f 187 addss %xmm1, %xmm0 // round away from 1.0 as appropriate for current rounding mode 188#if defined( __i386__ ) 189 movss %xmm0, FRAME_SIZE(STACKP) 190 flds FRAME_SIZE(STACKP) 191#endif 192 ret 193 194 // |x| > 126 || isnan(x) 1954: addl $0x33b8aa3b, %eax // restore |x| 196 cmpl $0x7f800000, %eax // 197 ja 5f // if isnan(x) goto 5 198 je 6f // if isinf(x) goto 6 199 200 cmpl $0xc358677d, %ecx // if( x <= -150 /ln(2) ) 201 jae 7f // goto 7 202 203 cmpl $0x4338aa3b, %ecx // if( x >= 128 /ln(2) ) 204 jge 8f // goto 8 205 206 jmp 1b 207 208 2095: //x is nan 210 addss %xmm0, %xmm0 //Quiet the NaN 211#if defined( __i386__ ) 212 movss %xmm0, FRAME_SIZE(STACKP) 213 flds FRAME_SIZE(STACKP) 214#endif 215 ret 216 2176: // |x| is inf 218 movss %xmm0, %xmm1 // x 219 psrad $31, %xmm0 // x == -Inf ? -1U : 0 220 andnps %xmm1, %xmm0 // x == -Inf ? 0 : Inf 221#if defined( __i386__ ) 222 movss %xmm0, FRAME_SIZE(STACKP) 223 flds FRAME_SIZE(STACKP) 224#endif 225 ret 226 2277: // x <= -150 228 movl $0x00800001, %eax 229 movd %eax, %xmm0 230 mulss %xmm0, %xmm0 //Create correctly rounded result, set inexact/underflow 231#if defined( __i386__ ) 232 movss %xmm0, FRAME_SIZE(STACKP) 233 flds FRAME_SIZE(STACKP) 234#endif 235 ret 236 2378: // x >= 128.0 238 movl $0x7f7fffff, %eax // FLT_MAX 239 movd %eax, %xmm0 // FLT_MAX 240 addss %xmm0, %xmm0 // Inf, set overflow 241#if defined( __i386__ ) 242 movss %xmm0, FRAME_SIZE(STACKP) 243 flds FRAME_SIZE(STACKP) 244#endif 245 ret 246 247//Uses round to nearest to deliver a reduced value between [ -0.5, 0.5 ] 248//This allows us to use a smaller polynomial 249//In addition, we dont bother to check for x is integer. This saves a few cycles in the general case. 250//Accurate to within 0.534 ulp 251ENTRY( expf$fenv_access_off ) 252#if defined( __i386__ ) 253 movl FRAME_SIZE(STACKP), %eax 254 movss FRAME_SIZE(STACKP), %xmm0 255#else 256 movd %xmm0, %eax 257#endif 258 259 cvtss2sd %xmm0, %xmm2 // (double) x 260 movl %eax, %ecx // x 261 andl $0x7fffffff, %eax // |x|vl 262 movl $1023, %edx // double precision bias 263 subl $0x33b8aa3b, %eax // subtract 0x1.0p-24f / ln(2) 264 cmpl $0x0f7d1d57, %eax // if( |x| > 126/ln(2) || isnan(x) || |x| < 0x1.0p-26f /ln(2) ) 265 ja 3f // goto 3 266 267 //The test above was a little too aggressive. 268 //values 126 < x < 128, and -150 < x < -126 will come back here: 269 //For x < -126, we change %edx to hold the exponent for 2**-126, 270 // and subtract -126 from x 271 2721: 273//PIC 274#if defined( __i386__ ) 275 calll expf_pic // set %ecx to point to local_addr 276expf_no_fenv_body: 277#endif 278 279 mulsd RELATIVE_ADDR( reciprocalLn2 ), %xmm2 280 lea RELATIVE_ADDR_B( expf_nofenv_c), CX_P 281 282 //extract fractional part and exit early if it is zero 283 andl $0x80000000, %ecx // signof(x) 284 orl $0x43300000, %ecx // copysign( 0x1.0p52, x ) >> 32 285 movd %ecx, %xmm6 // copysign( 0x1.0p52, x ) >> 32 286 movaps %xmm2, %xmm1 // x 287 psllq $32, %xmm6 // copysign( 0x1.0p52, x ) 288 addsd %xmm6, %xmm2 // x + copysign( 0x1.0p23f, x ) 289 subsd %xmm6, %xmm2 // rint(x) 290 cvttsd2si %xmm2, %eax // rint(x) 291 addl %eax, %edx // rint(x) + bias 292 movd %edx, %xmm7 // result exponent >> 32 293 subsd %xmm2, %xmm1 // fractional part 294 psllq $52, %xmm7 // shift result exponent into place 295 296 movsd (3*8)(CX_P), %xmm3 // c3 297 movsd (5*8)(CX_P), %xmm5 // c5 298 movsd (6*8)(CX_P), %xmm6 // c6 299 movapd %xmm1, %xmm2 // x 300 mulsd %xmm1, %xmm1 // x*x 301 mulsd %xmm2, %xmm3 // c3*x 302 mulsd %xmm2, %xmm5 // c5*x 303 mulsd (1*8)(CX_P), %xmm2 // c1*x 304 mulsd %xmm1, %xmm6 // c6*xx 305 addsd (2*8)(CX_P), %xmm3 // c2+c3x 306 movapd %xmm1, %xmm4 // xx 307 mulsd %xmm1, %xmm1 // xx*xx 308 addsd (4*8)(CX_P), %xmm5 // c4+c5x 309 addsd (CX_P), %xmm2 // c0+c1x 310 mulsd %xmm4, %xmm3 // c2xx+c3xxx 311 addsd %xmm6, %xmm5 // c4+c5x+c6xx 312 mulsd %xmm1, %xmm5 // c4xxxx+c5xxxxx+c6xxxxxx 313 addsd %xmm2, %xmm3 // c0+c1x+c2xx+c3xxx 314 addsd %xmm5, %xmm3 // c0+c1x+c2xx+c3xxx+c4xxxx+c5xxxxx+c6xxxxxx 315 316 mulsd %xmm7, %xmm3 // scale by 2**i 317 318// convert to single precision and return 319 cvtsd2ss %xmm3, %xmm0 320#if defined( __i386__ ) 321 movss %xmm0, FRAME_SIZE(STACKP) 322 flds FRAME_SIZE(STACKP) 323#endif 324 ret 325 326 3273: // |x| > 126 || isnan(x) || |x| < 0x1.0p-26f 328 jge 4f // if( |x| > 126 || isnan(x) ) goto 4 329 330 //fall through for common case of |x| < 0x1.0p-26f 331 movl $0x3f800000, %edx // 1.0f 332 movd %edx, %xmm1 // 1.0f 333 addss %xmm1, %xmm0 // round away from 1.0 as appropriate for current rounding mode 334#if defined( __i386__ ) 335 movss %xmm0, FRAME_SIZE(STACKP) 336 flds FRAME_SIZE(STACKP) 337#endif 338 ret 339 340 // |x| > 126 || isnan(x) 3414: addl $0x33b8aa3b, %eax // restore |x| 342 cmpl $0x7f800000, %eax // 343 ja 5f // if isnan(x) goto 5 344 je 6f // if isinf(x) goto 6 345 346 cmpl $0xc358677d, %ecx // if( x <= -150 /ln(2) ) 347 jae 7f // goto 7 348 349 cmpl $0x4338aa3b, %ecx // if( x >= 128 /ln(2) ) 350 jge 8f // goto 8 351 352 jmp 1b 353 354 3555: //x is nan 356 addss %xmm0, %xmm0 //Quiet the NaN 357#if defined( __i386__ ) 358 movss %xmm0, FRAME_SIZE(STACKP) 359 flds FRAME_SIZE(STACKP) 360#endif 361 ret 362 3636: // |x| is inf 364 movss %xmm0, %xmm1 // x 365 psrad $31, %xmm0 // x == -Inf ? -1U : 0 366 andnps %xmm1, %xmm0 // x == -Inf ? 0 : Inf 367#if defined( __i386__ ) 368 movss %xmm0, FRAME_SIZE(STACKP) 369 flds FRAME_SIZE(STACKP) 370#endif 371 ret 372 3737: // x <= -150 374 movl $0x00800001, %eax 375 movd %eax, %xmm0 376 mulss %xmm0, %xmm0 //Create correctly rounded result, set inexact/underflow 377#if defined( __i386__ ) 378 movss %xmm0, FRAME_SIZE(STACKP) 379 flds FRAME_SIZE(STACKP) 380#endif 381 ret 382 3838: // x >= 128.0 384 movl $0x7f800000, %eax // Inf 385#if defined( __i386__ ) 386 movl %eax, FRAME_SIZE(STACKP) 387 flds FRAME_SIZE(STACKP) 388#else 389 movd %eax, %xmm0 390#endif 391 ret 392