src/libm/Source/Intel/expf.S at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / expf.S
at fixPythonPipStalling 392 lines 17 kB view raw
wrap content
Lubos Dolezel More progress 9y ago
a76db625
  1/*
  2 *  expf.s
  3 *
  4 *      by Ian Ollmann
  5 *
  6 *  Copyright (c) 2007, Apple Inc.  All Rights Reserved.
  7 *
  8 *  This file implements the C99 expf function for the MacOS X __i386__ and __x86_64__ abis. 
  9 */
 10
 11#include <machine/asm.h>
 12#include "abi.h"
 13
 14//
 15//  Overall approach
 16//
 17//      We calculate e**(y) as follows:
 18//
 19//          e**y = 2**x         x = y / ln(2)
 20//
 21//      ...and proceed largely as we do in exp2f():
 22//
 23//          i = trunc(x)
 24//          f = x - i
 25//          
 26//          2**(x) = 2**(f+i) = 2**f * 2**i
 27//
 28//      We choose trunc here, so that 2**f always has the property of pushing 2**i that much closer to the 
 29//      extrema of finite, non-zero floating point computation. That is:
 30//
 31//              x < 1  --> 2**f <= 1
 32//              x == 1 --> 2**f = 1
 33//              x > 1  --> 2**f >= 1
 34//
 35//      This means that our 2**i calculation will not underflow/overflow prematurely before 2**f has had its say.
 36//      Round to nearest even would produce somewhat more accurate results, but accomplishing something like that 
 37//      rounding without knowledge of the current rounding mode is expensive.
 38//      
 39//      In principle, the 2**f part can further be broken up as:
 40//
 41//          2**f = 2**(c+r) = 2**c * 2**r
 42//
 43//      Where c is chosen so as to make r as close to 0 as possible and c can be determined fairly exactly by 
 44//      consulting a lookup table. Our choice of trunc above means we cant sure of the sign of f,
 45//      so our table ends up being twice as large as it would have if we had chosen floor or ceil. On the bright 
 46//      side, we never have to worry about f = x - trunc(f) being inexact in single precision.
 47//
 48//      Unfortunately, in practice this reduction seemed to be bad for throughput, so this implementation went with
 49//      a large polynomial instead that covers the range -1,1 or -0.5,0.5 depending on whether we know the rounding
 50//      mode is nearest or not.
 51//
 52//      Most of the arithmetic will be done in double precision. I investigated calculating the fractional term as 2**r-1, so as 
 53//      to preserve precision -- we expect a lot of zeros between the 1 and the first non-zero bit. Normally, we dont want
 54//      throw that away until the final operation so that non-default rounding modes arrive at something close to the 
 55//      right answer. Doing float in double precision means that this is unnecessary however. 
 56//
 57//      This code is probably worth revisiting once more to see if we can come up with a cheap reduction. These huge
 58//      polynomials just arent good for latency.
 59//
 60//
 61
 62.align 4
 63// 8th order minimax fit of exp2 on [-1.0,1.0].  |error| < 0.402865722354948566583852e-9:
 64expf_c:         .quad 0x40bc03f30399c376,   0x3ff000000001ea2a          // c4/c8 = 0.961813690023115610862381719985771e-2 / 0.134107709538786543922336536865157e-5, c0 = 1.0 + 0.278626872016317130037181614004e-10
 65                .quad 0x408f10e7f73e6d8f,   0x3fe62e42fd0933ee	        // c5/c8 = 0.133318252930790403741964203236548e-2 / 0.134107709538786543922336536865157e-5, c1 = .693147176943623740308984004029708   
 66                .quad 0x405cb616a9384e69,   0x3fcebfbdfd0f0afa	        // c6/c8 = 0.154016177542147239746127455226575e-3 / 0.134107709538786543922336536865157e-5, c2 = .240226505817268621584559118975830   
 67                .quad 0x4027173ebd288ba1,   0x3fac6b0a74f15403	        // c7/c8 = 0.154832722143258821052933667742417e-4 / 0.134107709538786543922336536865157e-5, c3 = 0.555041568519883074165425891257052e-1 
 68                .quad 0x3eb67fe1dc3105ba                                // c8 = 0.134107709538786543922336536865157e-5
 69
 70
 71.align 3
 72expf_nofenv_c:  .quad   0x3ff0000000058fca                          // 1.0 + -8.09329727503262700660348520172e-11
 73                .double 0.693147206709644041218074094717934
 74                .double 0.240226515050550309232521176082490
 75                .double 0.0555032721577134480296332498187050
 76                .double 0.00961799451418197772157647729048837
 77                .double 0.00134004316655903280893023589328151
 78                .double 0.000154780222739739895295319810010147
 79
 80.literal8
 81reciprocalLn2:  .quad   0x3ff71547652b82fe                          // 1.0 / ln(2)
 82
 83.text
 84
 85#if defined( __x86_64__ )
 86	#define RELATIVE_ADDR( _a)								(_a)( %rip )
 87	#define RELATIVE_ADDR_B( _a)							(_a)( %rip )
 88#elif defined( __i386__ )
 89	#define RELATIVE_ADDR( _a)								(_a)-expf_body( CX_P )
 90	#define RELATIVE_ADDR_B( _a)							(_a)-expf_no_fenv_body( CX_P )
 91
 92//a short routine to get the local address
 93.align 4
 94expf_pic:       movl    (%esp),     %ecx                    //copy address of local_addr to %ecx
 95                ret
 96#else
 97	#error arch not supported
 98#endif
 99
100//Accurate to within 0.5068 ulp in round to nearest. 
101ENTRY( expf )
102#if defined( __i386__ )
103    movl        FRAME_SIZE(STACKP),     %eax
104    movss       FRAME_SIZE(STACKP),     %xmm0
105#else
106    movd        %xmm0,                  %eax
107#endif
108    
109    cvtss2sd    %xmm0,                  %xmm2       // (double) x
110    movl        %eax,                   %ecx        // x
111    andl        $0x7fffffff,            %eax        // |x|vl
112    movl        $1023,                  %edx        // double precision bias
113    subl        $0x33b8aa3b,            %eax        // subtract 0x1.0p-24f / ln(2)
114    cmpl        $0x0f7d1d57,            %eax        // if( |x| > 126/ln(2) || isnan(x) || |x| < 0x1.0p-26f /ln(2) )
115    ja          3f                                  //      goto 3
116
117    //The test above was a little too aggressive. 
118    //values 126 < x < 128, and -150 < x < -126 will come back here:
119    //For x < -126, we change %edx to hold the exponent for 2**-126, 
120    //      and subtract -126 from x
121
1221:
123//PIC
124#if defined( __i386__ )
125    calll       expf_pic                                    // set %ecx to point to local_addr
126expf_body:   
127#endif
128
129    mulsd       RELATIVE_ADDR( reciprocalLn2 ), %xmm2
130    lea         RELATIVE_ADDR( expf_c), CX_P
131
132
133    //extract fractional part and exit early if it is zero
134    cvttsd2si   %xmm2,                  %eax        // trunc(x)
135    addl        %eax,                   %edx        // calculate biased exponent of result
136    cvtsi2sd    %eax,                   %xmm1       // trunc(x)
137    movd        %edx,                   %xmm7       // exponent part of result
138    psllq       $52,                    %xmm7       // shift result exponent into place
139    subsd       %xmm1,                  %xmm2       // get the fractional part
140
141
142    
143    // c0 + c1x1 + c2x2 + c3x3 + c4x4 + c5x5 + c6x6 + c7x7 + c8x8 
144#if defined( __SSE3__ )
145    movddup     %xmm2,                  %xmm1       // { x, x }
146#else
147    movapd      %xmm2,                  %xmm1       // x
148    unpcklpd    %xmm1,                  %xmm1       // { x, x }
149#endif    
150    mulsd       %xmm2,                  %xmm2       // x*x
151    movapd      %xmm1,                  %xmm3
152    mulpd       48(CX_P),               %xmm1       // { c3x, (c7/c8)x }
153    mulpd       16(CX_P),               %xmm3       // { c1x, (c5/c8)x }
154#if defined( __SSE3__ )
155    movddup     %xmm2,                  %xmm4       // { xx, xx }
156#else
157    movapd      %xmm2,                  %xmm4       // xx
158    unpcklpd    %xmm4,                  %xmm4       // { xx, xx }
159#endif
160    mulsd       %xmm2,                  %xmm2       // xx*xx
161    addpd       32(CX_P),               %xmm1       // { c2 + c3x, (c6/c8) + (c7/c8)x }
162    addpd       (CX_P),                 %xmm3       // { c0 + c1x, (c4/c8) + (c5/c8)x }
163    mulpd       %xmm4,                  %xmm1       // { c2xx + c3xxx, (c6/c8)xx + (c7/c8)xxx }
164    addsd       %xmm2,                  %xmm3       // { c0 + c1x, (c4/c8) + (c5/c8)x + xxxx }
165    mulsd       64(CX_P),               %xmm2       // c8 * xxxx
166    addpd       %xmm1,                  %xmm3       // { c0 + c1x + c2xx + c3xxx, (c4/c8) + (c5/c8)x + (c6/c8)xx + (c7/c8)xxx + xxxx }
167    movhlps     %xmm3,                  %xmm6       // { ?, c0 + c1x + c2xx + c3xxx }
168    mulsd       %xmm2,                  %xmm3       // { ..., c8xxxx* ((c4/c8) + (c5/c8)x + (c6/c8)xx + (c7/c8)xxx + xxxx) }
169    addsd       %xmm6,                  %xmm3       // c0 + c1x + c2xx + c3xxx + c4xxxx + c5xxxxx + c6xxxxxx + c7xxxxxxx + c8xxxxxxxxx
170    mulsd       %xmm7,                  %xmm3       // 2**i * {c0 + c1x + c2xx + c3xxx + c4xxxx + c5xxxxx + c6xxxxxx + c7xxxxxxx + c8xxxxxxxxx}
171    
172//  convert to single precision and return
173    cvtsd2ss    %xmm3,                  %xmm0
174#if defined( __i386__ )
175    movss       %xmm0,                  FRAME_SIZE(STACKP)
176    flds        FRAME_SIZE(STACKP)
177#endif
178    ret
179        
180
1813:  // |x| > 126 || isnan(x) || |x| < 0x1.0p-26f
182    jge          4f                                  // if( |x| > 126 || isnan(x) )   goto 4
183
184    //fall through for common case of |x| < 0x1.0p-26f
185    movl        $0x3f800000,            %edx        // 1.0f
186    movd        %edx,                   %xmm1       // 1.0f
187    addss       %xmm1,                  %xmm0       // round away from 1.0 as appropriate for current rounding mode
188#if defined( __i386__ )
189    movss       %xmm0,                  FRAME_SIZE(STACKP)
190    flds        FRAME_SIZE(STACKP)
191#endif
192    ret
193
194    // |x| > 126 || isnan(x)
1954:  addl        $0x33b8aa3b,            %eax        // restore |x|
196    cmpl        $0x7f800000,            %eax        // 
197    ja          5f                                  // if isnan(x) goto 5
198    je          6f                                  // if isinf(x) goto 6
199    
200    cmpl        $0xc358677d,            %ecx        // if( x <= -150 /ln(2) )
201    jae         7f                                  //      goto 7
202
203    cmpl        $0x4338aa3b,            %ecx        // if( x >= 128 /ln(2) )
204    jge         8f                                  //      goto 8
205
206    jmp         1b
207    
208
2095:  //x is nan
210    addss       %xmm0,                  %xmm0       //Quiet the NaN
211#if defined( __i386__ )
212    movss       %xmm0,                  FRAME_SIZE(STACKP)
213    flds        FRAME_SIZE(STACKP)
214#endif
215    ret
216    
2176: // |x| is inf
218    movss       %xmm0,                  %xmm1       // x
219    psrad       $31,                    %xmm0       // x == -Inf ? -1U : 0
220    andnps       %xmm1,                 %xmm0       // x == -Inf ? 0 : Inf
221#if defined( __i386__ )
222    movss       %xmm0,                  FRAME_SIZE(STACKP)
223    flds        FRAME_SIZE(STACKP)
224#endif
225    ret
226    
2277:  // x <= -150
228    movl        $0x00800001,            %eax
229    movd        %eax,                   %xmm0
230    mulss       %xmm0,                  %xmm0       //Create correctly rounded result, set inexact/underflow
231#if defined( __i386__ )
232    movss       %xmm0,                  FRAME_SIZE(STACKP)
233    flds        FRAME_SIZE(STACKP)
234#endif
235    ret
236    
2378:  // x >= 128.0
238    movl        $0x7f7fffff,            %eax        // FLT_MAX
239    movd        %eax,                   %xmm0       // FLT_MAX
240    addss       %xmm0,                  %xmm0       // Inf, set overflow
241#if defined( __i386__ )
242    movss       %xmm0,                  FRAME_SIZE(STACKP)
243    flds        FRAME_SIZE(STACKP)
244#endif
245    ret
246
247//Uses round to nearest to deliver a reduced value between [ -0.5, 0.5 ]
248//This allows us to use a smaller polynomial
249//In addition, we dont bother to check for x is integer. This saves a few cycles in the general case.
250//Accurate to within 0.534 ulp
251ENTRY( expf$fenv_access_off )
252#if defined( __i386__ )
253    movl        FRAME_SIZE(STACKP),     %eax
254    movss       FRAME_SIZE(STACKP),     %xmm0
255#else
256    movd        %xmm0,                  %eax
257#endif
258    
259    cvtss2sd    %xmm0,                  %xmm2       // (double) x
260    movl        %eax,                   %ecx        // x
261    andl        $0x7fffffff,            %eax        // |x|vl
262    movl        $1023,                  %edx        // double precision bias
263    subl        $0x33b8aa3b,            %eax        // subtract 0x1.0p-24f / ln(2)
264    cmpl        $0x0f7d1d57,            %eax        // if( |x| > 126/ln(2) || isnan(x) || |x| < 0x1.0p-26f /ln(2) )
265    ja          3f                                  //      goto 3
266
267    //The test above was a little too aggressive. 
268    //values 126 < x < 128, and -150 < x < -126 will come back here:
269    //For x < -126, we change %edx to hold the exponent for 2**-126, 
270    //      and subtract -126 from x
271
2721:
273//PIC
274#if defined( __i386__ )
275    calll       expf_pic                                    // set %ecx to point to local_addr
276expf_no_fenv_body:   
277#endif
278
279    mulsd       RELATIVE_ADDR( reciprocalLn2 ), %xmm2
280    lea         RELATIVE_ADDR_B( expf_nofenv_c), CX_P
281
282    //extract fractional part and exit early if it is zero
283    andl        $0x80000000,            %ecx        // signof(x)
284    orl         $0x43300000,            %ecx        // copysign( 0x1.0p52, x ) >> 32
285    movd        %ecx,                   %xmm6       // copysign( 0x1.0p52, x ) >> 32
286    movaps      %xmm2,                  %xmm1       // x
287    psllq       $32,                    %xmm6       // copysign( 0x1.0p52, x )
288    addsd       %xmm6,                  %xmm2       // x + copysign( 0x1.0p23f, x )
289    subsd       %xmm6,                  %xmm2       // rint(x)
290    cvttsd2si   %xmm2,                  %eax        // rint(x)
291    addl        %eax,                   %edx        // rint(x) + bias
292    movd        %edx,                   %xmm7       // result exponent >> 32
293    subsd       %xmm2,                  %xmm1       // fractional part
294    psllq       $52,                    %xmm7       // shift result exponent into place
295
296    movsd       (3*8)(CX_P),            %xmm3       // c3
297    movsd       (5*8)(CX_P),            %xmm5       // c5
298    movsd       (6*8)(CX_P),            %xmm6       // c6
299    movapd      %xmm1,                  %xmm2       // x
300    mulsd       %xmm1,                  %xmm1       // x*x
301    mulsd       %xmm2,                  %xmm3       // c3*x
302    mulsd       %xmm2,                  %xmm5       // c5*x
303    mulsd       (1*8)(CX_P),            %xmm2       // c1*x
304    mulsd       %xmm1,                  %xmm6       // c6*xx
305    addsd       (2*8)(CX_P),            %xmm3       // c2+c3x
306    movapd      %xmm1,                  %xmm4       // xx
307    mulsd       %xmm1,                  %xmm1       // xx*xx
308    addsd       (4*8)(CX_P),            %xmm5       // c4+c5x
309    addsd       (CX_P),                 %xmm2       // c0+c1x
310    mulsd       %xmm4,                  %xmm3       // c2xx+c3xxx
311    addsd       %xmm6,                  %xmm5       // c4+c5x+c6xx
312    mulsd       %xmm1,                  %xmm5       // c4xxxx+c5xxxxx+c6xxxxxx
313    addsd       %xmm2,                  %xmm3       // c0+c1x+c2xx+c3xxx
314    addsd       %xmm5,                  %xmm3       // c0+c1x+c2xx+c3xxx+c4xxxx+c5xxxxx+c6xxxxxx
315
316    mulsd       %xmm7,                  %xmm3       // scale by 2**i
317    
318//  convert to single precision and return
319    cvtsd2ss    %xmm3,                  %xmm0
320#if defined( __i386__ )
321    movss       %xmm0,                  FRAME_SIZE(STACKP)
322    flds        FRAME_SIZE(STACKP)
323#endif
324    ret
325        
326
3273:  // |x| > 126 || isnan(x) || |x| < 0x1.0p-26f
328    jge          4f                                  // if( |x| > 126 || isnan(x) )   goto 4
329
330    //fall through for common case of |x| < 0x1.0p-26f
331    movl        $0x3f800000,            %edx        // 1.0f
332    movd        %edx,                   %xmm1       // 1.0f
333    addss       %xmm1,                  %xmm0       // round away from 1.0 as appropriate for current rounding mode
334#if defined( __i386__ )
335    movss       %xmm0,                  FRAME_SIZE(STACKP)
336    flds        FRAME_SIZE(STACKP)
337#endif
338    ret
339
340    // |x| > 126 || isnan(x)
3414:  addl        $0x33b8aa3b,            %eax        // restore |x|
342    cmpl        $0x7f800000,            %eax        // 
343    ja          5f                                  // if isnan(x) goto 5
344    je          6f                                  // if isinf(x) goto 6
345    
346    cmpl        $0xc358677d,            %ecx        // if( x <= -150 /ln(2) )
347    jae         7f                                  //      goto 7
348
349    cmpl        $0x4338aa3b,            %ecx        // if( x >= 128 /ln(2) )
350    jge         8f                                  //      goto 8
351
352    jmp         1b
353    
354
3555:  //x is nan
356    addss       %xmm0,                  %xmm0       //Quiet the NaN
357#if defined( __i386__ )
358    movss       %xmm0,                  FRAME_SIZE(STACKP)
359    flds        FRAME_SIZE(STACKP)
360#endif
361    ret
362    
3636: // |x| is inf
364    movss       %xmm0,                  %xmm1       // x
365    psrad       $31,                    %xmm0       // x == -Inf ? -1U : 0
366    andnps       %xmm1,                 %xmm0       // x == -Inf ? 0 : Inf
367#if defined( __i386__ )
368    movss       %xmm0,                  FRAME_SIZE(STACKP)
369    flds        FRAME_SIZE(STACKP)
370#endif
371    ret
372    
3737:  // x <= -150
374    movl        $0x00800001,            %eax
375    movd        %eax,                   %xmm0
376    mulss       %xmm0,                  %xmm0       //Create correctly rounded result, set inexact/underflow
377#if defined( __i386__ )
378    movss       %xmm0,                  FRAME_SIZE(STACKP)
379    flds        FRAME_SIZE(STACKP)
380#endif
381    ret
382    
3838:  // x >= 128.0
384    movl        $0x7f800000,            %eax        // Inf
385#if defined( __i386__ )
386    movl        %eax,                   FRAME_SIZE(STACKP)
387    flds        FRAME_SIZE(STACKP)
388#else
389    movd        %eax,                   %xmm0
390#endif
391    ret
392