src/libm/Source/Intel/exp2f.S at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / exp2f.S
at fixPythonPipStalling 406 lines 18 kB view raw
wrap content
Lubos Dolezel More progress 9y ago
a76db625
  1/*
  2 *  exp2f.s
  3 *
  4 *      by Ian Ollmann
  5 *
  6 *  Copyright (c) 2007, Apple Inc.  All Rights Reserved.
  7 *
  8 *  This file implements the C99 exp2f function for the MacOS X __i386__ and __x86_64__ abis. 
  9 */
 10
 11#include <machine/asm.h>
 12#include "abi.h"
 13
 14//
 15//  Overall approach
 16//
 17//      We break up 2**(x) as follows:
 18//
 19//          i = trunc(x)
 20//          f = x - i
 21//          
 22//          2**(x) = 2**(f+i) = 2**f * 2**i
 23//
 24//      We choose trunc here, so that 2**f always has the property of pushing 2**i that much closer to the 
 25//      extrema of finite, non-zero floating point computation. That is:
 26//
 27//              x < 1  --> 2**f <= 1
 28//              x == 1 --> 2**f = 1
 29//              x > 1  --> 2**f >= 1
 30//
 31//      This means that our 2**i calculation will not underflow/overflow prematurely before 2**f has had its say.
 32//      Round to nearest even would produce somewhat more accurate results, but accomplishing something like that 
 33//      rounding without knowledge of the current rounding mode is expensive.
 34//      
 35//      In principle, the 2**f part can further be broken up as:
 36//
 37//          2**f = 2**(c+r) = 2**c * 2**r
 38//
 39//      Where c is chosen so as to make r as close to 0 as possible and c can be determined fairly exactly by 
 40//      consulting a lookup table. Our choice of trunc above means we cant sure of the sign of f,
 41//      so our table ends up being twice as large as it would have if we had chosen floor or ceil. On the bright 
 42//      side, we never have to worry about f = x - trunc(f) being inexact in single precision.
 43//
 44//      Unfortunately, in practice this reduction seemed to be bad for throughput, so this implementation went with
 45//      a large polynomial instead that covers the range -1,1 or -0.5,0.5 depending on whether we know the rounding
 46//      mode is nearest or not.
 47//
 48//      Most of the arithmetic will be done in double precision. I investigated calculating the fractional term as 2**r-1, so as 
 49//      to preserve precision -- we expect a lot of zeros between the 1 and the first non-zero bit. Normally, we dont want
 50//      throw that away until the final operation so that non-default rounding modes arrive at something close to the 
 51//      right answer. Doing float in double precision means that this is unnecessary however. 
 52//
 53//      This code is probably worth revisiting once more to see if we can come up with a cheap reduction. These huge
 54//      polynomials just arent good for latency.
 55//
 56//
 57
 58.align 4
 59// 8th order minimax fit of exp2 on [-1.0,1.0].  |error| < 0.402865722354948566583852e-9:
 60exp2f_c:        .quad 0x40bc03f30399c376,   0x3ff000000001ea2a          // c4/c8 = 0.961813690023115610862381719985771e-2 / 0.134107709538786543922336536865157e-5, c0 = 1.0 + 0.278626872016317130037181614004e-10
 61                .quad 0x408f10e7f73e6d8f,   0x3fe62e42fd0933ee	        // c5/c8 = 0.133318252930790403741964203236548e-2 / 0.134107709538786543922336536865157e-5, c1 = .693147176943623740308984004029708   
 62                .quad 0x405cb616a9384e69,   0x3fcebfbdfd0f0afa	        // c6/c8 = 0.154016177542147239746127455226575e-3 / 0.134107709538786543922336536865157e-5, c2 = .240226505817268621584559118975830   
 63                .quad 0x4027173ebd288ba1,   0x3fac6b0a74f15403	        // c7/c8 = 0.154832722143258821052933667742417e-4 / 0.134107709538786543922336536865157e-5, c3 = 0.555041568519883074165425891257052e-1 
 64                .quad 0x3eb67fe1dc3105ba                                // c8 = 0.134107709538786543922336536865157e-5
 65
 66
 67.align 3
 68exp2f_nofenv_c: .quad   0x3ff0000000058fca                          // 1.0 + -8.09329727503262700660348520172e-11
 69                .double 0.693147206709644041218074094717934
 70                .double 0.240226515050550309232521176082490
 71                .double 0.0555032721577134480296332498187050
 72                .double 0.00961799451418197772157647729048837
 73                .double 0.00134004316655903280893023589328151
 74                .double 0.000154780222739739895295319810010147
 75
 76
 77
 78.text
 79
 80#if defined( __x86_64__ )
 81	#define RELATIVE_ADDR( _a)								(_a)( %rip )
 82	#define RELATIVE_ADDR_B( _a)							(_a)( %rip )
 83#elif defined( __i386__ )
 84	#define RELATIVE_ADDR( _a)								(_a)-exp2f_body( CX_P )
 85	#define RELATIVE_ADDR_B( _a)							(_a)-exp2f_no_fenv_body( CX_P )
 86
 87//a short routine to get the local address
 88.align 4
 89exp2f_pic:      movl    (%esp),     %ecx                    //copy address of local_addr to %ecx
 90                ret
 91#else
 92	#error arch not supported
 93#endif
 94
 95//Accurate to within 0.5034 ulp in round to nearest. 
 96ENTRY( exp2f )
 97#if defined( __i386__ )
 98    movl        FRAME_SIZE(STACKP),     %eax
 99    movss       FRAME_SIZE(STACKP),     %xmm0
100#else
101    movd        %xmm0,                  %eax
102#endif
103    
104    movl        %eax,                   %ecx        // x
105    andl        $0x7fffffff,            %eax        // |x|vl
106    movl        $1023,                  %edx        // double precision bias
107    subl        $0x32800000,            %eax        // subtract 0x1.0p-26f
108    cmpl        $0x107c0000,            %eax        // if( |x| > 126 || isnan(x) || |x| < 0x1.0p-26f )
109    ja          3f                                  //      goto 3
110
111    //The test above was a little too aggressive. 
112    //values 126 < x < 128, and -150 < x < -126 will come back here:
113    //For x < -126, we change %edx to hold the exponent for 2**-126, 
114    //      and subtract -126 from x
115
1161:  //extract fractional part and exit early if it is zero
117    cvttss2si   %xmm0,                  %eax        // trunc(x)
118    cvtss2sd    %xmm0,                  %xmm2       // x
119    addl        %eax,                   %edx        // calculate biased exponent of result
120    cvtsi2sd    %eax,                   %xmm1       // trunc(x)
121    movd        %edx,                   %xmm7       // exponent part of result
122    ucomisd     %xmm2,                  %xmm1       // check to see if x was an integer
123    psllq       $52,                    %xmm7       // shift result exponent into place
124    je          2f                                  // if x was an integer, goto 2
125
126//PIC
127#if defined( __i386__ )
128    calll       exp2f_pic                                    // set %ecx to point to local_addr
129exp2f_body:   
130#endif
131
132    subsd       %xmm1,                  %xmm2       // get the fractional part
133    lea         RELATIVE_ADDR( exp2f_c), CX_P
134    
135    // c0 + c1x1 + c2x2 + c3x3 + c4x4 + c5x5 + c6x6 + c7x7 + c8x8 
136#if defined( __SSE3__ )
137    movddup     %xmm2,                  %xmm1       // { x, x }
138#else
139    movapd      %xmm2,                  %xmm1       // x
140    unpcklpd    %xmm1,                  %xmm1       // { x, x }
141#endif    
142    mulsd       %xmm2,                  %xmm2       // x*x
143    movapd      %xmm1,                  %xmm3
144    mulpd       48(CX_P),               %xmm1       // { c3x, (c7/c8)x }
145    mulpd       16(CX_P),               %xmm3       // { c1x, (c5/c8)x }
146#if defined( __SSE3__ )
147    movddup     %xmm2,                  %xmm4       // { xx, xx }
148#else
149    movapd      %xmm2,                  %xmm4       // xx
150    unpcklpd    %xmm4,                  %xmm4       // { xx, xx }
151#endif
152    mulsd       %xmm2,                  %xmm2       // xx*xx
153    addpd       32(CX_P),               %xmm1       // { c2 + c3x, (c6/c8) + (c7/c8)x }
154    addpd       (CX_P),                 %xmm3       // { c0 + c1x, (c4/c8) + (c5/c8)x }
155    mulpd       %xmm4,                  %xmm1       // { c2xx + c3xxx, (c6/c8)xx + (c7/c8)xxx }
156    addsd       %xmm2,                  %xmm3       // { c0 + c1x, (c4/c8) + (c5/c8)x + xxxx }
157    mulsd       64(CX_P),               %xmm2       // c8 * xxxx
158    addpd       %xmm1,                  %xmm3       // { c0 + c1x + c2xx + c3xxx, (c4/c8) + (c5/c8)x + (c6/c8)xx + (c7/c8)xxx + xxxx }
159    movhlps     %xmm3,                  %xmm6       // { ?, c0 + c1x + c2xx + c3xxx }
160    mulsd       %xmm2,                  %xmm3       // { ..., c8xxxx* ((c4/c8) + (c5/c8)x + (c6/c8)xx + (c7/c8)xxx + xxxx) }
161    addsd       %xmm6,                  %xmm3       // c0 + c1x + c2xx + c3xxx + c4xxxx + c5xxxxx + c6xxxxxx + c7xxxxxxx + c8xxxxxxxxx
162    mulsd       %xmm7,                  %xmm3       // 2**i * {c0 + c1x + c2xx + c3xxx + c4xxxx + c5xxxxx + c6xxxxxx + c7xxxxxxx + c8xxxxxxxxx}
163    
164//  convert to single precision and return
165    cvtsd2ss    %xmm3,                  %xmm0
166#if defined( __i386__ )
167    movss       %xmm0,                  FRAME_SIZE(STACKP)
168    flds        FRAME_SIZE(STACKP)
169#endif
170    ret
171    
172//x is an integer
1732:  // find 2**x
174    xorps       %xmm0,                  %xmm0
175    cvtsd2ss    %xmm7,                  %xmm0
176#if defined( __i386__ )
177    movss       %xmm0,                  FRAME_SIZE(STACKP)
178    flds        FRAME_SIZE(STACKP)
179#endif
180    ret
181    
182
1833:  // |x| > 126 || isnan(x) || |x| < 0x1.0p-26f
184    jge          4f                                  // if( |x| > 126 || isnan(x) )   goto 4
185
186    //fall through for common case of |x| < 0x1.0p-26f
187    movl        $0x3f800000,            %edx        // 1.0f
188    movd        %edx,                   %xmm1       // 1.0f
189    addss       %xmm1,                  %xmm0       // round away from 1.0 as appropriate for current rounding mode
190#if defined( __i386__ )
191    movss       %xmm0,                  FRAME_SIZE(STACKP)
192    flds        FRAME_SIZE(STACKP)
193#endif
194    ret
195
196    // |x| > 126 || isnan(x)
1974:  addl        $0x32800000,            %eax        // restore |x|
198    cmpl        $0x7f800000,            %eax        // 
199    ja          5f                                  // if isnan(x) goto 5
200    je          6f                                  // if isinf(x) goto 6
201    
202    cmpl        $0xc3160000,            %ecx        // if( x <= -150 )
203    jae         7f                                  //      goto 7
204
205    cmpl        $0x43000000,            %ecx        // if( x >= 128 )
206    jge         8f                                  //      goto 8
207
208    cmpl        $0,                     %ecx        // result is large finite
209    jge         1b                                  //   back to the main body
210    
211    // result is between -150 < x < -126,  denormal result
212    addl        $-126,                  %edx        // subtract 126 from exponent bias
213    movl        $0x42fc0000,            %eax        // 126.0f
214    movd        %eax,                   %xmm1       // 126.0f
215    addss       %xmm1,                  %xmm0       // x += 126.0f
216    jmp         1b
217    
218
2195:  //x is nan
220    addss       %xmm0,                  %xmm0       //Quiet the NaN
221#if defined( __i386__ )
222    movss       %xmm0,                  FRAME_SIZE(STACKP)
223    flds        FRAME_SIZE(STACKP)
224#endif
225    ret
226    
2276: // |x| is inf
228    movss       %xmm0,                  %xmm1       // x
229    psrad       $31,                    %xmm0       // x == -Inf ? -1U : 0
230    andnps       %xmm1,                 %xmm0       // x == -Inf ? 0 : Inf
231#if defined( __i386__ )
232    movss       %xmm0,                  FRAME_SIZE(STACKP)
233    flds        FRAME_SIZE(STACKP)
234#endif
235    ret
236    
2377:  // x <= -150
238    movl        $0x00800001,            %eax
239    movd        %eax,                   %xmm0
240    mulss       %xmm0,                  %xmm0       //Create correctly rounded result, set inexact/underflow
241#if defined( __i386__ )
242    movss       %xmm0,                  FRAME_SIZE(STACKP)
243    flds        FRAME_SIZE(STACKP)
244#endif
245    ret
246    
2478:  // x >= 128.0
248    movl        $0x7f7fffff,            %eax        // FLT_MAX
249    movd        %eax,                   %xmm0       // FLT_MAX
250    addss       %xmm0,                  %xmm0       // Inf, set overflow
251#if defined( __i386__ )
252    movss       %xmm0,                  FRAME_SIZE(STACKP)
253    flds        FRAME_SIZE(STACKP)
254#endif
255    ret
256
257//Uses round to nearest to deliver a reduced value between [ -0.5, 0.5 ]
258//This allows us to use a smaller polynomial
259//In addition, we dont bother to check for x is integer. This saves a few cycles in the general case.
260//Accurate to within 0.534 ulp
261ENTRY( exp2f$fenv_access_off )
262#if defined( __i386__ )
263    movl        FRAME_SIZE(STACKP),     %eax
264    movss       FRAME_SIZE(STACKP),     %xmm0
265#else
266    movd        %xmm0,                  %eax
267#endif
268    
269    movl        %eax,                   %ecx        // x
270    andl        $0x7fffffff,            %eax        // |x|vl
271    movl        $1023,                  %edx        // double precision bias
272    subl        $0x32800000,            %eax        // subtract 0x1.0p-26f
273    cmpl        $0x107c0000,            %eax        // if( |x| > 126 || isnan(x) || |x| < 0x1.0p-26f )
274    ja          3f                                  //      goto 3
275
276    //The test above was a little too aggressive. 
277    //values 126 < x < 128, and -150 < x < -126 will come back here:
278    //For x < -126, we change %edx to hold the exponent for 2**-126, 
279    //      and subtract -126 from x
280
2811:  //extract fractional part and exit early if it is zero
282    andl        $0x80000000,            %ecx        // signof(x)
283    orl         $0x4b000000,            %ecx        // copysign( 0x1.0p23f, x )
284    movd        %ecx,                   %xmm2       // copysign( 0x1.0p23f, x )
285    movaps      %xmm0,                  %xmm1       // x
286    addss       %xmm2,                  %xmm0       // x + copysign( 0x1.0p23f, x )
287    subss       %xmm2,                  %xmm0       // rint(x)
288    cvttss2si   %xmm0,                  %eax        // rint(x)
289    addl        %eax,                   %edx        // rint(x) + bias
290    movd        %edx,                   %xmm7       // result exponent >> 32
291    subss       %xmm0,                  %xmm1       // fractional part
292    psllq       $52,                    %xmm7       // shift result exponent into place
293    cvtss2sd    %xmm1,                  %xmm1       // fractional part as a double
294
295//PIC
296#if defined( __i386__ )
297    calll       exp2f_pic                                    // set %ecx to point to local_addr
298exp2f_no_fenv_body:   
299#endif
300
301    lea         RELATIVE_ADDR_B( exp2f_nofenv_c), CX_P
302    movsd       (3*8)(CX_P),            %xmm3       // c3
303    movsd       (5*8)(CX_P),            %xmm5       // c5
304    movsd       (6*8)(CX_P),            %xmm6       // c6
305    movapd      %xmm1,                  %xmm2       // x
306    mulsd       %xmm1,                  %xmm1       // x*x
307    mulsd       %xmm2,                  %xmm3       // c3*x
308    mulsd       %xmm2,                  %xmm5       // c5*x
309    mulsd       (1*8)(CX_P),            %xmm2       // c1*x
310    mulsd       %xmm1,                  %xmm6       // c6*xx
311    addsd       (2*8)(CX_P),            %xmm3       // c2+c3x
312    movapd      %xmm1,                  %xmm4       // xx
313    mulsd       %xmm1,                  %xmm1       // xx*xx
314    addsd       (4*8)(CX_P),            %xmm5       // c4+c5x
315    addsd       (CX_P),                 %xmm2       // c0+c1x
316    mulsd       %xmm4,                  %xmm3       // c2xx+c3xxx
317    addsd       %xmm6,                  %xmm5       // c4+c5x+c6xx
318    mulsd       %xmm1,                  %xmm5       // c4xxxx+c5xxxxx+c6xxxxxx
319    addsd       %xmm2,                  %xmm3       // c0+c1x+c2xx+c3xxx
320    addsd       %xmm5,                  %xmm3       // c0+c1x+c2xx+c3xxx+c4xxxx+c5xxxxx+c6xxxxxx
321
322    mulsd       %xmm7,                  %xmm3       // scale by 2**i
323    
324//  convert to single precision and return
325    cvtsd2ss    %xmm3,                  %xmm0
326#if defined( __i386__ )
327    movss       %xmm0,                  FRAME_SIZE(STACKP)
328    flds        FRAME_SIZE(STACKP)
329#endif
330    ret
331        
332
3333:  // |x| > 126 || isnan(x) || |x| < 0x1.0p-26f
334    jge          4f                                  // if( |x| > 126 || isnan(x) )   goto 4
335
336    //fall through for common case of |x| < 0x1.0p-26f
337    movl        $0x3f800000,            %edx        // 1.0f
338    movd        %edx,                   %xmm1       // 1.0f
339    addss       %xmm1,                  %xmm0       // round away from 1.0 as appropriate for current rounding mode
340#if defined( __i386__ )
341    movss       %xmm0,                  FRAME_SIZE(STACKP)
342    flds        FRAME_SIZE(STACKP)
343#endif
344    ret
345
346    // |x| > 126 || isnan(x)
3474:  addl        $0x32800000,            %eax        // restore |x|
348    cmpl        $0x7f800000,            %eax        // 
349    ja          5f                                  // if isnan(x) goto 5
350    je          6f                                  // if isinf(x) goto 6
351    
352    cmpl        $0xc3160000,            %ecx        // if( x <= -150 )
353    jae         7f                                  //      goto 7
354
355    cmpl        $0x43000000,            %ecx        // if( x >= 128 )
356    jge         8f                                  //      goto 8
357
358    cmpl        $0,                     %ecx        // result is large finite
359    jge         1b                                  //   back to the main body
360    
361    // result is between -150 < x < -126,  denormal result
362    addl        $-126,                  %edx        // subtract 126 from exponent bias
363    movl        $0x42fc0000,            %eax        // 126.0f
364    movd        %eax,                   %xmm1       // 126.0f
365    addss       %xmm1,                  %xmm0       // x += 126.0f
366    jmp         1b
367    
368
3695:  //x is nan
370    addss       %xmm0,                  %xmm0       //Quiet the NaN
371#if defined( __i386__ )
372    movss       %xmm0,                  FRAME_SIZE(STACKP)
373    flds        FRAME_SIZE(STACKP)
374#endif
375    ret
376    
3776: // |x| is inf
378    movss       %xmm0,                  %xmm1       // x
379    psrad       $31,                    %xmm0       // x == -Inf ? -1U : 0
380    andnps       %xmm1,                 %xmm0       // x == -Inf ? 0 : Inf
381#if defined( __i386__ )
382    movss       %xmm0,                  FRAME_SIZE(STACKP)
383    flds        FRAME_SIZE(STACKP)
384#endif
385    ret
386    
3877:  // x <= -150
388    movl        $0x00800001,            %eax
389    movd        %eax,                   %xmm0
390    mulss       %xmm0,                  %xmm0       //Create correctly rounded result, set inexact/underflow
391#if defined( __i386__ )
392    movss       %xmm0,                  FRAME_SIZE(STACKP)
393    flds        FRAME_SIZE(STACKP)
394#endif
395    ret
396    
3978:  // x >= 128.0
398    movl        $0x7f800000,            %eax        // Inf
399#if defined( __i386__ )
400    movl        %eax,                   FRAME_SIZE(STACKP)
401    flds        FRAME_SIZE(STACKP)
402#else
403    movd        %eax,                   %xmm0
404#endif
405    ret
406