src/libm/Source/Intel/expf_logf_powf.c at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / expf_logf_powf.c
at fixPythonPipStalling 664 lines 36 kB view raw
wrap content
Andrew Hyatt Resolve filename case issue with libm (fixes #379) 8y ago
c45e7204
  1/*
  2 *  expf_logf_powf.c
  3 *  Libm
  4 *
  5 *  Written by Ian Ollmann
  6 *  Polynomials by Ali Sazegari.
  7 *
  8 *  Copyright 2007 Apple Inc. All rights reserved.
  9 *
 10 */
 11
 12#include "math.h"
 13#include <emmintrin.h>
 14#include <stdint.h>
 15#include "xmmLibm_prefix.h"
 16#include <float.h>
 17
 18#if defined( __SSE3__ )
 19    #include <pmmintrin.h>
 20#endif
 21
 22                                                         
 23//11th order minimax polynomial approximation of y = log2(x)
 24//Valid over the range [1, 2]. Error is 0.160490514e-9.
 25//Packaged as vector doubles, in order from c0 to c11
 26static const __m128d log2polynomialCoefficients[6] = {   
 27                                                        { -3.86011461177189878, 11.2166692423937703 },
 28                                                        {-19.7108996153552616,  27.5580934192221441 },
 29                                                        {-28.7390040495347408,  22.2562416071123559 },
 30                                                        {-12.7549704819157692,   5.34170406305677394 },
 31                                                        {-1.59009195684350404,  0.318880468456163145 },
 32                                                        {-0.0386477049377158355, 0.00213962027817084459 }
 33                                                    };
 34
 35
 36//Second order minimax polynomial approximation of  y = 2**x
 37//Valid over the range [0.0,0.4e-2]. (We need [0, 2**-8] )
 38//  error is +- 0.11116223e-9,  We need less than 2**-28 here.
 39static const double exp2polynomialCoefficients[3] = {   1.00000000011116223,
 40                                                        0.693146680445468386,
 41                                                        0.240559810709772152    };
 42
 43
 44typedef uint32_t           vUInt32         __attribute__ ((__vector_size__ (16)));
 45
 46static const vUInt32 v1023 = { 1023, 0, 0, 0 };
 47static const __m128d exponentMask = { __builtin_inf(), __builtin_inf() };
 48static const __m128d half_sd = { 0.5, 0.0 };
 49static const __m128d one_sd = { 1.0, 0.0 };
 50static const __m128d two_pd = { 2.0, 2.0 };
 51
 52/*
 53float log2f( float x )
 54{
 55//Edge cases for C99 TC2:
 56
 57    //Handle NaN
 58    if( x != x )
 59        return x;
 60
 61// log2(+-0)returns -Inf and raises the divide-by-zero floating-point exception. 
 62// log2(x)returns a NaN and raises the  invalid floating-point exception for x < 0. 
 63    if( x <= 0.0f )
 64    {
 65        if( x == 0.0f )
 66            return XFLOAT_2_FLOAT( REQUIRED_DIVIDE_ss( FLOAT_2_XFLOAT(-1.0f), _mm_setzero_ps() ));
 67        else
 68            return XFLOAT_2_FLOAT( REQUIRED_MULTIPLY_ss( FLOAT_2_XFLOAT(__builtin_inff()), _mm_setzero_ps() ));
 69    }
 70    
 71// log2(+Inf)returns +Inf. 
 72    if( x == __builtin_inff() )
 73        return x;
 74
 75// We are going to try to get the last edge case to just happen in arithmetic
 76// log2(1)returns +0. 
 77
 78
 79//Main body
 80
 81    //Convert to double  
 82    __m128d X = FLOAT_2_XDOUBLE(x);      //{ 0, x }
 83
 84    //extract exponent -- denormals can't happen here
 85    __m128i iexponent = _mm_sub_epi32( _mm_srli_epi64( (__m128i) _mm_and_pd( X, exponentMask), 52 ), v1023 );
 86    __m128d exponent = _mm_cvtepi32_pd( iexponent );
 87
 88    //get mantissa of X as value in the range [ 1.0, 2.0 )
 89    __m128d mantissa = _mm_or_pd( _mm_andnot_pd( exponentMask, X ), one_sd );
 90
 91    //avoid setting inexact flag for exact powers of two
 92    if( 1.0 == XDOUBLE_2_DOUBLE( mantissa ) )
 93        return XDOUBLE_2_FLOAT( exponent );
 94
 95    __m128d m1, m2, m4, v1, v;
 96
 97#if 1
 98    if( 0.98f < x && x < 1.02f )
 99    {
100        float temp;
101
102        //use fyl2x to get y*log2(x)
103        asm volatile( "fld1 \n flds (%1)\n fyl2x \n fstps %0" : "=m" (*(&temp)) : "r" (&x) );
104            
105        return temp;
106    }
107#else
108    //inputs between 0.98 and 1.02 need special care
109    if( 0.98f < x && x < 1.02f )
110    {
111        __m128d XX = _mm_unpacklo_pd( X, X );
112        m1 = _mm_unpacklo_pd( one_sd, X );                          // { m, 1 }
113        m2 = _mm_mul_pd( XX, XX );                                   //{ m**2, m**2 }
114        m4 = _mm_mul_pd( m2, m2 );                                  //{ m**4, m**4 }
115
116        v1 = _mm_mul_pd( m4, log2near1polynomialCoefficients[5] );               // { c11 m**4, c10 m**4 } 
117        v  = _mm_mul_pd( m4, log2near1polynomialCoefficients[4] );               // { c9 m**4, c8 m**4 } 
118        m2 = _mm_mul_pd( m2, m1 );
119        v1 = _mm_add_pd( v1, log2near1polynomialCoefficients[3] );                       // { c7 + c11 m**4, c6 + c10 m**4 }
120        v  = _mm_add_pd( v , log2near1polynomialCoefficients[2] );                       // { c5 + c9 m**4, c4 + c8 m**4 }
121        v1 = _mm_mul_pd( v1, m4 );                                                  // { c7 m**4 + c11 m**8, c6 m**4 + c10 m**8 }
122        v  = _mm_mul_pd( v , m4 );                                                  // { c5 m**4 + c9 m**8, c4 m**4 + c8 m**8 }
123        v1 = _mm_add_pd( v1, log2near1polynomialCoefficients[1] );                       // { c3 + c7 m**4 + c11 m**8, c2 + c6 m**4 + c10 m**8 }
124        v  = _mm_add_pd( v , log2near1polynomialCoefficients[0] );                       // { c1 + c5 m**4 + c9 m**8, c0 + c4 m**4 + c8 m**8 }
125        v1 = _mm_mul_pd( v1, m2 );                                                  // { c3 m**3 + c7 m**7 + c11 m**11, c2 m**2 + c6 m**4 + c10 m**8 }
126        v  = _mm_mul_pd( v , m1 );                                                  // { c1 m + c5 m**5 + c9 m**9, c0 + c4 m**4 + c8 m**8 }
127        v  = _mm_add_pd( v , v1 );
128
129        //Add the two sub terms horizontally
130    #if defined( __SSE3__ )
131        v = _mm_hadd_pd(  v, _mm_setzero_pd() );
132    #else
133        v = _mm_add_sd( v, _mm_unpackhi_pd( v, v ) );
134    #endif
135    
136        return XDOUBLE_2_FLOAT( v );
137    }
138#endif
139
140    //First make a vector out of the mantissa
141    __m128d vMantissa = _mm_unpacklo_pd( mantissa, mantissa );                  //{ mantissa, mantissa }
142    m1 = _mm_unpacklo_pd( one_sd, mantissa );                           // { m, 1 }
143    m2 = _mm_mul_pd( vMantissa, vMantissa );                            //{ m**2, m**2 }
144    m4 = _mm_mul_pd( m2, m2 );                                          //{ m**4, m**4 }
145    v1 = _mm_mul_pd( m4, log2polynomialCoefficients[5] );               // { c11 m**4, c10 m**4 } 
146    v  = _mm_mul_pd( m4, log2polynomialCoefficients[4] );               // { c9 m**4, c8 m**4 } 
147    m2 = _mm_mul_pd( m2, m1 );
148    v1 = _mm_add_pd( v1, log2polynomialCoefficients[3] );                       // { c7 + c11 m**4, c6 + c10 m**4 }
149    v  = _mm_add_pd( v , log2polynomialCoefficients[2] );                       // { c5 + c9 m**4, c4 + c8 m**4 }
150    v1 = _mm_mul_pd( v1, m4 );                                                  // { c7 m**4 + c11 m**8, c6 m**4 + c10 m**8 }
151    v  = _mm_mul_pd( v , m4 );                                                  // { c5 m**4 + c9 m**8, c4 m**4 + c8 m**8 }
152    v1 = _mm_add_pd( v1, log2polynomialCoefficients[1] );                       // { c3 + c7 m**4 + c11 m**8, c2 + c6 m**4 + c10 m**8 }
153    v  = _mm_add_pd( v , log2polynomialCoefficients[0] );                       // { c1 + c5 m**4 + c9 m**8, c0 + c4 m**4 + c8 m**8 }
154    v1 = _mm_mul_pd( v1, m2 );                                                  // { c3 m**3 + c7 m**7 + c11 m**11, c2 m**2 + c6 m**4 + c10 m**8 }
155    v  = _mm_mul_pd( v , m1 );                                                  // { c1 m + c5 m**5 + c9 m**9, c0 + c4 m**4 + c8 m**8 }
156    v  = _mm_add_pd( v , v1 );
157
158    //Add the two sub terms horizontally
159#if defined( __SSE3__ )
160    v = _mm_hadd_pd(  v, _mm_setzero_pd() );
161#else
162    v = _mm_add_sd( v, _mm_unpackhi_pd( v, v ) );
163#endif
164
165
166    //add in the exponent
167    v = _mm_add_sd( v, exponent );
168    
169    return XDOUBLE_2_FLOAT( v );
170}
171*/
172
173static const __m128i mantissaHiBits = { 0x000FF00000000000ULL, 0ULL };
174
175/* 
176    exp2_table produced as follows:
177    
178    #include <emmintrin.h>
179    #include <stdio.h>
180    #include <stdint.h>
181    #include <math.h>
182
183
184    int main( void )
185    {
186            int i;
187
188            for( i = 0; i < 256; i++ )
189            {
190                    union
191                    {
192                            uint64_t        u;
193                            double          d;
194                    }u = { 0x3ff0000000000000ULL | ((uint64_t) i << 44 ) };
195
196                    if( 0 == i % 4 )
197                            printf( "\n" );
198
199                    printf( " {  %17.21f, 0x0.%2.2xp0 },", exp2( u.d - 1.0 ), i  );
200
201            }
202            printf( "\n" );
203
204            return 0;
205    }
206*/
207
208static const __m128d exp2_table[256] = { 
209                                            { 1.000000000000000000000, 0x0.00p0 }, { 1.002711275050202521797, 0x0.01p0 }, { 1.005429901112802726360, 0x0.02p0 }, { 1.008155898118417548304, 0x0.03p0 },
210                                            { 1.010889286051700475255, 0x0.04p0 }, { 1.013630084951489429557, 0x0.05p0 }, { 1.016378314910953095662, 0x0.06p0 }, { 1.019133996077737913666, 0x0.07p0 },
211                                            { 1.021897148654116627142, 0x0.08p0 }, { 1.024667792897135720764, 0x0.09p0 }, { 1.027445949118763746100, 0x0.0ap0 }, { 1.030231637686040979673, 0x0.0bp0 },
212                                            { 1.033024879021228414899, 0x0.0cp0 }, { 1.035825693601957198098, 0x0.0dp0 }, { 1.038634101961378730650, 0x0.0ep0 }, { 1.041450124688316103416, 0x0.0fp0 },
213                                            { 1.044273782427413754803, 0x0.10p0 }, { 1.047105095879289793359, 0x0.11p0 }, { 1.049944085800687210153, 0x0.12p0 }, { 1.052790773004626423415, 0x0.13p0 },
214                                            { 1.055645178360557157049, 0x0.14p0 }, { 1.058507322794512761632, 0x0.15p0 }, { 1.061377227289262092924, 0x0.16p0 }, { 1.064254912884464499001, 0x0.17p0 },
215                                            { 1.067140400676823697168, 0x0.18p0 }, { 1.070033711820241872914, 0x0.19p0 }, { 1.072934867525975555225, 0x0.1ap0 }, { 1.075843889062791047806, 0x0.1bp0 },
216                                            { 1.078760797757119860307, 0x0.1cp0 }, { 1.081685614993215249768, 0x0.1dp0 }, { 1.084618362213309206155, 0x0.1ep0 }, { 1.087559060917769659937, 0x0.1fp0 },
217                                            { 1.090507732665257689675, 0x0.20p0 }, { 1.093464399072885839814, 0x0.21p0 }, { 1.096429081816376882585, 0x0.22p0 }, { 1.099401802630221913759, 0x0.23p0 },
218                                            { 1.102382583307840890896, 0x0.24p0 }, { 1.105371445701741173195, 0x0.25p0 }, { 1.108368411723678725878, 0x0.26p0 }, { 1.111373503344817548211, 0x0.27p0 },
219                                            { 1.114386742595892432206, 0x0.28p0 }, { 1.117408151567369278823, 0x0.29p0 }, { 1.120437752409606746440, 0x0.2ap0 }, { 1.123475567333019897731, 0x0.2bp0 },
220                                            { 1.126521618608241848136, 0x0.2cp0 }, { 1.129575928566288078869, 0x0.2dp0 }, { 1.132638519598719195614, 0x0.2ep0 }, { 1.135709414157805463574, 0x0.2fp0 },
221                                            { 1.138788634756691564576, 0x0.30p0 }, { 1.141876203969561576201, 0x0.31p0 }, { 1.144972144431804172982, 0x0.32p0 }, { 1.148076478840178937801, 0x0.33p0 },
222                                            { 1.151189229952982673311, 0x0.34p0 }, { 1.154310420590215935377, 0x0.35p0 }, { 1.157440073633751120852, 0x0.36p0 }, { 1.160578212027498778980, 0x0.37p0 },
223                                            { 1.163724858777577475522, 0x0.38p0 }, { 1.166880036952481658474, 0x0.39p0 }, { 1.170043769683250189928, 0x0.3ap0 }, { 1.173216080163637320410, 0x0.3bp0 },
224                                            { 1.176396991650281220743, 0x0.3cp0 }, { 1.179586527462875844563, 0x0.3dp0 }, { 1.182784710984341014495, 0x0.3ep0 }, { 1.185991565660993840581, 0x0.3fp0 },
225                                            { 1.189207115002721026897, 0x0.40p0 }, { 1.192431382583151178167, 0x0.41p0 }, { 1.195664392039827328418, 0x0.42p0 }, { 1.198906167074380357818, 0x0.43p0 },
226                                            { 1.202156731452703075647, 0x0.44p0 }, { 1.205416109005123859177, 0x0.45p0 }, { 1.208684323626581624822, 0x0.46p0 }, { 1.211961399276801243374, 0x0.47p0 },
227                                            { 1.215247359980468955243, 0x0.48p0 }, { 1.218542229827408451825, 0x0.49p0 }, { 1.221846032972757400969, 0x0.4ap0 }, { 1.225158793637145526745, 0x0.4bp0 },
228                                            { 1.228480536106870024682, 0x0.4cp0 }, { 1.231811284734075861991, 0x0.4dp0 }, { 1.235151063936933191201, 0x0.4ep0 }, { 1.238499898199816540156, 0x0.4fp0 },
229                                            { 1.241857812073484002013, 0x0.50p0 }, { 1.245224830175257979548, 0x0.51p0 }, { 1.248600977189204819240, 0x0.52p0 }, { 1.251986277866316221719, 0x0.53p0 },
230                                            { 1.255380757024691096291, 0x0.54p0 }, { 1.258784439549716527296, 0x0.55p0 }, { 1.262197350394250738859, 0x0.56p0 }, { 1.265619514578806281691, 0x0.57p0 },
231                                            { 1.269050957191733219886, 0x0.58p0 }, { 1.272491703389402761815, 0x0.59p0 }, { 1.275941778396392001227, 0x0.5ap0 }, { 1.279401207505669324505, 0x0.5bp0 },
232                                            { 1.282870016078778263591, 0x0.5cp0 }, { 1.286348229546025567771, 0x0.5dp0 }, { 1.289835873406665944785, 0x0.5ep0 }, { 1.293332973229089466471, 0x0.5fp0 },
233                                            { 1.296839554651009640551, 0x0.60p0 }, { 1.300355643379650594227, 0x0.61p0 }, { 1.303881265191935812098, 0x0.62p0 }, { 1.307416445934677318164, 0x0.63p0 },
234                                            { 1.310961211524764413738, 0x0.64p0 }, { 1.314515587949354635811, 0x0.65p0 }, { 1.318079601266064049270, 0x0.66p0 }, { 1.321653277603157539133, 0x0.67p0 },
235                                            { 1.325236643159741323217, 0x0.68p0 }, { 1.328829724205954354588, 0x0.69p0 }, { 1.332432547083161500368, 0x0.6ap0 }, { 1.336045138204145832361, 0x0.6bp0 },
236                                            { 1.339667524053302916087, 0x0.6cp0 }, { 1.343299731186835321850, 0x0.6dp0 }, { 1.346941786232945803548, 0x0.6ep0 }, { 1.350593715892034252235, 0x0.6fp0 },
237                                            { 1.354255546936892651289, 0x0.70p0 }, { 1.357927306212901141791, 0x0.71p0 }, { 1.361609020638224754052, 0x0.72p0 }, { 1.365300717204011915484, 0x0.73p0 },
238                                            { 1.369002422974590738036, 0x0.74p0 }, { 1.372714165087668414245, 0x0.75p0 }, { 1.376435970754530169202, 0x0.76p0 }, { 1.380167867260237990479, 0x0.77p0 },
239                                            { 1.383909881963832022578, 0x0.78p0 }, { 1.387662042298529074813, 0x0.79p0 }, { 1.391424375771926236212, 0x0.7ap0 }, { 1.395196909966200271569, 0x0.7bp0 },
240                                            { 1.398979672538311236352, 0x0.7cp0 }, { 1.402772691220204759333, 0x0.7dp0 }, { 1.406575993819015435449, 0x0.7ep0 }, { 1.410389608217270662749, 0x0.7fp0 },
241                                            { 1.414213562373094923430, 0x0.80p0 }, { 1.418047884320415175097, 0x0.81p0 }, { 1.421892602169165575887, 0x0.82p0 }, { 1.425747744105494430045, 0x0.83p0 },
242                                            { 1.429613338391970023267, 0x0.84p0 }, { 1.433489413367788900544, 0x0.85p0 }, { 1.437375997448982367644, 0x0.86p0 }, { 1.441273119128625657126, 0x0.87p0 },
243                                            { 1.445180806977046650275, 0x0.88p0 }, { 1.449099089642035043113, 0x0.89p0 }, { 1.453027995849052622646, 0x0.8ap0 }, { 1.456967554401443765144, 0x0.8bp0 },
244                                            { 1.460917794180647044655, 0x0.8cp0 }, { 1.464878744146405731286, 0x0.8dp0 }, { 1.468850433336981842203, 0x0.8ep0 }, { 1.472832890869367528097, 0x0.8fp0 },
245                                            { 1.476826145939499346227, 0x0.90p0 }, { 1.480830227822471867327, 0x0.91p0 }, { 1.484845165872752614789, 0x0.92p0 }, { 1.488870989524397003834, 0x0.93p0 },
246                                            { 1.492907728291264835008, 0x0.94p0 }, { 1.496955411767235455400, 0x0.95p0 }, { 1.501014069626425584403, 0x0.96p0 }, { 1.505083731623406473332, 0x0.97p0 },
247                                            { 1.509164427593422841412, 0x0.98p0 }, { 1.513256187452609813349, 0x0.99p0 }, { 1.517359041198214741897, 0x0.9ap0 }, { 1.521473018908814589523, 0x0.9bp0 },
248                                            { 1.525598150744538195056, 0x0.9cp0 }, { 1.529734466947286986027, 0x0.9dp0 }, { 1.533881997840955913048, 0x0.9ep0 }, { 1.538040773831656826687, 0x0.9fp0 },
249                                            { 1.542210825407940744114, 0x0.a0p0 }, { 1.546392183141021670068, 0x0.a1p0 }, { 1.550584877684999973724, 0x0.a2p0 }, { 1.554788939777088430105, 0x0.a3p0 },
250                                            { 1.559004400237836929222, 0x0.a4p0 }, { 1.563231289971357629298, 0x0.a5p0 }, { 1.567469639965552774541, 0x0.a6p0 }, { 1.571719481292341402678, 0x0.a7p0 },
251                                            { 1.575980845107886496592, 0x0.a8p0 }, { 1.580253762652824578439, 0x0.a9p0 }, { 1.584538265252493749458, 0x0.aap0 }, { 1.588834384317163950229, 0x0.abp0 },
252                                            { 1.593142151342266776837, 0x0.acp0 }, { 1.597461597908627073394, 0x0.adp0 }, { 1.601792755682693414343, 0x0.aep0 }, { 1.606135656416771029242, 0x0.afp0 },
253                                            { 1.610490331949254283472, 0x0.b0p0 }, { 1.614856814204860491202, 0x0.b1p0 }, { 1.619235135194863728358, 0x0.b2p0 }, { 1.623625327017328867640, 0x0.b3p0 },
254                                            { 1.628027421857347833978, 0x0.b4p0 }, { 1.632441451987274971813, 0x0.b5p0 }, { 1.636867449766964410784, 0x0.b6p0 }, { 1.641305447644006321184, 0x0.b7p0 },
255                                            { 1.645755478153964945776, 0x0.b8p0 }, { 1.650217573920617741834, 0x0.b9p0 }, { 1.654691767656194523184, 0x0.bap0 }, { 1.659178092161616158151, 0x0.bbp0 },
256                                            { 1.663676580326736598181, 0x0.bcp0 }, { 1.668187265130582463968, 0x0.bdp0 }, { 1.672710179641596406341, 0x0.bep0 }, { 1.677245357017878468753, 0x0.bfp0 },
257                                            { 1.681792830507429004072, 0x0.c0p0 }, { 1.686352633448393145699, 0x0.c1p0 }, { 1.690924799269305056626, 0x0.c2p0 }, { 1.695509361489332622597, 0x0.c3p0 },
258                                            { 1.700106353718523477525, 0x0.c4p0 }, { 1.704715809658051250963, 0x0.c5p0 }, { 1.709337763100462925792, 0x0.c6p0 }, { 1.713972247929925973864, 0x0.c7p0 },
259                                            { 1.718619298122477934143, 0x0.c8p0 }, { 1.723278947746273992436, 0x0.c9p0 }, { 1.727951230961837669753, 0x0.cap0 }, { 1.732636182022311066575, 0x0.cbp0 },
260                                            { 1.737333835273706217350, 0x0.ccp0 }, { 1.742044225155156444984, 0x0.cdp0 }, { 1.746767386199168825556, 0x0.cep0 }, { 1.751503353031877985302, 0x0.cfp0 },
261                                            { 1.756252160373299453511, 0x0.d0p0 }, { 1.761013843037583681550, 0x0.d1p0 }, { 1.765788435933272726430, 0x0.d2p0 }, { 1.770575974063554713922, 0x0.d3p0 },
262                                            { 1.775376492526521188253, 0x0.d4p0 }, { 1.780190026515424461806, 0x0.d5p0 }, { 1.785016611318934964814, 0x0.d6p0 }, { 1.789856282321401037549, 0x0.d7p0 },
263                                            { 1.794709075003107168200, 0x0.d8p0 }, { 1.799575024940535117324, 0x0.d9p0 }, { 1.804454167806623932080, 0x0.dap0 }, { 1.809346539371031736820, 0x0.dbp0 },
264                                            { 1.814252175500398633901, 0x0.dcp0 }, { 1.819171112158608494269, 0x0.ddp0 }, { 1.824103385407053190548, 0x0.dep0 }, { 1.829049031404897274200, 0x0.dfp0 },
265                                            { 1.834008086409342430656, 0x0.e0p0 }, { 1.838980586775893710794, 0x0.e1p0 }, { 1.843966568958625984465, 0x0.e2p0 }, { 1.848966069510450838109, 0x0.e3p0 },
266                                            { 1.853979125083385470774, 0x0.e4p0 }, { 1.859005772428820479902, 0x0.e5p0 }, { 1.864046048397788979401, 0x0.e6p0 }, { 1.869099989941238604274, 0x0.e7p0 },
267                                            { 1.874167634110299962558, 0x0.e8p0 }, { 1.879249018056560194267, 0x0.e9p0 }, { 1.884344179032334309909, 0x0.eap0 }, { 1.889453154390938971474, 0x0.ebp0 },
268                                            { 1.894575981586965607306, 0x0.ecp0 }, { 1.899712698176555081275, 0x0.edp0 }, { 1.904863341817674138312, 0x0.eep0 }, { 1.910027950270389629495, 0x0.efp0 },
269                                            { 1.915206561397147178027, 0x0.f0p0 }, { 1.920399213163047402730, 0x0.f1p0 }, { 1.925605943636124806062, 0x0.f2p0 }, { 1.930826790987627106233, 0x0.f3p0 },
270                                            { 1.936061793492294347274, 0x0.f4p0 }, { 1.941310989528640451596, 0x0.f5p0 }, { 1.946574417579233218234, 0x0.f6p0 }, { 1.951852116230978317901, 0x0.f7p0 },
271                                            { 1.957144124175400401455, 0x0.f8p0 }, { 1.962450480208927316994, 0x0.f9p0 }, { 1.967771223233175659217, 0x0.fap0 }, { 1.973106392255234098343, 0x0.fbp0 },
272                                            { 1.978456026387950927869, 0x0.fcp0 }, { 1.983820164850219391894, 0x0.fdp0 }, { 1.989198846967266343100, 0x0.fep0 }, { 1.994592112170940234606, 0x0.ffp0 }
273                                             };     
274                                             
275// Minimax polynomial approximation for y = 2**x, valid over the range [ 0, 2**-8 ]. Accurate to within 0.11116223e-9. We need to within 2**(-28) here.                                          
276static const double exp2_polynomial[3] = { 1.00000000011116223, 0.693146680445468386, 0.240559810709772152 };
277static const xFloat huge = { 0x1.0p127f, 0, 0, 0 };
278static const xFloat tiny = { 0x1.00001p-127f, 0, 0, 0 };
279static const vUInt32 bias = { 1023, 0, 0, 0 };
280static const __m128d almost1 = { 1.0 - DBL_EPSILON, 0.0 };
281
282/*
283float exp2f( float x )
284{
285    //take care of NaN
286    if( x != x )
287        return x;
288
289    //deal with overflow
290    if( x >= 128.0f )
291    {
292        if( __builtin_inff() == x )
293            return x;
294
295        return XFLOAT_2_FLOAT( REQUIRED_ADD_ss( huge, huge ) );
296    }
297
298    //deal with underflow
299    if( x <= -150.0f )
300    {
301        if( -__builtin_inff() == x )
302            return 0;
303
304        return XFLOAT_2_FLOAT( REQUIRED_MULTIPLY_ss( tiny, tiny ) );
305    }
306
307
308
309    // exp2(+-0)returns 1. 
310    // Which will be taken care of through normal arithmetic
311
312    //Split x into fractional and integer parts
313    double xd = x;
314    // i = floorf( x )
315    double magic = x < 0 ? -0x1.0p52f : 0x1.0p52f;
316    double i = (xd + magic) - magic;
317    if( i == xd )
318        return  XDOUBLE_2_FLOAT( (xDouble) _mm_slli_epi64( _mm_add_epi32( _mm_cvtpd_epi32( DOUBLE_2_XDOUBLE( i ) ), bias ), 52 ) );
319    else if( i > xd )
320        i -= 1.0f;
321    double f = xd - i;
322
323    //We do exp2(x) as:
324    //
325    //      x = i + f               i = floor( x )     f = x - i
326    //
327    //      exp2(x) = exp2( i ) * exp2( f )
328    //
329    //  We further break down f into:
330    //
331    //      f = n * (2**-8) + ff
332    //
333    //      exp2(x) = exp2 (i ) * exp2( n * (2**-8) ) * exp2(ff)
334    //
335    //          exp2( i )           calculated by simple slamming of i + 1023 into exponent of a double (with suitable saturation)
336    //          exp2( n*(2**-8) )   calculated by table lookup into exp2_table
337    //          exp2( ff )          calculated by minimax polynomial
338
339    //reduce the fraction to the range [ 0, 2**-8 )
340    xDouble d = DOUBLE_2_XDOUBLE(f);   
341    
342    //For very small negative x, we can now have d = 1.0, which causes trouble. 
343    //In order to keep things sane we clamp vf to be 1.0 - DBL_EPSILON
344    d = _mm_min_sd( d, almost1 );
345
346    //extract the high 8 bits of the mantissa by adding one to d, and using integer operations to read out those bits
347    int n = _mm_cvtsi128_si32( _mm_srli_epi64( _mm_and_si128( (__m128i) _mm_add_sd( d, one_sd), mantissaHiBits ), 44 ) );
348
349    //subtract out a representation of the high 8 bits we removed from d
350    d = _mm_sub_sdm( d, (double*)&exp2_table[ n ] + 1 );     //reduces d to [ 0, 2**-8 )
351
352    
353    //d = exp2(d), accurate to within 0.11116223e-9. We need to within 2**(-28) here 
354    xDouble e = _mm_mul_sdm( d, exp2_polynomial + 2 );
355    e = _mm_add_sdm( e, exp2_polynomial + 1 );
356    e = _mm_mul_sd( e, d );
357    e = _mm_add_sdm( e, exp2_polynomial );
358
359    //Calculate exp2(i). Denormals are not possible because we use double precision here.
360    __m128i ei = _mm_slli_epi64( _mm_cvtpd_epi32( DOUBLE_2_XDOUBLE( i ) ), 52 );     
361    
362    ei = _mm_add_epi64( ei, (__m128i) exp2_table[ n ] );    
363
364    //scale by ei
365    e = _mm_mul_sd( e, (__m128d) ei );
366    
367    return XDOUBLE_2_FLOAT( e );
368}
369*/
370
371static const __m128d minusZero_sd = { -0.0, 0.0 };
372
373float powf( float x, float y )
374{
375// The pow functions compute x raised to the power y. A domain error occurs if x is finite 
376// and negative and y is finite and not an integer value. A range error may occur. A domain 
377// error may occur if x is zero and y is zero. A domain error or range error may occur if x 
378// is zero and y is less than zero. 
379
380    // pow(+1, y) returns 1 for any y, even a NaN. 
381    // pow(x, +-0) returns 1 for any x, even a NaN. 
382    if( x == 1.0f || y == 0.0f )
383        return 1.0f;
384
385    // deal with NaNs before they cause trouble
386    if( x != x || y != y )
387        return x + y;
388
389    float fabsx = __builtin_fabsf( x );
390    float fabsy = __builtin_fabsf( y );
391    
392    //Find out if Y is an integer and if so if it is even or odd, without setting the inexact flag
393    uint32_t yFracBits = 0;
394    uint32_t yOneBit = 0;
395    int32_t xExp, yExp;
396    
397    union
398    {
399        float       f;
400        uint32_t    u;
401    }uy, ux;
402    
403    uy.f = fabsy;
404    ux.f = fabsx;
405
406    yExp = (uy.u >> 23) - 127;
407    xExp = (ux.u >> 23) - 127;
408    
409    if( fabsy < 1.0f )
410        yFracBits = 0x007FFFFF;
411    else if( fabsy < 0x1.0p24f )   // y can only be an odd integer if 1.0f <= |y| < 2**24
412    {
413        yOneBit = (0x00800000 >> yExp) & uy.u;
414        yFracBits = (0x007FFFFF >> yExp) & uy.u;
415    }
416
417    if( x == 0.0f  )
418    {
419        if( y < 0.0f )
420        {
421            // pow(+-0, y) returns +-Inf and raises the ‘‘divide-by-zero’’ floating-point exception for y an odd integer < 0. 
422            // pow(+-0, y) returns +Inf and raises the ‘‘divide-by-zero’’ floating-point exception for y < 0 and not an odd integer. 
423            x = 1.0f / x;   //set div/0 flag and x to infinity of right sign
424
425            if( 0 != yFracBits || 0 == yOneBit )    //if y is not an odd integer
426                x = __builtin_fabsf( x );
427
428            return x;
429        }
430        else  
431        {   //y > 0. y == 0 case taken care of at the top of the function.
432
433            // pow(+-0, y) returns +-0 for y an odd integer > 0. 
434            // pow(+-0, y) returns +0 for y > 0 and not an odd integer. 
435            if( 0 != yFracBits || 0 == yOneBit )    //if y is not an odd integer
436                return 0.0f;
437        
438            return x;
439        }
440    }
441
442    if( fabsx == __builtin_inff() )
443    {
444        // pow(+Inf, y) returns +0 for y < 0. 
445        // pow(+Inf, y) returns +Inf for y > 0. 
446        if( x == __builtin_inff() )
447        {
448            if( y < 0.0f )
449                return 0.0f;
450        
451            return x;
452        }
453        else
454        { //x = -Inf
455            if( y < 0.0f )
456            {
457                // pow(-Inf, y) returns −0 for y an odd integer < 0. 
458                // pow(-Inf, y) returns +0 for y < 0 and not an odd integer. 
459                if( 0 != yFracBits || 0 == yOneBit )    //if y is not an odd integer
460                    return 0.0f;
461                    
462                return -0.0f;
463            }
464            else
465            { //y > 0. y == 0 case taken care of at the top of the function.
466                // pow(−Inf, y) returns -Inf for y an odd integer > 0. 
467                // pow(−Inf, y) returns +Inf for y > 0 and not an odd integer. 
468                if( 0 != yFracBits || 0 == yOneBit )    //if y is not an odd integer
469                    return __builtin_inff();
470            
471                return x;
472            }
473        }
474    }
475    
476    if( fabsy == __builtin_inff() )
477    {
478        // pow(−1, +-Inf) returns 1. 
479        if( x == -1.0f)
480            return 1.0f;
481
482        if( y == __builtin_inff() )
483        {
484            // pow(x, +Inf) returns +0 for |x| < 1. 
485            if( fabsx < 1.0f )
486                return 0.0f;
487                
488            // pow(x, +inf) returns +Inf for |x| > 1. 
489            return __builtin_inff();
490        }
491
492        // pow(x, -Inf) returns +Inf for |x| < 1. 
493        if( fabsx < 1.0f )
494            return __builtin_inff();
495        // pow(x, -Inf) returns +0 for |x| > 1. 
496        return 0.0f;
497    }
498
499    // pow(x, y) returns a NaN and raises the ‘‘invalid’’ floating-point exception for finite x < 0 and finite non-integer y. 
500    if( x < 0.0f && 0 != yFracBits )
501    {
502        SET_INVALID_FLAG();
503        return __builtin_nanf("37");
504    }
505
506    //if y is an integer, less than 2**16, do iPow
507    if( 0 == yFracBits && fabsy <= 0x1.0p16f )
508    {
509        int32_t iy = y; //should be exact
510        int32_t yIsNeg = iy >> 31;
511        iy = abs( iy );
512        
513        double dx = x;
514        double result = iy & 1 ? dx : 1.0;
515        
516        while( iy >>= 1 )
517        {
518            dx *= dx;
519            if( iy & 1 )
520                result *= dx;
521        }
522    
523        //We are using double precision here, so we don't need to worry about range differences between tiny vs huge numbers for negative Y
524        if( yIsNeg )
525            return (float) (1.0 / result);
526            
527        return (float) result;
528    }
529    
530    if( 0.5f == fabsy )
531    {
532        double d = sqrt( (double) x );
533    
534        if( y < 0.0f )
535            return (float) ( 1.0 / d );
536    
537        return (float) d;
538    }
539    
540    //deal with 
541        
542    //Do pow the hard way
543
544    //Convert x and y to double to do the arithmetic
545    __m128d ylog2X;
546
547    //We have two fallover cases for the log2 estimate above. 
548    //
549    //The first happens for values very near 1. This causes results near 0,
550    //which means that our minimax error estimate is way off, because the results
551    //are in a much smaller binade than normal, so the minimax error seems that much 
552    //larger.
553    //
554    //The second case is for large y. This is a simple matter of minimax error * y
555    //eventually exceeds our error threshold.
556    //
557    //In each case, we just fall back on the 80-bit log2 instruction, which costs us
558    //an extra 40 cycles or so.
559    //
560    if( fabsy > 52.0f || ( 0.98f < fabsx && fabsx < 1.02f ) )
561    {
562        double temp;
563
564        //use fyl2x to get y*log2(x)
565        asm volatile( "flds (%1)\n flds (%2)\n fyl2x \n fstpl %0" : "=m" (*(&temp)) : "r" (&y),  "r" (&fabsx) );
566            
567        ylog2X = _mm_load_sd( &temp );
568    }
569    else
570    {
571        __m128d X = FLOAT_2_XDOUBLE( fabsx );
572        __m128d Y = FLOAT_2_XDOUBLE( y );
573
574        //Extract the exponent and mantissa. We are essentially doing frexp here, except that the mantissa is [1, 2)
575        __m128d log2X = _mm_cvtepi32_pd( _mm_sub_epi32( _mm_srli_epi64( (__m128i) _mm_and_pd( X, exponentMask), 52 ), v1023 ) );     //always exact
576        __m128d mantissa = _mm_or_pd( _mm_andnot_pd( exponentMask, X ), one_sd );
577
578        //avoid setting inexact flag for exact powers of two
579        if( 1.0 != XDOUBLE_2_DOUBLE( mantissa ) )
580        { //take log2( mantissa )
581            __m128d vMantissa = _mm_unpacklo_pd( mantissa, mantissa );                  //{ mantissa, mantissa }
582            __m128d m1 = _mm_unpacklo_pd( one_sd, mantissa );                           // { m, 1 }
583            __m128d m2 = _mm_mul_pd( vMantissa, vMantissa );                            //{ m**2, m**2 }
584            __m128d m4 = _mm_mul_pd( m2, m2 );                                          //{ m**4, m**4 }
585            __m128d v1 = _mm_mul_pd( m4, log2polynomialCoefficients[5] );               // { c11 m**4, c10 m**4 } 
586            __m128d v  = _mm_mul_pd( m4, log2polynomialCoefficients[4] );               // { c9 m**4, c8 m**4 } 
587            m2 = _mm_mul_pd( m2, m1 );
588            v1 = _mm_add_pd( v1, log2polynomialCoefficients[3] );                       // { c7 + c11 m**4, c6 + c10 m**4 }
589            v  = _mm_add_pd( v , log2polynomialCoefficients[2] );                       // { c5 + c9 m**4, c4 + c8 m**4 }
590            v1 = _mm_mul_pd( v1, m4 );                                                  // { c7 m**4 + c11 m**8, c6 m**4 + c10 m**8 }
591            v  = _mm_mul_pd( v , m4 );                                                  // { c5 m**4 + c9 m**8, c4 m**4 + c8 m**8 }
592            v1 = _mm_add_pd( v1, log2polynomialCoefficients[1] );                       // { c3 + c7 m**4 + c11 m**8, c2 + c6 m**4 + c10 m**8 }
593            v  = _mm_add_pd( v , log2polynomialCoefficients[0] );                       // { c1 + c5 m**4 + c9 m**8, c0 + c4 m**4 + c8 m**8 }
594            v1 = _mm_mul_pd( v1, m2 );                                                  // { c3 m**3 + c7 m**7 + c11 m**11, c2 m**2 + c6 m**4 + c10 m**8 }
595            v  = _mm_mul_pd( v , m1 );                                                  // { c1 m + c5 m**5 + c9 m**9, c0 + c4 m**4 + c8 m**8 }
596            v  = _mm_add_pd( v , v1 );
597
598            //Add the two sub terms horizontally
599        #if defined( __SSE3__ )
600            v = _mm_hadd_pd(  v, _mm_setzero_pd() );
601        #else
602            v = _mm_add_sd( v, _mm_unpackhi_pd( v, v ) );
603        #endif
604        
605            log2X = _mm_add_sd( log2X, v );
606        }
607        
608        ylog2X = _mm_mul_sd( log2X, Y );
609    }
610
611    // ylog2X = y * log2( x )
612    double q = XDOUBLE_2_DOUBLE( ylog2X );
613    
614    //deal with overflow
615    if( q >= 128.0 )
616        return XFLOAT_2_FLOAT( REQUIRED_ADD_ss( huge, huge ) );
617
618    //deal with underflow to zero
619    if( q <= -150.0 )
620        return XFLOAT_2_FLOAT( REQUIRED_MULTIPLY_ss( tiny, tiny ) );
621
622    static const __m128d magic = { 0x1.0p52, 0.0 };
623    __m128d smagic = _mm_or_pd( magic, _mm_and_pd( ylog2X, minusZero_sd ) );    //smagic = copysign( 0x1.0p52, ylog2X )
624
625    // separate log2X into fractional and integer components
626    __m128d vi = _mm_sub_sd( _mm_add_sd( ylog2X, smagic ), smagic );            // vi = rint( ylog2X )
627    vi = _mm_sub_sd( vi, _mm_and_pd( _mm_cmpgt_sd( vi, ylog2X ), one_sd ) );     // vi = floor( ylog2X )
628    
629    //Avoid setting inexact if ylog2X is an integer
630    if( _mm_ucomieq_sd( vi, ylog2X ) )
631        return XDOUBLE_2_FLOAT( (xDouble) _mm_slli_epi64( _mm_add_epi32( _mm_cvtpd_epi32( vi ), bias ), 52 ) );
632
633    __m128d vf = _mm_sub_sd( ylog2X, vi );                                       // vf = fractional part ( 0 <= vf < 1.0 )
634    
635    //For very small negative ylog2X, we can now have vf = 1.0, which causes trouble. 
636    //In order to keep things sane we clamp vf to be 1.0 - EPSILON
637    vf = _mm_min_sd( vf, almost1 );
638
639    //reduce the fraction to the range [ 0, 2**-8 )
640    int n = _mm_cvtsi128_si32( _mm_srli_epi64( _mm_and_si128( (__m128i) _mm_add_sd( vf, one_sd), mantissaHiBits ), 44 ) );
641    vf = _mm_sub_sdm( vf, (double*)&exp2_table[ n ] + 1 );     //reduces d to [ 0, 2**-8 )
642    
643    //d = exp2(d), accurate to within 0.11116223e-9. We need to within 2**(-28) here 
644    xDouble e = _mm_mul_sdm( vf, exp2_polynomial + 2 );
645    e = _mm_add_sdm( e, exp2_polynomial + 1 );
646    e = _mm_mul_sd( e, vf );
647    e = _mm_add_sdm( e, exp2_polynomial );
648
649    //Calculate exp2(i). Denormals are not possible because we use double precision here.
650    __m128i ei = _mm_slli_epi64( _mm_cvtpd_epi32( vi ), 52 );     
651    ei = _mm_add_epi64( ei, (__m128i) exp2_table[ n ] );    
652
653    //scale by ei
654    e = _mm_mul_sd( e, (__m128d) ei );
655    
656    float result =  XDOUBLE_2_FLOAT( e );
657
658    if( x < 0.0f && yOneBit )
659        return -result;
660
661    return result;
662}
663
664