src/libm/Source/Intel/powf.S at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / powf.S
at fixPythonPipStalling 717 lines 40 kB view raw
wrap content
Lubos Dolezel More progress 9y ago
a76db625
  1
  2/*
  3 *    powf.s
  4 *
  5 *        by Ian Ollmann
  6 *
  7 *    Copyright (c) 2007, Apple Inc. All Rights Reserved.
  8 *
  9 *    Implementation of C99 powf function for MacOS X __i386__ and __x86_64__ architectures.
 10 *
 11 */
 12
 13
 14#define LOCAL_STACK_SIZE 3*FRAME_SIZE
 15 
 16#include "machine/asm.h"
 17#include "abi.h"
 18
 19.const
 20gMaskShift:    .byte    0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    \
 21                        0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    \
 22                        0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    \
 23                        0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    \
 24                        0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    \
 25                        0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    \
 26                        0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    \
 27                        0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7,    \
 28                        8, 9,10,11, 12,13,14,15,16,17,18,19,20,21,22,23,    \
 29                       24,25,26,27, 28,29,30,31,31,31,31,31,31,31,31,31,    \
 30                       31,31,31,31, 31,31,31,31,31,31,31,31,31,31,31,31,    \
 31                       31,31,31,31, 31,31,31,31,31,31,31,31,31,31,31,31,    \
 32                       31,31,31,31, 31,31,31,31,31,31,31,31,31,31,31,31,    \
 33                       31,31,31,31, 31,31,31,31,31,31,31,31,31,31,31,31,    \
 34                       31,31,31,31, 31,31,31,31,31,31,31,31,31,31,31,31,    \
 35                       31,31,31,31, 31,31,31,31,31,31,31,31,31,31,31,31
 36
 37.align 4
 38// 8th order minimax fit of exp2 on [-1.0,1.0].  |error| < 0.402865722354948566583852e-9:
 39powf_exp_c:     .quad 0x40bc03f30399c376,   0x3ff000000001ea2a          // c4/c8 = 0.961813690023115610862381719985771e-2 / 0.134107709538786543922336536865157e-5, c0 = 1.0 + 0.278626872016317130037181614004e-10
 40                .quad 0x408f10e7f73e6d8f,   0x3fe62e42fd0933ee            // c5/c8 = 0.133318252930790403741964203236548e-2 / 0.134107709538786543922336536865157e-5, c1 = .693147176943623740308984004029708   
 41                .quad 0x405cb616a9384e69,   0x3fcebfbdfd0f0afa            // c6/c8 = 0.154016177542147239746127455226575e-3 / 0.134107709538786543922336536865157e-5, c2 = .240226505817268621584559118975830   
 42                .quad 0x4027173ebd288ba1,   0x3fac6b0a74f15403            // c7/c8 = 0.154832722143258821052933667742417e-4 / 0.134107709538786543922336536865157e-5, c3 = 0.555041568519883074165425891257052e-1 
 43                .quad 0x3eb67fe1dc3105ba                                // c8 = 0.134107709538786543922336536865157e-5 
 44
 45//    The reduction for the log2 stage is done as:
 46//
 47//        for log2(x):
 48//            x = 2**i * 1.m                                    1.0 <= 1.m < 2.0
 49//            index = top 7 bits of m
 50//            reduced = 1.m * powf_log2_c[2*index]            1-2**-8 < reduced <= 1+2**-7
 51//            log2x = exp2(i) + powf_log2_c[2*index+1] + log1p(reduced - 1) / ln(2);
 52//
 53//        exp2(i) is exact
 54//        powf_log2_c[2*index] has 53 bits of precision, and is exact for the first and last entries
 55//        powf_log2_c[2*index+1] has 53 bits of precision, and is exact for the first and last entries
 56//        log1p( -2**-8 < x <= 2**-7 )  is done as a 5 term Taylor series. Error should be proportional
 57//            to the missing 6th order term or < 2**(6*-7)/6 ~ 2**-44, in the worst case. For smaller x, 
 58//            it will obviously be better. 
 59//        
 60//        Error in powf is rougly porportional to error_in_exp2_stage + y * error_in_log2_stage.
 61//        y is bounded by the point that powf overflows (underflow loses precision a bit sooner) 
 62//        For our 2**-7 worst case, y is about 10000 at overflow.  10000 * 2**-44 = ~6e-10, ~2**-31
 63//        which should give us a good margin of safety.  For cases that are in 1.0-2**-8 < x < 1.0+2**-7, 
 64//        we expect the precision of the Taylor series to improve faster than y's ability to disrupt 
 65//        the precision in ylog2(x) up to the point that we run out of precision in the double.  The 
 66//        worst case in this regard is probably 0x1.ffffep-1f, the number closest to 1. This overflows 
 67//        somewhere in the range y = [ -2**30, -2**31 ]. We predict we'll need 53-55 bits of precision
 68//        here, which may slightly exceed the precision of the double. While this number may come out 
 69//        wrong by a few ulps, all the other ones should be within tolerance since the next closest 
 70//        number is twice as far from 1.0. For these cases, the values in powf_log2_c are exact, so 
 71//        the only source of error is the Taylor series for log1p, and post-scaling by 1/ln(2).
 72//        For cases not in 1.0-2**-8 < x < 1.0+2**-7 the point at which y overflows is much smaller, so 
 73//        we don't need so much precision.
 74//
 75// Reduction table for log2 stage of power prepared as:
 76//        #include <stdio.h>
 77//        #include <stdint.h>
 78//        #include <math.h>
 79//
 80//        int main( void )
 81//        {
 82//                int i;
 83//
 84//                for( i = 0; i < 128; i++ )
 85//                {
 86//                        long double a = 1.0L /( 1.0L + (long double) i / (long double) 127 );
 87//                        union{ double d; uint64_t u;}u, v;
 88//                        u.d = a;
 89//                        v.d = -log2l(a);
 90//
 91//                        printf( ".quad 0x%llx,\t0x%llx\t// %Lg, -log2l(%Lg)\n", u.u, v.u, a, a );
 92//                }
 93//
 94//                return 0;
 95//        }
 96//
 97//    We use /127 here rather than /128 to allow the cases where the most precision is needed to be reduced by
 98//    exact powers of two. (These are 1.0 + 1 ulp and 1.0 - 1ulp.)  The other values will land somewhere in the
 99//  range [ 1.0 - 2**-8,  1.0 + 2**-7 ]. (Experimentally verified for all floats 1.0 <= x < 2.0.)
100//
101.align 3
102powf_log2_c:    .quad 0x3ff0000000000000,    0x8000000000000000    // 1, -log2l(1)
103                .quad 0x3fefc00000000000,    0x3f872c7ba20f7327    // 0.992188, -log2l(0.992188)
104                .quad 0x3fef80fe03f80fe0,    0x3f9715662c7f3dbc    // 0.984496, -log2l(0.984496)
105                .quad 0x3fef42f42f42f42f,    0x3fa13eea2b6545df    // 0.976923, -log2l(0.976923)
106                .quad 0x3fef05dcd30dadec,    0x3fa6e7f0bd9710dd    // 0.969466, -log2l(0.969466)
107                .quad 0x3feec9b26c9b26ca,    0x3fac85f25e12da51    // 0.962121, -log2l(0.962121)
108                .quad 0x3fee8e6fa39be8e7,    0x3fb10c8cd0c74414    // 0.954887, -log2l(0.954887)
109                .quad 0x3fee540f4898d5f8,    0x3fb3d0c813e48e00    // 0.947761, -log2l(0.947761)
110                .quad 0x3fee1a8c536fe1a9,    0x3fb68fbf5169e028    // 0.940741, -log2l(0.940741)
111                .quad 0x3fede1e1e1e1e1e2,    0x3fb949866f0b017b    // 0.933824, -log2l(0.933824)
112                .quad 0x3fedaa0b3630957d,    0x3fbbfe30e28821c0    // 0.927007, -log2l(0.927007)
113                .quad 0x3fed7303b5cc0ed7,    0x3fbeadd1b4ef9a1f    // 0.92029, -log2l(0.92029)
114                .quad 0x3fed3cc6e80ebbdb,    0x3fc0ac3dc2e0ca0c    // 0.913669, -log2l(0.913669)
115                .quad 0x3fed075075075075,    0x3fc1ff2046fb7116    // 0.907143, -log2l(0.907143)
116                .quad 0x3fecd29c244fe2f3,    0x3fc34f99517622ae    // 0.900709, -log2l(0.900709)
117                .quad 0x3fec9ea5dbf193d5,    0x3fc49db19c99a54d    // 0.894366, -log2l(0.894366)
118                .quad 0x3fec6b699f5423ce,    0x3fc5e971b3a4ee80    // 0.888112, -log2l(0.888112)
119                .quad 0x3fec38e38e38e38e,    0x3fc732e1f41ccdba    // 0.881944, -log2l(0.881944)
120                .quad 0x3fec070fe3c070fe,    0x3fc87a0a8f0ff9b2    // 0.875862, -log2l(0.875862)
121                .quad 0x3febd5eaf57abd5f,    0x3fc9bef38a4ffae5    // 0.869863, -log2l(0.869863)
122                .quad 0x3feba5713280dee9,    0x3fcb01a4c19f6811    // 0.863946, -log2l(0.863946)
123                .quad 0x3feb759f2298375a,    0x3fcc4225e7d5e3c6    // 0.858108, -log2l(0.858108)
124                .quad 0x3feb4671655e7f24,    0x3fcd807e87fa4521    // 0.852349, -log2l(0.852349)
125                .quad 0x3feb17e4b17e4b18,    0x3fcebcb6065350a2    // 0.846667, -log2l(0.846667)
126                .quad 0x3feae9f5d3eba7d7,    0x3fcff6d3a16f617f    // 0.84106, -log2l(0.84106)
127                .quad 0x3feabca1af286bca,    0x3fd0976f3991af9e    // 0.835526, -log2l(0.835526)
128                .quad 0x3fea8fe53a8fe53b,    0x3fd1326eb8c0aba3    // 0.830065, -log2l(0.830065)
129                .quad 0x3fea63bd81a98ef6,    0x3fd1cc6bb7e3870f    // 0.824675, -log2l(0.824675)
130                .quad 0x3fea3827a3827a38,    0x3fd265698fa26c0a    // 0.819355, -log2l(0.819355)
131                .quad 0x3fea0d20d20d20d2,    0x3fd2fd6b881e82d3    // 0.814103, -log2l(0.814103)
132                .quad 0x3fe9e2a65187566c,    0x3fd39474d95e1649    // 0.808917, -log2l(0.808917)
133                .quad 0x3fe9b8b577e61371,    0x3fd42a88abb54986    // 0.803797, -log2l(0.803797)
134                .quad 0x3fe98f4bac46d7c0,    0x3fd4bfaa182b7fe3    // 0.798742, -log2l(0.798742)
135                .quad 0x3fe9666666666666,    0x3fd553dc28dd9724    // 0.79375, -log2l(0.79375)
136                .quad 0x3fe93e032e1c9f02,    0x3fd5e721d95d124d    // 0.78882, -log2l(0.78882)
137                .quad 0x3fe9161f9add3c0d,    0x3fd6797e170c5221    // 0.783951, -log2l(0.783951)
138                .quad 0x3fe8eeb9533d4065,    0x3fd70af3c177f740    // 0.779141, -log2l(0.779141)
139                .quad 0x3fe8c7ce0c7ce0c8,    0x3fd79b85aaad8878    // 0.77439, -log2l(0.77439)
140                .quad 0x3fe8a15b8a15b8a1,    0x3fd82b36978f76d5    // 0.769697, -log2l(0.769697)
141                .quad 0x3fe87b5f9d4d1bc2,    0x3fd8ba09402697ed    // 0.76506, -log2l(0.76506)
142                .quad 0x3fe855d824ca58e9,    0x3fd948004ff12dbf    // 0.760479, -log2l(0.760479)
143                .quad 0x3fe830c30c30c30c,    0x3fd9d51e662f92a2    // 0.755952, -log2l(0.755952)
144                .quad 0x3fe80c1e4bbd595f,    0x3fda6166162e9ec8    // 0.751479, -log2l(0.751479)
145                .quad 0x3fe7e7e7e7e7e7e8,    0x3fdaecd9e78fdbea    // 0.747059, -log2l(0.747059)
146                .quad 0x3fe7c41df1077c42,    0x3fdb777c568f9ae2    // 0.74269, -log2l(0.74269)
147                .quad 0x3fe7a0be82fa0be8,    0x3fdc014fd448fe3a    // 0.738372, -log2l(0.738372)
148                .quad 0x3fe77dc7c4cf2aea,    0x3fdc8a56c6f80bca    // 0.734104, -log2l(0.734104)
149                .quad 0x3fe75b37e875b37f,    0x3fdd12938a39d6f0    // 0.729885, -log2l(0.729885)
150                .quad 0x3fe7390d2a6c405e,    0x3fdd9a086f4ad416    // 0.725714, -log2l(0.725714)
151                .quad 0x3fe71745d1745d17,    0x3fde20b7bd4365a8    // 0.721591, -log2l(0.721591)
152                .quad 0x3fe6f5e02e4850ff,    0x3fdea6a3b152b1e6    // 0.717514, -log2l(0.717514)
153                .quad 0x3fe6d4da9b536a6d,    0x3fdf2bce7ef7d06b    // 0.713483, -log2l(0.713483)
154                .quad 0x3fe6b4337c6cb157,    0x3fdfb03a50395dba    // 0.709497, -log2l(0.709497)
155                .quad 0x3fe693e93e93e93f,    0x3fe019f4a2edc134    // 0.705556, -log2l(0.705556)
156                .quad 0x3fe673fa57b0cbab,    0x3fe05b6ebbca3d9a    // 0.701657, -log2l(0.701657)
157                .quad 0x3fe6546546546546,    0x3fe09c8c7a1fd74c    // 0.697802, -log2l(0.697802)
158                .quad 0x3fe63528917c80b3,    0x3fe0dd4ee107ae0a    // 0.693989, -log2l(0.693989)
159                .quad 0x3fe61642c8590b21,    0x3fe11db6ef5e7873    // 0.690217, -log2l(0.690217)
160                .quad 0x3fe5f7b282135f7b,    0x3fe15dc59fdc06b7    // 0.686486, -log2l(0.686486)
161                .quad 0x3fe5d9765d9765d9,    0x3fe19d7be92a2310    // 0.682796, -log2l(0.682796)
162                .quad 0x3fe5bb8d015e75bc,    0x3fe1dcdabdfad537    // 0.679144, -log2l(0.679144)
163                .quad 0x3fe59df51b3bea36,    0x3fe21be30d1e0ddb    // 0.675532, -log2l(0.675532)
164                .quad 0x3fe580ad602b580b,    0x3fe25a95c196bef3    // 0.671958, -log2l(0.671958)
165                .quad 0x3fe563b48c20563b,    0x3fe298f3c2af6595    // 0.668421, -log2l(0.668421)
166                .quad 0x3fe5470961d7ca63,    0x3fe2d6fdf40e09c5    // 0.664921, -log2l(0.664921)
167                .quad 0x3fe52aaaaaaaaaab,    0x3fe314b535c7b89e    // 0.661458, -log2l(0.661458)
168                .quad 0x3fe50e97366227cb,    0x3fe3521a64737cf3    // 0.658031, -log2l(0.658031)
169                .quad 0x3fe4f2cddb0d3225,    0x3fe38f2e593cda73    // 0.654639, -log2l(0.654639)
170                .quad 0x3fe4d74d74d74d75,    0x3fe3cbf1e9f5cf2f    // 0.651282, -log2l(0.651282)
171                .quad 0x3fe4bc14e5e0a72f,    0x3fe40865e9285f33    // 0.647959, -log2l(0.647959)
172                .quad 0x3fe4a12316176410,    0x3fe4448b2627ade3    // 0.64467, -log2l(0.64467)
173                .quad 0x3fe48676f31219dc,    0x3fe480626d20a876    // 0.641414, -log2l(0.641414)
174                .quad 0x3fe46c0f6feb6ac6,    0x3fe4bbec872a4505    // 0.638191, -log2l(0.638191)
175                .quad 0x3fe451eb851eb852,    0x3fe4f72a3a555958    // 0.635, -log2l(0.635)
176                .quad 0x3fe4380a3065e3fb,    0x3fe5321c49bc0c91    // 0.631841, -log2l(0.631841)
177                .quad 0x3fe41e6a74981447,    0x3fe56cc37590e6c5    // 0.628713, -log2l(0.628713)
178                .quad 0x3fe4050b59897548,    0x3fe5a7207b2d815a    // 0.625616, -log2l(0.625616)
179                .quad 0x3fe3ebebebebebec,    0x3fe5e1341520db00    // 0.622549, -log2l(0.622549)
180                .quad 0x3fe3d30b3d30b3d3,    0x3fe61afefb3d5201    // 0.619512, -log2l(0.619512)
181                .quad 0x3fe3ba68636adfb0,    0x3fe65481e2a6477b    // 0.616505, -log2l(0.616505)
182                .quad 0x3fe3a2027932b48f,    0x3fe68dbd7ddd6e15    // 0.613527, -log2l(0.613527)
183                .quad 0x3fe389d89d89d89e,    0x3fe6c6b27ccfc698    // 0.610577, -log2l(0.610577)
184                .quad 0x3fe371e9f3c04e64,    0x3fe6ff618ce24cd7    // 0.607656, -log2l(0.607656)
185                .quad 0x3fe35a35a35a35a3,    0x3fe737cb58fe5716    // 0.604762, -log2l(0.604762)
186                .quad 0x3fe342bad7f64b39,    0x3fe76ff0899daa49    // 0.601896, -log2l(0.601896)
187                .quad 0x3fe32b78c13521d0,    0x3fe7a7d1c4d64520    // 0.599057, -log2l(0.599057)
188                .quad 0x3fe3146e92a10d38,    0x3fe7df6fae65e424    // 0.596244, -log2l(0.596244)
189                .quad 0x3fe2fd9b8396ba9e,    0x3fe816cae7bd40b1    // 0.593458, -log2l(0.593458)
190                .quad 0x3fe2e6fecf2e6fed,    0x3fe84de4100b0ce2    // 0.590698, -log2l(0.590698)
191                .quad 0x3fe2d097b425ed09,    0x3fe884bbc446ae3f    // 0.587963, -log2l(0.587963)
192                .quad 0x3fe2ba6574cae996,    0x3fe8bb529f3ab8f3    // 0.585253, -log2l(0.585253)
193                .quad 0x3fe2a46756e62a46,    0x3fe8f1a9398f2d58    // 0.582569, -log2l(0.582569)
194                .quad 0x3fe28e9ca3a728ea,    0x3fe927c029d3798a    // 0.579909, -log2l(0.579909)
195                .quad 0x3fe27904a7904a79,    0x3fe95d980488409a    // 0.577273, -log2l(0.577273)
196                .quad 0x3fe2639eb2639eb2,    0x3fe993315c28e8fb    // 0.574661, -log2l(0.574661)
197                .quad 0x3fe24e6a171024e7,    0x3fe9c88cc134f3c3    // 0.572072, -log2l(0.572072)
198                .quad 0x3fe239662b9f91cb,    0x3fe9fdaac2391e1c    // 0.569507, -log2l(0.569507)
199                .quad 0x3fe2249249249249,    0x3fea328bebd84e80    // 0.566964, -log2l(0.566964)
200                .quad 0x3fe20fedcba98765,    0x3fea6730c8d44efa    // 0.564444, -log2l(0.564444)
201                .quad 0x3fe1fb78121fb781,    0x3fea9b99e21655eb    // 0.561947, -log2l(0.561947)
202                .quad 0x3fe1e7307e4ef157,    0x3feacfc7beb75e94    // 0.559471, -log2l(0.559471)
203                .quad 0x3fe1d31674c59d31,    0x3feb03bae40852a0    // 0.557018, -log2l(0.557018)
204                .quad 0x3fe1bf295cc93903,    0x3feb3773d59a05ff    // 0.554585, -log2l(0.554585)
205                .quad 0x3fe1ab68a0473c1b,    0x3feb6af315450638    // 0.552174, -log2l(0.552174)
206                .quad 0x3fe197d3abc65f4f,    0x3feb9e3923313e58    // 0.549784, -log2l(0.549784)
207                .quad 0x3fe18469ee58469f,    0x3febd1467ddd70a7    // 0.547414, -log2l(0.547414)
208                .quad 0x3fe1712ad98b8957,    0x3fec041ba2268731    // 0.545064, -log2l(0.545064)
209                .quad 0x3fe15e15e15e15e1,    0x3fec36b90b4ebc3a    // 0.542735, -log2l(0.542735)
210                .quad 0x3fe14b2a7c2fee92,    0x3fec691f33049ba0    // 0.540426, -log2l(0.540426)
211                .quad 0x3fe1386822b63cbf,    0x3fec9b4e9169de22    // 0.538136, -log2l(0.538136)
212                .quad 0x3fe125ce4feeb7a1,    0x3feccd479d1a1f94    // 0.535865, -log2l(0.535865)
213                .quad 0x3fe1135c81135c81,    0x3fecff0acb3170e3    // 0.533613, -log2l(0.533613)
214                .quad 0x3fe10112358e75d3,    0x3fed30988f52c6d3    // 0.531381, -log2l(0.531381)
215                .quad 0x3fe0eeeeeeeeeeef,    0x3fed61f15bae4663    // 0.529167, -log2l(0.529167)
216                .quad 0x3fe0dcf230dcf231,    0x3fed9315a1076fa2    // 0.526971, -log2l(0.526971)
217                .quad 0x3fe0cb1b810ecf57,    0x3fedc405cebb27dc    // 0.524793, -log2l(0.524793)
218                .quad 0x3fe0b96a673e2808,    0x3fedf4c252c5a3e1    // 0.522634, -log2l(0.522634)
219                .quad 0x3fe0a7de6d1d6086,    0x3fee254b99c83339    // 0.520492, -log2l(0.520492)
220                .quad 0x3fe096771e4d528c,    0x3fee55a20f0eecf9    // 0.518367, -log2l(0.518367)
221                .quad 0x3fe0853408534085,    0x3fee85c61c963f0d    // 0.51626, -log2l(0.51626)
222                .quad 0x3fe07414ba8f0741,    0x3feeb5b82b10609b    // 0.51417, -log2l(0.51417)
223                .quad 0x3fe06318c6318c63,    0x3feee578a1eaa83f    // 0.512097, -log2l(0.512097)
224                .quad 0x3fe0523fbe3367d7,    0x3fef1507e752c6c8    // 0.51004, -log2l(0.51004)
225                .quad 0x3fe04189374bc6a8,    0x3fef4466603be71d    // 0.508, -log2l(0.508)
226                .quad 0x3fe030f4c7e7859c,    0x3fef73947063b3fd    // 0.505976, -log2l(0.505976)
227                .quad 0x3fe0208208208208,    0x3fefa2927a574422    // 0.503968, -log2l(0.503968)
228                .quad 0x3fe0103091b51f5e,    0x3fefd160df77ed7a    // 0.501976, -log2l(0.501976)
229                .quad 0x3fe0000000000000,    0x3ff0000000000000    // 0.5, -log2l(0.5)
230
231// Taylor series coefficients for log2 stage
232powf_logTaylor:    .double        -0.5, 0.33333333333333333333333333333333, -0.25, 0.2
233
234
235.literal8
236oneD:        .double        1.0
237d128:        .double        128.0
238dm150:       .double        -150.0
239recip_ln2:   .quad          0x3ff71547652b82fe      // 1.0 / ln(2)
240
241.literal4
242infF:        .long        0x7f800000        // inf
243minfF:       .long        0xff800000        // -inf
244oneF:        .long        0x3f800000        // 1.0f
245moneF:       .long        0xbf800000        // -1.0f
246maxy:        .long        0x4effffff        // 0x1.0p31f - 1 ulp
247miny:        .long        0xCeffffff        // -0x1.0p31f + 1 ulp
248mantissaMask:        .long        0x007fffff
249
250.text
251
252#if defined( __x86_64__ )
253    #define SI_P        %rsi
254    #define DI_P        %rdi
255    #define RELATIVE_ADDR( _a)                                (_a)( %rip )
256#else
257    #define SI_P        %esi
258    #define DI_P        %edi
259    #define RELATIVE_ADDR( _a)                                (_a)-0b( BX_P )
260#endif
261
262ENTRY( powf )
263#if defined( __i386__ )
264    movl    FRAME_SIZE( STACKP ),         %eax
265    movl    4+FRAME_SIZE( STACKP ),       %edx
266    movss   FRAME_SIZE( STACKP ),         %xmm0
267    movss   4+FRAME_SIZE( STACKP ),       %xmm1
268#else
269    movd    %xmm0,                        %eax
270    movd    %xmm1,                        %edx
271#endif
272
273    //early out for x == 1.0
274    cmpl    $0x3f800000,                  %eax        //if( x == 1.0 )
275    je      6f                                        //        goto 6
276
277    //early out for y == 1.0  (costs 1 cycle for x86_64, free for i386) 
278    cmpl    $0x3f800000,                  %edx        //if( y == 1.0 )
279    je        6f                                      //        goto 6
280
281    andl    $0x7fffffff,                  %edx        // |y|
282    
283    
284    // Find out if y is an integer without raising inexact
285    // Note tested over entire range. Fails for Inf/NaN, but we don't care about that here.
286    push    BX_P
287    push    SI_P
288    push    DI_P
289
290#if defined( __i386__ )    
291    call    0f
2920:  pop        BX_P
293#else
294    xorq    %rdi,                         %rdi
295#endif
296
297    // check to see if we fell into an edge case
298    subl    $1,                           %eax
299    subl    $1,                           %edx
300    cmpl    $0x7f7fffff,                  %eax        // if( x < 0 || x == inf || isnan(x) )
301    jae        7f                                     //        goto 7
302    cmpl    $0x4affffff,                  %edx        // if( |y| >= 0x1.0p23 || 0 == y || isnan(y)  )
303    jae        7f                                     //        goto 7
304
305    cmpl    $0x3effffff,                  %edx        // if( |y| == 0.5f )
306    je        8f                                      //        goto 8
307    
308// The main part of pow:
309//        0 < x < inf,    |y| < 0x1.0p31,        x != 1,        y != 0
310
311    addl    $1,                          %eax
312    andl    $0x7fffffff,                 %eax        // |x|
313
314#if 0
315    // if y is integer, call ipowf instead
316    addl    $1,                          %edx
317    movl    %edx,                        %edi        // |y|
318    lea     RELATIVE_ADDR(gMaskShift),   CX_P        // gMaskShift ptr
319    shrl    $23,                         %edi        // |y| >> 23
320    movzbl  (CX_P, DI_P, 1),             %ecx        // gMaskShift[ |y| >> 23 ]
321    mov     $0x3fffffff,                 DI_P        // 0x3fffffff
322    shrl    %cl,                         %edi        // 0x3fffffff >> gMaskShift[ |y| >> 23 ]
323    andl    %edx,                        %edi        // fractional part of y
324    cmpl    $0,                          %edi
325    je      ___ipowf
326#endif
327
328    //separate |x| into 2**i * 1.m
329    movss   RELATIVE_ADDR( mantissaMask), %xmm3
330    movss   RELATIVE_ADDR( oneF),         %xmm2
331    andps   %xmm3,                        %xmm0       // m
332    orps    %xmm2,                        %xmm0       // 1.m
333    shrl    $23,                          %eax        // exponent + bias
334    cmpl    $0,                           %eax
335    jne     1f
336    
337    // normalize denormal x
338    subss   %xmm2,                        %xmm0       //    1.m - 1.0
339    movd    %xmm0,                        %eax
340    shrl    $23,                          %eax        // exponent + bias
341    andps   %xmm3,                        %xmm0       // m
342    orps    %xmm2,                        %xmm0       // 1.m
343    subl    $126,                         %eax
344
3451:  subl    $127,                         %eax        // i = exponent - bias
346    cvtsi2sd    %eax,                     %xmm3       // log2x = (double) i
347     
348    //check for unit mantissa
349    ucomiss %xmm2,                        %xmm0       // if( 1.m == 1.0 )
350    je      2f                                        //        skip to 2
351
352    //handle non-unit mantissa here
353    movd    %xmm0,                        %eax        // set aside 1.m
354#if defined( __x86_64__ )
355    cdqe
356#endif
357    cvtss2sd    %xmm0,                    %xmm0        // r = (double) 1.m
358    lea      RELATIVE_ADDR( powf_log2_c ), CX_P
359    
360    // use the top 7 bits of the mantissa to index the powf_log2_c table
361    shr      $(23-7-4),                   AX_P
362    and      $0x7f0,                      AX_P
363    
364    // reduce r to 1-2**7 < r < 1+2**-7
365    mulsd    (CX_P, AX_P, 1),            %xmm0        // r *= powf_log2_c[ 2 * index ]
366
367    // compensate in log2x by adding powf_log2_c[ 2 * index + 1]
368    //     do this early so that we force -1.0 + 1.0 to avoid (-1.0 + tiny) + 1.0 later. 
369    //    Precision loss from this is at most 7 bits, which is acceptable    
370    addsd    8(CX_P, AX_P, 1),          %xmm3        // log2x + powf_log2_c[ 2 * index + 1]
371    
372    // we calculate log2(r) as log1p( r-1 ) / ln(2)
373    subsd    RELATIVE_ADDR(oneD),       %xmm0        // r -= 1.0
374
375    
376    // log(1+r) = r - rr/2 + rrr/3 - rrrr/4 + rrrrr/5        
377    //        with -2**-7 < r < 2**-7, should be good to (5+1)*7 +2 = 44 bits of accuracy or so
378    //            (5+1) because the error term is roughly equal to the missing r**6/6 term 
379    lea          RELATIVE_ADDR( powf_logTaylor ), CX_P
380    movsd        8(CX_P),                  %xmm4
381    movsd        24(CX_P),                 %xmm5
382    movapd       %xmm0,                    %xmm2        // r
383    mulsd        %xmm0,                    %xmm0        // rr
384    mulsd        %xmm2,                    %xmm4        // 0.333333333333333333333r
385    mulsd        %xmm2,                    %xmm5        // 0.2r
386    addsd        (CX_P),                   %xmm4        // -0.5 + 0.333333333333333333333r
387    addsd        16(CX_P),                 %xmm5        // -0.25 + 0.2r
388    mulsd        %xmm0,                    %xmm4        // -0.5rr + 0.333333333333333333333rrr
389    mulsd        %xmm0,                    %xmm0        // rrrr
390    addsd        %xmm2,                    %xmm4        // r - 0.5rr + 0.333333333333333333333rrr
391    mulsd        %xmm0,                    %xmm5        // -0.25rrrr + 0.2rrrrr
392    addsd        %xmm5,                    %xmm4        // r - 0.5rr + 0.333333333333333333333rrr - 0.25rrrr + 0.2rrrrr
393    mulsd        RELATIVE_ADDR( recip_ln2), %xmm4       // ( r - 0.5rr + 0.333333333333333333333rrr - 0.25rrrr + 0.2rrrrr ) * (1/ln(2))
394    addsd        %xmm4,                    %xmm3        // log2x + powf_log2_c[ 2 * index + 1] + ( r - 0.5rr + 0.333333333333333333333rrr - 0.25rrrr + 0.2rrrrr ) * (1/ln(2))
395
396    // multiply by y
3972:  cvtss2sd     %xmm1,                    %xmm0
398    mulsd        %xmm3,                    %xmm0        // y * log2(x) 
399
400    ucomisd      RELATIVE_ADDR( d128),     %xmm0        // if( ylog2(x) >= 128 )
401    jae            4f                                   //        goto 4
402
403    ucomisd      RELATIVE_ADDR( dm150),    %xmm0        // if( ylog2(x) <= -150
404    jbe            4f                                   //        goto 4
405
406    // separate ylog2(x) into i + f
407    cvttsd2si    %xmm0,                    %eax         // i = (int) ylog2(x)
408    cvtsi2sd     %eax,                     %xmm1        // trunc( ylog2(x) )
409    subsd        %xmm1,                    %xmm0        // f
410
411    // calculate 2**i
412    addl        $1023,                     %eax         // exponent + bias
413    movd        %eax,                      %xmm7        // move to vector register
414    psllq       $52,                       %xmm7        // shift exponent + bias into place
415
416    // early out for power of 2
417    xorpd        %xmm6,                    %xmm6
418    ucomisd      %xmm0,                    %xmm6
419    movsd        RELATIVE_ADDR( oneD),     %xmm1
420    je            3f
421    
422    //f = exp2(f)
423#if defined( __SSE3__ )
424    movddup      %xmm0,                    %xmm1        // { f, f }
425#else
426    movapd       %xmm0,                    %xmm1
427    unpcklpd     %xmm1,                    %xmm1        //    { f, f }
428#endif
429    mulsd        %xmm0,                    %xmm0        // ff = f*f
430    movapd       %xmm1,                    %xmm3        // { f, f }
431    lea          RELATIVE_ADDR( powf_exp_c ), CX_P
432    mulpd        48(CX_P),                 %xmm1        // { c3f, (c7/c8)f }
433    mulpd        16(CX_P),                 %xmm3        // { c1f, (c5/c8)f }
434#if defined( __SSE3__ )
435    movddup      %xmm0,                    %xmm4        // { ff, ff }
436#else
437    movapd       %xmm0,                    %xmm4
438    unpcklpd     %xmm4,                    %xmm4        //    { ff, ff }
439#endif
440    mulsd        %xmm0,                    %xmm0        // ffff = ff * ff
441    addpd        32(CX_P),                 %xmm1        // { c2 + c3f, (c6/c8) + (c7/c8)f }
442    addpd        (CX_P),                   %xmm3        // { c0 + c1f, (c4/c8) + (c5/c8)f }
443    mulpd        %xmm4,                    %xmm1        // { c2ff + c3fff, (c6/c8)ff + (c7/c8)fff }
444    addsd        %xmm0,                    %xmm3        // { c0 + c1x, (c4/c8) + (c5/c8)f + ffff }
445    mulsd        64(CX_P),                 %xmm0        // c8ffff
446    addpd        %xmm1,                    %xmm3        // { c0 + c1f + c2ff + c3fff, (c4/c8) + (c5/c8)f + (c6/c8)ff + (c7/c8)fff + ffff }
447    movhlps      %xmm3,                    %xmm1        // { ?, c0 + c1f + c2ff + c3fff }
448    mulsd        %xmm0,                    %xmm3        // { ..., c8ffff* ((c4/c8) + (c5/c8)f + (c6/c8)ff + (c7/c8)fff + ffff) } 
449    addsd        %xmm3,                    %xmm1        // c0 + c1f + c2ff + c3fff + c4ffff + c5fffff + c6ffffff + c7fffffff + c8fffffffff
450
451    // scale by 2**i, and convert to float
4523:    mulsd      %xmm1,                    %xmm7
453    xorps        %xmm0,                    %xmm0
454    cvtsd2ss     %xmm7,                    %xmm0
455
456    pop        DI_P
457    pop        SI_P
458    pop        BX_P
459#if defined( __i386__ )
460    movss    %xmm0,    FRAME_SIZE( STACKP )
461    flds    FRAME_SIZE( STACKP )
462#endif
463    ret
464
465    // overflow / underflow
4664:  xorpd    %xmm1,                        %xmm1       // 0
467    cmpltsd  %xmm0,                        %xmm1       // 0 < ylog2(x) ? -1LL : 0
468    movd     %xmm1,                        %eax        // 0 < ylog2(x) ? -1U : 0
469    andl     $0xfff,                       %eax        // 0 < ylog2(x) ?  0xfff : 0
470    xorl     $0x801,                       %eax        // 0 < ylog2(x) ?  0x7fe : 0x801 
471    movd     %eax,                         %xmm2       // 0 < ylog2(x) ?  0x7fe : 0x801
472    psllq    $52,                          %xmm2       // 0 < ylog2(x) ? 0x1.0p+1023 : -0x1.0p-1022
473    mulsd    %xmm0,                        %xmm2       // result = ylog2(x) * (0 < ylog2(x) ? 0x1.0p+1023 : -0x1.0p-1022)
474    xorps    %xmm0,                        %xmm0       // 0
475    cvtsd2ss %xmm2,                        %xmm0       // convert result to float
476    jmp      9f
477
478
479    // ( x < 0 && isfinite(x) && |y| is not in { 0, inf, NaN } )  or x is unknown, but |y| >= 0x1.0p23
4805:  cmpl    $0,                            %edi        // if( y is not an integer )
481    jne     8f
482    
483    // since we know y is an integer, we can just call ipowf
484    jmp    ___ipowf
485    
4866:    // x == 1.0f return x
487#if    defined( __i386__ )
488    flds    FRAME_SIZE( STACKP )
489#endif
490    ret
491    
492    //    A whole basket of special cases lands here
493    // (x <= 0 || x == Inf || isnan(x)) or ( |y| >= 0x1.0p23f || y == 0 || isnan(y) )
494    //        all we have to do is figure out which one!
4957:
496    addl    $1,                            %eax        // |y|
497    addl    $1,                            %edx        // |y|
498    andl    $0x7fffffff,                   %eax        // |x|
499    cmpl    $0,                            %edx        // if( |y| == 0 )
500    je      4f                                         //        goto 4
501
502// (x <= 0 || x == Inf || isnan(x)) or ( |y| >= 0x1.0p23 || isnan(y) )
503
504    //check for NaNs
505    ucomiss  %xmm0,                        %xmm1
506    jp       7f
507
508// (x <= 0 || x == Inf ) or |y| >= 0x1.0p23f
509    
510    // calculate fractional part of y and ones bit of y
511    movl    %edx,                         %edi        // |y|
512    lea     RELATIVE_ADDR(gMaskShift),    CX_P        // gMaskShift ptr
513    shrl    $23,                          %edi        // |y| >> 23
514    movzbl  (CX_P, DI_P, 1),              %ecx        // gMaskShift[ |y| >> 23 ]
515    mov     $0x3fffffff,                  DI_P        // 0x3fffffff
516    mov     $0x40000000,                  SI_P        // 0x40000000
517    shrl    %cl,                          %edi        // 0x3fffffff >> gMaskShift[ |y| >> 23 ]
518    shrl    %cl,                          %esi        // 0x40000000 >> gMaskShift[ |y| >> 23 ]
519    andl    %edx,                         %edi        // fractional part of y
520    andl    %edx,                         %esi        // ones bit of y
521
522    // if( x == 0 )        goto 2
523    xorps   %xmm2,                        %xmm2
524    ucomiss %xmm0,                        %xmm2
525    je      2f
526    
527// (x < 0 || x == Inf ) or |y| >= 0x1.0p23f
528
529    // if( |y| == inf )    goto 3
530    cmpl    $0x7f800000,                  %edx
531    je      3f
532
533// (x < 0 || x == Inf)  or ( 0x1.0p23f <= |y| < inf )
534
535    // if( x == inf )    goto 5
536    ucomiss   RELATIVE_ADDR( infF ),      %xmm0
537    je        5f
538
539// x < 0 or ( 0x1.0p23f <= |y| < inf )
540
541    // negative finite x or large y go off to be considered for ipowf
542    ucomiss   RELATIVE_ADDR( minfF ),     %xmm0        // if( x != -inf )      
543    ja        5b                                       //        goto the other 5
544
545// x == -inf
546
547    // At this point, we know that x is -Inf and |y| is not in { 0, Inf, NaN }.
548    // Deal with y is odd integer cases
549    // if( 0 == fractionalBits && 0 != onesBit )
550    movl    %edi,                         %ecx        // fractional Bits
551    subl    $1,                           %ecx        // fractionalBits == 0 ? -1 : some non-negative number
552    sarl    $31,                          %ecx        // fractionalBits == 0 ? -1 : 0
553    andl    %esi,                         %ecx        // fractionalBits == 0 ? onesBit : 0
554    cmpl    $0,                           %ecx        // if( 0 == fractionalBits && 0 != onesBit )
555    jne      6f                                       //        goto 6
556    
557//    x = -inf, |y| is not in { 0, Inf, NaN }, and y is not an odd integer
558
559    // if( 0.0f < y )    return -x;        else return 0
560    cmpltss  %xmm1,                       %xmm2      //  0.0f < y ? -1 : 0 
561    andps    %xmm2,                       %xmm0      //  0.0f < y ?  x : 0
562    pslld    $31,                         %xmm2      //  0.0f < y ? 0x80000000 : 0
563    xorps    %xmm2,                       %xmm0      //  0.0f < y ?  -x : 0
564    jmp      9f        //return 0
565
566    // x < 0 && y is not an integer, or |y| == 0.5f 
5678:  xorps    %xmm2,                       %xmm2
568    cmpless  %xmm0,                       %xmm2        //  0 <= x ? -1 : 0
569    andps    %xmm1,                       %xmm2        //    0 <= x ?  y : 0
570    xorps    %xmm3,                       %xmm3
571    ucomiss  %xmm2,                       %xmm3        //  if( x >= 0 && 0 > y )
572    ja       1f                                        //        goto 1
573
574    sqrtss   %xmm0,                      %xmm0
575    jmp      9f    //return
576    
577//    y == -0.5f && x > 0
5781:  cvtss2sd   %xmm0,                    %xmm0
579    movsd      RELATIVE_ADDR( oneD ),    %xmm1
580    divsd      %xmm0,                    %xmm1
581    sqrtsd     %xmm1,                    %xmm1
582    xorps      %xmm0,                    %xmm0
583    cvtsd2ss   %xmm1,                    %xmm0
584    jmp        9f    // return
585
586
587    // x == 0
5882:    // if( y is an odd integer )        goto 8
589    movl    %edi,                        %ecx        // fractional Bits
590    subl    $1,                          %ecx        // fractionalBits == 0 ? -1 : some non-negative number
591    sarl    $31,                         %ecx        // fractionalBits == 0 ? -1 : 0
592    andl    %esi,                        %ecx        // fractionalBits == 0 ? onesBit : 0
593    cmpl    $0,                          %ecx        // if( fractionalBits == 0 && 0 != onesBit )
594    jne     8f                                       //        y is an odd integer, goto 8
595
596    xorps    %xmm0,                      %xmm0       // x = fabsf(x)
597    ucomiss  %xmm1,                      %xmm0       // if( 0 < y )
598    jb       9f                                      //        return x
599
600    //return 1.0 / f
601    movss    RELATIVE_ADDR( oneF ),      %xmm1
602    divss    %xmm0,                      %xmm1        // return inf and set div/0
603    movaps   %xmm1,                      %xmm0
604    jmp      9f
605
606    // |y| == inf
6073:  ucomiss  RELATIVE_ADDR( moneF ),     %xmm0        // if( -1.0f == x )
608    je       4f                                       //        return 1.0f
609
610    cmpl     $0x3f7fffff,                %eax         // if( |x| > 1.0f )
611    ja       1f                                       //        goto 1f
612    
613    xorps    %xmm0,                      %xmm0        // 0.0f
614    cmpnless %xmm1,                      %xmm0        // y == inf ? 0 : -1 
615    psrld    $1,                         %xmm0        // y == inf ? 0 : 0x7fffffff
616    andps    %xmm1,                      %xmm0        // y == inf ? 0 : inf
617    jmp      9f                                       // return
618    
619    // return 1.0f
6204:  movl     $1,                        %ecx
621    xorps    %xmm0,                     %xmm0
622    cvtsi2ss %ecx,                      %xmm0
623    jmp      9f
624
625    // x == inf
6265:  xorps    %xmm2,                     %xmm2
627    cmpltss  %xmm1,                     %xmm2        // 0 < y ? -1 : 0
628    andps    %xmm2,                     %xmm0        // 0 < y ?  x : 0
629    jmp      9f
630
631    // 0 == fractionalBits && 0 != onesBit 
6326:  xorps    %xmm2,                     %xmm2
633    ucomiss  %xmm1,                     %xmm2        // if( 0 < y )
634    jb       9f                                        //return x                        
635
636    movl    $0x80000000,                %ecx        // -0.0f
637    movd    %ecx,                       %xmm0        // copy to xmm, zero high part of register
638    jmp     9f        //return -0.0
639
640    
6417:    // NaN
642    addss   %xmm1,                      %xmm0
643    jmp     9f
644
645    // x == 0, y is an odd integer
6468:  ucomiss   %xmm1,                    %xmm2    // if( 0 < y )
647    jb        9f                                    //        return x
648    
649    //return 1.0 / f
650    movss    RELATIVE_ADDR( oneF ),     %xmm1
651    divss    %xmm0,                     %xmm1        // return inf and set div/0
652    movaps   %xmm1,                     %xmm0
653    jmp      9f
654
655    //|y| == inf, |x| > 1.0f
6561:  xorps    %xmm0,                        %xmm0
657    cmpltss  %xmm1,                        %xmm0
658    andps    %xmm1,                        %xmm0
659    jmp      9f
660    
661
662.align 4
663    // return value in %xmm0
6649:
665    pop        DI_P
666    pop        SI_P
667    pop        BX_P
668#if defined( __i386__ )
669    movss      %xmm0,    FRAME_SIZE( STACKP )
670    flds       FRAME_SIZE( STACKP )
671#endif
672    ret
673    
674// x and y passed in in xmm0 and xmm1
675//  result returned in xmm0 
676//    BX_P already points to label 0 above
677___ipowf:
678    // clamp INT_MIN <= y < INT_MAX.  Values outside this range can't be odd numbers.
679    maxss        RELATIVE_ADDR( miny ),    %xmm1
680    minss        RELATIVE_ADDR( maxy ),    %xmm1
681    cvttss2si    %xmm1,                    %edx        // (int) y
682    
683    cvtss2sd    %xmm0,                     %xmm0        //  x
684    movsd       RELATIVE_ADDR( oneD ),     %xmm2        //    r = 1.0
685
686    cmpl        $0,                        %edx         //    if( y >= 0 )
687    jge         1f                                     //        goto 4
688    
689    // y < 0
690    movapd      %xmm0,                     %xmm1        // x
691    movapd      %xmm2,                     %xmm0        // 1.0
692    divsd       %xmm1,                     %xmm0        // 1.0 / x
693    negl        %edx
694    
6951:  test        $1,                        %edx
696    jz          3f                                    // if( |y| is odd )
697    movapd      %xmm0,                     %xmm2        //        r = x
698    jmp         3f
699
700.align 4
701// do{
7022:  mulsd       %xmm0,                     %xmm0        //    x *= x
703    test        $1,                        %edx
704    jz          3f                                    //    if( |y| is odd )    continue
705    mulsd       %xmm0,                     %xmm2        //        r *= x
706
7073:  shrl        $1,                        %edx        //    |y| >>= 1
708    test        $-1,                       %edx
709    jnz         2b                                    //    if( y )        continue
710//    }while( |y| )
711
712    // round to float
713    xorps       %xmm0,                     %xmm0        // 0
714    cvtsd2ss    %xmm2,                     %xmm0        // (float) r
715
716    //exit
717    jmp         9b