this repo has no description
at fixPythonPipStalling 717 lines 40 kB view raw
1 2/* 3 * powf.s 4 * 5 * by Ian Ollmann 6 * 7 * Copyright (c) 2007, Apple Inc. All Rights Reserved. 8 * 9 * Implementation of C99 powf function for MacOS X __i386__ and __x86_64__ architectures. 10 * 11 */ 12 13 14#define LOCAL_STACK_SIZE 3*FRAME_SIZE 15 16#include "machine/asm.h" 17#include "abi.h" 18 19.const 20gMaskShift: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ 21 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ 22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ 23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ 24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ 25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ 26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ 27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, \ 28 8, 9,10,11, 12,13,14,15,16,17,18,19,20,21,22,23, \ 29 24,25,26,27, 28,29,30,31,31,31,31,31,31,31,31,31, \ 30 31,31,31,31, 31,31,31,31,31,31,31,31,31,31,31,31, \ 31 31,31,31,31, 31,31,31,31,31,31,31,31,31,31,31,31, \ 32 31,31,31,31, 31,31,31,31,31,31,31,31,31,31,31,31, \ 33 31,31,31,31, 31,31,31,31,31,31,31,31,31,31,31,31, \ 34 31,31,31,31, 31,31,31,31,31,31,31,31,31,31,31,31, \ 35 31,31,31,31, 31,31,31,31,31,31,31,31,31,31,31,31 36 37.align 4 38// 8th order minimax fit of exp2 on [-1.0,1.0]. |error| < 0.402865722354948566583852e-9: 39powf_exp_c: .quad 0x40bc03f30399c376, 0x3ff000000001ea2a // c4/c8 = 0.961813690023115610862381719985771e-2 / 0.134107709538786543922336536865157e-5, c0 = 1.0 + 0.278626872016317130037181614004e-10 40 .quad 0x408f10e7f73e6d8f, 0x3fe62e42fd0933ee // c5/c8 = 0.133318252930790403741964203236548e-2 / 0.134107709538786543922336536865157e-5, c1 = .693147176943623740308984004029708 41 .quad 0x405cb616a9384e69, 0x3fcebfbdfd0f0afa // c6/c8 = 0.154016177542147239746127455226575e-3 / 0.134107709538786543922336536865157e-5, c2 = .240226505817268621584559118975830 42 .quad 0x4027173ebd288ba1, 0x3fac6b0a74f15403 // c7/c8 = 0.154832722143258821052933667742417e-4 / 0.134107709538786543922336536865157e-5, c3 = 0.555041568519883074165425891257052e-1 43 .quad 0x3eb67fe1dc3105ba // c8 = 0.134107709538786543922336536865157e-5 44 45// The reduction for the log2 stage is done as: 46// 47// for log2(x): 48// x = 2**i * 1.m 1.0 <= 1.m < 2.0 49// index = top 7 bits of m 50// reduced = 1.m * powf_log2_c[2*index] 1-2**-8 < reduced <= 1+2**-7 51// log2x = exp2(i) + powf_log2_c[2*index+1] + log1p(reduced - 1) / ln(2); 52// 53// exp2(i) is exact 54// powf_log2_c[2*index] has 53 bits of precision, and is exact for the first and last entries 55// powf_log2_c[2*index+1] has 53 bits of precision, and is exact for the first and last entries 56// log1p( -2**-8 < x <= 2**-7 ) is done as a 5 term Taylor series. Error should be proportional 57// to the missing 6th order term or < 2**(6*-7)/6 ~ 2**-44, in the worst case. For smaller x, 58// it will obviously be better. 59// 60// Error in powf is rougly porportional to error_in_exp2_stage + y * error_in_log2_stage. 61// y is bounded by the point that powf overflows (underflow loses precision a bit sooner) 62// For our 2**-7 worst case, y is about 10000 at overflow. 10000 * 2**-44 = ~6e-10, ~2**-31 63// which should give us a good margin of safety. For cases that are in 1.0-2**-8 < x < 1.0+2**-7, 64// we expect the precision of the Taylor series to improve faster than y's ability to disrupt 65// the precision in ylog2(x) up to the point that we run out of precision in the double. The 66// worst case in this regard is probably 0x1.ffffep-1f, the number closest to 1. This overflows 67// somewhere in the range y = [ -2**30, -2**31 ]. We predict we'll need 53-55 bits of precision 68// here, which may slightly exceed the precision of the double. While this number may come out 69// wrong by a few ulps, all the other ones should be within tolerance since the next closest 70// number is twice as far from 1.0. For these cases, the values in powf_log2_c are exact, so 71// the only source of error is the Taylor series for log1p, and post-scaling by 1/ln(2). 72// For cases not in 1.0-2**-8 < x < 1.0+2**-7 the point at which y overflows is much smaller, so 73// we don't need so much precision. 74// 75// Reduction table for log2 stage of power prepared as: 76// #include <stdio.h> 77// #include <stdint.h> 78// #include <math.h> 79// 80// int main( void ) 81// { 82// int i; 83// 84// for( i = 0; i < 128; i++ ) 85// { 86// long double a = 1.0L /( 1.0L + (long double) i / (long double) 127 ); 87// union{ double d; uint64_t u;}u, v; 88// u.d = a; 89// v.d = -log2l(a); 90// 91// printf( ".quad 0x%llx,\t0x%llx\t// %Lg, -log2l(%Lg)\n", u.u, v.u, a, a ); 92// } 93// 94// return 0; 95// } 96// 97// We use /127 here rather than /128 to allow the cases where the most precision is needed to be reduced by 98// exact powers of two. (These are 1.0 + 1 ulp and 1.0 - 1ulp.) The other values will land somewhere in the 99// range [ 1.0 - 2**-8, 1.0 + 2**-7 ]. (Experimentally verified for all floats 1.0 <= x < 2.0.) 100// 101.align 3 102powf_log2_c: .quad 0x3ff0000000000000, 0x8000000000000000 // 1, -log2l(1) 103 .quad 0x3fefc00000000000, 0x3f872c7ba20f7327 // 0.992188, -log2l(0.992188) 104 .quad 0x3fef80fe03f80fe0, 0x3f9715662c7f3dbc // 0.984496, -log2l(0.984496) 105 .quad 0x3fef42f42f42f42f, 0x3fa13eea2b6545df // 0.976923, -log2l(0.976923) 106 .quad 0x3fef05dcd30dadec, 0x3fa6e7f0bd9710dd // 0.969466, -log2l(0.969466) 107 .quad 0x3feec9b26c9b26ca, 0x3fac85f25e12da51 // 0.962121, -log2l(0.962121) 108 .quad 0x3fee8e6fa39be8e7, 0x3fb10c8cd0c74414 // 0.954887, -log2l(0.954887) 109 .quad 0x3fee540f4898d5f8, 0x3fb3d0c813e48e00 // 0.947761, -log2l(0.947761) 110 .quad 0x3fee1a8c536fe1a9, 0x3fb68fbf5169e028 // 0.940741, -log2l(0.940741) 111 .quad 0x3fede1e1e1e1e1e2, 0x3fb949866f0b017b // 0.933824, -log2l(0.933824) 112 .quad 0x3fedaa0b3630957d, 0x3fbbfe30e28821c0 // 0.927007, -log2l(0.927007) 113 .quad 0x3fed7303b5cc0ed7, 0x3fbeadd1b4ef9a1f // 0.92029, -log2l(0.92029) 114 .quad 0x3fed3cc6e80ebbdb, 0x3fc0ac3dc2e0ca0c // 0.913669, -log2l(0.913669) 115 .quad 0x3fed075075075075, 0x3fc1ff2046fb7116 // 0.907143, -log2l(0.907143) 116 .quad 0x3fecd29c244fe2f3, 0x3fc34f99517622ae // 0.900709, -log2l(0.900709) 117 .quad 0x3fec9ea5dbf193d5, 0x3fc49db19c99a54d // 0.894366, -log2l(0.894366) 118 .quad 0x3fec6b699f5423ce, 0x3fc5e971b3a4ee80 // 0.888112, -log2l(0.888112) 119 .quad 0x3fec38e38e38e38e, 0x3fc732e1f41ccdba // 0.881944, -log2l(0.881944) 120 .quad 0x3fec070fe3c070fe, 0x3fc87a0a8f0ff9b2 // 0.875862, -log2l(0.875862) 121 .quad 0x3febd5eaf57abd5f, 0x3fc9bef38a4ffae5 // 0.869863, -log2l(0.869863) 122 .quad 0x3feba5713280dee9, 0x3fcb01a4c19f6811 // 0.863946, -log2l(0.863946) 123 .quad 0x3feb759f2298375a, 0x3fcc4225e7d5e3c6 // 0.858108, -log2l(0.858108) 124 .quad 0x3feb4671655e7f24, 0x3fcd807e87fa4521 // 0.852349, -log2l(0.852349) 125 .quad 0x3feb17e4b17e4b18, 0x3fcebcb6065350a2 // 0.846667, -log2l(0.846667) 126 .quad 0x3feae9f5d3eba7d7, 0x3fcff6d3a16f617f // 0.84106, -log2l(0.84106) 127 .quad 0x3feabca1af286bca, 0x3fd0976f3991af9e // 0.835526, -log2l(0.835526) 128 .quad 0x3fea8fe53a8fe53b, 0x3fd1326eb8c0aba3 // 0.830065, -log2l(0.830065) 129 .quad 0x3fea63bd81a98ef6, 0x3fd1cc6bb7e3870f // 0.824675, -log2l(0.824675) 130 .quad 0x3fea3827a3827a38, 0x3fd265698fa26c0a // 0.819355, -log2l(0.819355) 131 .quad 0x3fea0d20d20d20d2, 0x3fd2fd6b881e82d3 // 0.814103, -log2l(0.814103) 132 .quad 0x3fe9e2a65187566c, 0x3fd39474d95e1649 // 0.808917, -log2l(0.808917) 133 .quad 0x3fe9b8b577e61371, 0x3fd42a88abb54986 // 0.803797, -log2l(0.803797) 134 .quad 0x3fe98f4bac46d7c0, 0x3fd4bfaa182b7fe3 // 0.798742, -log2l(0.798742) 135 .quad 0x3fe9666666666666, 0x3fd553dc28dd9724 // 0.79375, -log2l(0.79375) 136 .quad 0x3fe93e032e1c9f02, 0x3fd5e721d95d124d // 0.78882, -log2l(0.78882) 137 .quad 0x3fe9161f9add3c0d, 0x3fd6797e170c5221 // 0.783951, -log2l(0.783951) 138 .quad 0x3fe8eeb9533d4065, 0x3fd70af3c177f740 // 0.779141, -log2l(0.779141) 139 .quad 0x3fe8c7ce0c7ce0c8, 0x3fd79b85aaad8878 // 0.77439, -log2l(0.77439) 140 .quad 0x3fe8a15b8a15b8a1, 0x3fd82b36978f76d5 // 0.769697, -log2l(0.769697) 141 .quad 0x3fe87b5f9d4d1bc2, 0x3fd8ba09402697ed // 0.76506, -log2l(0.76506) 142 .quad 0x3fe855d824ca58e9, 0x3fd948004ff12dbf // 0.760479, -log2l(0.760479) 143 .quad 0x3fe830c30c30c30c, 0x3fd9d51e662f92a2 // 0.755952, -log2l(0.755952) 144 .quad 0x3fe80c1e4bbd595f, 0x3fda6166162e9ec8 // 0.751479, -log2l(0.751479) 145 .quad 0x3fe7e7e7e7e7e7e8, 0x3fdaecd9e78fdbea // 0.747059, -log2l(0.747059) 146 .quad 0x3fe7c41df1077c42, 0x3fdb777c568f9ae2 // 0.74269, -log2l(0.74269) 147 .quad 0x3fe7a0be82fa0be8, 0x3fdc014fd448fe3a // 0.738372, -log2l(0.738372) 148 .quad 0x3fe77dc7c4cf2aea, 0x3fdc8a56c6f80bca // 0.734104, -log2l(0.734104) 149 .quad 0x3fe75b37e875b37f, 0x3fdd12938a39d6f0 // 0.729885, -log2l(0.729885) 150 .quad 0x3fe7390d2a6c405e, 0x3fdd9a086f4ad416 // 0.725714, -log2l(0.725714) 151 .quad 0x3fe71745d1745d17, 0x3fde20b7bd4365a8 // 0.721591, -log2l(0.721591) 152 .quad 0x3fe6f5e02e4850ff, 0x3fdea6a3b152b1e6 // 0.717514, -log2l(0.717514) 153 .quad 0x3fe6d4da9b536a6d, 0x3fdf2bce7ef7d06b // 0.713483, -log2l(0.713483) 154 .quad 0x3fe6b4337c6cb157, 0x3fdfb03a50395dba // 0.709497, -log2l(0.709497) 155 .quad 0x3fe693e93e93e93f, 0x3fe019f4a2edc134 // 0.705556, -log2l(0.705556) 156 .quad 0x3fe673fa57b0cbab, 0x3fe05b6ebbca3d9a // 0.701657, -log2l(0.701657) 157 .quad 0x3fe6546546546546, 0x3fe09c8c7a1fd74c // 0.697802, -log2l(0.697802) 158 .quad 0x3fe63528917c80b3, 0x3fe0dd4ee107ae0a // 0.693989, -log2l(0.693989) 159 .quad 0x3fe61642c8590b21, 0x3fe11db6ef5e7873 // 0.690217, -log2l(0.690217) 160 .quad 0x3fe5f7b282135f7b, 0x3fe15dc59fdc06b7 // 0.686486, -log2l(0.686486) 161 .quad 0x3fe5d9765d9765d9, 0x3fe19d7be92a2310 // 0.682796, -log2l(0.682796) 162 .quad 0x3fe5bb8d015e75bc, 0x3fe1dcdabdfad537 // 0.679144, -log2l(0.679144) 163 .quad 0x3fe59df51b3bea36, 0x3fe21be30d1e0ddb // 0.675532, -log2l(0.675532) 164 .quad 0x3fe580ad602b580b, 0x3fe25a95c196bef3 // 0.671958, -log2l(0.671958) 165 .quad 0x3fe563b48c20563b, 0x3fe298f3c2af6595 // 0.668421, -log2l(0.668421) 166 .quad 0x3fe5470961d7ca63, 0x3fe2d6fdf40e09c5 // 0.664921, -log2l(0.664921) 167 .quad 0x3fe52aaaaaaaaaab, 0x3fe314b535c7b89e // 0.661458, -log2l(0.661458) 168 .quad 0x3fe50e97366227cb, 0x3fe3521a64737cf3 // 0.658031, -log2l(0.658031) 169 .quad 0x3fe4f2cddb0d3225, 0x3fe38f2e593cda73 // 0.654639, -log2l(0.654639) 170 .quad 0x3fe4d74d74d74d75, 0x3fe3cbf1e9f5cf2f // 0.651282, -log2l(0.651282) 171 .quad 0x3fe4bc14e5e0a72f, 0x3fe40865e9285f33 // 0.647959, -log2l(0.647959) 172 .quad 0x3fe4a12316176410, 0x3fe4448b2627ade3 // 0.64467, -log2l(0.64467) 173 .quad 0x3fe48676f31219dc, 0x3fe480626d20a876 // 0.641414, -log2l(0.641414) 174 .quad 0x3fe46c0f6feb6ac6, 0x3fe4bbec872a4505 // 0.638191, -log2l(0.638191) 175 .quad 0x3fe451eb851eb852, 0x3fe4f72a3a555958 // 0.635, -log2l(0.635) 176 .quad 0x3fe4380a3065e3fb, 0x3fe5321c49bc0c91 // 0.631841, -log2l(0.631841) 177 .quad 0x3fe41e6a74981447, 0x3fe56cc37590e6c5 // 0.628713, -log2l(0.628713) 178 .quad 0x3fe4050b59897548, 0x3fe5a7207b2d815a // 0.625616, -log2l(0.625616) 179 .quad 0x3fe3ebebebebebec, 0x3fe5e1341520db00 // 0.622549, -log2l(0.622549) 180 .quad 0x3fe3d30b3d30b3d3, 0x3fe61afefb3d5201 // 0.619512, -log2l(0.619512) 181 .quad 0x3fe3ba68636adfb0, 0x3fe65481e2a6477b // 0.616505, -log2l(0.616505) 182 .quad 0x3fe3a2027932b48f, 0x3fe68dbd7ddd6e15 // 0.613527, -log2l(0.613527) 183 .quad 0x3fe389d89d89d89e, 0x3fe6c6b27ccfc698 // 0.610577, -log2l(0.610577) 184 .quad 0x3fe371e9f3c04e64, 0x3fe6ff618ce24cd7 // 0.607656, -log2l(0.607656) 185 .quad 0x3fe35a35a35a35a3, 0x3fe737cb58fe5716 // 0.604762, -log2l(0.604762) 186 .quad 0x3fe342bad7f64b39, 0x3fe76ff0899daa49 // 0.601896, -log2l(0.601896) 187 .quad 0x3fe32b78c13521d0, 0x3fe7a7d1c4d64520 // 0.599057, -log2l(0.599057) 188 .quad 0x3fe3146e92a10d38, 0x3fe7df6fae65e424 // 0.596244, -log2l(0.596244) 189 .quad 0x3fe2fd9b8396ba9e, 0x3fe816cae7bd40b1 // 0.593458, -log2l(0.593458) 190 .quad 0x3fe2e6fecf2e6fed, 0x3fe84de4100b0ce2 // 0.590698, -log2l(0.590698) 191 .quad 0x3fe2d097b425ed09, 0x3fe884bbc446ae3f // 0.587963, -log2l(0.587963) 192 .quad 0x3fe2ba6574cae996, 0x3fe8bb529f3ab8f3 // 0.585253, -log2l(0.585253) 193 .quad 0x3fe2a46756e62a46, 0x3fe8f1a9398f2d58 // 0.582569, -log2l(0.582569) 194 .quad 0x3fe28e9ca3a728ea, 0x3fe927c029d3798a // 0.579909, -log2l(0.579909) 195 .quad 0x3fe27904a7904a79, 0x3fe95d980488409a // 0.577273, -log2l(0.577273) 196 .quad 0x3fe2639eb2639eb2, 0x3fe993315c28e8fb // 0.574661, -log2l(0.574661) 197 .quad 0x3fe24e6a171024e7, 0x3fe9c88cc134f3c3 // 0.572072, -log2l(0.572072) 198 .quad 0x3fe239662b9f91cb, 0x3fe9fdaac2391e1c // 0.569507, -log2l(0.569507) 199 .quad 0x3fe2249249249249, 0x3fea328bebd84e80 // 0.566964, -log2l(0.566964) 200 .quad 0x3fe20fedcba98765, 0x3fea6730c8d44efa // 0.564444, -log2l(0.564444) 201 .quad 0x3fe1fb78121fb781, 0x3fea9b99e21655eb // 0.561947, -log2l(0.561947) 202 .quad 0x3fe1e7307e4ef157, 0x3feacfc7beb75e94 // 0.559471, -log2l(0.559471) 203 .quad 0x3fe1d31674c59d31, 0x3feb03bae40852a0 // 0.557018, -log2l(0.557018) 204 .quad 0x3fe1bf295cc93903, 0x3feb3773d59a05ff // 0.554585, -log2l(0.554585) 205 .quad 0x3fe1ab68a0473c1b, 0x3feb6af315450638 // 0.552174, -log2l(0.552174) 206 .quad 0x3fe197d3abc65f4f, 0x3feb9e3923313e58 // 0.549784, -log2l(0.549784) 207 .quad 0x3fe18469ee58469f, 0x3febd1467ddd70a7 // 0.547414, -log2l(0.547414) 208 .quad 0x3fe1712ad98b8957, 0x3fec041ba2268731 // 0.545064, -log2l(0.545064) 209 .quad 0x3fe15e15e15e15e1, 0x3fec36b90b4ebc3a // 0.542735, -log2l(0.542735) 210 .quad 0x3fe14b2a7c2fee92, 0x3fec691f33049ba0 // 0.540426, -log2l(0.540426) 211 .quad 0x3fe1386822b63cbf, 0x3fec9b4e9169de22 // 0.538136, -log2l(0.538136) 212 .quad 0x3fe125ce4feeb7a1, 0x3feccd479d1a1f94 // 0.535865, -log2l(0.535865) 213 .quad 0x3fe1135c81135c81, 0x3fecff0acb3170e3 // 0.533613, -log2l(0.533613) 214 .quad 0x3fe10112358e75d3, 0x3fed30988f52c6d3 // 0.531381, -log2l(0.531381) 215 .quad 0x3fe0eeeeeeeeeeef, 0x3fed61f15bae4663 // 0.529167, -log2l(0.529167) 216 .quad 0x3fe0dcf230dcf231, 0x3fed9315a1076fa2 // 0.526971, -log2l(0.526971) 217 .quad 0x3fe0cb1b810ecf57, 0x3fedc405cebb27dc // 0.524793, -log2l(0.524793) 218 .quad 0x3fe0b96a673e2808, 0x3fedf4c252c5a3e1 // 0.522634, -log2l(0.522634) 219 .quad 0x3fe0a7de6d1d6086, 0x3fee254b99c83339 // 0.520492, -log2l(0.520492) 220 .quad 0x3fe096771e4d528c, 0x3fee55a20f0eecf9 // 0.518367, -log2l(0.518367) 221 .quad 0x3fe0853408534085, 0x3fee85c61c963f0d // 0.51626, -log2l(0.51626) 222 .quad 0x3fe07414ba8f0741, 0x3feeb5b82b10609b // 0.51417, -log2l(0.51417) 223 .quad 0x3fe06318c6318c63, 0x3feee578a1eaa83f // 0.512097, -log2l(0.512097) 224 .quad 0x3fe0523fbe3367d7, 0x3fef1507e752c6c8 // 0.51004, -log2l(0.51004) 225 .quad 0x3fe04189374bc6a8, 0x3fef4466603be71d // 0.508, -log2l(0.508) 226 .quad 0x3fe030f4c7e7859c, 0x3fef73947063b3fd // 0.505976, -log2l(0.505976) 227 .quad 0x3fe0208208208208, 0x3fefa2927a574422 // 0.503968, -log2l(0.503968) 228 .quad 0x3fe0103091b51f5e, 0x3fefd160df77ed7a // 0.501976, -log2l(0.501976) 229 .quad 0x3fe0000000000000, 0x3ff0000000000000 // 0.5, -log2l(0.5) 230 231// Taylor series coefficients for log2 stage 232powf_logTaylor: .double -0.5, 0.33333333333333333333333333333333, -0.25, 0.2 233 234 235.literal8 236oneD: .double 1.0 237d128: .double 128.0 238dm150: .double -150.0 239recip_ln2: .quad 0x3ff71547652b82fe // 1.0 / ln(2) 240 241.literal4 242infF: .long 0x7f800000 // inf 243minfF: .long 0xff800000 // -inf 244oneF: .long 0x3f800000 // 1.0f 245moneF: .long 0xbf800000 // -1.0f 246maxy: .long 0x4effffff // 0x1.0p31f - 1 ulp 247miny: .long 0xCeffffff // -0x1.0p31f + 1 ulp 248mantissaMask: .long 0x007fffff 249 250.text 251 252#if defined( __x86_64__ ) 253 #define SI_P %rsi 254 #define DI_P %rdi 255 #define RELATIVE_ADDR( _a) (_a)( %rip ) 256#else 257 #define SI_P %esi 258 #define DI_P %edi 259 #define RELATIVE_ADDR( _a) (_a)-0b( BX_P ) 260#endif 261 262ENTRY( powf ) 263#if defined( __i386__ ) 264 movl FRAME_SIZE( STACKP ), %eax 265 movl 4+FRAME_SIZE( STACKP ), %edx 266 movss FRAME_SIZE( STACKP ), %xmm0 267 movss 4+FRAME_SIZE( STACKP ), %xmm1 268#else 269 movd %xmm0, %eax 270 movd %xmm1, %edx 271#endif 272 273 //early out for x == 1.0 274 cmpl $0x3f800000, %eax //if( x == 1.0 ) 275 je 6f // goto 6 276 277 //early out for y == 1.0 (costs 1 cycle for x86_64, free for i386) 278 cmpl $0x3f800000, %edx //if( y == 1.0 ) 279 je 6f // goto 6 280 281 andl $0x7fffffff, %edx // |y| 282 283 284 // Find out if y is an integer without raising inexact 285 // Note tested over entire range. Fails for Inf/NaN, but we don't care about that here. 286 push BX_P 287 push SI_P 288 push DI_P 289 290#if defined( __i386__ ) 291 call 0f 2920: pop BX_P 293#else 294 xorq %rdi, %rdi 295#endif 296 297 // check to see if we fell into an edge case 298 subl $1, %eax 299 subl $1, %edx 300 cmpl $0x7f7fffff, %eax // if( x < 0 || x == inf || isnan(x) ) 301 jae 7f // goto 7 302 cmpl $0x4affffff, %edx // if( |y| >= 0x1.0p23 || 0 == y || isnan(y) ) 303 jae 7f // goto 7 304 305 cmpl $0x3effffff, %edx // if( |y| == 0.5f ) 306 je 8f // goto 8 307 308// The main part of pow: 309// 0 < x < inf, |y| < 0x1.0p31, x != 1, y != 0 310 311 addl $1, %eax 312 andl $0x7fffffff, %eax // |x| 313 314#if 0 315 // if y is integer, call ipowf instead 316 addl $1, %edx 317 movl %edx, %edi // |y| 318 lea RELATIVE_ADDR(gMaskShift), CX_P // gMaskShift ptr 319 shrl $23, %edi // |y| >> 23 320 movzbl (CX_P, DI_P, 1), %ecx // gMaskShift[ |y| >> 23 ] 321 mov $0x3fffffff, DI_P // 0x3fffffff 322 shrl %cl, %edi // 0x3fffffff >> gMaskShift[ |y| >> 23 ] 323 andl %edx, %edi // fractional part of y 324 cmpl $0, %edi 325 je ___ipowf 326#endif 327 328 //separate |x| into 2**i * 1.m 329 movss RELATIVE_ADDR( mantissaMask), %xmm3 330 movss RELATIVE_ADDR( oneF), %xmm2 331 andps %xmm3, %xmm0 // m 332 orps %xmm2, %xmm0 // 1.m 333 shrl $23, %eax // exponent + bias 334 cmpl $0, %eax 335 jne 1f 336 337 // normalize denormal x 338 subss %xmm2, %xmm0 // 1.m - 1.0 339 movd %xmm0, %eax 340 shrl $23, %eax // exponent + bias 341 andps %xmm3, %xmm0 // m 342 orps %xmm2, %xmm0 // 1.m 343 subl $126, %eax 344 3451: subl $127, %eax // i = exponent - bias 346 cvtsi2sd %eax, %xmm3 // log2x = (double) i 347 348 //check for unit mantissa 349 ucomiss %xmm2, %xmm0 // if( 1.m == 1.0 ) 350 je 2f // skip to 2 351 352 //handle non-unit mantissa here 353 movd %xmm0, %eax // set aside 1.m 354#if defined( __x86_64__ ) 355 cdqe 356#endif 357 cvtss2sd %xmm0, %xmm0 // r = (double) 1.m 358 lea RELATIVE_ADDR( powf_log2_c ), CX_P 359 360 // use the top 7 bits of the mantissa to index the powf_log2_c table 361 shr $(23-7-4), AX_P 362 and $0x7f0, AX_P 363 364 // reduce r to 1-2**7 < r < 1+2**-7 365 mulsd (CX_P, AX_P, 1), %xmm0 // r *= powf_log2_c[ 2 * index ] 366 367 // compensate in log2x by adding powf_log2_c[ 2 * index + 1] 368 // do this early so that we force -1.0 + 1.0 to avoid (-1.0 + tiny) + 1.0 later. 369 // Precision loss from this is at most 7 bits, which is acceptable 370 addsd 8(CX_P, AX_P, 1), %xmm3 // log2x + powf_log2_c[ 2 * index + 1] 371 372 // we calculate log2(r) as log1p( r-1 ) / ln(2) 373 subsd RELATIVE_ADDR(oneD), %xmm0 // r -= 1.0 374 375 376 // log(1+r) = r - rr/2 + rrr/3 - rrrr/4 + rrrrr/5 377 // with -2**-7 < r < 2**-7, should be good to (5+1)*7 +2 = 44 bits of accuracy or so 378 // (5+1) because the error term is roughly equal to the missing r**6/6 term 379 lea RELATIVE_ADDR( powf_logTaylor ), CX_P 380 movsd 8(CX_P), %xmm4 381 movsd 24(CX_P), %xmm5 382 movapd %xmm0, %xmm2 // r 383 mulsd %xmm0, %xmm0 // rr 384 mulsd %xmm2, %xmm4 // 0.333333333333333333333r 385 mulsd %xmm2, %xmm5 // 0.2r 386 addsd (CX_P), %xmm4 // -0.5 + 0.333333333333333333333r 387 addsd 16(CX_P), %xmm5 // -0.25 + 0.2r 388 mulsd %xmm0, %xmm4 // -0.5rr + 0.333333333333333333333rrr 389 mulsd %xmm0, %xmm0 // rrrr 390 addsd %xmm2, %xmm4 // r - 0.5rr + 0.333333333333333333333rrr 391 mulsd %xmm0, %xmm5 // -0.25rrrr + 0.2rrrrr 392 addsd %xmm5, %xmm4 // r - 0.5rr + 0.333333333333333333333rrr - 0.25rrrr + 0.2rrrrr 393 mulsd RELATIVE_ADDR( recip_ln2), %xmm4 // ( r - 0.5rr + 0.333333333333333333333rrr - 0.25rrrr + 0.2rrrrr ) * (1/ln(2)) 394 addsd %xmm4, %xmm3 // log2x + powf_log2_c[ 2 * index + 1] + ( r - 0.5rr + 0.333333333333333333333rrr - 0.25rrrr + 0.2rrrrr ) * (1/ln(2)) 395 396 // multiply by y 3972: cvtss2sd %xmm1, %xmm0 398 mulsd %xmm3, %xmm0 // y * log2(x) 399 400 ucomisd RELATIVE_ADDR( d128), %xmm0 // if( ylog2(x) >= 128 ) 401 jae 4f // goto 4 402 403 ucomisd RELATIVE_ADDR( dm150), %xmm0 // if( ylog2(x) <= -150 404 jbe 4f // goto 4 405 406 // separate ylog2(x) into i + f 407 cvttsd2si %xmm0, %eax // i = (int) ylog2(x) 408 cvtsi2sd %eax, %xmm1 // trunc( ylog2(x) ) 409 subsd %xmm1, %xmm0 // f 410 411 // calculate 2**i 412 addl $1023, %eax // exponent + bias 413 movd %eax, %xmm7 // move to vector register 414 psllq $52, %xmm7 // shift exponent + bias into place 415 416 // early out for power of 2 417 xorpd %xmm6, %xmm6 418 ucomisd %xmm0, %xmm6 419 movsd RELATIVE_ADDR( oneD), %xmm1 420 je 3f 421 422 //f = exp2(f) 423#if defined( __SSE3__ ) 424 movddup %xmm0, %xmm1 // { f, f } 425#else 426 movapd %xmm0, %xmm1 427 unpcklpd %xmm1, %xmm1 // { f, f } 428#endif 429 mulsd %xmm0, %xmm0 // ff = f*f 430 movapd %xmm1, %xmm3 // { f, f } 431 lea RELATIVE_ADDR( powf_exp_c ), CX_P 432 mulpd 48(CX_P), %xmm1 // { c3f, (c7/c8)f } 433 mulpd 16(CX_P), %xmm3 // { c1f, (c5/c8)f } 434#if defined( __SSE3__ ) 435 movddup %xmm0, %xmm4 // { ff, ff } 436#else 437 movapd %xmm0, %xmm4 438 unpcklpd %xmm4, %xmm4 // { ff, ff } 439#endif 440 mulsd %xmm0, %xmm0 // ffff = ff * ff 441 addpd 32(CX_P), %xmm1 // { c2 + c3f, (c6/c8) + (c7/c8)f } 442 addpd (CX_P), %xmm3 // { c0 + c1f, (c4/c8) + (c5/c8)f } 443 mulpd %xmm4, %xmm1 // { c2ff + c3fff, (c6/c8)ff + (c7/c8)fff } 444 addsd %xmm0, %xmm3 // { c0 + c1x, (c4/c8) + (c5/c8)f + ffff } 445 mulsd 64(CX_P), %xmm0 // c8ffff 446 addpd %xmm1, %xmm3 // { c0 + c1f + c2ff + c3fff, (c4/c8) + (c5/c8)f + (c6/c8)ff + (c7/c8)fff + ffff } 447 movhlps %xmm3, %xmm1 // { ?, c0 + c1f + c2ff + c3fff } 448 mulsd %xmm0, %xmm3 // { ..., c8ffff* ((c4/c8) + (c5/c8)f + (c6/c8)ff + (c7/c8)fff + ffff) } 449 addsd %xmm3, %xmm1 // c0 + c1f + c2ff + c3fff + c4ffff + c5fffff + c6ffffff + c7fffffff + c8fffffffff 450 451 // scale by 2**i, and convert to float 4523: mulsd %xmm1, %xmm7 453 xorps %xmm0, %xmm0 454 cvtsd2ss %xmm7, %xmm0 455 456 pop DI_P 457 pop SI_P 458 pop BX_P 459#if defined( __i386__ ) 460 movss %xmm0, FRAME_SIZE( STACKP ) 461 flds FRAME_SIZE( STACKP ) 462#endif 463 ret 464 465 // overflow / underflow 4664: xorpd %xmm1, %xmm1 // 0 467 cmpltsd %xmm0, %xmm1 // 0 < ylog2(x) ? -1LL : 0 468 movd %xmm1, %eax // 0 < ylog2(x) ? -1U : 0 469 andl $0xfff, %eax // 0 < ylog2(x) ? 0xfff : 0 470 xorl $0x801, %eax // 0 < ylog2(x) ? 0x7fe : 0x801 471 movd %eax, %xmm2 // 0 < ylog2(x) ? 0x7fe : 0x801 472 psllq $52, %xmm2 // 0 < ylog2(x) ? 0x1.0p+1023 : -0x1.0p-1022 473 mulsd %xmm0, %xmm2 // result = ylog2(x) * (0 < ylog2(x) ? 0x1.0p+1023 : -0x1.0p-1022) 474 xorps %xmm0, %xmm0 // 0 475 cvtsd2ss %xmm2, %xmm0 // convert result to float 476 jmp 9f 477 478 479 // ( x < 0 && isfinite(x) && |y| is not in { 0, inf, NaN } ) or x is unknown, but |y| >= 0x1.0p23 4805: cmpl $0, %edi // if( y is not an integer ) 481 jne 8f 482 483 // since we know y is an integer, we can just call ipowf 484 jmp ___ipowf 485 4866: // x == 1.0f return x 487#if defined( __i386__ ) 488 flds FRAME_SIZE( STACKP ) 489#endif 490 ret 491 492 // A whole basket of special cases lands here 493 // (x <= 0 || x == Inf || isnan(x)) or ( |y| >= 0x1.0p23f || y == 0 || isnan(y) ) 494 // all we have to do is figure out which one! 4957: 496 addl $1, %eax // |y| 497 addl $1, %edx // |y| 498 andl $0x7fffffff, %eax // |x| 499 cmpl $0, %edx // if( |y| == 0 ) 500 je 4f // goto 4 501 502// (x <= 0 || x == Inf || isnan(x)) or ( |y| >= 0x1.0p23 || isnan(y) ) 503 504 //check for NaNs 505 ucomiss %xmm0, %xmm1 506 jp 7f 507 508// (x <= 0 || x == Inf ) or |y| >= 0x1.0p23f 509 510 // calculate fractional part of y and ones bit of y 511 movl %edx, %edi // |y| 512 lea RELATIVE_ADDR(gMaskShift), CX_P // gMaskShift ptr 513 shrl $23, %edi // |y| >> 23 514 movzbl (CX_P, DI_P, 1), %ecx // gMaskShift[ |y| >> 23 ] 515 mov $0x3fffffff, DI_P // 0x3fffffff 516 mov $0x40000000, SI_P // 0x40000000 517 shrl %cl, %edi // 0x3fffffff >> gMaskShift[ |y| >> 23 ] 518 shrl %cl, %esi // 0x40000000 >> gMaskShift[ |y| >> 23 ] 519 andl %edx, %edi // fractional part of y 520 andl %edx, %esi // ones bit of y 521 522 // if( x == 0 ) goto 2 523 xorps %xmm2, %xmm2 524 ucomiss %xmm0, %xmm2 525 je 2f 526 527// (x < 0 || x == Inf ) or |y| >= 0x1.0p23f 528 529 // if( |y| == inf ) goto 3 530 cmpl $0x7f800000, %edx 531 je 3f 532 533// (x < 0 || x == Inf) or ( 0x1.0p23f <= |y| < inf ) 534 535 // if( x == inf ) goto 5 536 ucomiss RELATIVE_ADDR( infF ), %xmm0 537 je 5f 538 539// x < 0 or ( 0x1.0p23f <= |y| < inf ) 540 541 // negative finite x or large y go off to be considered for ipowf 542 ucomiss RELATIVE_ADDR( minfF ), %xmm0 // if( x != -inf ) 543 ja 5b // goto the other 5 544 545// x == -inf 546 547 // At this point, we know that x is -Inf and |y| is not in { 0, Inf, NaN }. 548 // Deal with y is odd integer cases 549 // if( 0 == fractionalBits && 0 != onesBit ) 550 movl %edi, %ecx // fractional Bits 551 subl $1, %ecx // fractionalBits == 0 ? -1 : some non-negative number 552 sarl $31, %ecx // fractionalBits == 0 ? -1 : 0 553 andl %esi, %ecx // fractionalBits == 0 ? onesBit : 0 554 cmpl $0, %ecx // if( 0 == fractionalBits && 0 != onesBit ) 555 jne 6f // goto 6 556 557// x = -inf, |y| is not in { 0, Inf, NaN }, and y is not an odd integer 558 559 // if( 0.0f < y ) return -x; else return 0 560 cmpltss %xmm1, %xmm2 // 0.0f < y ? -1 : 0 561 andps %xmm2, %xmm0 // 0.0f < y ? x : 0 562 pslld $31, %xmm2 // 0.0f < y ? 0x80000000 : 0 563 xorps %xmm2, %xmm0 // 0.0f < y ? -x : 0 564 jmp 9f //return 0 565 566 // x < 0 && y is not an integer, or |y| == 0.5f 5678: xorps %xmm2, %xmm2 568 cmpless %xmm0, %xmm2 // 0 <= x ? -1 : 0 569 andps %xmm1, %xmm2 // 0 <= x ? y : 0 570 xorps %xmm3, %xmm3 571 ucomiss %xmm2, %xmm3 // if( x >= 0 && 0 > y ) 572 ja 1f // goto 1 573 574 sqrtss %xmm0, %xmm0 575 jmp 9f //return 576 577// y == -0.5f && x > 0 5781: cvtss2sd %xmm0, %xmm0 579 movsd RELATIVE_ADDR( oneD ), %xmm1 580 divsd %xmm0, %xmm1 581 sqrtsd %xmm1, %xmm1 582 xorps %xmm0, %xmm0 583 cvtsd2ss %xmm1, %xmm0 584 jmp 9f // return 585 586 587 // x == 0 5882: // if( y is an odd integer ) goto 8 589 movl %edi, %ecx // fractional Bits 590 subl $1, %ecx // fractionalBits == 0 ? -1 : some non-negative number 591 sarl $31, %ecx // fractionalBits == 0 ? -1 : 0 592 andl %esi, %ecx // fractionalBits == 0 ? onesBit : 0 593 cmpl $0, %ecx // if( fractionalBits == 0 && 0 != onesBit ) 594 jne 8f // y is an odd integer, goto 8 595 596 xorps %xmm0, %xmm0 // x = fabsf(x) 597 ucomiss %xmm1, %xmm0 // if( 0 < y ) 598 jb 9f // return x 599 600 //return 1.0 / f 601 movss RELATIVE_ADDR( oneF ), %xmm1 602 divss %xmm0, %xmm1 // return inf and set div/0 603 movaps %xmm1, %xmm0 604 jmp 9f 605 606 // |y| == inf 6073: ucomiss RELATIVE_ADDR( moneF ), %xmm0 // if( -1.0f == x ) 608 je 4f // return 1.0f 609 610 cmpl $0x3f7fffff, %eax // if( |x| > 1.0f ) 611 ja 1f // goto 1f 612 613 xorps %xmm0, %xmm0 // 0.0f 614 cmpnless %xmm1, %xmm0 // y == inf ? 0 : -1 615 psrld $1, %xmm0 // y == inf ? 0 : 0x7fffffff 616 andps %xmm1, %xmm0 // y == inf ? 0 : inf 617 jmp 9f // return 618 619 // return 1.0f 6204: movl $1, %ecx 621 xorps %xmm0, %xmm0 622 cvtsi2ss %ecx, %xmm0 623 jmp 9f 624 625 // x == inf 6265: xorps %xmm2, %xmm2 627 cmpltss %xmm1, %xmm2 // 0 < y ? -1 : 0 628 andps %xmm2, %xmm0 // 0 < y ? x : 0 629 jmp 9f 630 631 // 0 == fractionalBits && 0 != onesBit 6326: xorps %xmm2, %xmm2 633 ucomiss %xmm1, %xmm2 // if( 0 < y ) 634 jb 9f //return x 635 636 movl $0x80000000, %ecx // -0.0f 637 movd %ecx, %xmm0 // copy to xmm, zero high part of register 638 jmp 9f //return -0.0 639 640 6417: // NaN 642 addss %xmm1, %xmm0 643 jmp 9f 644 645 // x == 0, y is an odd integer 6468: ucomiss %xmm1, %xmm2 // if( 0 < y ) 647 jb 9f // return x 648 649 //return 1.0 / f 650 movss RELATIVE_ADDR( oneF ), %xmm1 651 divss %xmm0, %xmm1 // return inf and set div/0 652 movaps %xmm1, %xmm0 653 jmp 9f 654 655 //|y| == inf, |x| > 1.0f 6561: xorps %xmm0, %xmm0 657 cmpltss %xmm1, %xmm0 658 andps %xmm1, %xmm0 659 jmp 9f 660 661 662.align 4 663 // return value in %xmm0 6649: 665 pop DI_P 666 pop SI_P 667 pop BX_P 668#if defined( __i386__ ) 669 movss %xmm0, FRAME_SIZE( STACKP ) 670 flds FRAME_SIZE( STACKP ) 671#endif 672 ret 673 674// x and y passed in in xmm0 and xmm1 675// result returned in xmm0 676// BX_P already points to label 0 above 677___ipowf: 678 // clamp INT_MIN <= y < INT_MAX. Values outside this range can't be odd numbers. 679 maxss RELATIVE_ADDR( miny ), %xmm1 680 minss RELATIVE_ADDR( maxy ), %xmm1 681 cvttss2si %xmm1, %edx // (int) y 682 683 cvtss2sd %xmm0, %xmm0 // x 684 movsd RELATIVE_ADDR( oneD ), %xmm2 // r = 1.0 685 686 cmpl $0, %edx // if( y >= 0 ) 687 jge 1f // goto 4 688 689 // y < 0 690 movapd %xmm0, %xmm1 // x 691 movapd %xmm2, %xmm0 // 1.0 692 divsd %xmm1, %xmm0 // 1.0 / x 693 negl %edx 694 6951: test $1, %edx 696 jz 3f // if( |y| is odd ) 697 movapd %xmm0, %xmm2 // r = x 698 jmp 3f 699 700.align 4 701// do{ 7022: mulsd %xmm0, %xmm0 // x *= x 703 test $1, %edx 704 jz 3f // if( |y| is odd ) continue 705 mulsd %xmm0, %xmm2 // r *= x 706 7073: shrl $1, %edx // |y| >>= 1 708 test $-1, %edx 709 jnz 2b // if( y ) continue 710// }while( |y| ) 711 712 // round to float 713 xorps %xmm0, %xmm0 // 0 714 cvtsd2ss %xmm2, %xmm0 // (float) r 715 716 //exit 717 jmp 9b