this repo has no description
at fixPythonPipStalling 386 lines 20 kB view raw
1/* 2 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_LICENSE_HEADER_START@ 5 * 6 * The contents of this file constitute Original Code as defined in and 7 * are subject to the Apple Public Source License Version 1.1 (the 8 * "License"). You may not use this file except in compliance with the 9 * License. Please obtain a copy of the License at 10 * http://www.apple.com/publicsource and read it before using this file. 11 * 12 * This Original Code and all software distributed under the License are 13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER 14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the 17 * License for the specific language governing rights and limitations 18 * under the License. 19 * 20 * @APPLE_LICENSE_HEADER_END@ 21 */ 22/******************************************************************************* 23* * 24* File sqrt.c, * 25* Function square root (IEEE-754). * 26* * 27* Implementation of square root based on the approximation provided by * 28* Taligent, Inc. who received it from IBM. * 29* * 30* Copyright � 1996-2001 Apple Computer, Inc. All rights reserved. * 31* * 32* Modified and ported by A. Sazegari, started on June 1996. * 33* Modified and ported by Robert A. Murley (ram) for Mac OS X. * 34* * 35* A MathLib v4 file. * 36* * 37* This file was received from Taligent which modified the original IBM * 38* sources. It returns the square root of its double argument, using * 39* Markstein algorithm and requires MAF code generation. * 40* * 41* The algorithm uses Newton�s method to calculate the IEEE-754 correctly * 42* rounded double square root, starting with 8 bit estimate for: * 43* g � �x and y � 1/2�x. Using MAF instructions, each iteration refines * 44* the original approximations g and y with rounding mode set to nearest. * 45* The final step calculates g with the caller�s rounding restored. This * 46* in turn guarantees proper IEEE rounding and exceptions. INEXACT is the * 47* only possible exception raised in this calculation. Initial guesses * 48* for g and y are determined from argument x via table lookup into the * 49* array SqrtTable[256]. * 50* * 51* Note: NaN is set computationally. It will be changed to the proper * 52* NaN code later. * 53* * 54* April 14 1997: added this file to mathlib v3 and included mathlib�s * 55* nan code. * 56* July 23 2001: replaced __setflm with FEGETENVD/FESETENVD. * 57* August 28 2001: added #ifdef __ppc__. * 58* replaced DblInHex typedef with hexdouble. * 59* used standard exception symbols from fenv.h. * 60* September 09 2001: added more comments. * 61* September 10 2001: added macros to detect PowerPC and correct compiler. * 62* September 18 2001: added <CoreServices/CoreServices.h> to get <fp.h>. * 63* October 08 2001: removed <CoreServices/CoreServices.h>. * 64* changed SqrtTable to const private_extern * 65* changed compiler errors to warnings. * 66* November 06 2001: commented out warning about Intel architectures. * 67* November 19 2001: <scp> Added single precision implementation. * 68* * 69* W A R N I N G: * 70* These routines require a 64-bit double precision IEEE-754 model. * 71* They are written for PowerPC only and are expecting the compiler * 72* to generate the correct sequence of multiply-add fused instructions. * 73* * 74* These routines are not intended for 32-bit Intel architectures. * 75* * 76* A version of gcc higher than 932 is required. * 77* * 78* GCC compiler options: * 79* optimization level 3 (-O3) * 80* -fschedule-insns -finline-functions -funroll-all-loops * 81* * 82*******************************************************************************/ 83 84#include "fenv_private.h" 85#include "fp_private.h" 86#include "math.h" 87#include <System/ppc/cpu_capabilities.h> 88 89inline static uint32_t __attribute__((always_inline)) 90_get_cpu_capabilities(void) 91{ 92 uint32_t caps; 93 asm("lwz %0, %1(0)" : "=r" (caps) : "I" (_COMM_PAGE_CPU_CAPABILITIES)); 94 return caps; 95} 96 97#define twoTo512 1.34078079299425971e154 // 2**512 98#define twoToMinus256 8.636168555094444625e-78 // 2**-256 99#define upHalfOfAnULP 0.500000000000000111 // 0x1.0000000000001p-1 100 101#define SQRT_NAN "1" 102 103/******************************************************************************* 104* SqrtTable[256] contains initial estimates for the high significand bits * 105* of sqrt and correction factor. This table is also used by the IBM * 106* arccos and arcsin functions. * 107*******************************************************************************/ 108 109__private_extern__ 110const unsigned short SqrtTable[256] = 111 { 112 27497, 27752, 28262, 28517, 28772, 29282, 29537, 29792, 30302, 30557, 31067, 31322, 113 31577, 32088, 32343, 32598, 33108, 33363, 33618, 34128, 34384, 34639, 35149, 35404, 114 35659, 35914, 36425, 36680, 36935, 37446, 37701, 37956, 38211, 38722, 38977, 39232, 115 39487, 39998, 40253, 40508, 40763, 41274, 41529, 41784, 42040, 42550, 42805, 43061, 116 43316, 43571, 44082, 44337, 44592, 44848, 45103, 45358, 45869, 46124, 46379, 46635, 117 46890, 47401, 47656, 47911, 48167, 48422, 48677, 48933, 49443, 49699, 49954, 50209, 118 50465, 50720, 50976, 51231, 51742, 51997, 52252, 52508, 52763, 53019, 53274, 53529, 119 53785, 54296, 54551, 54806, 55062, 55317, 55573, 55828, 56083, 56339, 56594, 56850, 120 57105, 57616, 57871, 58127, 58382, 58638, 58893, 59149, 59404, 59660, 59915, 60170, 121 60426, 60681, 60937, 61192, 61448, 61703, 61959, 62214, 62470, 62725, 62981, 63236, 122 63492, 63747, 64003, 64258, 64514, 64769, 65025, 65280, 511, 510, 764, 1018, 123 1272, 1526, 1780, 2034, 2288, 2542, 2796, 3050, 3305, 3559, 3813, 4067, 124 4321, 4576, 4830, 5084, 5338, 5593, 5847, 6101, 6101, 6356, 6610, 6864, 125 7119, 7373, 7627, 7882, 8136, 8391, 8391, 8645, 8899, 9154, 9408, 9663, 126 9917, 10172, 10172, 10426, 10681, 10935, 11190, 11444, 11699, 11699, 11954, 12208, 127 12463, 12717, 12972, 13226, 13226, 13481, 13736, 13990, 14245, 14245, 14500, 14754, 128 15009, 15264, 15518, 15518, 15773, 16028, 16282, 16537, 16537, 16792, 17047, 17301, 129 17556, 17556, 17811, 18066, 18320, 18575, 18575, 18830, 19085, 19339, 19339, 19594, 130 19849, 20104, 20104, 20359, 20614, 20868, 21123, 21123, 21378, 21633, 21888, 21888, 131 22143, 22398, 22653, 22653, 22907, 23162, 23417, 23417, 23672, 23927, 23927, 24182, 132 24437, 24692, 24692, 24947, 25202, 25457, 25457, 25712, 25967, 25967, 26222, 26477, 133 26732, 26732, 26987, 27242 }; 134 135// There are flag and rounding errors being generated in this file 136 137static const double Two911 = 0x1.0p-911; // HEXDOUBLE( 0x07000000, 0x00000000 ); 138static const hexdouble infinity = HEXDOUBLE( 0x7ff00000, 0x00000000 ); 139static const double twoTo128 = 0x1.0p+128; // 0.340282366920938463463e39; 140static const double twoToM128 = 0x1.0p-128; // 0.293873587705571876992e-38; 141 142double __sqrt ( double x ); 143double __sqrt ( double x ) 144{ 145 register int index; 146 hexdouble xInHex, yInHex, gInHex; 147 register double OldEnvironment, g, y, y2, d, e; 148 register uint32_t xhead, ghead, yhead; 149 150 register double FPR_z, FPR_Two911, FPR_TwoM128, FPR_Two128, FPR_inf, FPR_HalfULP; 151 152 xInHex.d = x; FPR_z = 0.0; 153 FPR_inf = infinity.d; FPR_Two911 = Two911; 154 FPR_TwoM128 = twoToM128; FPR_Two128 = twoTo128; 155 gInHex.i.lo = 0u; yInHex.i.lo = 0u; 156 FPR_HalfULP = upHalfOfAnULP; 157 158 FEGETENVD( OldEnvironment ); // save environment, set default 159 160 __NOOP; 161 __ENSURE( FPR_TwoM128, FPR_Two128, FPR_z ); 162 163 FESETENVD( FPR_z ); 164 __ENSURE( FPR_HalfULP, FPR_inf, FPR_Two911 ); 165 166/******************************************************************************* 167* Special case for GPUL: storing and loading floats avoids L/S hazard 168*******************************************************************************/ 169 __NOOP; 170 if (likely( FPR_TwoM128 < x && x < FPR_Two128 )) // Can float hold initial guess? 171 { 172 hexsingle GInHex, YInHex; 173 register uint32_t GPR_t, GPR_foo; 174 175/******************************************************************************* 176* Calculate initial estimates for g and y from x and SqrtTable[]. * 177*******************************************************************************/ 178 179 xhead = xInHex.i.hi; // high 32 bits of x 180 index = ( xhead >> 13 ) & 0xffu; // table index 181 GPR_t = SqrtTable[index]; 182 183 // Form *single* precision exponent estimate from double argument: 184 // (((((xhead - bias1024) >> 1) + bias128) << 3) & 0x7f800000) = 185 // (((((xhead - bias1024 + bias128 + bias128) >> 1) << 3) & 0x7f800000)) 186 187 ghead = ( ( xhead + 0x07f00000 + 0x07f00000 - 0x3ff00000 ) << 2 ) & 0x7f800000; 188 GInHex.lval = ghead + ( ( 0xff00u & GPR_t ) << 7 ); 189 190 // Force GInHex.lval to memory. Then load it into g in the FPU register file 191 // on the *following* cycle. This avoids a Store/Load hazard. 192 asm volatile ( "add %0, %1, %2" : "=r" (GPR_foo) : "b" (&GInHex), "b" (&YInHex) : "memory" ); /* NOOP */ 193 g = GInHex.fval; 194 195 yhead = 0x7e000000u - ghead; 196 YInHex.lval = yhead + ( ( 0xffu & GPR_t ) << 15 ); 197 198 // Force YInHex.lval to memory. Then load it into y in the FPU register file 199 // on the *following* cycle. This avoids a Store/Load hazard. 200 asm volatile ( "add %0, %1, %2" : "=r" (GPR_foo) : "b" (&GInHex), "b" (&YInHex) : "memory" ); /* NOOP */ 201 __NOOP; 202 __NOOP; 203 y = YInHex.fval; 204 205/******************************************************************************* 206* Iterate to refine both g and y. * 207*******************************************************************************/ 208 209 d = __FNMSUB( g, g, x ); y2 = y + y; 210 211 g = __FMADD(y, d, g ); // 16-bit g 212 213 e = __FNMSUB( y, g, FPR_HalfULP ); d = __FNMSUB( g, g, x ); 214 215 y = __FMADD( e, y2, y ); // 16-bit y 216 217 y2 = y + y; g = __FMADD(y, d, g); // 32-bit g 218 219 e = __FNMSUB( y, g, FPR_HalfULP ); d = __FNMSUB( g, g, x ); 220 221 y = __FMADD( e, y2, y ); // 32-bit y 222 223 y2 = y + y; g = __FMADD(y, d, g ); // 64-bit g before rounding 224 225 e = __FNMSUB( y, g, FPR_HalfULP ); d = __FNMSUB( g, g, x ); 226 227 y = __FMADD( e, y2, y ); // 64-bit y 228 229 FESETENVD( OldEnvironment ); // restore caller's environment 230 231 __NOOP; 232 return ( __FMADD( y, d, g ) ); // final step 233 } 234 235 if (likely( FPR_z < x && x < FPR_inf )) 236 { 237 238/******************************************************************************* 239* First and most common section: argument > 2.0^(-911), about 5.77662E-275.* 240*******************************************************************************/ 241 242 if (likely( FPR_Two911 < x )) 243 { 244 register uint32_t GPR_t; 245 246/******************************************************************************* 247* Calculate initial estimates for g and y from x and SqrtTable[]. * 248*******************************************************************************/ 249 xhead = xInHex.i.hi; // high 32 bits of x 250 index = ( xhead >> 13 ) & 0xffu; // table index 251 GPR_t = SqrtTable[index]; 252 253 ghead = ( ( xhead + 0x3ff00000 ) >> 1 ) & 0x7ff00000; 254 yhead = 0x7fc00000u - ghead; 255 256 gInHex.i.hi = ghead + ( ( 0xff00u & GPR_t ) << 4 ); 257 g = gInHex.d; 258 259 yInHex.i.hi = yhead + ( ( 0xffu & GPR_t ) << 12 ); 260 y = yInHex.d; 261/******************************************************************************* 262* Iterate to refine both g and y. * 263*******************************************************************************/ 264 265 d = __FNMSUB( g, g, x ); y2 = y + y; 266 267 g = __FMADD(y, d, g ); // 16-bit g 268 269 e = __FNMSUB( y, g, FPR_HalfULP ); d = __FNMSUB( g, g, x ); 270 271 y = __FMADD( e, y2, y ); // 16-bit y 272 273 y2 = y + y; g = __FMADD(y, d, g); // 32-bit g 274 275 e = __FNMSUB( y, g, FPR_HalfULP ); d = __FNMSUB( g, g, x ); 276 277 y = __FMADD( e, y2, y ); // 32-bit y 278 279 y2 = y + y; g = __FMADD(y, d, g ); // 64-bit g before rounding 280 281 e = __FNMSUB( y, g, FPR_HalfULP ); d = __FNMSUB( g, g, x ); 282 283 y = __FMADD( e, y2, y ); // 64-bit y 284 285 FESETENVD( OldEnvironment ); // restore caller's environment 286 return ( __FMADD( y, d, g ) ); // final step 287 } 288 289/******************************************************************************* 290* Second section: 0.0 < argument < 2.0^(-911) which is about 5.77662E-275. * 291* Identical to the previous segment aside from 2^512 scale factor. * 292*******************************************************************************/ 293 else 294 { 295 xInHex.d = x * twoTo512; // scale up by 2^512 296 __NOOP; 297 __NOOP; 298 __NOOP; 299 xhead = xInHex.i.hi; 300 301/******************************************************************************* 302* Calculate initial estimates for g and y from x and SqrtTable[]. * 303*******************************************************************************/ 304 305 ghead = ( ( xhead + 0x3ff00000 ) >> 1) & 0x7ff00000; 306 index = ( xhead >> 13) & 0xffu; // table index 307 yhead = 0x7fc00000u - ghead; 308 gInHex.i.hi = ghead + ( ( 0xff00u & SqrtTable[index] ) << 4 ); 309 yInHex.i.hi = yhead + ( ( 0xffu & SqrtTable[index] ) << 12 ); 310 x = xInHex.d; 311 g = gInHex.d; 312 y = yInHex.d; 313 314/******************************************************************************* 315* Iterate to refine both g and y. * 316*******************************************************************************/ 317 318 d = __FNMSUB( g, g, x ); y2 = y + y; 319 320 g = __FMADD(y, d, g ); // 16-bit g 321 322 e = __FNMSUB( y, g, FPR_HalfULP ); d = __FNMSUB( g, g, x ); 323 324 y = __FMADD( e, y2, y ); // 16-bit y 325 326 y2 = y + y; g = __FMADD(y, d, g); // 32-bit g 327 328 e = __FNMSUB( y, g, FPR_HalfULP ); d = __FNMSUB( g, g, x ); 329 330 y = __FMADD( e, y2, y ); // 32-bit y 331 332 y2 = y + y; g = __FMADD(y, d, g ); // 64-bit g before rounding 333 334 e = __FNMSUB( y, g, FPR_HalfULP ); d = __FNMSUB( g, g, x ); 335 336 d *= twoToMinus256; g *= twoToMinus256; // undo scaling 337 338 y = __FMADD( e, y2, y ); // 64-bit y 339 340 FESETENVD( OldEnvironment ); // restore caller's environment 341 return ( __FMADD( y, d, g ) ); // final step 342 } 343 } 344 345/******************************************************************************* 346* Fourth section: special cases: argument is +INF, NaN, +/-0.0, or <0. * 347*******************************************************************************/ 348 349 FESETENVD( OldEnvironment ); // restore caller's environment 350 351 if ( x < FPR_z && x == x ) // < 0.0 and not NAN 352 { 353 hexdouble env; 354 x = nan ( SQRT_NAN ); 355 env.d = OldEnvironment; 356 __NOOP; 357 __NOOP; 358 __NOOP; 359 360 env.i.lo |= SET_INVALID; 361 FESETENVD_GRP( env.d ); // restore caller's environment 362 return ( x ); // return NAN 363 } 364 else return ( x ); // NANs and +/-0.0 365} 366 367/* 368By pushing the software sqrt implementation into its own routine, we avoid the PIC setup and simply have: 369_sqrt: 37000000340 lwz r0,0x8020(0) 37100000344 andis. r2,r0,0x2000 37200000348 bne 0x350 3730000034c b ___sqrt 37400000350 fsqrt f1,f1 37500000354 blr 376Perhaps gcc can someday be coaxed to eliminate the extra branch? 377Matt suggests "use -freorder-blocks", we'll pick that up for the next release. 378*/ 379double sqrt(double x) 380{ 381 if ((_get_cpu_capabilities() & kHasFsqrt)) 382 return __fsqrt( x ); 383 else return __sqrt( x ); 384} 385 386