src/libm/Source/PowerPC/sqrt.c at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / PowerPC / sqrt.c
at fixPythonPipStalling 386 lines 20 kB view raw
wrap content
Lubos Dolezel Restructured source tree to prepare for merge with the "darling" repo 10y ago
f228ae16
  1/*
  2 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
  3 *
  4 * @APPLE_LICENSE_HEADER_START@
  5 * 
  6 * The contents of this file constitute Original Code as defined in and
  7 * are subject to the Apple Public Source License Version 1.1 (the
  8 * "License").  You may not use this file except in compliance with the
  9 * License.  Please obtain a copy of the License at
 10 * http://www.apple.com/publicsource and read it before using this file.
 11 * 
 12 * This Original Code and all software distributed under the License are
 13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
 17 * License for the specific language governing rights and limitations
 18 * under the License.
 19 * 
 20 * @APPLE_LICENSE_HEADER_END@
 21 */
 22/*******************************************************************************
 23*                                                                              *
 24*     File sqrt.c,                                                             *
 25*     Function square root (IEEE-754).                                         *
 26*                                                                              *
 27*     Implementation of square root based on the approximation provided by     *
 28*     Taligent, Inc. who received it from IBM.                                 *
 29*                                                                              *
 30*     Copyright � 1996-2001 Apple Computer, Inc.  All rights reserved.         *
 31*                                                                              *
 32*     Modified and ported by A. Sazegari, started on June 1996.                *
 33*     Modified and ported by Robert A. Murley (ram) for Mac OS X.              *
 34*                                                                              *
 35*     A MathLib v4 file.                                                       *
 36*                                                                              *
 37*     This file was received from Taligent which modified the original IBM     *
 38*     sources.  It returns the square root of its double argument, using       *
 39*     Markstein algorithm and requires MAF code generation.                    *
 40*                                                                              *
 41*     The algorithm uses Newton�s method to calculate the IEEE-754 correctly   *
 42*     rounded double square root, starting with 8 bit estimate for:            *
 43*     g � �x and y � 1/2�x.  Using MAF instructions, each iteration refines    *
 44*     the original approximations g and y with rounding mode set to nearest.   *
 45*     The final step calculates g with the caller�s rounding restored.  This   *
 46*     in turn guarantees proper IEEE rounding and exceptions. INEXACT is the   *
 47*     only possible exception raised in this calculation.  Initial guesses     *
 48*     for g and y are determined from argument x via table lookup into the     *
 49*     array SqrtTable[256].                                                    *
 50*                                                                              *
 51*     Note: NaN is set computationally.  It will be changed to the proper      *
 52*     NaN code later.                                                          *
 53*                                                                              *
 54*     April     14 1997: added this file to mathlib v3 and included mathlib�s  *
 55*                        nan code.                                             *
 56*     July      23 2001: replaced __setflm with FEGETENVD/FESETENVD.           *
 57*     August    28 2001: added #ifdef __ppc__.                                 *
 58*                        replaced DblInHex typedef with hexdouble.             *
 59*                        used standard exception symbols from fenv.h.          *
 60*     September 09 2001: added more comments.                                  *
 61*     September 10 2001: added macros to detect PowerPC and correct compiler.  *
 62*     September 18 2001: added <CoreServices/CoreServices.h> to get <fp.h>.    *
 63*     October   08 2001: removed <CoreServices/CoreServices.h>.                *
 64*                        changed SqrtTable to const private_extern             *
 65*                        changed compiler errors to warnings.                  *
 66*     November  06 2001: commented out warning about Intel architectures.      *
 67*     November  19 2001: <scp> Added single precision implementation.          *
 68*                                                                              *
 69*     W A R N I N G:                                                           *
 70*     These routines require a 64-bit double precision IEEE-754 model.         *
 71*     They are written for PowerPC only and are expecting the compiler         *
 72*     to generate the correct sequence of multiply-add fused instructions.     *
 73*                                                                              *
 74*     These routines are not intended for 32-bit Intel architectures.          *
 75*                                                                              *
 76*     A version of gcc higher than 932 is required.                            *
 77*                                                                              *
 78*     GCC compiler options:                                                    *
 79*            optimization level 3 (-O3)                                        *
 80*            -fschedule-insns -finline-functions -funroll-all-loops            *
 81*                                                                              *
 82*******************************************************************************/
 83
 84#include     "fenv_private.h"
 85#include     "fp_private.h"
 86#include     "math.h"
 87#include <System/ppc/cpu_capabilities.h>
 88
 89inline static uint32_t __attribute__((always_inline))
 90_get_cpu_capabilities(void) 
 91{
 92        uint32_t caps;
 93        asm("lwz %0, %1(0)" : "=r" (caps) : "I" (_COMM_PAGE_CPU_CAPABILITIES));
 94        return caps;
 95}
 96
 97#define      twoTo512           1.34078079299425971e154    // 2**512    
 98#define      twoToMinus256      8.636168555094444625e-78   // 2**-256
 99#define      upHalfOfAnULP      0.500000000000000111       // 0x1.0000000000001p-1
100
101#define      SQRT_NAN           "1"
102
103/*******************************************************************************
104*     SqrtTable[256] contains initial estimates for the high significand bits  *
105*     of sqrt and correction factor.  This table is  also used by the IBM      *
106*     arccos and arcsin functions.                                             *
107*******************************************************************************/
108
109__private_extern__
110const unsigned short SqrtTable[256] =
111      {
112      27497, 27752, 28262, 28517, 28772, 29282, 29537, 29792, 30302, 30557, 31067, 31322,
113      31577, 32088, 32343, 32598, 33108, 33363, 33618, 34128, 34384, 34639, 35149, 35404,
114      35659, 35914, 36425, 36680, 36935, 37446, 37701, 37956, 38211, 38722, 38977, 39232,
115      39487, 39998, 40253, 40508, 40763, 41274, 41529, 41784, 42040, 42550, 42805, 43061,
116      43316, 43571, 44082, 44337, 44592, 44848, 45103, 45358, 45869, 46124, 46379, 46635,
117      46890, 47401, 47656, 47911, 48167, 48422, 48677, 48933, 49443, 49699, 49954, 50209,
118      50465, 50720, 50976, 51231, 51742, 51997, 52252, 52508, 52763, 53019, 53274, 53529,
119      53785, 54296, 54551, 54806, 55062, 55317, 55573, 55828, 56083, 56339, 56594, 56850,
120      57105, 57616, 57871, 58127, 58382, 58638, 58893, 59149, 59404, 59660, 59915, 60170,
121      60426, 60681, 60937, 61192, 61448, 61703, 61959, 62214, 62470, 62725, 62981, 63236,
122      63492, 63747, 64003, 64258, 64514, 64769, 65025, 65280, 511,   510,   764,   1018,
123      1272,  1526,  1780,  2034,  2288,  2542,  2796,  3050,  3305,  3559,  3813,  4067,
124      4321,  4576,  4830,  5084,  5338,  5593,  5847,  6101,  6101,  6356,  6610,  6864,
125      7119,  7373,  7627,  7882,  8136,  8391,  8391,  8645,  8899,  9154,  9408,  9663,
126      9917,  10172, 10172, 10426, 10681, 10935, 11190, 11444, 11699, 11699, 11954, 12208,
127      12463, 12717, 12972, 13226, 13226, 13481, 13736, 13990, 14245, 14245, 14500, 14754,
128      15009, 15264, 15518, 15518, 15773, 16028, 16282, 16537, 16537, 16792, 17047, 17301,
129      17556, 17556, 17811, 18066, 18320, 18575, 18575, 18830, 19085, 19339, 19339, 19594,
130      19849, 20104, 20104, 20359, 20614, 20868, 21123, 21123, 21378, 21633, 21888, 21888,
131      22143, 22398, 22653, 22653, 22907, 23162, 23417, 23417, 23672, 23927, 23927, 24182,
132      24437, 24692, 24692, 24947, 25202, 25457, 25457, 25712, 25967, 25967, 26222, 26477,
133      26732, 26732, 26987, 27242 };
134
135//    There are flag and rounding errors being generated in this file
136
137static const double Two911      = 0x1.0p-911; // HEXDOUBLE( 0x07000000, 0x00000000 );
138static const hexdouble infinity = HEXDOUBLE( 0x7ff00000, 0x00000000 );
139static const double twoTo128    = 0x1.0p+128; // 0.340282366920938463463e39;
140static const double twoToM128   = 0x1.0p-128; // 0.293873587705571876992e-38;
141
142double __sqrt ( double x );
143double __sqrt ( double x )
144{
145      register int index;
146      hexdouble xInHex, yInHex, gInHex;
147      register double OldEnvironment, g, y, y2, d, e;
148      register uint32_t xhead, ghead, yhead;
149      
150      register double FPR_z, FPR_Two911, FPR_TwoM128, FPR_Two128, FPR_inf, FPR_HalfULP;
151      
152      xInHex.d = x;					FPR_z = 0.0;
153      FPR_inf = infinity.d;				FPR_Two911 = Two911;
154      FPR_TwoM128 = twoToM128;				FPR_Two128 = twoTo128;
155      gInHex.i.lo = 0u;				yInHex.i.lo = 0u;
156      FPR_HalfULP = upHalfOfAnULP;
157      
158      FEGETENVD( OldEnvironment );               	// save environment, set default
159      
160      __NOOP;
161      __ENSURE( FPR_TwoM128, FPR_Two128, FPR_z );
162      
163       FESETENVD( FPR_z );
164      __ENSURE( FPR_HalfULP, FPR_inf, FPR_Two911 );
165      
166/*******************************************************************************
167*     Special case for GPUL: storing and loading floats avoids L/S hazard  
168*******************************************************************************/
169      __NOOP;
170      if (likely(  FPR_TwoM128 < x &&  x < FPR_Two128 )) // Can float hold initial guess?
171      {
172            hexsingle GInHex, YInHex;
173            register uint32_t GPR_t, GPR_foo;
174
175/*******************************************************************************
176*     Calculate initial estimates for g and y from x and SqrtTable[].          *
177*******************************************************************************/
178            
179            xhead = xInHex.i.hi;			// high 32 bits of x
180            index = ( xhead >> 13 ) & 0xffu; 		// table index
181            GPR_t = SqrtTable[index];
182  
183            // Form *single* precision exponent estimate from double argument:
184            // (((((xhead - bias1024) >> 1) + bias128) << 3) & 0x7f800000) =
185            // (((((xhead - bias1024 + bias128 + bias128) >> 1) << 3) & 0x7f800000))
186            
187            ghead = ( ( xhead + 0x07f00000 + 0x07f00000 - 0x3ff00000 ) << 2 ) & 0x7f800000;
188            GInHex.lval = ghead + ( ( 0xff00u & GPR_t ) << 7 );
189
190            // Force GInHex.lval to memory. Then load it into g in the FPU register file
191            // on the *following* cycle. This avoids a Store/Load hazard.
192            asm volatile ( "add %0, %1, %2" : "=r" (GPR_foo) : "b" (&GInHex), "b" (&YInHex) : "memory" ); /* NOOP */
193            g = GInHex.fval;
194
195            yhead = 0x7e000000u - ghead;
196            YInHex.lval = yhead + ( ( 0xffu & GPR_t ) << 15 ); 
197            
198            // Force YInHex.lval to memory. Then load it into y in the FPU register file
199            // on the *following* cycle. This avoids a Store/Load hazard.
200            asm volatile ( "add %0, %1, %2" : "=r" (GPR_foo) : "b" (&GInHex), "b" (&YInHex) : "memory" ); /* NOOP */
201            __NOOP;
202            __NOOP;
203            y = YInHex.fval;
204            
205/*******************************************************************************
206*     Iterate to refine both g and y.                                          *
207*******************************************************************************/
208
209            d = __FNMSUB( g, g, x );			y2 = y + y;
210            
211            g = __FMADD(y, d, g );                 				//   16-bit g
212            
213            e = __FNMSUB( y, g, FPR_HalfULP );		d = __FNMSUB( g, g, x );
214            
215            y = __FMADD( e, y2, y );             				//   16-bit y
216            
217            y2 = y + y;					g = __FMADD(y, d, g);   //   32-bit g
218            
219            e = __FNMSUB( y, g, FPR_HalfULP );		d = __FNMSUB( g, g, x );
220            
221            y = __FMADD( e, y2, y );                  				//   32-bit y
222            
223            y2 = y + y;                  		g = __FMADD(y, d, g );  //   64-bit g before rounding
224            
225            e = __FNMSUB( y, g, FPR_HalfULP );		d = __FNMSUB( g, g, x );
226                                
227            y = __FMADD( e, y2, y );                  				//   64-bit y
228            
229             FESETENVD( OldEnvironment );   					//   restore caller's environment
230            
231            __NOOP;
232            return (  __FMADD( y, d, g ) );            				//   final step
233      }
234        
235      if (likely( FPR_z < x && x < FPR_inf )) 
236      {
237
238/*******************************************************************************
239*     First and most common section: argument > 2.0^(-911), about 5.77662E-275.*
240*******************************************************************************/
241            
242            if (likely( FPR_Two911 < x )) 
243            {
244                  register uint32_t GPR_t;
245
246/*******************************************************************************
247*     Calculate initial estimates for g and y from x and SqrtTable[].          *
248*******************************************************************************/
249                  xhead = xInHex.i.hi;                         	// high 32 bits of x
250                  index = ( xhead >> 13 ) & 0xffu; 		// table index
251                  GPR_t = SqrtTable[index];
252                  
253                  ghead = ( ( xhead + 0x3ff00000 ) >> 1 ) & 0x7ff00000;
254                  yhead = 0x7fc00000u - ghead;
255                
256                  gInHex.i.hi = ghead + ( ( 0xff00u & GPR_t ) << 4 );
257                  g = gInHex.d;
258                
259                  yInHex.i.hi = yhead + ( ( 0xffu & GPR_t ) << 12 ); 
260                  y = yInHex.d;
261/*******************************************************************************
262*     Iterate to refine both g and y.                                          *
263*******************************************************************************/
264
265                  d = __FNMSUB( g, g, x );		y2 = y + y;
266                  
267                  g = __FMADD(y, d, g );                 			//   16-bit g
268                  
269                  e = __FNMSUB( y, g, FPR_HalfULP );	d = __FNMSUB( g, g, x );
270                  
271                  y = __FMADD( e, y2, y );             				//   16-bit y
272                  
273                  y2 = y + y;				g = __FMADD(y, d, g);   //   32-bit g
274                  
275                  e = __FNMSUB( y, g, FPR_HalfULP );	d = __FNMSUB( g, g, x );
276                  
277                  y = __FMADD( e, y2, y );                  			//   32-bit y
278                  
279                  y2 = y + y;                  		g = __FMADD(y, d, g );  //   64-bit g before rounding
280                  
281                  e = __FNMSUB( y, g, FPR_HalfULP );	d = __FNMSUB( g, g, x );
282                  			
283                  y = __FMADD( e, y2, y );                  			//   64-bit y
284                  
285                  FESETENVD( OldEnvironment );   				//   restore caller's environment
286                  return ( __FMADD( y, d, g ) );            			//   final step
287            }
288
289/*******************************************************************************
290*     Second section: 0.0 < argument < 2.0^(-911) which is about 5.77662E-275. *
291*     Identical to the previous segment aside from 2^512 scale factor.         *
292*******************************************************************************/
293            else
294            { 
295                  xInHex.d = x * twoTo512;         //   scale up by 2^512
296		  __NOOP;
297		  __NOOP;
298		  __NOOP;
299                  xhead = xInHex.i.hi;
300
301/*******************************************************************************
302*     Calculate initial estimates for g and y from x and SqrtTable[].          *
303*******************************************************************************/
304
305                  ghead = ( ( xhead + 0x3ff00000 ) >> 1) & 0x7ff00000;
306                  index = ( xhead >> 13) & 0xffu; // table index
307                  yhead = 0x7fc00000u - ghead;
308                  gInHex.i.hi = ghead + ( ( 0xff00u & SqrtTable[index] ) << 4 );
309                  yInHex.i.hi = yhead + ( ( 0xffu & SqrtTable[index] ) << 12 );
310                  x = xInHex.d;
311                  g = gInHex.d;
312                  y = yInHex.d;
313
314/*******************************************************************************
315*     Iterate to refine both g and y.                                          *
316*******************************************************************************/
317            
318                  d = __FNMSUB( g, g, x );		y2 = y + y;
319                  
320                  g = __FMADD(y, d, g );                 			 //   16-bit g
321                  
322                  e = __FNMSUB( y, g, FPR_HalfULP );	d = __FNMSUB( g, g, x );
323                  
324                  y = __FMADD( e, y2, y );             				//   16-bit y
325                  
326                  y2 = y + y;				g = __FMADD(y, d, g);   //   32-bit g
327                  
328                  e = __FNMSUB( y, g, FPR_HalfULP );	d = __FNMSUB( g, g, x );
329                  
330                  y = __FMADD( e, y2, y );                  			//   32-bit y
331                  
332                  y2 = y + y;                  		g = __FMADD(y, d, g );  //   64-bit g before rounding
333                  
334                  e = __FNMSUB( y, g, FPR_HalfULP );	d = __FNMSUB( g, g, x );
335                  			
336                  d *= twoToMinus256;			g *= twoToMinus256;     //   undo scaling
337                  
338                  y = __FMADD( e, y2, y );                  			//   64-bit y
339                  
340                  FESETENVD( OldEnvironment );   				//   restore caller's environment
341                  return ( __FMADD( y, d, g ) );            			//   final step
342            }
343      }
344
345/*******************************************************************************
346*     Fourth section: special cases: argument is +INF, NaN, +/-0.0, or <0.     *
347*******************************************************************************/
348   
349      FESETENVD( OldEnvironment );       		//   restore caller's environment
350
351      if ( x < FPR_z && x == x  )			// < 0.0 and not NAN 
352      {
353            hexdouble env;
354            x = nan ( SQRT_NAN );
355            env.d = OldEnvironment;
356	    __NOOP;
357	    __NOOP;
358	    __NOOP;
359      
360            env.i.lo |= SET_INVALID;
361            FESETENVD_GRP( env.d );         		//   restore caller's environment
362            return ( x );               		// return NAN
363      }
364      else return ( x );				// NANs and +/-0.0
365}
366
367/*
368By pushing the software sqrt implementation into its own routine, we avoid the PIC setup and simply have:
369_sqrt:
37000000340        lwz     r0,0x8020(0)
37100000344        andis.  r2,r0,0x2000
37200000348        bne     0x350
3730000034c        b       ___sqrt
37400000350        fsqrt   f1,f1
37500000354        blr
376Perhaps gcc can someday be coaxed to eliminate the extra branch?
377Matt suggests "use -freorder-blocks", we'll pick that up for the next release.
378*/
379double sqrt(double x)
380{
381      if ((_get_cpu_capabilities() & kHasFsqrt))
382	return __fsqrt( x );
383      else return __sqrt( x );
384}
385	    
386