src/libm/Source/PowerPC/FastSinCos.c at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / PowerPC / FastSinCos.c
at fixPythonPipStalling 511 lines 24 kB view raw
wrap content
Lubos Dolezel Restructured source tree to prepare for merge with the "darling" repo 10y ago
f228ae16
  1/*
  2 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
  3 *
  4 * @APPLE_LICENSE_HEADER_START@
  5 * 
  6 * The contents of this file constitute Original Code as defined in and
  7 * are subject to the Apple Public Source License Version 1.1 (the
  8 * "License").  You may not use this file except in compliance with the
  9 * License.  Please obtain a copy of the License at
 10 * http://www.apple.com/publicsource and read it before using this file.
 11 * 
 12 * This Original Code and all software distributed under the License are
 13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
 17 * License for the specific language governing rights and limitations
 18 * under the License.
 19 * 
 20 * @APPLE_LICENSE_HEADER_END@
 21 */
 22/*******************************************************************************
 23*                                                                              *
 24*    FastSinCos.c                                                              *
 25*                                                                              *
 26*    Double precision Sine and Cosine.                                         *
 27*                                                                              *
 28*    Copyright � 1997-2001 by Apple Computer, Inc. All rights reserved.        *
 29*                                                                              *
 30*    Written by A. Sazegari, started on June 1997.                             *
 31*    Modified and ported by Robert A. Murley (ram) for Mac OS X.               *
 32*                                                                              *
 33*    A MathLib v4 file.                                                        *
 34*                                                                              *
 35*    Based on the trigonometric functions from IBM/Taligent.                   *
 36*                                                                              *
 37*    November  06 2001: commented out warning about Intel architectures.       *
 38*    July      20 2001: replaced __setflm with FEGETENVD/FESETENVD.            *
 39*                       replaced DblInHex typedef with hexdouble.              *
 40*    September 07 2001: added #ifdef __ppc__.                                  *
 41*    September 09 2001: added more comments.                                   *
 42*    September 10 2001: added macros to detect PowerPC and correct compiler.   *
 43*    September 18 2001: added <CoreServices/CoreServices.h> to get to <fp.h>   *
 44*                       and <fenv.h>, removed denormal comments.               *
 45*    October   08 2001: removed <CoreServices/CoreServices.h>.                 *
 46*                       changed compiler errors to warnings.                   *
 47*                                                                              *
 48*    These routines have a long double (107-bits) argument reduction to        *
 49*    better match their long double counterpart.                               *
 50*                                                                              *
 51*    W A R N I N G:                                                            *
 52*    These routines require a 64-bit double precision IEEE-754 model.          *
 53*    They are written for PowerPC only and are expecting the compiler          *
 54*    to generate the correct sequence of multiply-add fused instructions.      *
 55*                                                                              *
 56*    These routines are not intended for 32-bit Intel architectures.           *
 57*                                                                              *
 58*    A version of gcc higher than 932 is required.                             *
 59*                                                                              *
 60*    GCC compiler options:                                                     *
 61*          optimization level 3 (-O3)                                          *
 62*          -fschedule-insns -finline-functions -funroll-all-loops              *
 63*                                                                              *
 64*******************************************************************************/
 65
 66#include    "fenv_private.h"
 67#include    "fp_private.h"
 68#include    "math.h"
 69
 70#define     TRIG_NAN          "33"
 71
 72/*******************************************************************************
 73*      Floating-point constants.                                               *
 74*******************************************************************************/
 75
 76static const double kPiScale42 = 1.38168706094305449e13;   // 0x1.921fb54442d17p43
 77static const double kPiScale53 = 2.829695100811376e16;     // 0x1.921fb54442d18p54
 78static const double piOver4 = 0.785398163397448390;        // 0x1.921fb54442d19p-1
 79static const double piOver2 = 1.570796326794896619231322;  // 0x1.921fb54442d18p0
 80static const double piOver2Tail = 6.1232339957367660e-17;  // 0x1.1a62633145c07p-54
 81static const double twoOverPi = 0.636619772367581382;      // 0x1.45f306dc9c883p-1
 82//static const double k2ToM26 = 1.490116119384765625e-8;     // 0x1.0p-26;
 83static const double kMinNormal = 0x1.0p-1022;			   // 2.2250738585072014e-308;
 84static const double kRintBig = 2.7021597764222976e16;      // 0x1.8p54
 85static const double kRint = 6.755399441055744e15;          // 0x1.8p52
 86static const hexdouble infinity = HEXDOUBLE(0x7ff00000, 0x00000000);
 87
 88/*******************************************************************************
 89*      Approximation coefficients.                                             *
 90*******************************************************************************/
 91
 92static const double s13 = 1.5868926979889205164e-10;       // 1.0/13!
 93static const double s11 = -2.5050225177523807003e-8;       // -1.0/11!
 94static const double s9 =  2.7557309793219876880e-6;        // 1.0/9!
 95static const double s7 = -1.9841269816180999116e-4;        // -1.0/7!
 96static const double s5 =  8.3333333332992771264e-3;        // 1.0/5!
 97static const double s3 = -0.16666666666666463126;          // 1.0/3!
 98static const double c14 = -1.138218794258068723867e-11;    // -1.0/14!
 99static const double c12 = 2.087614008917893178252e-9;      // 1.0/12!
100static const double c10 = -2.755731724204127572108e-7;     // -1.0/10!
101static const double c8 =  2.480158729870839541888e-5;      // 1.0/8!
102static const double c6 = -1.388888888888735934799e-3;      // -1.0/6!
103static const double c4 =  4.166666666666666534980e-2;      // 1.0/4!
104static const double c2 = -.5;                              // -1.0/2!
105
106double sin ( double x ) 
107{
108      register double absOfX, intquo, arg, argtail, xSquared, xThird, xFourth, temp1, temp2, result;
109      register uint32_t ltable;
110      hexdouble      z, OldEnvironment;
111      
112      register double FPR_env, FPR_z, FPR_Min, FPR_pi4, FPR_piScale;
113      register double FPR_t, FPR_inf, FPR_pi53, FPR_2divPi, FPR_PiDiv2, FPR_PiDiv2Tail, FPR_kRintBig, FPR_kRint;
114      register uint32_t GPR_f;
115
116      FEGETENVD( FPR_env );                     	// save env, set default
117      FPR_z = 0.0;				
118      
119      FPR_pi4 = piOver4;
120      absOfX = __FABS ( x ); 		 		__ENSURE( FPR_z, FPR_z, FPR_pi4 ); // ensure z and pi4 are in registers
121      
122      FESETENVD( FPR_z );
123           
124      if ( absOfX < FPR_pi4 ) 				// |x| < �/4
125      {
126            if (likely( absOfX != FPR_z ))
127            {
128                register double FPR_s3, FPR_s5, FPR_s7, FPR_s9, FPR_s11, FPR_s13;
129                    
130/*******************************************************************************
131*      at this point, x is normal with magnitude between 0 and �/4.            *
132*******************************************************************************/
133                // sin polynomial approximation
134                
135                xSquared = __FMUL( x, x );  		
136                FPR_Min = kMinNormal;
137                __ENSURE( FPR_z, FPR_z, FPR_Min ); 	// ensure FPR_Min is loaded into register
138                
139                FPR_s9 = s9; 			 	FPR_s13 = s13;			
140                
141                FPR_s7 = s7; 			 	FPR_s11 = s11;
142                
143                xFourth = __FMUL( xSquared, xSquared ); xThird = __FMUL( x, xSquared );
144                FPR_s5 = s5; 	                
145
146                FPR_s3 = s3;
147                temp1 = __FMADD( s13, xFourth, s9 );  	temp2 = __FMADD( s11, xFourth, s7 );
148
149                temp1 = __FMADD( temp1, xFourth, s5 );  temp2 = __FMADD( temp2, xFourth, s3 );
150    
151                temp1 = __FMADD( temp1, xSquared, temp2 ); result = __FMADD( temp1, xThird, x );
152                
153                FESETENVD( FPR_env );        		// restore caller's mode
154
155                if (unlikely( __FABS( result ) < FPR_Min ))
156                    __PROG_UF_INEXACT( FPR_Min );
157                else
158                    __PROG_INEXACT( FPR_pi4 );
159    
160                return ( result );
161            } 
162            else 
163            {
164                FESETENVD( FPR_env );       		// restore caller's mode
165                return x; 				// +0 -0 preserved
166            }
167        }
168
169      
170      if (unlikely( x != x ))                           // x is a NaN
171      {
172            FESETENVD( FPR_env );  // restore caller's mode
173            return ( x );
174      }
175
176/*******************************************************************************
177*      x has magnitude > �/4.                                                  *
178*******************************************************************************/
179      FPR_piScale = kPiScale42;
180      FPR_piScale = __FMADD( FPR_z, FPR_z, FPR_piScale );
181      FPR_inf = infinity.d; 		 		FPR_pi53 = kPiScale53;
182      FPR_2divPi = twoOverPi; 	 			FPR_PiDiv2 = piOver2;
183      FPR_PiDiv2Tail = piOver2Tail;	 		FPR_kRintBig = kRintBig;
184      FPR_kRint = kRint;
185      
186      if (unlikely( absOfX > FPR_piScale )) 
187      {            
188/*******************************************************************************
189*      |x| is huge or infinite.                                                *
190*******************************************************************************/
191            if ( absOfX == FPR_inf )  
192            {                               	  	// infinite case is invalid
193                  OldEnvironment.d = FPR_env;
194				  __NOOP;
195				  __NOOP;
196				  __NOOP;
197                  OldEnvironment.i.lo |= SET_INVALID;
198                  FESETENVD_GRP( OldEnvironment.d );  	// restore caller's mode
199                  return ( nan ( TRIG_NAN ) );    	// return NaN
200            }
201            
202            while ( absOfX > FPR_pi53 )  
203            {                               	 	// loop to reduce x below
204                  intquo = __FMUL( x, FPR_2divPi );	// �*2^53 in magnitude
205                  FPR_t = __FNMSUB( intquo, FPR_PiDiv2, x );
206                  x = __FNMSUB( intquo, FPR_PiDiv2Tail, FPR_t );
207                  absOfX = __FABS ( x ) ;
208            }
209
210/*******************************************************************************
211*     final reduction of x to magnitude between 0 and 2*�.                     *
212*******************************************************************************/
213            FPR_t = __FMADD( x, FPR_2divPi, FPR_kRintBig );
214            intquo = FPR_t - FPR_kRintBig;
215            FPR_t = __FNMSUB( intquo, FPR_PiDiv2, x );
216            x = __FNMSUB( intquo, FPR_PiDiv2Tail, FPR_t );
217            absOfX = __FABS( x );
218        }
219      
220/*******************************************************************************
221*     |x| < pi*2^42: further reduction is probably necessary.  A double-double *
222*     reduced argument is determined ( arg:argtail ) .  It is possible that x  *
223*     has been reduced below pi/4 in magnitude, but x is definitely nonzero    *
224*     and safely in the normal range.                                          *
225*******************************************************************************/
226      // find integer quotient of x/(�/2)
227      FPR_t = __FMADD( x, FPR_2divPi, FPR_kRint ); 	intquo = FPR_t - FPR_kRint; 
228      z.d = FPR_t;
229      
230      FPR_t = __FNMSUB( intquo, FPR_PiDiv2, x );  	arg = __FNMSUB( intquo, FPR_PiDiv2Tail, FPR_t );
231      xSquared = __FMUL( arg, arg );		  	FPR_t -= arg;
232      
233      GPR_f = z.i.lo;
234      xFourth = __FMUL( xSquared, xSquared );  		argtail = __FNMSUB( intquo, FPR_PiDiv2Tail, FPR_t );
235      
236/*******************************************************************************
237*     multiple of �/2 ( mod 4)  determines approx used and sign of result.     *
238*******************************************************************************/
239      
240      ltable = GPR_f & FE_ALL_RND;
241      
242      if ( ltable & 0x1u )  
243      {                                    		// argument closest to ��/2
244/*******************************************************************************
245*     use cosine approximation.                                                *
246*******************************************************************************/
247            register double FPR_c2, FPR_c4, FPR_c6, FPR_c8, FPR_c10, FPR_c12, FPR_c14;
248            
249            FPR_c10 = c10; 				FPR_c14 = c14;
250            FPR_c8 = c8; 			 	FPR_c12 = c12;
251            
252            temp1 = __FMADD( FPR_c14, xFourth, FPR_c10 ); temp2 = __FMADD( FPR_c12, xFourth, FPR_c8);
253            FPR_c6 = c6; 			
254            
255            FPR_c4 = c4;
256            temp1 = __FMADD( temp1, xFourth, FPR_c6 ); 	temp2 = __FMADD( temp2, xFourth, FPR_c4);
257            
258            FPR_c2 = c2;
259            temp1 = __FMADD( temp1, xFourth, FPR_c2 );
260            
261            temp1 = __FMADD( xSquared, temp2, temp1 );
262            temp1 = __FMSUB( arg, temp1, argtail );	// second-order correction
263            
264            if ( ltable < 2 )                   	// adjust sign of result
265                  result = __FMADD( arg, temp1, 1.0 ); 	// positive
266            else 
267            {
268                  arg = - arg;
269                  result =__FMSUB( arg, temp1, 1.0 ); 	// negative
270            }
271            FESETENVD( FPR_env );        		// restore caller's mode
272            __PROG_INEXACT( FPR_PiDiv2 );
273      } 
274      else 
275      {
276/*******************************************************************************
277*     use sine approximation.                                                  *
278*******************************************************************************/
279            register double FPR_s3, FPR_s5, FPR_s7, FPR_s9, FPR_s11, FPR_s13;
280            
281            FPR_s9 = s9; 			 	FPR_s13 = s13;
282            FPR_s7 = s7; 			 	FPR_s11 = s11;
283            
284            temp1 = __FMADD( FPR_s13, xFourth, FPR_s9 );temp2 = __FMADD( FPR_s11, xFourth, FPR_s7 );
285            FPR_s5 = s5; 			
286            
287            FPR_s3 = s3;
288            temp1 = __FMADD( temp1, xFourth, FPR_s5 );	temp2 = __FMADD( temp2, xFourth, FPR_s3 );
289            
290            temp1 = __FMADD( temp1, xSquared, temp2 );  xThird = __FMUL( arg, xSquared );
291            
292            temp1 = __FMADD( temp1, xThird, argtail ); 	// second-order correction
293
294            if ( ltable < 2 )                    // adjust sign of final result
295                  result = arg + temp1 ;         // positive
296            else 
297			{
298                  arg = - arg;
299                  result = arg - temp1;          // negative
300			}
301			
302            FESETENVD( FPR_env );        		// restore caller's mode
303            __PROG_INEXACT( FPR_PiDiv2 );
304      }
305      
306      return ( result ) ;
307}
308
309/*******************************************************************************
310*     Cosine section.                                                          *
311*******************************************************************************/
312double cos ( double x ) 
313{
314      register double absOfX, intquo, arg, argtail, xSquared, xThird, xFourth,
315                      temp1, temp2, result;
316      register uint32_t iquad;
317      hexdouble z, OldEnvironment;
318      
319      register double FPR_env, FPR_z, FPR_pi4, FPR_piScale;
320      register double FPR_t, FPR_inf, FPR_pi53, FPR_2divPi, FPR_PiDiv2, FPR_PiDiv2Tail, FPR_kRintBig, FPR_kRint;
321      register uint32_t GPR_f;
322
323      FEGETENVD( FPR_env );                     	// save env, set default
324      FPR_z = 0.0;				
325      
326      FPR_pi4 = piOver4;
327      absOfX = __FABS ( x ); 		 		__ENSURE( FPR_z, FPR_z, FPR_pi4 ); // ensure z and pi4 are in registers
328      
329      FESETENVD( FPR_z );
330           
331      if ( absOfX < FPR_pi4 )				// |x| < �/4
332      {
333            if (likely( absOfX != FPR_z )) 
334            {
335                // cos polynomial approximation
336                register double FPR_c2, FPR_c4, FPR_c6, FPR_c8, FPR_c10, FPR_c12, FPR_c14, FPR_One;
337                
338                xSquared = __FMUL( x, x );                     
339                FPR_One = 1.0;
340                __ENSURE( FPR_z, FPR_z, FPR_One );
341
342                FPR_c10 = c10; 				FPR_c14 = c14;
343                
344                FPR_c8 = c8; 			 	FPR_c12 = c12;
345                
346                FPR_c6 = c6; 			 	FPR_c4 = c4;
347
348                xFourth = __FMUL( xSquared, xSquared );
349                __NOOP;
350                FPR_c2 = c2;
351
352                temp1 = __FMADD( FPR_c14, xFourth, FPR_c10 ); 	temp2 = __FMADD( FPR_c12, xFourth, FPR_c8);
353                temp1 = __FMADD( temp1, xFourth, FPR_c6 );	temp2 = __FMADD( temp2, xFourth, FPR_c4);
354                
355                temp1 = __FMADD( temp1, xFourth, FPR_c2 ); 	temp1 = __FMADD( xSquared, temp2, temp1 );
356                result = __FMADD( xSquared, temp1, FPR_One );
357                
358                FESETENVD( FPR_env );        		// restore caller's mode
359                __PROG_INEXACT( FPR_pi4 );
360                        
361                return ( result );
362            } 
363            else 
364            {
365                FESETENVD( FPR_env );       		// restore caller's mode
366                return 1.0; 				
367            }
368      }
369
370      if (unlikely( x != x ))                           // x is a NaN
371      {
372            FESETENVD( FPR_env );        		// restore caller's mode
373            return ( x );
374      }
375
376/*******************************************************************************
377*      x has magnitude > �/4.                                                  *
378*******************************************************************************/
379       FPR_piScale = kPiScale42;
380       FPR_piScale = __FMADD( FPR_z, FPR_z, FPR_piScale );
381       FPR_inf = infinity.d; 		 		FPR_pi53 = kPiScale53;
382       FPR_2divPi = twoOverPi; 	 			FPR_PiDiv2 = piOver2;
383       FPR_PiDiv2Tail = piOver2Tail;	 		FPR_kRintBig = kRintBig;
384       FPR_kRint = kRint;
385
386      if (unlikely( absOfX > FPR_piScale ))  
387
388/*******************************************************************************
389*      |x| is huge or infinite.                                                *
390*******************************************************************************/
391      {
392            if ( absOfX == infinity.d )  
393            {                               		// infinite case is invalid
394                  OldEnvironment.d = FPR_env;
395				  __NOOP;
396				  __NOOP;
397				  __NOOP;
398                  OldEnvironment.i.lo |= SET_INVALID;
399                  FESETENVD_GRP( OldEnvironment.d );  	// restore caller's mode
400                  return ( nan ( TRIG_NAN ) );    	// return NaN
401            }
402            
403            while ( absOfX > FPR_pi53 )  
404            {                               	  	// loop to reduce x below
405                  intquo = __FMUL( x, FPR_2divPi );	// �*2^53 in magnitude
406                  FPR_t = __FNMSUB( intquo, FPR_PiDiv2, x );
407                  x = __FNMSUB( intquo, FPR_PiDiv2Tail, FPR_t );
408                  absOfX = __FABS ( x ) ;
409            }
410            
411/*******************************************************************************
412*     final reduction of x to magnitude between 0 and 2*�.                     *
413*******************************************************************************/
414            FPR_t = __FMADD( x, FPR_2divPi, FPR_kRintBig );
415            intquo = FPR_t - FPR_kRintBig;
416            FPR_t = __FNMSUB( intquo, FPR_PiDiv2, x );
417            x = __FNMSUB( intquo, FPR_PiDiv2Tail, FPR_t );
418            absOfX = __FABS( x );
419	}           
420      
421/*******************************************************************************
422*     |x| < pi*2^42: further reduction is probably necessary.  A double-double *
423*     reduced argument is determined ( arg:argtail ) .  It is possible that x  *
424*     has been reduced below pi/4 in magnitude, but x is definitely nonzero    *
425*     and safely in the normal range.                                          *
426*******************************************************************************/
427
428        // find integer quotient of x/(�/2)
429        FPR_t = __FMADD( x, FPR_2divPi, FPR_kRint );
430            
431        intquo = FPR_t - FPR_kRint; 
432        z.d = FPR_t;
433      
434        FPR_t = __FNMSUB( intquo, FPR_PiDiv2, x );
435      
436        arg = __FNMSUB( intquo, FPR_PiDiv2Tail, FPR_t );
437      
438        xSquared = __FMUL( arg, arg );		  	FPR_t -= arg;
439        GPR_f = z.i.lo;
440      
441        xFourth = __FMUL( xSquared, xSquared );  	argtail = __FNMSUB( intquo, FPR_PiDiv2Tail, FPR_t );
442      
443        iquad = (GPR_f + 1 ) & FE_ALL_RND;        	// iquad = int multiple mod 4
444      
445/*******************************************************************************
446*     multiple of �/2 ( mod 4)  determines approx used and sign of result.     *
447*******************************************************************************/
448
449      if ( iquad & 0x1u)  
450      {                                      		// arg closest to 0 or �
451/*******************************************************************************
452*     use cosine approximation.                                                *
453*******************************************************************************/
454            register double FPR_c2, FPR_c4, FPR_c6, FPR_c8, FPR_c10, FPR_c12, FPR_c14;
455            
456            FPR_c10 = c10; 			 	FPR_c14 = c14;
457            FPR_c8 = c8; 			 	FPR_c12 = c12;
458            
459            temp1 = __FMADD( FPR_c14, xFourth, FPR_c10 ); temp2 = __FMADD( FPR_c12, xFourth, FPR_c8);
460            FPR_c6 = c6; 			
461
462            FPR_c4 = c4;
463            temp1 = __FMADD( temp1, xFourth, FPR_c6 );  temp2 = __FMADD( temp2, xFourth, FPR_c4);
464            
465            FPR_c2 = c2;
466            temp1 = __FMADD( temp1, xFourth, FPR_c2 );  temp1 = __FMADD( xSquared, temp2, temp1 );
467            
468            temp1 = __FMSUB( arg, temp1, argtail );    	// second-order correction
469            if ( iquad < 2 )                       	// adjust sign of result
470                  result = __FMADD( arg, temp1, 1.0 );  // positive
471            else 
472            {
473                  arg = - arg;
474                  result =__FMSUB( arg, temp1, 1.0 );	// negative
475            }
476      }
477      else 
478      {
479/*******************************************************************************
480*     use sine approximation.                                                  *
481*******************************************************************************/
482            register double FPR_s3, FPR_s5, FPR_s7, FPR_s9, FPR_s11, FPR_s13;
483            
484            FPR_s9 = s9; 			 	FPR_s13 = s13;
485            FPR_s7 = s7; 			 	FPR_s11 = s11;
486            
487            temp1 = __FMADD( FPR_s13, xFourth, FPR_s9 ); temp2 = __FMADD( FPR_s11, xFourth, FPR_s7 );
488            FPR_s5 = s5; 			
489            
490            FPR_s3 = s3;
491            temp1 = __FMADD( temp1, xFourth, FPR_s5 ); 	temp2 = __FMADD( temp2, xFourth, FPR_s3 );
492            
493            temp1 = __FMADD( temp1, xSquared, temp2 );  xThird = __FMUL( arg, xSquared );
494            
495            temp1 = __FMADD( temp1, xThird, argtail ); 	// second-order correction
496
497            if ( iquad < 2 )                      // adjust sign of result
498                  result = temp1 + arg;
499            else 
500			{
501                  arg = - arg;
502                  result = arg - temp1;
503			}
504      }
505
506      FESETENVD( FPR_env );        				// restore caller's mode
507      __PROG_INEXACT( FPR_PiDiv2 );
508          
509      return ( result ) ;
510}
511