src/libm/Source/PowerPC/asinacos.c at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / PowerPC / asinacos.c
at fixPythonPipStalling 581 lines 25 kB view raw
wrap content
Lubos Dolezel Restructured source tree to prepare for merge with the "darling" repo 10y ago
f228ae16
  1/*
  2 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
  3 *
  4 * @APPLE_LICENSE_HEADER_START@
  5 * 
  6 * The contents of this file constitute Original Code as defined in and
  7 * are subject to the Apple Public Source License Version 1.1 (the
  8 * "License").  You may not use this file except in compliance with the
  9 * License.  Please obtain a copy of the License at
 10 * http://www.apple.com/publicsource and read it before using this file.
 11 * 
 12 * This Original Code and all software distributed under the License are
 13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
 17 * License for the specific language governing rights and limitations
 18 * under the License.
 19 * 
 20 * @APPLE_LICENSE_HEADER_END@
 21 */
 22/*******************************************************************************
 23*                                                                              *
 24*     File asinacos.c,                                                         *
 25*     Implementation of asin and acos for the PowerPC.                         *
 26*                                                                              *
 27*     Copyright � 1991-2001 Apple Computer, Inc.  All rights reserved.         *
 28*                                                                              *
 29*     Written and ported by Robert A. Murley (ram) for Mac OS X.               *
 30*     Modified by A. Sazegari, September 2001.                                 *
 31*                                                                              *
 32*     A MathLib v4 file.                                                       *
 33*                                                                              *
 34*     Based on the trigonometric functions from IBM/Taligent.                  *
 35*                                                                              *
 36*     July	20 2001: replaced __setflm with FEGETENVD/FESETENVD.           *
 37*                        replaced DblInHex typedef with hexdouble.             *
 38*     September 07 2001: added #ifdef __ppc__.                                 *
 39*     September 09 2001: added more comments.                                  *
 40*     September 10 2001: added macros to detect PowerPC and correct compiler.  *
 41*     September 21 2001: added <CoreServices/CoreServices.h> to get to <fp.h>  *
 42*                        and <fenv.h>.                                         *
 43*     September 24 2001: used standard symbols for environment flags.          *
 44*     September 25 2001: added FESETENVD(0.0) at start of acos.                *
 45*     October   08 2001: removed <CoreServices/CoreServices.h>.                *
 46*                        changed compiler errors to warnings.                  *
 47*     November  06 2001: commented out warning about Intel architectures.      *
 48*                                                                              *
 49*     W A R N I N G:                                                           *
 50*     These routines require a 64-bit double precision IEEE-754 model.         *
 51*     They are written for PowerPC only and are expecting the compiler         *
 52*     to generate the correct sequence of multiply-add fused instructions.     *
 53*                                                                              *
 54*     These routines are not intended for 32-bit Intel architectures.          *
 55*                                                                              *
 56*     A version of gcc higher than 932 is required.                            *
 57*                                                                              *
 58*      GCC compiler options:                                                   *
 59*            optimization level 3 (-O3)                                        *
 60*            -fschedule-insns -finline-functions -funroll-all-loops            *
 61*                                                                              *
 62*******************************************************************************/
 63
 64#include    "fp_private.h"
 65#include    "fenv_private.h"
 66#include    "math.h"
 67
 68extern const unsigned short SqrtTable[];
 69
 70// Floating-point constants
 71static const double kPiBy2Tail = 6.1232339957367660e-17;      // 0x1.1a62633145c07p-54
 72static const hexdouble kPiu       = HEXDOUBLE(0x400921fb, 0x54442d18);
 73
 74#define      INVERSE_TRIGONOMETRIC_NAN      "34"
 75
 76/****************************************************************************
 77
 78FUNCTION:  double asin (double x)
 79    
 80DESCRIPTION:  returns the inverse sine of its argument in the range of
 81              -pi/2 to pi/2 (radians).
 82    
 83CALLS:  __FABS
 84    
 85EXCEPTIONS raised:  INEXACT, UNDERFLOW/INEXACT or INVALID.
 86    
 87ACCURACY:  Error is less than an ulp and usually less than half an ulp.
 88           Caller's rounding direction is honored.
 89
 90****************************************************************************/
 91static struct kBlock {
 92    const double k2ToM26    /* = 1.490116119384765625e-8 */ ;     // 0x1.0p-26 */ ;
 93    const double kPiBy2     /* = 1.570796326794896619231322 */ ;  // 0x1.921fb54442d18p0
 94    const double kPiBy2Tail /* = 6.1232339957367660e-17 */ ;      // 0x1.1a62633145c07p-54
 95    const double kMinNormal /* = 2.2250738585072014e-308 */ ;     // 0x1.0p-1022
 96    
 97    const  double a14  /* = 0.03085091303188211304259 */ ;
 98    const  double a13  /* =-0.02425125216179617744614 */ ;
 99    const  double a12  /* = 0.02273334623573271023373 */ ;
100    const  double a11  /* = 0.0002983797813053653120360 */ ;
101    const  double a10  /* = 0.008819738782817955152102 */ ;
102    const  double a9   /* = 0.008130738583790024803650 */ ;
103    const  double a8   /* = 0.009793486386035222577675 */ ;
104    const  double a7   /* = 0.01154901189590083099584 */ ;
105    const  double a6   /* = 0.01396501522140471126730 */ ;
106    const  double a5   /* = 0.01735275722383019335276 */ ;
107    const  double a4   /* = 0.02237215928757573539490 */ ;
108    const  double a3   /* = 0.03038194444121688698408 */ ;
109    const  double a2   /* = 0.04464285714288482921087 */ ;
110    const  double a1   /* = 0.07499999999999990640378 */ ;
111    const  double a0   /* = 0.1666666666666666667191 */ ;
112    const  double aa11 /* = 0.0002983797813053653120360 */ ;
113    const  double aa10 /* = 0.008819738782817955152102 */ ;
114    const  double aa9  /* = 0.008130738583790024803650 */ ;
115    const  double aa8  /* = 0.009793486386035222577675 */ ;
116    const  double aa7  /* = 0.01154901189590083099584 */ ;
117    const  double aa6  /* = 0.01396501522140471126730 */ ;
118    const  double aa5  /* = 0.01735275722383019335276 */ ;
119    const  double aa4  /* = 0.02237215928757573539490 */ ;
120    const  double aa3  /* = 0.03038194444121688698408 */ ;
121    const  double aa2  /* = 0.04464285714288482921087 */ ;
122    const  double aa1  /* = 0.07499999999999990640378 */ ;
123    const  double aa0  /* = 0.1666666666666666667191 */ ;
124} kBlock =
125{
126/* static const double k2ToM26    = */ 0x1.0p-26,					// 1.490116119384765625e-8
127/* static const double kPiBy2     = */ 1.570796326794896619231322,  // 0x1.921fb54442d18p0
128/* static const double kPiBy2Tail = */ 6.1232339957367660e-17,      // 0x1.1a62633145c07p-54
129/* static const double kMinNormal = */ 0x1.0p-1022,					// 2.2250738585072014e-308
130
131/* const static double a14  = */ 0.03085091303188211304259,
132/* const static double a13  = */-0.02425125216179617744614,
133/* const static double a12  = */ 0.02273334623573271023373,
134/* const static double a11  = */ 0.0002983797813053653120360,
135/* const static double a10  = */ 0.008819738782817955152102,
136/* const static double a9   = */ 0.008130738583790024803650,
137/* const static double a8   = */ 0.009793486386035222577675,
138/* const static double a7   = */ 0.01154901189590083099584,
139/* const static double a6   = */ 0.01396501522140471126730,
140/* const static double a5   = */ 0.01735275722383019335276,
141/* const static double a4   = */ 0.02237215928757573539490,
142/* const static double a3   = */ 0.03038194444121688698408,
143/* const static double a2   = */ 0.04464285714288482921087,
144/* const static double a1   = */ 0.07499999999999990640378,
145/* const static double a0   = */ 0.1666666666666666667191,
146/* const static double aa11 = */ 0.0002983797813053653120360,
147/* const static double aa10 = */ 0.008819738782817955152102,
148/* const static double aa9  = */ 0.008130738583790024803650,
149/* const static double aa8  = */ 0.009793486386035222577675,
150/* const static double aa7  = */ 0.01154901189590083099584,
151/* const static double aa6  = */ 0.01396501522140471126730,
152/* const static double aa5  = */ 0.01735275722383019335276,
153/* const static double aa4  = */ 0.02237215928757573539490,
154/* const static double aa3  = */ 0.03038194444121688698408,
155/* const static double aa2  = */ 0.04464285714288482921087,
156/* const static double aa1  = */ 0.07499999999999990640378,
157/* const static double aa0  = */ 0.1666666666666666667191
158};
159
160double asin (double x)
161{
162    hexdouble     cD, yD ,gD , fpenv;
163    double        absx, x2, x3, x4, temp1, temp2, u, v;
164    double        g, y, d, y2, e;
165    register int           index;
166    register uint32_t ghead, yhead, rnddir;
167    
168    register double FPR_env, FPR_z, FPR_half, FPR_one, FPR_k2ToM26, FPR_cD, FPR_yD, FPR_kMinNormal, FPR_kPiBy2, FPR_kPiBy2Tail;
169    register struct kBlock *pK;
170    
171    pK = &kBlock;
172    
173    FPR_z = 0.0;                            FPR_half = 0.5;
174    FPR_k2ToM26 = pK->k2ToM26;				FPR_kMinNormal = pK->kMinNormal;
175    FPR_kPiBy2 = pK->kPiBy2;				FPR_one = 1.0;
176    
177    if (unlikely(x != x))                           		// NaN argument
178        return ( copysign ( nan ( INVERSE_TRIGONOMETRIC_NAN ), x ) );
179        
180    absx = __FABS( x );
181    
182    FEGETENVD ( FPR_env );                   		// save env, set default
183    __ENSURE( FPR_z, FPR_half, FPR_k2ToM26 ); 		__ENSURE( FPR_one, FPR_kMinNormal, FPR_kPiBy2 ); // ensure register loads
184    FPR_cD = __FNMSUB( FPR_half, absx, FPR_half ); 
185    FESETENVD ( FPR_z );
186    
187    x2 = __FMUL( x, x );				// under/overflows are masked
188    cD.d = FPR_cD;
189    					        
190    if ( absx <= FPR_half )
191    {   // |x| <= 0.5
192        if (unlikely( absx < FPR_k2ToM26 ))
193        {   // |x| < 2^(-26)
194            if ( absx == FPR_z )
195            {                              		// x == 0.0 case is exact
196                FESETENVD ( FPR_env );
197                return ( x );
198            }
199            
200            fpenv.d = FPR_env;
201			__NOOP;
202			__NOOP;
203			__NOOP;
204            rnddir = fpenv.i.lo & FE_ALL_RND;   		// caller's rounding direction
205            FPR_yD = x;
206    
207            // Rounding away from zero:  return NextDouble(x,x+x)
208            if (((rnddir == FE_DOWNWARD) && (x < FPR_z)) || ((rnddir == FE_UPWARD) && (x > FPR_z)))
209            {
210                yD.d = FPR_yD;
211                __NOOP;
212                __NOOP;
213                __NOOP;
214                yD.i.lo += 1u;
215                if (yD.i.lo == 0u)
216                    yD.i.hi += 1u;
217                __NOOP;
218                __NOOP;
219                __NOOP;
220                FPR_yD = yD.d;
221                absx = __FABS( FPR_yD );
222            }
223
224            FESETENVD ( FPR_env );               	// restore caller environment
225            __PROG_INEXACT( FPR_kPiBy2 );
226            
227            // Set UNDERFLOW if result is subnormal        
228            if (absx < FPR_kMinNormal)             	// subnormal case
229                __PROG_UF_INEXACT( FPR_kMinNormal );   	//   set UNDERFLOW
230                
231            return ( FPR_yD );                     	// return small result
232        }   // [|x| < 2^(-26)]
233        
234        {
235        register double r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
236        register double r10, r11, r12, r13, r14;
237        
238        r14 = pK->a14;					r13 = pK->a13;
239        temp1 = __FMADD( r14, x2, r13 );		x4 = __FMUL( x2, x2 );			
240        
241        x3 = __FMUL( x, x2 );				__NOOP;
242        r11 = pK->a11;					r12 = pK->a12;
243        
244        r9 = pK->a9;					r10 = pK->a10;
245        temp1 = __FMADD( temp1, x4, r11 );		__NOOP;
246        
247        temp1 = __FMADD( temp1, x4, r9 );		temp2 = __FMADD( r12, x4, r10 );
248        r7 = pK->a7;					r8 = pK->a8;
249                
250        r5 = pK->a5;					r6 = pK->a6;
251        temp1 = __FMADD( temp1, x4, r7 );		temp2 = __FMADD( temp2, x4, r8 );
252            
253        temp1 = __FMADD( temp1, x4, r5 );		temp2 = __FMADD( temp2, x4, r6 );
254        r3 = pK->a3;					r4 = pK->a4;
255        
256        r1 = pK->a1;					r2 = pK->a2;
257        temp1 = __FMADD( temp1, x4, r3 );		temp2 = __FMADD( temp2, x4, r4 );
258        
259        temp1 = __FMADD( temp1, x4, r1 );		temp2 = __FMADD( temp2, x4, r2 );
260        r0 = pK->a0;					__NOOP;
261        
262        temp2 = __FMADD( temp2, x2, temp1 );	
263        temp1 = __FMADD( x2, temp2, r0 );
264        }
265        
266        FESETENVD ( FPR_env );                   	// restore caller's mode
267        __PROG_INEXACT( FPR_kPiBy2 );
268        
269        return (__FMADD( x3, temp1, x ));
270    }   // [|x| <= 0.5]
271    
272/****************************************************************************
273    For 0.5 < |x| < 1.0, a polynomial approximation is used to estimate
274    the power series resulting from the identity:
275       ArcSin(x) = pi/2 - 2*ArcSin(Sqrt(0.5(1.0 - x))).
276    The square root is evaluated in-line and concurrently with the
277    evaluation of the ArcSin.
278 ***************************************************************************/
279    
280    if (absx < FPR_one)
281    {   // |x| < 1.0
282        register double FPR_gD, FPR_two;
283        register double r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
284        register double r10, r11, r12, r13, r14;
285        register uint32_t GPR_p, GPR_q, GPR_r, GPR_s, GPR_t;
286        
287        x2 = FPR_cD;	/* fmr */			x4 = __FMUL( FPR_cD, FPR_cD); 	// Was x2*x2;
288        GPR_t = cD.i.hi;				r0 = pK->aa0;
289
290        GPR_s = GPR_t + 0x3ff00000u;			__NOOP;
291        r14 = pK->a14;					r13 = pK->a13;
292
293        r12 = pK->a12;					r11 = pK->a11;
294        gD.i.lo = 0u;        
295
296        index = (GPR_t >> 13) & 0xffu;
297        GPR_p = SqrtTable[index]; // 3 instrs issue
298        
299        temp1 = __FMADD( r13, x4, r11 ); 		temp2 = __FMADD( r14, x4, r12 );
300        r10 = pK->a10;					r9 = pK->a9;
301        
302        GPR_r = ((0xff00u & GPR_p) << 4);		
303        ghead = (GPR_s >> 1) & 0x7ff00000u;
304        gD.i.hi = ghead + GPR_r;
305        
306        r8 = pK->a8;					r7 = pK->aa7;
307        yhead = 0x7fc00000u - ghead;
308
309        yD.i.lo = 0u;					GPR_q = ((0xffu & GPR_p) << 12);
310        yD.i.hi = yhead + GPR_q;			
311        
312        r6 = pK->aa6;					r5 = pK->aa5;
313        temp1 = __FMADD( temp1, x4, r9 ); 		temp2 = __FMADD( temp2, x4, r10 );
314                
315        temp1 = __FMADD( temp1, x4, r7 ); 		temp2 = __FMADD( temp2, x4, r8 );
316        r4 = pK->aa4;					r3 = pK->aa3;
317                                        			
318        r2 = pK->aa2;					r1 = pK->aa1;        
319        temp1 = __FMADD( temp1, x4, r5 );		temp2 = __FMADD( temp2, x4, r6 );
320        
321        temp1 = __FMADD( temp1, x4, r3 ); 		temp2 = __FMADD( temp2, x4, r4 );
322        FPR_two = 2.0;
323        
324        FPR_gD = gD.d;					y = yD.d;
325        temp1 = __FMADD( temp1, x4, r1 );		temp2 = __FMADD( temp2, x4, r2 );
326        
327        d = __FNMSUB( FPR_gD, FPR_gD, FPR_cD );		FPR_kPiBy2Tail = pK->kPiBy2Tail;
328        g = __FMADD( y, d, FPR_gD );			y2 = y + y;                 		// 16-bit g
329        e = __FNMSUB( y, g, FPR_half );			d = __FNMSUB( g, g, FPR_cD );
330        y = __FMADD( e, y2, y);               		temp1 = __FMADD( x2, temp2, temp1 );	// 16-bit y
331        g = __FMADD( y, d, g );                		y2 = y + y;				// 32-bit g
332            
333        e = __FNMSUB( y, g, FPR_half );			d = __FNMSUB( g, g, FPR_cD );
334        y = __FMADD( e, y2, y );               		temp1 = __FMADD( x2, temp1, r0 );	// 32-bit y
335        g = __FMADD( y, d, g);                							// 64-bit Sqrt
336        d = __FNMSUB( g, g, FPR_cD );			u = __FNMSUB( FPR_two, g, FPR_kPiBy2 );	// pi/2 - 2.0*Sqrt (head)
337        d = __FMUL( d, y );         			v = __FNMSUB( FPR_two, g, (FPR_kPiBy2 - u) );// tail of 0.5*Sqrt (32 bits)
338        g = __FMUL( g, x2 );				d = __FNMSUB( FPR_half, FPR_kPiBy2Tail, d );
339        
340        temp2 = __FNMSUB( FPR_half, v, d );
341        temp2 = __FMADD( g, temp1, temp2 );
342    
343        FESETENVD ( FPR_env );        // restore caller's mode
344        __PROG_INEXACT( FPR_kPiBy2 );
345        
346        if (x > FPR_z)
347        {                           // take care of sign of result
348            return __FNMSUB( FPR_two, temp2, u );
349        }
350        else
351            return __FMSUB( FPR_two, temp2, u );
352    }   // [|x| < 1.0]
353    
354    
355    // If |x| == 1.0, return properly signed pi/2.
356    if (absx == FPR_one)
357    {
358        FPR_kPiBy2Tail = pK->kPiBy2Tail;
359        
360        FESETENVD (FPR_env);            // restore caller's mode
361        if (x > FPR_z)
362            return __FMADD( x, FPR_kPiBy2Tail, FPR_kPiBy2 );
363        else
364            return __FMSUB( x, FPR_kPiBy2Tail, FPR_kPiBy2 );
365    }
366    
367    // |x| > 1.0 signals INVALID and returns a NaN
368    fpenv.d = FPR_env;
369	__NOOP;
370	__NOOP;
371	__NOOP;
372    fpenv.i.lo |= SET_INVALID;
373    FESETENVD_GRP (fpenv.d);
374    return ( nan ( INVERSE_TRIGONOMETRIC_NAN ) );
375}   // ArcSin(x)
376
377/****************************************************************************
378
379FUNCTION:  double ArcCos(double x)
380    
381DESCRIPTION:  returns the inverse cosine of its argument in the range of
382              0 to pi (radians).
383    
384CALLS:  __FABS
385    
386EXCEPTIONS raised:  INEXACT or INVALID.
387    
388ACCURACY:  Error is less than an ulp and usually less than half an ulp.
389           Caller's rounding direction is honored.
390
391****************************************************************************/
392
393double acos(double x)
394{
395    hexdouble     cD, yD, gD, fpenv;
396    double        absx, x2, x3, x4, temp1, temp2, s, u, v, w;
397    double        g, y, d, y2, e;
398    int           index;
399    uint32_t	  ghead, yhead;
400
401    register double FPR_env, FPR_z, FPR_half, FPR_one, FPR_k2ToM26, FPR_cD, FPR_kMinNormal, FPR_kPiBy2, FPR_kPiBy2Tail;
402    register struct kBlock *pK;
403    
404    pK = &kBlock;
405    
406    FPR_z = 0.0;					FPR_half = 0.5;
407    FPR_k2ToM26 = pK->k2ToM26;				FPR_kMinNormal = pK->kMinNormal;
408    FPR_kPiBy2 = pK->kPiBy2;				FPR_one = 1.0;
409
410    if (unlikely(x != x))                           // NaN
411        return ( copysign ( nan ( INVERSE_TRIGONOMETRIC_NAN ), x ) );
412        
413    absx = __FABS(x);
414    
415    FEGETENVD ( FPR_env );                   		// save env, set default
416    __ENSURE( FPR_z, FPR_half, FPR_k2ToM26 ); 		__ENSURE( FPR_one, FPR_kMinNormal, FPR_kPiBy2 ); // ensure register loads
417    FPR_cD = __FNMSUB( FPR_half, absx, FPR_half ); 
418    FESETENVD ( FPR_z );
419    
420    x2 = __FMUL( x, x );				// under/overflows are masked
421    cD.d = FPR_cD;
422
423    if ( absx <= FPR_half )
424    {   						// |x| <= 0.5
425        register double r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
426        register double r10, r11, r12, r13, r14;
427        
428        r14 = pK->a14;					r13 = pK->a13;
429        temp1 = __FMADD( r14, x2, r13 );		x4 = __FMUL( x2, x2 );			
430        
431        x3 = __FMUL( x, x2 );				u = FPR_kPiBy2 - x; // pi/2 - x (high-order term)
432        r11 = pK->a11;					r12 = pK->a12;
433        
434        r9 = pK->a9;					r10 = pK->a10;
435        temp1 = __FMADD( temp1, x4, r11 );		__NOOP;
436        
437        temp1 = __FMADD( temp1, x4, r9 );		temp2 = __FMADD( r12, x4, r10 );
438        r7 = pK->a7;					r8 = pK->a8;
439                
440        r5 = pK->a5;					r6 = pK->a6;
441        temp1 = __FMADD( temp1, x4, r7 );		temp2 = __FMADD( temp2, x4, r8 );
442        
443        temp1 = __FMADD( temp1, x4, r5 );		temp2 = __FMADD( temp2, x4, r6 );
444        r3 = pK->a3;					r4 = pK->a4;
445        
446        r1 = pK->a1;					r2 = pK->a2;
447        temp1 = __FMADD( temp1, x4, r3 );		temp2 = __FMADD( temp2, x4, r4 );
448        
449        temp1 = __FMADD( temp1, x4, r1 );		temp2 = __FMADD( temp2, x4, r2 );
450        r0 = pK->a0;					FPR_kPiBy2Tail = pK->kPiBy2Tail;
451        
452        // finish up ArcCos(x) = pi/2 - ArcSin(x) 
453        
454        temp1 = __FMADD( x2, temp2, temp1 );
455        v = FPR_kPiBy2 - u - x;                 	// pi/2 - x (tail)
456        temp1 = __FMADD( x2, temp1, pK->a0 );
457        w = __FNMSUB( x3, temp1, (v + FPR_kPiBy2Tail) );// combine low order terms
458        
459        FESETENVD ( FPR_env );                   	// restore caller's mode
460        __PROG_INEXACT( FPR_kPiBy2 );
461        
462        return ( u + w );
463    }   // [|x| <= 0.5]
464    
465/****************************************************************************
466    For 0.5 < |x| < 1.0, a polynomial approximation is used to estimate
467    the power series resulting from the identity:
468      ArcSin(x) = pi/2 - 2*ArcSin(Sqrt(0.5(1.0 - x))).
469    The square root is evaluated in-line and concurrently with the
470    evaluation of the ArcSin.
471 ****************************************************************************/
472    if ( absx < FPR_one ) 
473    {   // 0.5 < |x| < 1.0
474        register double FPR_gD, FPR_two;
475        register double r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
476        register double r10, r11, r12, r13, r14;
477        register uint32_t GPR_p, GPR_q, GPR_r, GPR_s, GPR_t;
478        
479        x2 = FPR_cD;	/* fmr */			x4 = __FMUL( x2, x2);
480        GPR_t = cD.i.hi;				r0 = pK->aa0;
481
482        GPR_s = GPR_t + 0x3ff00000u;			__NOOP;
483        r14 = pK->a14;					r13 = pK->a13;
484
485        r12 = pK->a12;					r11 = pK->aa11;
486        gD.i.lo = 0u;        
487
488        index = (GPR_t >> 13) & 0xffu;
489        GPR_p = SqrtTable[index]; // 3 instrs issue
490        
491        temp1 = __FMADD( r13, x4, r11 ); 		temp2 = __FMADD( r14, x4, r12 );
492        r10 = pK->aa10;					r9 = pK->aa9;
493        
494        GPR_r = ((0xff00u & GPR_p) << 4);		
495        ghead = (GPR_s >> 1) & 0x7ff00000u;
496        gD.i.hi = ghead + GPR_r;
497        
498        r8 = pK->aa8;					r7 = pK->aa7;
499        yhead = 0x7fc00000u - ghead;
500
501        yD.i.lo = 0u;					GPR_q = ((0xffu & GPR_p) << 12);
502        yD.i.hi = yhead + GPR_q;			
503        
504        r6 = pK->aa6;					r5 = pK->aa5;
505        temp1 = __FMADD( temp1, x4, r9 ); 		temp2 = __FMADD( temp2, x4, r10 );
506                
507        temp1 = __FMADD( temp1, x4, r7 ); 		temp2 = __FMADD( temp2, x4, r8 );
508        r4 = pK->aa4;					r3 = pK->aa3;
509                                        			
510        r2 = pK->aa2;					r1 = pK->aa1;        
511        temp1 = __FMADD( temp1, x4, r5 );		temp2 = __FMADD( temp2, x4, r6 );
512        
513        temp1 = __FMADD( temp1, x4, r3 ); 		temp2 = __FMADD( temp2, x4, r4 );
514        FPR_two = 2.0;
515        
516        FPR_gD = gD.d;					y = yD.d;
517        temp1 = __FMADD( temp1, x4, r1 );		temp2 = __FMADD( temp2, x4, r2 );
518        
519        d = __FNMSUB( FPR_gD, FPR_gD, x2 );
520        g = __FMADD( y, d, FPR_gD );			y2 = y + y;                 	// 16-bit g
521        e = __FNMSUB( y, g, FPR_half );			d = __FNMSUB( g, g, x2 );
522        y = __FMADD( e, y2, y);                       	temp1 = __FMADD( x2, temp2, temp1 );// 16-bit y
523        g = __FMADD( y, d, g );                		y2 = y + y;			// 32-bit g
524        e = __FNMSUB( y, g, FPR_half );			d = __FNMSUB( g, g, x2 );
525        y = __FMADD( e, y2, y );                       					// 32-bit y
526        g = __FMADD( y, d, g);                						// 64-bit Sqrt
527        d = __FNMSUB( g, g, x2 );                   					// tail of 0.5*Sqrt (32 bits)
528        y2 = g + g;                         						// 2*Sqrt
529        
530        if (x > FPR_z)
531        {   // positive x
532            s = __FMADD( __FMUL( g, x2 ), __FMADD( x2, temp1, pK->aa0), d );// low order terms
533            
534            FESETENVD ( FPR_env );                   			// restore caller's mode
535            __PROG_INEXACT( FPR_kPiBy2 );
536            
537            return __FMADD( FPR_two, s, y2);                		// combine terms
538        }   // [positive x]
539        else
540        {   // negative x
541            register double q,r,ss,t;
542            
543            u = __FMSUB( FPR_two, FPR_kPiBy2, y2 );            		// high-order term
544            r = d - kPiBy2Tail;
545            ss = __FMADD( x2, temp1, pK->aa0 );
546            t = __FMUL( g, x2 );
547            v = __FMSUB( FPR_two, FPR_kPiBy2, u ) - y2;       		// tail correction
548            q = __FNMSUB( FPR_half, v, r );
549            s = __FMADD( ss, t, q );					// collect low-order terms
550            
551            FESETENVD ( FPR_env );                   			// restore caller's mode
552            __PROG_INEXACT( FPR_kPiBy2 );
553            
554            return __FNMSUB( FPR_two, s, u);                		// combine terms
555        }   // [negative x]
556    }   // [0.5 < |x| < 1.0]
557
558
559    // |x| == 1.0:  return exact +0.0 or inexact pi
560    if ( x == FPR_one )
561    {
562        FESETENVD ( FPR_env );            				// restore caller's mode
563        return ( FPR_z );
564    }
565    
566    if ( x == -1.0 )
567    {
568        FESETENVD ( FPR_env );                   			// restore caller's mode
569        __PROG_INEXACT( FPR_kPiBy2 );
570        return (kPiu.d + 2.0*kPiBy2Tail);
571    }
572    
573    // |x| > 1.0 signals INVALID and returns a NaN
574    fpenv.d = FPR_env;
575	__NOOP;
576	__NOOP;
577	__NOOP;
578    fpenv.i.lo |= SET_INVALID;
579    FESETENVD_GRP ( fpenv.d );
580    return ( nan ( INVERSE_TRIGONOMETRIC_NAN ) );
581}   // ArcCos(x)