/* * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * * The contents of this file constitute Original Code as defined in and * are subject to the Apple Public Source License Version 1.1 (the * "License"). You may not use this file except in compliance with the * License. Please obtain a copy of the License at * http://www.apple.com/publicsource and read it before using this file. * * This Original Code and all software distributed under the License are * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the * License for the specific language governing rights and limitations * under the License. * * @APPLE_LICENSE_HEADER_END@ */ /******************************************************************************* * * * File asinacos.c, * * Implementation of asin and acos for the PowerPC. * * * * Copyright © 1991-2001 Apple Computer, Inc. All rights reserved. * * * * Written and ported by Robert A. Murley (ram) for Mac OS X. * * Modified by A. Sazegari, September 2001. * * * * A MathLib v4 file. * * * * Based on the trigonometric functions from IBM/Taligent. * * * * July 20 2001: replaced __setflm with FEGETENVD/FESETENVD. * * replaced DblInHex typedef with hexdouble. * * September 07 2001: added #ifdef __ppc__. * * September 09 2001: added more comments. * * September 10 2001: added macros to detect PowerPC and correct compiler. * * September 21 2001: added to get to * * and . * * September 24 2001: used standard symbols for environment flags. * * September 25 2001: added FESETENVD(0.0) at start of acos. * * October 08 2001: removed . * * changed compiler errors to warnings. * * November 06 2001: commented out warning about Intel architectures. * * * * W A R N I N G: * * These routines require a 64-bit double precision IEEE-754 model. * * They are written for PowerPC only and are expecting the compiler * * to generate the correct sequence of multiply-add fused instructions. * * * * These routines are not intended for 32-bit Intel architectures. * * * * A version of gcc higher than 932 is required. * * * * GCC compiler options: * * optimization level 3 (-O3) * * -fschedule-insns -finline-functions -funroll-all-loops * * * *******************************************************************************/ #include "fp_private.h" #include "fenv_private.h" #include "math.h" extern const unsigned short SqrtTable[]; // Floating-point constants static const double kPiBy2Tail = 6.1232339957367660e-17; // 0x1.1a62633145c07p-54 static const hexdouble kPiu = HEXDOUBLE(0x400921fb, 0x54442d18); #define INVERSE_TRIGONOMETRIC_NAN "34" /**************************************************************************** FUNCTION: double asin (double x) DESCRIPTION: returns the inverse sine of its argument in the range of -pi/2 to pi/2 (radians). CALLS: __FABS EXCEPTIONS raised: INEXACT, UNDERFLOW/INEXACT or INVALID. ACCURACY: Error is less than an ulp and usually less than half an ulp. Caller's rounding direction is honored. ****************************************************************************/ static struct kBlock { const double k2ToM26 /* = 1.490116119384765625e-8 */ ; // 0x1.0p-26 */ ; const double kPiBy2 /* = 1.570796326794896619231322 */ ; // 0x1.921fb54442d18p0 const double kPiBy2Tail /* = 6.1232339957367660e-17 */ ; // 0x1.1a62633145c07p-54 const double kMinNormal /* = 2.2250738585072014e-308 */ ; // 0x1.0p-1022 const double a14 /* = 0.03085091303188211304259 */ ; const double a13 /* =-0.02425125216179617744614 */ ; const double a12 /* = 0.02273334623573271023373 */ ; const double a11 /* = 0.0002983797813053653120360 */ ; const double a10 /* = 0.008819738782817955152102 */ ; const double a9 /* = 0.008130738583790024803650 */ ; const double a8 /* = 0.009793486386035222577675 */ ; const double a7 /* = 0.01154901189590083099584 */ ; const double a6 /* = 0.01396501522140471126730 */ ; const double a5 /* = 0.01735275722383019335276 */ ; const double a4 /* = 0.02237215928757573539490 */ ; const double a3 /* = 0.03038194444121688698408 */ ; const double a2 /* = 0.04464285714288482921087 */ ; const double a1 /* = 0.07499999999999990640378 */ ; const double a0 /* = 0.1666666666666666667191 */ ; const double aa11 /* = 0.0002983797813053653120360 */ ; const double aa10 /* = 0.008819738782817955152102 */ ; const double aa9 /* = 0.008130738583790024803650 */ ; const double aa8 /* = 0.009793486386035222577675 */ ; const double aa7 /* = 0.01154901189590083099584 */ ; const double aa6 /* = 0.01396501522140471126730 */ ; const double aa5 /* = 0.01735275722383019335276 */ ; const double aa4 /* = 0.02237215928757573539490 */ ; const double aa3 /* = 0.03038194444121688698408 */ ; const double aa2 /* = 0.04464285714288482921087 */ ; const double aa1 /* = 0.07499999999999990640378 */ ; const double aa0 /* = 0.1666666666666666667191 */ ; } kBlock = { /* static const double k2ToM26 = */ 0x1.0p-26, // 1.490116119384765625e-8 /* static const double kPiBy2 = */ 1.570796326794896619231322, // 0x1.921fb54442d18p0 /* static const double kPiBy2Tail = */ 6.1232339957367660e-17, // 0x1.1a62633145c07p-54 /* static const double kMinNormal = */ 0x1.0p-1022, // 2.2250738585072014e-308 /* const static double a14 = */ 0.03085091303188211304259, /* const static double a13 = */-0.02425125216179617744614, /* const static double a12 = */ 0.02273334623573271023373, /* const static double a11 = */ 0.0002983797813053653120360, /* const static double a10 = */ 0.008819738782817955152102, /* const static double a9 = */ 0.008130738583790024803650, /* const static double a8 = */ 0.009793486386035222577675, /* const static double a7 = */ 0.01154901189590083099584, /* const static double a6 = */ 0.01396501522140471126730, /* const static double a5 = */ 0.01735275722383019335276, /* const static double a4 = */ 0.02237215928757573539490, /* const static double a3 = */ 0.03038194444121688698408, /* const static double a2 = */ 0.04464285714288482921087, /* const static double a1 = */ 0.07499999999999990640378, /* const static double a0 = */ 0.1666666666666666667191, /* const static double aa11 = */ 0.0002983797813053653120360, /* const static double aa10 = */ 0.008819738782817955152102, /* const static double aa9 = */ 0.008130738583790024803650, /* const static double aa8 = */ 0.009793486386035222577675, /* const static double aa7 = */ 0.01154901189590083099584, /* const static double aa6 = */ 0.01396501522140471126730, /* const static double aa5 = */ 0.01735275722383019335276, /* const static double aa4 = */ 0.02237215928757573539490, /* const static double aa3 = */ 0.03038194444121688698408, /* const static double aa2 = */ 0.04464285714288482921087, /* const static double aa1 = */ 0.07499999999999990640378, /* const static double aa0 = */ 0.1666666666666666667191 }; double asin (double x) { hexdouble cD, yD ,gD , fpenv; double absx, x2, x3, x4, temp1, temp2, u, v; double g, y, d, y2, e; register int index; register uint32_t ghead, yhead, rnddir; register double FPR_env, FPR_z, FPR_half, FPR_one, FPR_k2ToM26, FPR_cD, FPR_yD, FPR_kMinNormal, FPR_kPiBy2, FPR_kPiBy2Tail; register struct kBlock *pK; pK = &kBlock; FPR_z = 0.0; FPR_half = 0.5; FPR_k2ToM26 = pK->k2ToM26; FPR_kMinNormal = pK->kMinNormal; FPR_kPiBy2 = pK->kPiBy2; FPR_one = 1.0; if (unlikely(x != x)) // NaN argument return ( copysign ( nan ( INVERSE_TRIGONOMETRIC_NAN ), x ) ); absx = __FABS( x ); FEGETENVD ( FPR_env ); // save env, set default __ENSURE( FPR_z, FPR_half, FPR_k2ToM26 ); __ENSURE( FPR_one, FPR_kMinNormal, FPR_kPiBy2 ); // ensure register loads FPR_cD = __FNMSUB( FPR_half, absx, FPR_half ); FESETENVD ( FPR_z ); x2 = __FMUL( x, x ); // under/overflows are masked cD.d = FPR_cD; if ( absx <= FPR_half ) { // |x| <= 0.5 if (unlikely( absx < FPR_k2ToM26 )) { // |x| < 2^(-26) if ( absx == FPR_z ) { // x == 0.0 case is exact FESETENVD ( FPR_env ); return ( x ); } fpenv.d = FPR_env; __NOOP; __NOOP; __NOOP; rnddir = fpenv.i.lo & FE_ALL_RND; // caller's rounding direction FPR_yD = x; // Rounding away from zero: return NextDouble(x,x+x) if (((rnddir == FE_DOWNWARD) && (x < FPR_z)) || ((rnddir == FE_UPWARD) && (x > FPR_z))) { yD.d = FPR_yD; __NOOP; __NOOP; __NOOP; yD.i.lo += 1u; if (yD.i.lo == 0u) yD.i.hi += 1u; __NOOP; __NOOP; __NOOP; FPR_yD = yD.d; absx = __FABS( FPR_yD ); } FESETENVD ( FPR_env ); // restore caller environment __PROG_INEXACT( FPR_kPiBy2 ); // Set UNDERFLOW if result is subnormal if (absx < FPR_kMinNormal) // subnormal case __PROG_UF_INEXACT( FPR_kMinNormal ); // set UNDERFLOW return ( FPR_yD ); // return small result } // [|x| < 2^(-26)] { register double r0, r1, r2, r3, r4, r5, r6, r7, r8, r9; register double r10, r11, r12, r13, r14; r14 = pK->a14; r13 = pK->a13; temp1 = __FMADD( r14, x2, r13 ); x4 = __FMUL( x2, x2 ); x3 = __FMUL( x, x2 ); __NOOP; r11 = pK->a11; r12 = pK->a12; r9 = pK->a9; r10 = pK->a10; temp1 = __FMADD( temp1, x4, r11 ); __NOOP; temp1 = __FMADD( temp1, x4, r9 ); temp2 = __FMADD( r12, x4, r10 ); r7 = pK->a7; r8 = pK->a8; r5 = pK->a5; r6 = pK->a6; temp1 = __FMADD( temp1, x4, r7 ); temp2 = __FMADD( temp2, x4, r8 ); temp1 = __FMADD( temp1, x4, r5 ); temp2 = __FMADD( temp2, x4, r6 ); r3 = pK->a3; r4 = pK->a4; r1 = pK->a1; r2 = pK->a2; temp1 = __FMADD( temp1, x4, r3 ); temp2 = __FMADD( temp2, x4, r4 ); temp1 = __FMADD( temp1, x4, r1 ); temp2 = __FMADD( temp2, x4, r2 ); r0 = pK->a0; __NOOP; temp2 = __FMADD( temp2, x2, temp1 ); temp1 = __FMADD( x2, temp2, r0 ); } FESETENVD ( FPR_env ); // restore caller's mode __PROG_INEXACT( FPR_kPiBy2 ); return (__FMADD( x3, temp1, x )); } // [|x| <= 0.5] /**************************************************************************** For 0.5 < |x| < 1.0, a polynomial approximation is used to estimate the power series resulting from the identity: ArcSin(x) = pi/2 - 2*ArcSin(Sqrt(0.5(1.0 - x))). The square root is evaluated in-line and concurrently with the evaluation of the ArcSin. ***************************************************************************/ if (absx < FPR_one) { // |x| < 1.0 register double FPR_gD, FPR_two; register double r0, r1, r2, r3, r4, r5, r6, r7, r8, r9; register double r10, r11, r12, r13, r14; register uint32_t GPR_p, GPR_q, GPR_r, GPR_s, GPR_t; x2 = FPR_cD; /* fmr */ x4 = __FMUL( FPR_cD, FPR_cD); // Was x2*x2; GPR_t = cD.i.hi; r0 = pK->aa0; GPR_s = GPR_t + 0x3ff00000u; __NOOP; r14 = pK->a14; r13 = pK->a13; r12 = pK->a12; r11 = pK->a11; gD.i.lo = 0u; index = (GPR_t >> 13) & 0xffu; GPR_p = SqrtTable[index]; // 3 instrs issue temp1 = __FMADD( r13, x4, r11 ); temp2 = __FMADD( r14, x4, r12 ); r10 = pK->a10; r9 = pK->a9; GPR_r = ((0xff00u & GPR_p) << 4); ghead = (GPR_s >> 1) & 0x7ff00000u; gD.i.hi = ghead + GPR_r; r8 = pK->a8; r7 = pK->aa7; yhead = 0x7fc00000u - ghead; yD.i.lo = 0u; GPR_q = ((0xffu & GPR_p) << 12); yD.i.hi = yhead + GPR_q; r6 = pK->aa6; r5 = pK->aa5; temp1 = __FMADD( temp1, x4, r9 ); temp2 = __FMADD( temp2, x4, r10 ); temp1 = __FMADD( temp1, x4, r7 ); temp2 = __FMADD( temp2, x4, r8 ); r4 = pK->aa4; r3 = pK->aa3; r2 = pK->aa2; r1 = pK->aa1; temp1 = __FMADD( temp1, x4, r5 ); temp2 = __FMADD( temp2, x4, r6 ); temp1 = __FMADD( temp1, x4, r3 ); temp2 = __FMADD( temp2, x4, r4 ); FPR_two = 2.0; FPR_gD = gD.d; y = yD.d; temp1 = __FMADD( temp1, x4, r1 ); temp2 = __FMADD( temp2, x4, r2 ); d = __FNMSUB( FPR_gD, FPR_gD, FPR_cD ); FPR_kPiBy2Tail = pK->kPiBy2Tail; g = __FMADD( y, d, FPR_gD ); y2 = y + y; // 16-bit g e = __FNMSUB( y, g, FPR_half ); d = __FNMSUB( g, g, FPR_cD ); y = __FMADD( e, y2, y); temp1 = __FMADD( x2, temp2, temp1 ); // 16-bit y g = __FMADD( y, d, g ); y2 = y + y; // 32-bit g e = __FNMSUB( y, g, FPR_half ); d = __FNMSUB( g, g, FPR_cD ); y = __FMADD( e, y2, y ); temp1 = __FMADD( x2, temp1, r0 ); // 32-bit y g = __FMADD( y, d, g); // 64-bit Sqrt d = __FNMSUB( g, g, FPR_cD ); u = __FNMSUB( FPR_two, g, FPR_kPiBy2 ); // pi/2 - 2.0*Sqrt (head) d = __FMUL( d, y ); v = __FNMSUB( FPR_two, g, (FPR_kPiBy2 - u) );// tail of 0.5*Sqrt (32 bits) g = __FMUL( g, x2 ); d = __FNMSUB( FPR_half, FPR_kPiBy2Tail, d ); temp2 = __FNMSUB( FPR_half, v, d ); temp2 = __FMADD( g, temp1, temp2 ); FESETENVD ( FPR_env ); // restore caller's mode __PROG_INEXACT( FPR_kPiBy2 ); if (x > FPR_z) { // take care of sign of result return __FNMSUB( FPR_two, temp2, u ); } else return __FMSUB( FPR_two, temp2, u ); } // [|x| < 1.0] // If |x| == 1.0, return properly signed pi/2. if (absx == FPR_one) { FPR_kPiBy2Tail = pK->kPiBy2Tail; FESETENVD (FPR_env); // restore caller's mode if (x > FPR_z) return __FMADD( x, FPR_kPiBy2Tail, FPR_kPiBy2 ); else return __FMSUB( x, FPR_kPiBy2Tail, FPR_kPiBy2 ); } // |x| > 1.0 signals INVALID and returns a NaN fpenv.d = FPR_env; __NOOP; __NOOP; __NOOP; fpenv.i.lo |= SET_INVALID; FESETENVD_GRP (fpenv.d); return ( nan ( INVERSE_TRIGONOMETRIC_NAN ) ); } // ArcSin(x) /**************************************************************************** FUNCTION: double ArcCos(double x) DESCRIPTION: returns the inverse cosine of its argument in the range of 0 to pi (radians). CALLS: __FABS EXCEPTIONS raised: INEXACT or INVALID. ACCURACY: Error is less than an ulp and usually less than half an ulp. Caller's rounding direction is honored. ****************************************************************************/ double acos(double x) { hexdouble cD, yD, gD, fpenv; double absx, x2, x3, x4, temp1, temp2, s, u, v, w; double g, y, d, y2, e; int index; uint32_t ghead, yhead; register double FPR_env, FPR_z, FPR_half, FPR_one, FPR_k2ToM26, FPR_cD, FPR_kMinNormal, FPR_kPiBy2, FPR_kPiBy2Tail; register struct kBlock *pK; pK = &kBlock; FPR_z = 0.0; FPR_half = 0.5; FPR_k2ToM26 = pK->k2ToM26; FPR_kMinNormal = pK->kMinNormal; FPR_kPiBy2 = pK->kPiBy2; FPR_one = 1.0; if (unlikely(x != x)) // NaN return ( copysign ( nan ( INVERSE_TRIGONOMETRIC_NAN ), x ) ); absx = __FABS(x); FEGETENVD ( FPR_env ); // save env, set default __ENSURE( FPR_z, FPR_half, FPR_k2ToM26 ); __ENSURE( FPR_one, FPR_kMinNormal, FPR_kPiBy2 ); // ensure register loads FPR_cD = __FNMSUB( FPR_half, absx, FPR_half ); FESETENVD ( FPR_z ); x2 = __FMUL( x, x ); // under/overflows are masked cD.d = FPR_cD; if ( absx <= FPR_half ) { // |x| <= 0.5 register double r0, r1, r2, r3, r4, r5, r6, r7, r8, r9; register double r10, r11, r12, r13, r14; r14 = pK->a14; r13 = pK->a13; temp1 = __FMADD( r14, x2, r13 ); x4 = __FMUL( x2, x2 ); x3 = __FMUL( x, x2 ); u = FPR_kPiBy2 - x; // pi/2 - x (high-order term) r11 = pK->a11; r12 = pK->a12; r9 = pK->a9; r10 = pK->a10; temp1 = __FMADD( temp1, x4, r11 ); __NOOP; temp1 = __FMADD( temp1, x4, r9 ); temp2 = __FMADD( r12, x4, r10 ); r7 = pK->a7; r8 = pK->a8; r5 = pK->a5; r6 = pK->a6; temp1 = __FMADD( temp1, x4, r7 ); temp2 = __FMADD( temp2, x4, r8 ); temp1 = __FMADD( temp1, x4, r5 ); temp2 = __FMADD( temp2, x4, r6 ); r3 = pK->a3; r4 = pK->a4; r1 = pK->a1; r2 = pK->a2; temp1 = __FMADD( temp1, x4, r3 ); temp2 = __FMADD( temp2, x4, r4 ); temp1 = __FMADD( temp1, x4, r1 ); temp2 = __FMADD( temp2, x4, r2 ); r0 = pK->a0; FPR_kPiBy2Tail = pK->kPiBy2Tail; // finish up ArcCos(x) = pi/2 - ArcSin(x) temp1 = __FMADD( x2, temp2, temp1 ); v = FPR_kPiBy2 - u - x; // pi/2 - x (tail) temp1 = __FMADD( x2, temp1, pK->a0 ); w = __FNMSUB( x3, temp1, (v + FPR_kPiBy2Tail) );// combine low order terms FESETENVD ( FPR_env ); // restore caller's mode __PROG_INEXACT( FPR_kPiBy2 ); return ( u + w ); } // [|x| <= 0.5] /**************************************************************************** For 0.5 < |x| < 1.0, a polynomial approximation is used to estimate the power series resulting from the identity: ArcSin(x) = pi/2 - 2*ArcSin(Sqrt(0.5(1.0 - x))). The square root is evaluated in-line and concurrently with the evaluation of the ArcSin. ****************************************************************************/ if ( absx < FPR_one ) { // 0.5 < |x| < 1.0 register double FPR_gD, FPR_two; register double r0, r1, r2, r3, r4, r5, r6, r7, r8, r9; register double r10, r11, r12, r13, r14; register uint32_t GPR_p, GPR_q, GPR_r, GPR_s, GPR_t; x2 = FPR_cD; /* fmr */ x4 = __FMUL( x2, x2); GPR_t = cD.i.hi; r0 = pK->aa0; GPR_s = GPR_t + 0x3ff00000u; __NOOP; r14 = pK->a14; r13 = pK->a13; r12 = pK->a12; r11 = pK->aa11; gD.i.lo = 0u; index = (GPR_t >> 13) & 0xffu; GPR_p = SqrtTable[index]; // 3 instrs issue temp1 = __FMADD( r13, x4, r11 ); temp2 = __FMADD( r14, x4, r12 ); r10 = pK->aa10; r9 = pK->aa9; GPR_r = ((0xff00u & GPR_p) << 4); ghead = (GPR_s >> 1) & 0x7ff00000u; gD.i.hi = ghead + GPR_r; r8 = pK->aa8; r7 = pK->aa7; yhead = 0x7fc00000u - ghead; yD.i.lo = 0u; GPR_q = ((0xffu & GPR_p) << 12); yD.i.hi = yhead + GPR_q; r6 = pK->aa6; r5 = pK->aa5; temp1 = __FMADD( temp1, x4, r9 ); temp2 = __FMADD( temp2, x4, r10 ); temp1 = __FMADD( temp1, x4, r7 ); temp2 = __FMADD( temp2, x4, r8 ); r4 = pK->aa4; r3 = pK->aa3; r2 = pK->aa2; r1 = pK->aa1; temp1 = __FMADD( temp1, x4, r5 ); temp2 = __FMADD( temp2, x4, r6 ); temp1 = __FMADD( temp1, x4, r3 ); temp2 = __FMADD( temp2, x4, r4 ); FPR_two = 2.0; FPR_gD = gD.d; y = yD.d; temp1 = __FMADD( temp1, x4, r1 ); temp2 = __FMADD( temp2, x4, r2 ); d = __FNMSUB( FPR_gD, FPR_gD, x2 ); g = __FMADD( y, d, FPR_gD ); y2 = y + y; // 16-bit g e = __FNMSUB( y, g, FPR_half ); d = __FNMSUB( g, g, x2 ); y = __FMADD( e, y2, y); temp1 = __FMADD( x2, temp2, temp1 );// 16-bit y g = __FMADD( y, d, g ); y2 = y + y; // 32-bit g e = __FNMSUB( y, g, FPR_half ); d = __FNMSUB( g, g, x2 ); y = __FMADD( e, y2, y ); // 32-bit y g = __FMADD( y, d, g); // 64-bit Sqrt d = __FNMSUB( g, g, x2 ); // tail of 0.5*Sqrt (32 bits) y2 = g + g; // 2*Sqrt if (x > FPR_z) { // positive x s = __FMADD( __FMUL( g, x2 ), __FMADD( x2, temp1, pK->aa0), d );// low order terms FESETENVD ( FPR_env ); // restore caller's mode __PROG_INEXACT( FPR_kPiBy2 ); return __FMADD( FPR_two, s, y2); // combine terms } // [positive x] else { // negative x register double q,r,ss,t; u = __FMSUB( FPR_two, FPR_kPiBy2, y2 ); // high-order term r = d - kPiBy2Tail; ss = __FMADD( x2, temp1, pK->aa0 ); t = __FMUL( g, x2 ); v = __FMSUB( FPR_two, FPR_kPiBy2, u ) - y2; // tail correction q = __FNMSUB( FPR_half, v, r ); s = __FMADD( ss, t, q ); // collect low-order terms FESETENVD ( FPR_env ); // restore caller's mode __PROG_INEXACT( FPR_kPiBy2 ); return __FNMSUB( FPR_two, s, u); // combine terms } // [negative x] } // [0.5 < |x| < 1.0] // |x| == 1.0: return exact +0.0 or inexact pi if ( x == FPR_one ) { FESETENVD ( FPR_env ); // restore caller's mode return ( FPR_z ); } if ( x == -1.0 ) { FESETENVD ( FPR_env ); // restore caller's mode __PROG_INEXACT( FPR_kPiBy2 ); return (kPiu.d + 2.0*kPiBy2Tail); } // |x| > 1.0 signals INVALID and returns a NaN fpenv.d = FPR_env; __NOOP; __NOOP; __NOOP; fpenv.i.lo |= SET_INVALID; FESETENVD_GRP ( fpenv.d ); return ( nan ( INVERSE_TRIGONOMETRIC_NAN ) ); } // ArcCos(x)