src/libm/Source/PowerPC/HyperbolicDD.c at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / PowerPC / HyperbolicDD.c
at fixPythonPipStalling 670 lines 28 kB view raw
wrap content
Lubos Dolezel Restructured source tree to prepare for merge with the "darling" repo 10y ago
f228ae16
  1/*
  2 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
  3 *
  4 * @APPLE_LICENSE_HEADER_START@
  5 * 
  6 * The contents of this file constitute Original Code as defined in and
  7 * are subject to the Apple Public Source License Version 1.1 (the
  8 * "License").  You may not use this file except in compliance with the
  9 * License.  Please obtain a copy of the License at
 10 * http://www.apple.com/publicsource and read it before using this file.
 11 * 
 12 * This Original Code and all software distributed under the License are
 13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
 17 * License for the specific language governing rights and limitations
 18 * under the License.
 19 * 
 20 * @APPLE_LICENSE_HEADER_END@
 21 */
 22//
 23//	HyperbolicDD.c
 24//	
 25//	Double-double Function Library
 26//	Copyright: � 1995-96 by Apple Computer, Inc., all rights reserved
 27//	
 28//	long double coshl(long double x);
 29//	long double sinhl(long double x);
 30//	long double tanhl(long double x);
 31//
 32//	Change history:
 33//		9/1/96	- PAF removed unnecessary tests (uncovered by MrC warning)
 34//
 35
 36#include "math.h"
 37#include "fp_private.h"
 38#if (__WANT_LONG_DOUBLE_FORMAT__ - 0L == 128L)
 39#include "DD.h"
 40
 41static const double recip = 0.1957294106339126123084757437350e-19;
 42static const double kBig  = 4.504699138998272e+15 ; // 2.0**52 + 2.0**40 = 0x43300100 00000000 
 43
 44uint32_t TanhTbl[] = {
 45	0x00000000,	0x00000000,		0x00000000,	0x00000000,		0x00000000,	0x00000000, 
 46	0x3FB00000,	0x00030807,  	0x3FAFF559,	0x97E63AD9,  	0xBB6CF9C0,	0x2437818A, 
 47	0x3FC00000,	0x0000284A,  	0x3FBFD599,	0x2BC5078A,  	0xBB77F9D4,	0x704D912A, 
 48	0x3FC80000,	0x00005880,  	0x3FC7B8FF,	0x903C4CEC,  	0x3B47716F,	0xB54B60B5, 
 49	0x3FD00000,	0x00000CDC,  	0x3FCF597E,	0xA69A34B3,  	0xBB794C01,	0xC101435A, 
 50	0x3FD40000,	0x0000313C,  	0x3FD35F98,	0xA0EA91C7,  	0xBB904F63,	0x26BD63B6, 
 51	0x3FD80000,	0x000021EA,  	0x3FD6EF53,	0xDE8CAD3F,  	0xBB91172E,	0x8FAFB7A2, 
 52	0x3FDC0000,	0x00005214,  	0x3FDA5729,	0xEE48C464,  	0x3B9543D8,	0xC4943292, 
 53	0x3FE00000,	0x00001E86,  	0x3FDD9353,	0xD756BAF6,  	0xBB8F03A0,	0x34F7B469, 
 54	0x3FE20000,	0x00007115,  	0x3FE05086,	0xF2F72867,  	0xBB95D918,	0x3E3269EE, 
 55	0x3FE40000,	0x00005720,  	0x3FE1BF47,	0xEABBCBE9,  	0x3B918CCE,	0xA6F1FA3A, 
 56	0x3FE60000,	0x000021CF,  	0x3FE3157D,	0xFE9F8724,  	0xBBA1AC2C,	0x2B395168, 
 57	0x3FE80000,	0x00016908,  	0x3FE45323,	0xE553C98B,  	0xBBA14AF9,	0xB571AFA7, 
 58	0x3FEA0000,	0x00005202,  	0x3FE5788F,	0xF10D56AF,  	0xBBA3C229,	0xC063AD80, 
 59	0x3FEC0000,	0x00005742,  	0x3FE68665,	0x0B8C4C1B,  	0x3B8BE9FC,	0xF1B4428B, 
 60	0x3FEE0000,	0x00008582,  	0x3FE77D83,	0x8E34430D,  	0x3B829FA7,	0xF1623176, 
 61	0x3FF00000,	0x00005F76,  	0x3FE85EFA,	0xB51543C3,  	0xBB4B3592,	0x2498C785, 
 62	0x3FF10000,	0x00001F8D,  	0x3FE92BFB,	0x370DB380,  	0x3BA29188,	0x377121C3, 
 63	0x3FF20000,	0x0000130F,  	0x3FE9E5CB,	0x5BA45A90,  	0x3B7DFB4E,	0xF67C44E2, 
 64	0x3FF30000,	0x00004F18,  	0x3FEA8DBC,	0xBC31BABE,  	0x3B99C45D,	0xAE5D7827, 
 65	0x3FF40000,	0x00005131,  	0x3FEB2523,	0xBB6B5B77,  	0x3B85A025,	0x161129AB, 
 66	0x3FF50000,	0x000029D4,  	0x3FEBAD50,	0xA4A6A0D5,  	0xBB983007,	0x72D2D609, 
 67	0x3FF60000,	0x000007E1,  	0x3FEC278A,	0x52A4E807,  	0x3BA451F7,	0x31DE764B, 
 68	0x3FF70000,	0x0000185B,  	0x3FEC950A,	0x3340D299,  	0x3BAABF14,	0xE874463C, 
 69	0x3FF80000,	0x000024C4,  	0x3FECF6F9,	0x786E02C1,  	0x3B6B3F8E,	0x2446EA6B, 
 70	0x3FF90000,	0x0000BF0C,  	0x3FED4E6F,	0x4642C44F,  	0x3BA657E4,	0xDE96E741, 
 71	0x3FFA0000,	0x00006986,  	0x3FED9C6F,	0xAFE63ACE,  	0x3BA71F1B,	0xC2E3ED65, 
 72	0x3FFB0000,	0x00003690,  	0x3FEDE1EB,	0x59375F86,  	0x3B94D798,	0x08425B54, 
 73	0x3FFC0000,	0x000068CA,  	0x3FEE1FBF,	0x97E34D01,  	0x3BAD07C7,	0xA6BF9B47, 
 74	0x3FFD0000,	0x00008A24,  	0x3FEE56B6,	0xF3EFC7EE,  	0xBBA26658,	0xA2EB55F0, 
 75	0x3FFE0000,	0x0003C04A,  	0x3FEE8789,	0xECECBA51,  	0xBBA8B27F,	0x9026BBD4, 
 76	0x3FFF0000,	0x00000D68,  	0x3FEEB2DF,	0xEDD5EEB6,  	0x3B9EE74B,	0x12149464, 
 77	0x40000000,	0x00004DFD,  	0x3FEED950,	0x5E1BD9DE,  	0x3B971596,	0x795B2F55
 78};
 79
 80uint32_t TanhCoeff[] = {
 81	0x3FF00000,	0x00000000,		0x00000000,	0x00000000, 
 82	0xBFD55555,	0x55555555,  	0xBC755555,	0x55555554, 
 83	0x3FC11111,	0x11111111,  	0x3C411111,	0x10FF0F6D, 
 84	0xBFABA1BA,	0x1BA1BA1C,  	0x3C47917A,	0xA287E6B6, 
 85	0x3F9664F4,	0x882C10FA,  	0xBC0A8F5F,	0x684BD9FF, 
 86	0xBF8226E3,	0x55E6C238,  	0x3C01097B,	0x425ED098, 
 87	0x3F6D6D3D,	0x0E154787,  	0xBC0BA781,	0x948B0FCF, 
 88	0xBF57DA36,	0x446C8BDA,  	0xBBD2F684,	0xBDA12F6B, 
 89	0x3F435580,	0xBCDA12F7,  	0xBBEED097,	0xB425ED0A, 
 90	0xBF2F545A,	0x781948B1,  	0x3B7948B0,	0xFCD6E991, 
 91	0x3F17B291,	0x61F9ADD4,  	0xBBAF9ADD,	0x3C0CA458
 92};
 93
 94struct TanhTblEntry { 
 95   double One;
 96   double Two;
 97   double Three;
 98} TanhTblEntry;
 99
100struct TanhCoeffEntry { 
101   double One;				
102   double Two;
103} TanhCoeffEntry;
104
105
106extern long double  _ExpInnerLD(double alpha, double beta, double gamma,
107								double *pextra, int normflag);
108
109long double sinhl(long double x)
110{
111	double xHead, xTail, xHeadx, xTailx;
112 	double p, q, r1, r2, r3, r4, r5, t1, t2;
113	double residual, residual1, residual2, residual3, residual4, residual5, residual6;
114	double res, resnew, reshi, reslow, restiny, resbot, resmid; 
115	double remainder, partial, extraword;
116	double argsq, argsq2;
117	double prod, prodlow, sum, suml, temp, temp2,  templow;
118	double bottom;
119	uint32_t hibits;
120	int i;
121	double fpenv;
122	DblDbl ddx;
123	double coeff[16] = {
124		0.62132974171578525315635255289e-14, 0.577836659795680285435407874e-11,
125		0.469203367754092391773551193841e-8, 0.329380764163372859025032938e-5,
126		0.1976284584980237154150197628458498e-2, 1.0, 420.0, 143640.0, 39070080.0,
127		8204716800.0, 1279935820800.0, 140792940288000.0, 10137091700736000.0,
128		425757851430912000.0, 8515157028618240000.0, 51090942171709440000.0
129	};
130
131	ddx.f = x;
132	xHead = ddx.d[0];
133	xTail = ddx.d[1];
134	hibits = ddx.i[0] & 0x7FFFFFFFu;	// highest 32 bits as uint32_t
135	
136	// NaNs and infinities
137	if (hibits >= 0x7ff00000u)			// special cases: NaN and infinity
138		return x;
139
140	// x = zero hence xHead is zero
141	if ((hibits | ddx.i[1]) == 0)  return x;      
142
143		
144	// x is not zero, infinity, or NaN
145    
146	// finite  non-zero x 
147        FEGETENVD(fpenv);
148        FESETENVD(0.0);
149
150
151	// for small x, |x| <= 0.693147 use power series
152
153	if (fabs(xHead) <= 0.693147 ) {
154		temp=2.0*xHead*xTail;																// small argument
155	 	argsq=__FMADD( xHead, xHead, temp );				// xHead*xHead+temp;			// use power series
156		argsq2=__FMADD( xTail, xTail, __FMSUB( xHead, xHead, argsq ) + temp ); // xHead*xHead-argsq+temp+xTail*xTail;
157	 	sum=coeff[0];
158	 	suml=0.0;
159	 	sum=__FMADD( argsq, sum, coeff[1] );				// coeff[1]+argsq*sum;
160	 	sum=__FMADD( argsq, sum, coeff[2] );				// coeff[2]+argsq*sum;
161	 	sum=__FMADD( argsq, sum, coeff[3] );				// coeff[3]+argsq*sum;
162		for ( i=4; i<15; i++) {	
163			templow=__FMADD( suml, argsq, sum*argsq2 );		// suml*argsq+sum*argsq2;
164			temp=__FMADD( sum, argsq, templow );			// sum*argsq+templow;
165			bottom=__FMSUB( sum, argsq, temp) + templow;	// sum*argsq-temp+templow;
166			sum=__FADD( coeff[i], temp );
167			residual=coeff[i]-sum+temp;
168			suml=bottom+residual;
169		}
170	 	prodlow=__FMADD( suml, argsq, sum*argsq2 );			// suml*argsq+sum*argsq2;		// mult. by arg^2
171	 	prod=__FMADD( sum, argsq, prodlow );				// sum*argsq+prodlow;
172	 	prodlow=__FMSUB( sum, argsq, prod) + prodlow;		// sum*argsq-prod+prodlow;
173	 	temp2=__FMADD( prodlow, xHead, prod*xTail );		// prodlow*xHead+prod*xTail;	// mult. by xHead
174	 	temp=__FMADD( prod, xHead, temp2 );					// prod*xHead+temp2;
175	 	temp2=__FMSUB(prod, xHead, temp ) + temp2;			// prod*xHead-temp+temp2;
176	 	sum=temp*recip;																		// sum of series ---
177	 	remainder=__FNMSUB( sum, coeff[15], temp );			// (temp-sum*coeff[15]);
178	 	partial=__FADD( remainder, temp2 );
179	 	residual=remainder-partial+temp2;
180	 	suml=__FMADD( partial, recip, residual*recip );		// partial*recip+(residual*recip);
181	 	res=__FADD( xHead, sum );															// except for argument
182	 	reslow=(xHead-res+sum);																// exact
183	 	resmid=__FADD( xTail, suml );
184	 	restiny=xTail-resmid+suml;
185	 	p=__FADD( reslow, resmid );															// sum of middle terms
186	 	q=reslow-p+resmid;																	// possible residual
187	 	reshi=__FADD( res, p );
188	 	resbot=(res-reshi+p)+(q+restiny);
189		ddx.d[0] = __FADD( reshi, resbot );													// build a cannonical result
190		ddx.d[1] = (reshi - ddx.d[0]) + resbot; 
191                FESETENVD(fpenv);
192		return (ddx.f);
193	}
194
195	// |x| > 0.693147 
196	// else if (fabs(xHead) > 0.693147 ) {  <-- Unnecessary test
197	else {
198  		if (xHead < 0) {
199	 		xHeadx = -xHead;
200	 		xTailx = -xTail;
201  		} 
202		else {
203	 		xHeadx = xHead;
204	 		xTailx = xTail;
205  		}
206
207		ddx.f = _ExpInnerLD(xHeadx, xTailx, 0.0 , &extraword , 1); 	//get .5*e^x  
208  		
209		if (fabs(xHead) > 39.0) {
210		
211            if (xHead < 0) ddx.f = - ddx.f;
212
213			reshi = ddx.d[0];
214			ddx.d[0] =  __FADD( ddx.d[0],  ddx.d[1] );
215			if((ddx.i[0] & 0x7FFFFFFFu)>= 0x7ff00000u) {
216				// special cases : NaN and infinity
217				ddx.d[1] = 0.0;
218			}
219			else {
220				ddx.d[1] = (reshi -  ddx.d[0]) + ddx.d[1];
221			}
222                        FESETENVD(fpenv);
223			return (ddx.f);
224		} 
225		else {
226	 		t1 = ddx.d[0];																// zres = .5*e^x
227	 		t2 = ddx.d[1];																// Def. of sinh: (e^x - e^-x)/2
228	 		r1=0.25/t1;
229	 		residual=__FNMSUB( t2, r1, __FNMSUB( t1, r1, 0.25 ) );	// (0.25-t1*r1)-t2*r1;
230	 		r2=residual*(4.0*r1);														// (r1,r2)=.5*e^-x
231	 		residual1 =__FNMSUB( t2, r1, __FNMSUB( t1, r1, 0.25 ) - residual ); // (0.25-t1*r1)-residual-t2*r1;
232	 		r3=__FMSUB(t1, r2, residual );		//(t1*r2-residual);						// (reversed sign)
233	 		r4=__FMADD( extraword, r1, t2*r2 );	//(extraword*r1+(t2*r2));				// (reversed sign)
234	 		reshi=__FSUB( t1, r1 );
235	 		reslow=__FSUB( t2, r2 );
236	 		residual=(t1-reshi)-r1;														// exact
237	 		if (fabs(t2) > fabs(r2)) 
238				residual2=(t2-reslow)-r2;												// exact
239	 		else 
240				residual2=t2-(reslow+r2);												// exact
241	 		r5=(r3+r4-residual1)*(4.0*r1);
242	 		resnew=__FADD( reshi, reslow );
243	 		residual3=reshi-resnew+reslow;												// exact
244	 		residual4=__FADD( residual, residual3 );
245	 		reshi=__FADD( resnew, residual4 );
246	 		residual5=residual-residual4+residual3;
247	 		residual6=resnew -reshi+residual4;
248	 		reslow=(residual2+(extraword+r5))+residual5+residual6;
249	 		if (xHead <	0.0) {															// Reverse signs for
250				reshi = -reshi;															// negative arguments
251				reslow = -reslow;
252			}
253			ddx.d[0] = __FADD( reshi, reslow );											// build a cannonical result
254			ddx.d[1] = (reshi - ddx.d[0]) + reslow; 
255                        FESETENVD(fpenv);
256			return (ddx.f);
257		}
258	}       // end of abs(xHead) > 0.693147
259
260}
261
262
263long double coshl(long double x)
264{
265	double xHead, xTail, xHeadx, xTailx;
266 	double r1, r2, r3, r4, r5, t1, t2;
267	double residual, residual1, residual2, residual3, residual4, residual5, residual6;
268	double  res, resnew, reshi, reslow;
269	double remainder, partial, extraword;
270	double argsq, argsq2;
271	double prod, prodlow, sum, suml, temp, templow;
272	double bottom;
273	uint32_t hibits, signx;
274	int i;
275    double fpenv;
276	DblDbl ddx;
277	double coeff[9] = {
278		306.0,  73440.0,  13366080.0,  1764322560.0,  158789030400.0,  8892185702400.0,  
279		266765571072000.0,  3201186852864000.0, 6402373705728000.0
280	};
281	double recip = 1.561920696858622646221636435005e-16;
282
283	ddx.f = x;
284	xHead = ddx.d[0];
285	xTail = ddx.d[1];
286	hibits = ddx.i[0] & 0x7FFFFFFFu;		// highest 32 bits as uint32_t
287    signx = (ddx.i[0] >> 31) & 1u;			// get sign of head
288	
289	// NaNs and infinities
290	
291	if (hibits >= 0x7ff00000u){			// special cases: NaN and infinity
292		if (xHead != xHead) {				// x is a NaN
293			return x;
294		}
295		else {								// For +/- infinity return +infinity
296			if (signx)
297				return (-x);
298			else
299				return x;
300		}
301	}
302
303
304	// x = zero hence xHead is zero
305
306	if ((hibits | ddx.i[1]) == 0)
307		return (long double) 1.0;      
308
309	// x is not zero, infinity nor a NaN
310	
311        FEGETENVD(fpenv);
312        FESETENVD(0.0);
313
314	//for small x, |x| <= 0.125 use power series
315	if (hibits <= 0x3fc00000u) {
316		temp=2.0*xHead*xTail;        
317		argsq=__FMADD( xHead, xHead, temp );				// xHead*xHead+temp;         
318		argsq2=__FMADD( xTail, xTail, __FMSUB( xHead, xHead, argsq ) + temp ); // xHead*xHead-argsq+temp+xTail*xTail;
319		sum=1.0;
320		suml=0.0;
321	 	sum=__FMADD( argsq, sum, coeff[0] );				// coeff[0]+argsq*sum;
322	 	sum=__FMADD( argsq, sum, coeff[1] );				// coeff[1]+argsq*sum;
323	 	sum=__FMADD( argsq, sum, coeff[2] );				// coeff[2]+argsq*sum;
324		for (i=3; i< 8; i++) {
325			templow=__FMADD( suml, argsq, sum*argsq2 );		// suml*argsq+sum*argsq2;
326			temp=__FMADD( sum, argsq, templow );			// sum*argsq+templow;
327			bottom=__FMSUB( sum, argsq, temp) + templow;	// sum*argsq-temp+templow;
328			sum=__FADD( coeff[i], temp );
329			residual=coeff[i]-sum+temp;
330			suml=bottom+residual;
331		}
332	 	prodlow=__FMADD( suml, argsq, sum*argsq2 );			// suml*argsq+sum*argsq2;		// mult. by arg^2
333	 	prod=__FMADD( sum, argsq, prodlow );				// sum*argsq+prodlow;
334	 	prodlow=__FMSUB( sum, argsq, prod) + prodlow;		// sum*argsq-prod+prodlow;
335	 	sum=prod*recip;																		// sum of series ---
336	 	remainder=__FNMSUB( sum, coeff[8], prod );			// (prod-sum*coeff[8]);
337	 	partial=__FADD( remainder, prodlow );
338		residual=remainder-partial+prodlow;
339	 	suml=__FMADD( partial, recip, residual*recip );		// partial*recip+(residual*recip);
340		res=__FADD(1.0, sum );																// except for 1.0 term.
341	 	reslow=1.0 - res+sum+suml;
342		ddx.d[0] = __FADD( res, reslow );													// build a cannonical result
343		ddx.d[1] = (res - ddx.d[0]) + reslow; 
344                FESETENVD(fpenv);
345		return (ddx.f);
346	}
347
348	// |x| > 0.125 
349	// else if (hibits > 0x3fc00000u) {   	// abs(xHead) > 0.125  <-- Unnecessary test
350	else {   								// abs(xHead) > 0.125
351		if (signx) {
352			xHeadx = -xHead;
353			xTailx = -xTail;
354		}
355		else{
356			xHeadx = xHead;
357			xTailx = xTail;
358		} 
359		ddx.f = _ExpInnerLD (xHeadx, xTailx, 0.0 , &extraword , 1); 	// get .5*e^x 
360
361		// |x| > 40.0		
362		if (hibits > 0x40440000u) { 		// abs(xHead) > 40.0
363
364			reshi = ddx.d[0];
365			ddx.d[0] =  __FADD( ddx.d[0],  ddx.d[1] );
366			if((ddx.i[0] & 0x7FFFFFFFu)>= 0x7ff00000u) {  // special cases : NaN and infinity
367				ddx.d[1] = 0.0;
368			}
369			else {
370				ddx.d[1] = (reshi -  ddx.d[0]) + ddx.d[1];
371			}
372                        FESETENVD(fpenv);
373			return (ddx.f);
374		}
375		// 0.125 <= |x| < 40.0	
376		else{
377	 		t1 = ddx.d[0];																// zres=.5*e^x
378	 		t2 = ddx.d[1];																// def. of cosh: (e^x + e^-x)/2
379	 		r1=0.25/t1;
380	 		residual=__FNMSUB( t2, r1, __FNMSUB( t1, r1, 0.25 ) );	// (0.25-t1*r1)-t2*r1;
381	 		r2=residual*(4.0*r1);														// (r1,r2)=.5*e^-x + errors< 4 ulps
382	 		residual1 =__FNMSUB( t2, r1, __FNMSUB( t1, r1, 0.25 ) - residual ); // (0.25-t1*r1)-residual-t2*r1;
383											// rest of rem. from HO divisor only
384	 		r3=__FMSUB(t1, r2, residual );		//(t1*r2-residual);						// (reversed sign)...LO quotient
385			r4=__FMADD( extraword, r1, t2*r2 );	//(extraword*r1+(t2*r2));				// (reversed sign)...LO quotient
386	 		reshi=__FADD( t1, r1 );
387	 		reslow=__FADD( t2, r2 );
388	 		residual=(t1-reshi)+r1;														// exact
389	 		if (fabs(t2) >  fabs(r2)) 
390	   			residual2=(t2-reslow)+r2;												// exact
391	 		else
392	  			 residual2=r2-reslow+t2;												// exact
393	 		r5=(r3+r4-residual1)*(4.0*r1);												// reversed sign
394	 		resnew=__FADD( reshi, reslow );
395	 		residual3=reshi-resnew+reslow;												// exact
396	 		residual4=__FADD( residual, residual3 );
397	 		reshi=__FADD( resnew, residual4 );
398	 		residual5=residual-residual4+residual3;
399	 		residual6=resnew-reshi+residual4;
400	 		reslow=(residual2+(extraword-r5))+residual5+residual6;
401			ddx.d[0] = __FADD( reshi, reslow );											// build a cannonical result
402			ddx.d[1] = (reshi - ddx.d[0]) + reslow; 
403                        FESETENVD(fpenv);
404			return (ddx.f);
405		}
406	}       // end of abs(xHead) > 0.125
407}
408
409
410long double tanhl(long double x)
411{
412
413	double r, res, res2, res3, res4,res5, resfin, reslow;
414	double rem, rem1, rem2;
415	uint32_t signx, hibits, ndx ;
416	double q, qq, qa, qb, qc, q1, quot, quot2, quot3;
417	double arg, arg1, arg2, argsq, argsq2;
418	double sum, sum2, suml, suml2;
419	double frac, frac2, fac, fac2;
420	double ra, rb, rc, rd, re, rf;
421	double ans, ansx, ansl, ansmid, anstiny, anslow, almost;
422	double prod, prodlow, prodx, prodlowx, proderr, pl;
423	double top, top2, den, den1, denf, denf2;
424	double rg, t2, t3, tsq, tsql;
425	double az, bz, temp, temp1, templow, carry, carry1, extra;
426	double bump, residual, tiny, recip;
427	Double DeN;
428	double fpenv;
429	double xHead, xTail;
430	DblDbl ddx;
431	int i;
432
433	struct TanhTblEntry *TanhTblPtr = (struct TanhTblEntry *)TanhTbl;
434	struct TanhCoeffEntry *TanhCoeffPtr = (struct TanhCoeffEntry *) TanhCoeff;
435
436	ddx.f = x;
437	xHead = ddx.d[0];
438	xTail = ddx.d[1];
439	hibits = ddx.i[0] & 0x7FFFFFFFu;			// highest 32 bits as uint32_t
440    signx = (ddx.i[0] >> 31) & 1u;				// get sign of head
441	
442	// NaN or zero
443	
444	if ((xHead != xHead)||((hibits | ddx.i[1]) == 0))
445		return x;
446
447        FEGETENVD(fpenv);
448        FESETENVD(0.0);
449		
450	//for infinity or large x , |xHead| > 40.0 
451
452	if (hibits > 0x40440000u) {				// results = 1, for big args.
453        FESETENVD(fpenv);
454		if (signx)
455			return (long double) (-1.0);		// sign of result= sign of arg
456		else
457			return (long double) (1.0);
458    }
459
460	// x is not zero, infinity nor a NaN
461	
462	if (signx) {
463		xHead = -xHead;
464		xTail = -xTail;
465	}
466
467
468	// values of |xHead| with 2.0<= |xHead| <= 40.0
469
470	if (hibits > 0x40000000u) {	  
471		//****************************************************************************
472		//                                                                           *
473		//                         1.0                                               *
474		//      tanh(x)= 1 -  -------------           |x| > 2.0                      *
475		//                    .5 +.5*e^(2x)                                          *
476		//                                                                           *
477		//****************************************************************************
478		xHead=2.0*xHead;						// double arg
479		xTail=2.0*xTail;
480       
481		ddx.f = _ExpInnerLD (xHead, xTail, 0.0 , &extra, 1 );							// compute e^(2*arg)
482		az = ddx.d[0];
483       	bz = ddx.d[1];
484       	temp=__FADD( az, 0.5 );															// az >= 0.5
485       	carry=az-temp+0.5;																// exact
486       	temp1=__FADD( carry, bz );
487       	carry1=carry-temp1+bz;
488       	den=__FADD( temp, temp1 );
489       	den1=temp-den+temp1+carry1;
490       	r=1.0/den;																		// guess recip. of denominator
491       	rem=__FNMSUB( den, r, 1.0 );			// 1.0-den*r;							// first remainder
492       	rem1=__FNMSUB( r, den1, rem );			// rem-r*den1;							// (rem1,rem2) is better rem.
493       	if (fabs(rem) > fabs(r*den1)) 
494        	rem2=__FNMSUB( r, den1, rem-rem1 ); // rem-rem1-r*den1;
495       	else {
496          	rem1=__FNMSUB( r, den1, rem ); // rem-(r*den1);
497          	/* rem2=-(r*den1)-rem1+rem -(r*den1-(r*den1)); */
498			rem2 = rem - __FMADD( r, den1, rem1 ) - __FMSUB( r, den1, (r*den1) );
499       	} 
500       	qq=__FMADD( r, rem1, r ); // r+r*rem1; 							// going for the full quotient
501       	qa=__FMADD( r, rem1, r-qq ); // r-qq+r*rem1;
502       	qb=__FMADD( r, rem2, qa ); // qa+r*rem2;
503       	q=__FADD( qq, qb );
504       	// qc=qa-qb+r*rem2-(extra*r)*r;
505		qc = __FNMSUB( extra*r, r, __FMADD( r, rem2, qa - qb ) );
506       	q1=qq-q+qb+qc;
507       	res=__FSUB( 1.0, q );															// high order result
508       	res2=1.0-res-q;																	// exact low order
509       	res3=__FSUB( res2, q1 );
510       	resfin=__FADD( res, res3 );
511       	res4=res2-res3-q1;
512       	res5=res-resfin+res3;
513       	reslow=res4+res5;
514       	if (signx) {
515          	resfin=-resfin;
516          	reslow=-reslow;
517       	} 
518		ddx.d[0] = __FADD( resfin, reslow );											// build a cannonical result
519		ddx.d[1] = (resfin - ddx.d[0]) + reslow; 
520                FESETENVD(fpenv);
521		return (ddx.f);
522	}
523
524	// else if (hibits <=  0x40000000u) {		// |xHead| < 2.0  <-- Unnecessary test
525	else {										// |xHead| < 2.0
526	
527		//***************************************************************************
528		//                                                                          *
529		//     Accurate table is used to reduce small arguments such that the       *
530		//     range of the |reduced argument| < 1/32. The tanh addition formula    *
531		//     is used to piece together three tanhs                                *
532		//                                                                          *
533		//***************************************************************************
534
535		DeN.f = __FMADD( xHead, 16.0, kBig ); // kBig+xHead*16.0;				// compute table lookup index
536 	 	ndx = DeN.i[1];
537       	if (hibits <  0x3fb00000u) ndx=0;		// |xHead| < 0.0625
538       	arg1=xHead-TanhTblPtr[(int32_t)ndx].One;			// reduce argument by table value
539												// which is close to ndx/16 (exact)
540       	arg=__FADD( arg1, xTail );							// full 53 bit argument
541       	arg2=arg1-arg+xTail;					// low order argument
542		//************************************************************************************
543		//     The argument has been broken up as follows:                             		 *
544		//                   Zarg=TanhTblPtr[ndx].One +arg+arg2                              *
545		//                                                                             		 *
546		//     tanh(TanhTblPtr[ndx].One) is read from tantbl(2,ndx),tantbl(3,ndx)            *
547		//         with at least 122 bits of precision                                 		 *
548		//                   tanh(arg2)=arg2+o(2^-172)                                 		 *
549		//                                                                             		 *
550		//     Compute tanh(arg) by economized power series.                           		 *
551		//     the task is to piece together the three parts                     			 *
552		//                                                                             		 *
553		//                                      (tanh(xTail)+tanh(c))(1-tanh^2(xHead))    	 *
554		// tanh(xHead+xTail+c)=tanh(xHead) +   ------------------------------------------    *
555		//                         1+tanh(xHead) tanh(xTail)+tanh(c)(tanh(xHead)+tanh(xTail))*
556		//                                                                             		 *
557		//************************************************************************************
558     	sum=TanhCoeffPtr[10].One;
559      	suml=TanhCoeffPtr[10].Two;
560      	argsq=arg*arg;
561      	argsq2=__FMSUB( arg, arg, argsq ); //	arg*arg-argsq;					// arg^2 for series
562     	for (i = 9; i > 0; i--) { 
563    		pl=__FMADD( suml, argsq, sum*argsq2 ); // suml*argsq+sum*argsq2;
564         	prod=__FMADD( sum, argsq, pl ); // sum*argsq+pl;
565         	prodlow=__FMSUB( sum, argsq, prod ) + pl; // sum*argsq-prod+pl;
566         	sum=__FADD( TanhCoeffPtr[i].One, prod );		// add in the next coefficient
567         	sum2=TanhCoeffPtr[i].Two+prodlow;
568         	suml=TanhCoeffPtr[i].One-sum+prod+sum2;
569		}
570      	pl=__FMADD( suml, argsq, sum*argsq2 ); // suml*argsq+sum*argsq2;
571      	temp=__FMADD( sum, argsq, pl ); // sum*argsq+pl;
572      	templow=__FMSUB( sum, argsq, temp ) + pl; // sum*argsq-temp+pl;
573      	pl=templow*arg;							// last multiplication by arg
574      	prodx=__FMADD( temp, arg, pl );			// temp*arg+pl;
575      	prodlowx=__FMSUB( temp, arg, prodx ) + pl; // temp*arg-prodx+pl;				// tanh(arg)-1.0  is done.
576      	prod=__FADD( arg, prodx );
577      	prodlow=arg-prod+prodx+prodlowx;		// tanh(arg) is done.
578      
579	  	if (hibits <  0x3fb00000u) {			// |xHead| < 0.0625, trivial: tanh(xHead)=0 in formula
580         	proderr=(arg-prod+prodx)-prodlow+prodlowx;
581         	bump=__FNMSUB( prod, prod*arg2, arg2 ); // arg2-prod*prod*arg2;
582         	reslow=__FADD( prodlow, bump );
583         	residual=prodlow-reslow+bump+proderr;
584         	res=__FADD( reslow, prod );
585         	reslow=prod-res+reslow+residual;
586         	if (signx) {
587            	res=-res;
588            	reslow=-reslow;
589         	} 
590
591			ddx.d[0] = __FADD( res, reslow );			// build a cannonical result
592			ddx.d[1] = (res - ddx.d[0]) + reslow; 
593                        FESETENVD(fpenv);
594			return (ddx.f);
595      	} 
596      
597		tiny=arg2*(TanhTblPtr[(int32_t)ndx].Two+prod);	// The last addend in denominator
598		
599      	// pl=TanhTblPtr[(int32_t)ndx].Two*prodlow+TanhTblPtr[(int32_t)ndx].Three*prod;  // tanh(xHead)*tanh(xTail)
600		pl = __FMADD( TanhTblPtr[(int32_t)ndx].Two, prodlow, TanhTblPtr[(int32_t)ndx].Three*prod );
601		
602      	// den=TanhTblPtr[(int32_t)ndx].Two*prod+pl;		// denominator completed except
603		den = __FMADD( TanhTblPtr[(int32_t)ndx].Two, prod, pl );
604		
605      	// den1=TanhTblPtr[(int32_t)ndx].Two*prod-den+pl+tiny;			// for the +1 term.
606		den1 = __FMSUB( TanhTblPtr[(int32_t)ndx].Two, prod, den ) + pl + tiny;
607		
608      	suml2=prodlow+arg2;						// start on numerator
609      	t2=2.0*TanhTblPtr[(int32_t)ndx].Two*TanhTblPtr[(int32_t)ndx].Three;	// table look up value
610		
611      	// tsq=TanhTblPtr[(int32_t)ndx].Two*TanhTblPtr[(int32_t)ndx].Two+t2;		// squared.
612		tsq = __FMADD( TanhTblPtr[(int32_t)ndx].Two, TanhTblPtr[(int32_t)ndx].Two, t2 );
613		
614      	// tsql=TanhTblPtr[(int32_t)ndx].Two*TanhTblPtr[(int32_t)ndx].Two-tsq+t2;
615		tsql = __FMSUB( TanhTblPtr[(int32_t)ndx].Two, TanhTblPtr[(int32_t)ndx].Two, tsq ) + t2;
616		
617      	/* t3=2.0*TanhTblPtr[(int32_t)ndx].Two*TanhTblPtr[(int32_t)ndx].Three-t2 +
618    		TanhTblPtr[(int32_t)ndx].Three*TanhTblPtr[(int32_t)ndx].Three; */
619		t3 = __FMADD( TanhTblPtr[(int32_t)ndx].Three, TanhTblPtr[(int32_t)ndx].Three, 
620				__FMSUB( 2.0*TanhTblPtr[(int32_t)ndx].Two, TanhTblPtr[(int32_t)ndx].Three, t2 ) );
621				
622      	denf=__FADD( 1.0, den );							// denominator is really done now
623      	denf2=1.0-denf+den+den1;
624      	fac=__FSUB( 1.0, tsq );							// compute 1-tanhtble(ndx)^2 becomes
625      	residual=1.0-fac-tsq;					// (fac, fac2)
626      	fac2=residual-tsql-t3;
627      	pl=__FMADD( fac, suml2, fac2*prod ); // fac*suml2+fac2*prod;					// (xTail+c)(1-xHead*xHead) ... above formula
628      	top=__FMADD( fac, prod, pl ); // fac*prod+pl;
629      	top2=__FMSUB( fac, prod, top ) + pl; // fac*prod-top+pl;					// doing division
630      	recip=1.0/denf;
631      	quot=top*recip;
632      	ra=__FNMSUB( quot, denf, top ); // top-quot*denf;						// 3 part remainder:
633      	rb=-(quot*denf2);						// rem=ra+rb+top2
634      	rc= - __FMADD(quot, denf2, rb ); // -(quot*denf2+rb);					// 3rd order remainder
635      	rd=__FADD( top2, rb );								// sum 2nd order rems. into rf
636      	if (fabs(top2) > fabs(rb))    
637			re=top2-rd+rb;
638      	else 
639			re=rb-rd+top2;
640      	rf=__FADD( ra, rd );
641      	if (fabs(ra) > fabs(rd)) 
642         	rg=ra-rf+rd;						// more 3rd order rems.
643      	else									// rc+re+rg
644         	rg=rd-rf+ra;
645      	quot2=rf*recip;
646      	quot3=__FMSUB( rf, recip, quot2 ); // rf*recip-quot2;
647      	frac=__FADD( quot, quot2 );
648      	// frac2=quot-frac+quot2+(quot3+recip *(rc+re+rg));
649		frac2 = quot-frac+quot2 + __FMADD( recip, (rc+re+rg), quot3 );
650
651
652      	ansx=__FADD( TanhTblPtr[(int32_t)ndx].Two, frac );			// paste together the result
653      	ansl=__FADD( TanhTblPtr[(int32_t)ndx].Three, frac2 );
654      	ansmid=TanhTblPtr[(int32_t)ndx].Two-ansx+frac;
655      	anstiny=TanhTblPtr[(int32_t)ndx].Three-ansl+frac2;
656      	almost=__FADD( ansmid, ansl );
657      	ans=__FADD( ansx, almost );
658      	anslow=(ansx-ans+almost)+((ansmid-almost+ansl) +anstiny);
659      	if (signx) {
660         	ans=-ans;
661         	anslow=-anslow;
662      	} 
663 
664 		ddx.d[0] = __FADD( ans, anslow );				// build a cannonical result
665		ddx.d[1] = (ans - ddx.d[0]) + anslow; 
666                FESETENVD(fpenv);
667		return (ddx.f);
668	}
669}
670#endif