src/libm/Source/PowerPC/SqrtDD.c at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / PowerPC / SqrtDD.c
at fixPythonPipStalling 254 lines 10 kB view raw
wrap content
Lubos Dolezel Restructured source tree to prepare for merge with the "darling" repo 10y ago
f228ae16
  1/*
  2 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
  3 *
  4 * @APPLE_LICENSE_HEADER_START@
  5 * 
  6 * The contents of this file constitute Original Code as defined in and
  7 * are subject to the Apple Public Source License Version 1.1 (the
  8 * "License").  You may not use this file except in compliance with the
  9 * License.  Please obtain a copy of the License at
 10 * http://www.apple.com/publicsource and read it before using this file.
 11 * 
 12 * This Original Code and all software distributed under the License are
 13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
 17 * License for the specific language governing rights and limitations
 18 * under the License.
 19 * 
 20 * @APPLE_LICENSE_HEADER_END@
 21 */
 22//
 23//	SqrtDD.c
 24//
 25//	Double-double Function Library
 26//	Copyright: � 1995-96 by Apple Computer, Inc., all rights reserved
 27//	
 28//	long double sqrtl( long double x );
 29//
 30
 31#include "math.h"
 32#include "fp_private.h"
 33#if (__WANT_LONG_DOUBLE_FORMAT__ - 0L == 128L)
 34#include "DD.h"
 35
 36// sqrttable[256] contains initial estimates for the high eight significand 
 37// bits for Sqrt(x) and 0.5 times its reciprocal.
 38
 39static const unsigned short sqrttable[256] = {
 40       0x6b69, 0x6c68, 0x6e67, 0x6f65, 0x7064, 0x7263, 0x7361, 0x7460,
 41       0x765f, 0x775d, 0x795c, 0x7a5b, 0x7b5a, 0x7d58, 0x7e57, 0x7f56,
 42       0x8155, 0x8254, 0x8352, 0x8551, 0x8650, 0x874f, 0x894e, 0x8a4d,
 43       0x8b4c, 0x8c4b, 0x8e4a, 0x8f48, 0x9047, 0x9246, 0x9345, 0x9444,
 44       0x9543, 0x9742, 0x9841, 0x9940, 0x9a3f, 0x9c3e, 0x9d3d, 0x9e3c,
 45       0x9f3c, 0xa13b, 0xa23a, 0xa339, 0xa438, 0xa637, 0xa736, 0xa835,
 46       0xa934, 0xaa33, 0xac33, 0xad32, 0xae31, 0xaf30, 0xb02f, 0xb12e,
 47       0xb32e, 0xb42d, 0xb52c, 0xb62b, 0xb72a, 0xb92a, 0xba29, 0xbb28,
 48       0xbc27, 0xbd26, 0xbe26, 0xbf25, 0xc124, 0xc223, 0xc323, 0xc422,
 49       0xc521, 0xc621, 0xc720, 0xc81f, 0xca1e, 0xcb1e, 0xcc1d, 0xcd1c,
 50       0xce1c, 0xcf1b, 0xd01a, 0xd11a, 0xd219, 0xd418, 0xd518, 0xd617,
 51       0xd716, 0xd816, 0xd915, 0xda14, 0xdb14, 0xdc13, 0xdd13, 0xde12,
 52       0xdf11, 0xe111, 0xe210, 0xe310, 0xe40f, 0xe50e, 0xe60e, 0xe70d,
 53       0xe80d, 0xe90c, 0xea0b, 0xeb0b, 0xec0a, 0xed0a, 0xee09, 0xef09,
 54       0xf008, 0xf108, 0xf207, 0xf306, 0xf406, 0xf505, 0xf605, 0xf704,
 55       0xf804, 0xf903, 0xfa03, 0xfb02, 0xfc02, 0xfd01, 0xfe01, 0xff00,
 56       0x00ff, 0x01fd, 0x02fb, 0x03f9, 0x04f7, 0x05f5, 0x06f3, 0x07f2,
 57       0x08f0, 0x09ee, 0x0aec, 0x0bea, 0x0ce9, 0x0de7, 0x0ee5, 0x0fe4,
 58       0x10e2, 0x11e0, 0x12df, 0x13dd, 0x14db, 0x15da, 0x16d8, 0x17d7,
 59       0x17d5, 0x18d4, 0x19d2, 0x1ad1, 0x1bcf, 0x1cce, 0x1dcc, 0x1ecb,
 60       0x1fc9, 0x20c8, 0x20c6, 0x21c5, 0x22c4, 0x23c2, 0x24c1, 0x25c0,
 61       0x26be, 0x27bd, 0x27bc, 0x28ba, 0x29b9, 0x2ab8, 0x2bb7, 0x2cb5,
 62       0x2db4, 0x2db3, 0x2eb2, 0x2fb0, 0x30af, 0x31ae, 0x32ad, 0x33ac,
 63       0x33aa, 0x34a9, 0x35a8, 0x36a7, 0x37a6, 0x37a5, 0x38a4, 0x39a3,
 64       0x3aa2, 0x3ba0, 0x3c9f, 0x3c9e, 0x3d9d, 0x3e9c, 0x3f9b, 0x409a,
 65       0x4099, 0x4198, 0x4297, 0x4396, 0x4495, 0x4494, 0x4593, 0x4692,
 66       0x4791, 0x4890, 0x488f, 0x498e, 0x4a8d, 0x4b8c, 0x4b8c, 0x4c8b,
 67       0x4d8a, 0x4e89, 0x4e88, 0x4f87, 0x5086, 0x5185, 0x5284, 0x5283,
 68       0x5383, 0x5482, 0x5581, 0x5580, 0x567f, 0x577e, 0x587e, 0x587d,
 69       0x597c, 0x5a7b, 0x5b7a, 0x5b79, 0x5c79, 0x5d78, 0x5d77, 0x5e76,
 70       0x5f76, 0x6075, 0x6074, 0x6173, 0x6272, 0x6372, 0x6371, 0x6470,
 71       0x656f, 0x656f, 0x666e, 0x676d, 0x686d, 0x686c, 0x696b, 0x6a6a
 72      };
 73
 74
 75
 76//****************************************************************************
 77// FUNCTION:  long double __sqrtldextra( double x, double y, double *pextra )
 78//
 79// DESCRIPTION:  treats x and y as the head and tail, respectively of a long
 80//               double argument X.  Returns the square root of X rounded to
 81//               long double format and sets *pextra to a double value which
 82//               contains a correction factor which contains about 8 extra
 83//               bits of precision for the square root.  This extra precision
 84//               is used by the long double ArcCos implementation.
 85//	      
 86//              The main square root refinement algorithm is that of P.
 87//              Markstein, IBM J. R&D, January, 1990, p.117.
 88//
 89// ASSUMPTIONS: x and y represent a finite positive long double in canonical
 90//              form and rounding direction is currently set to the default
 91//              (IEEE round-to-nearest) mode.
 92//
 93//***************************************************************************
 94
 95long double __sqrtldextra( double x, double y, double *pextra )
 96{
 97	DblDbl zDD;
 98	Double xD, guessD, recipD;
 99	double scale, guess, recip, diff, recip2, epsilon, guessnew, gsq, guesslow;
100	double gsqlow, gg, gmid, gmidlow, recipnew, diff1, diff3, diff3x, diff3a;
101	double diff3ax, diff4, diff5, glow2, diff6, diff7, firstfix, fixup;
102	uint32_t ghead, reciphead, ival;
103	int index;
104	
105	const Double scaleupD = {{0x5ff00000,0x0}};
106	const Double scaledownD = {{0x1ff00000,0x0}};
107	const Double adjustupD = {{0x4ff00000,0x0}};
108	const Double adjustdownD = {{0x2ff00000,0x0}};
109	
110	xD.f = x;
111	scale = 1.0;
112	
113	if (xD.i[0] < 0x07000000u) {			// tiny argument
114		x *= scaleupD.f;					//   scale up
115		y *= scaleupD.f;
116		xD.f = x;
117		scale = adjustdownD.f;				// final scale factor
118	}
119	else if (xD.i[0] > 0x7fdfffffu) {		// huge argument
120		x *= scaledownD.f;					//   scale down away from
121		y *= scaledownD.f;					//   overflow threshold
122		xD.f = x;
123		scale = adjustupD.f;				//   final scale factor
124	}
125	
126	// estimate exponents of square root and 0.5*reciprocal square root
127	ghead = ((xD.i[0] + 0x3ff00000u) >> 1) & 0x7ff00000u;
128	reciphead = 0x7fc00000u - ghead;
129	
130	// initialize initial significand estimates for square root and 0.5*reciprocal
131	// from sqrttable entry indexed based upon significand of x and low exponent
132	// bit
133	index = (int)((xD.i[0] >> 13) & 0xffu);
134	ival = (uint32_t) sqrttable[index];
135	guessD.i[1] = 0u;
136	guessD.i[0] = ghead + ((ival & 0xff00u) << 4);
137	recipD.i[0] = reciphead + ((ival & 0xffu) << 12);
138	recipD.i[1] = 0u;
139	
140	guess = guessD.f;
141	recip = recipD.f;
142	
143	// iterate to refine square root and its reciprocal (MAF required)
144	diff = __FNMSUB( guess, guess, x );	    // x - guess*guess;
145	recip2 = recip + recip;			    // recip2 is approx 1.0/guess
146	guess = __FMADD( diff, recip, guess );	    //guess + diff*recip;		// 16-bit square root
147	epsilon = __FNMSUB( recip, guess, 0.5 );    // 0.5 - recip*guess;		// begin Newton-Raphson iteration
148	diff = __FNMSUB( guess, guess, x );	    // x - guess*guess;
149	recip = __FMADD( epsilon, recip2, recip );  // recip + epsilon*recip2;		// 16-bit reciprocal
150	guess = __FMADD( recip, diff, guess );	    // guess + recip*diff;		// 32-bit square root
151	recip2 = recip + recip;
152	epsilon = __FNMSUB( recip, guess, 0.5 );    // 0.5 - recip*guess;
153	diff = __FNMSUB( guess, guess, x ) + y;	    // (x - guess*guess) + y;
154	recip = __FMADD( epsilon, recip2, recip );					// 32-bit reciprocal
155	guessnew = __FMADD( recip, diff, guess );   // guess + recip*diff;		// 53-bit square root
156	recip2 = recip + recip;
157	gsq = __FMUL( guessnew, guessnew );	    // NOTE: Departure from P. Markstein
158						    //   for greater accuracy.
159	guesslow = __FMADD( recip, diff, (guess - guessnew) );	    // (guess - guessnew) + recip*diff;
160	gsqlow = __FMSUB( guessnew, guessnew, gsq );// guessnew*guessnew - gsq;
161	gg = guessnew + guessnew;		    // (guessnew,guesslow)^2 must be
162						    //   computed to sextuple precision
163	gmid = __FMUL( gg, guesslow );		    //   in order to control errors to
164	gmidlow = __FMSUB( gg, guesslow, gmid );    // gg*guesslow - gmid;		//   less than 1/2 ulp
165	epsilon = __FNMSUB( recip, guessnew, 0.5 ); // 0.5 - recip*guessnew;
166	recip2 = recip + recip;
167	recipnew = __FMADD( epsilon, recip2, recip );
168	diff1 = x - gsq;			    // exact
169	diff3 = __FSUB( diff1, gmid );		    // not necessarily exact
170	diff3x = diff1 - diff3 - gmid;		    // usually inexact
171	
172	diff3a = __FSUB( diff3, gsqlow );	    // not necessarily exact
173	diff3ax = diff3 - (diff3a + gsqlow);	    // exact
174	diff4 = diff3a + y;			    // usually inexact
175	
176	diff5 = diff4 - gmidlow;		    // error < ulp/1024
177	glow2 = guesslow*guesslow;		    // error < ulp/(2^50)
178	diff6 = __FSUB( diff5, glow2 );		    // diff5 - glow2;			// error < ulp/1024
179	diff7 = diff6 + (diff3x + diff3ax);	    // error < ulp/1024
180	firstfix = recipnew*diff7;							// total error < ulp/256
181	fixup = __FADD( guesslow, firstfix );	    // guesslow + firstfix;
182	guess = __FADD( guessnew, fixup );	    // put in canonical form
183	fixup = guessnew - guess + fixup;
184	
185	zDD.d[0] = guess*scale;			    // scale result and extra bits;
186	zDD.d[1] = fixup*scale;
187	*pextra = ((guesslow - fixup) + firstfix)*scale;
188	return (zDD.f);				    // return long double square root
189}
190 
191long double sqrtl( long double x )
192{
193	DblDbl xDD;
194	long double y;
195	double envD;
196	double extra;
197	uint32_t hibits;
198	
199	xDD.f = x;
200	hibits = xDD.i[0];				// high 32 bits of head
201	
202	// Non-trivial case (positive finite x) call __sqrtldextra with default
203	//   rounding in effect and discard extra precision bits.
204	
205	if ((xDD.d[0] != 0.0) && (hibits < 0x7ff00000u)) {
206                FEGETENVD(envD);
207                FESETENVD(0.0);
208		y = __sqrtldextra(xDD.d[0],xDD.d[1],&extra);
209		FESETENVD(envD);			// restore caller's env
210		return (y);
211	}
212	
213	// Separate edge cases into trivial and exceptional cases
214	
215	if ((hibits < 0x80000000u) || (xDD.d[0] == 0.0) || (xDD.d[0] != xDD.d[0]))
216		return (x);					// zero, +INF, or NaN
217	
218	// Negative ordered arguments result in NaN;
219	xDD.i[0] = 0x7ff80000u;		// return value is NaN
220	xDD.i[1] = 0u;
221	xDD.i[2] = 0x7ff80000u;
222	xDD.i[3] = 0u;
223	return (xDD.f);					// return NaN
224}
225
226#include "math.h"
227long double cbrtl( long double x )
228{
229	DblDbl xDD;
230	double arg, guess;
231	long double ldguess, ldguess2, sqldguess;
232	const long double oneThird = 0.3333333333333333333333333333333L;
233	
234	xDD.f = x;
235	
236	arg = xDD.d[0];
237	
238	if (arg != arg || arg == 0.0 || arg == INFINITY || -arg == INFINITY)
239	    return x;
240	    
241	guess = cbrt(arg);
242	
243	xDD.d[0] = guess;
244	xDD.d[1] = 0.0;
245	
246	ldguess = xDD.f;
247	
248	sqldguess = ldguess * ldguess;
249	ldguess2 = ldguess + ldguess;
250	
251	return (ldguess2 + x/sqldguess)*oneThird;
252}
253    
254#endif