this repo has no description
1/*
2 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22/*******************************************************************************
23* *
24* File sqrt.c, *
25* Function square root (IEEE-754). *
26* *
27* Implementation of square root based on the approximation provided by *
28* Taligent, Inc. who received it from IBM. *
29* *
30* Copyright � 1996-2001 Apple Computer, Inc. All rights reserved. *
31* *
32* Modified and ported by A. Sazegari, started on June 1996. *
33* Modified and ported by Robert A. Murley (ram) for Mac OS X. *
34* *
35* A MathLib v4 file. *
36* *
37* This file was received from Taligent which modified the original IBM *
38* sources. It returns the square root of its double argument, using *
39* Markstein algorithm and requires MAF code generation. *
40* *
41* The algorithm uses Newton�s method to calculate the IEEE-754 correctly *
42* rounded double square root, starting with 8 bit estimate for: *
43* g � �x and y � 1/2�x. Using MAF instructions, each iteration refines *
44* the original approximations g and y with rounding mode set to nearest. *
45* The final step calculates g with the caller�s rounding restored. This *
46* in turn guarantees proper IEEE rounding and exceptions. INEXACT is the *
47* only possible exception raised in this calculation. Initial guesses *
48* for g and y are determined from argument x via table lookup into the *
49* array SqrtTable[256]. *
50* *
51* Note: NaN is set computationally. It will be changed to the proper *
52* NaN code later. *
53* *
54* April 14 1997: added this file to mathlib v3 and included mathlib�s *
55* nan code. *
56* July 23 2001: replaced __setflm with FEGETENVD/FESETENVD. *
57* August 28 2001: added #ifdef __ppc__. *
58* replaced DblInHex typedef with hexdouble. *
59* used standard exception symbols from fenv.h. *
60* September 09 2001: added more comments. *
61* September 10 2001: added macros to detect PowerPC and correct compiler. *
62* September 18 2001: added <CoreServices/CoreServices.h> to get <fp.h>. *
63* October 08 2001: removed <CoreServices/CoreServices.h>. *
64* changed SqrtTable to const private_extern *
65* changed compiler errors to warnings. *
66* November 06 2001: commented out warning about Intel architectures. *
67* November 19 2001: <scp> Added single precision implementation. *
68* *
69* W A R N I N G: *
70* These routines require a 64-bit double precision IEEE-754 model. *
71* They are written for PowerPC only and are expecting the compiler *
72* to generate the correct sequence of multiply-add fused instructions. *
73* *
74* These routines are not intended for 32-bit Intel architectures. *
75* *
76* A version of gcc higher than 932 is required. *
77* *
78* GCC compiler options: *
79* optimization level 3 (-O3) *
80* -fschedule-insns -finline-functions -funroll-all-loops *
81* *
82*******************************************************************************/
83
84#include "fenv_private.h"
85#include "fp_private.h"
86#include "math.h"
87#include <System/ppc/cpu_capabilities.h>
88
89inline static uint32_t __attribute__((always_inline))
90_get_cpu_capabilities(void)
91{
92 uint32_t caps;
93 asm("lwz %0, %1(0)" : "=r" (caps) : "I" (_COMM_PAGE_CPU_CAPABILITIES));
94 return caps;
95}
96
97#define twoTo512 1.34078079299425971e154 // 2**512
98#define twoToMinus256 8.636168555094444625e-78 // 2**-256
99#define upHalfOfAnULP 0.500000000000000111 // 0x1.0000000000001p-1
100
101#define SQRT_NAN "1"
102
103/*******************************************************************************
104* SqrtTable[256] contains initial estimates for the high significand bits *
105* of sqrt and correction factor. This table is also used by the IBM *
106* arccos and arcsin functions. *
107*******************************************************************************/
108
109__private_extern__
110const unsigned short SqrtTable[256] =
111 {
112 27497, 27752, 28262, 28517, 28772, 29282, 29537, 29792, 30302, 30557, 31067, 31322,
113 31577, 32088, 32343, 32598, 33108, 33363, 33618, 34128, 34384, 34639, 35149, 35404,
114 35659, 35914, 36425, 36680, 36935, 37446, 37701, 37956, 38211, 38722, 38977, 39232,
115 39487, 39998, 40253, 40508, 40763, 41274, 41529, 41784, 42040, 42550, 42805, 43061,
116 43316, 43571, 44082, 44337, 44592, 44848, 45103, 45358, 45869, 46124, 46379, 46635,
117 46890, 47401, 47656, 47911, 48167, 48422, 48677, 48933, 49443, 49699, 49954, 50209,
118 50465, 50720, 50976, 51231, 51742, 51997, 52252, 52508, 52763, 53019, 53274, 53529,
119 53785, 54296, 54551, 54806, 55062, 55317, 55573, 55828, 56083, 56339, 56594, 56850,
120 57105, 57616, 57871, 58127, 58382, 58638, 58893, 59149, 59404, 59660, 59915, 60170,
121 60426, 60681, 60937, 61192, 61448, 61703, 61959, 62214, 62470, 62725, 62981, 63236,
122 63492, 63747, 64003, 64258, 64514, 64769, 65025, 65280, 511, 510, 764, 1018,
123 1272, 1526, 1780, 2034, 2288, 2542, 2796, 3050, 3305, 3559, 3813, 4067,
124 4321, 4576, 4830, 5084, 5338, 5593, 5847, 6101, 6101, 6356, 6610, 6864,
125 7119, 7373, 7627, 7882, 8136, 8391, 8391, 8645, 8899, 9154, 9408, 9663,
126 9917, 10172, 10172, 10426, 10681, 10935, 11190, 11444, 11699, 11699, 11954, 12208,
127 12463, 12717, 12972, 13226, 13226, 13481, 13736, 13990, 14245, 14245, 14500, 14754,
128 15009, 15264, 15518, 15518, 15773, 16028, 16282, 16537, 16537, 16792, 17047, 17301,
129 17556, 17556, 17811, 18066, 18320, 18575, 18575, 18830, 19085, 19339, 19339, 19594,
130 19849, 20104, 20104, 20359, 20614, 20868, 21123, 21123, 21378, 21633, 21888, 21888,
131 22143, 22398, 22653, 22653, 22907, 23162, 23417, 23417, 23672, 23927, 23927, 24182,
132 24437, 24692, 24692, 24947, 25202, 25457, 25457, 25712, 25967, 25967, 26222, 26477,
133 26732, 26732, 26987, 27242 };
134
135// There are flag and rounding errors being generated in this file
136
137static const double Two911 = 0x1.0p-911; // HEXDOUBLE( 0x07000000, 0x00000000 );
138static const hexdouble infinity = HEXDOUBLE( 0x7ff00000, 0x00000000 );
139static const double twoTo128 = 0x1.0p+128; // 0.340282366920938463463e39;
140static const double twoToM128 = 0x1.0p-128; // 0.293873587705571876992e-38;
141
142double __sqrt ( double x );
143double __sqrt ( double x )
144{
145 register int index;
146 hexdouble xInHex, yInHex, gInHex;
147 register double OldEnvironment, g, y, y2, d, e;
148 register uint32_t xhead, ghead, yhead;
149
150 register double FPR_z, FPR_Two911, FPR_TwoM128, FPR_Two128, FPR_inf, FPR_HalfULP;
151
152 xInHex.d = x; FPR_z = 0.0;
153 FPR_inf = infinity.d; FPR_Two911 = Two911;
154 FPR_TwoM128 = twoToM128; FPR_Two128 = twoTo128;
155 gInHex.i.lo = 0u; yInHex.i.lo = 0u;
156 FPR_HalfULP = upHalfOfAnULP;
157
158 FEGETENVD( OldEnvironment ); // save environment, set default
159
160 __NOOP;
161 __ENSURE( FPR_TwoM128, FPR_Two128, FPR_z );
162
163 FESETENVD( FPR_z );
164 __ENSURE( FPR_HalfULP, FPR_inf, FPR_Two911 );
165
166/*******************************************************************************
167* Special case for GPUL: storing and loading floats avoids L/S hazard
168*******************************************************************************/
169 __NOOP;
170 if (likely( FPR_TwoM128 < x && x < FPR_Two128 )) // Can float hold initial guess?
171 {
172 hexsingle GInHex, YInHex;
173 register uint32_t GPR_t, GPR_foo;
174
175/*******************************************************************************
176* Calculate initial estimates for g and y from x and SqrtTable[]. *
177*******************************************************************************/
178
179 xhead = xInHex.i.hi; // high 32 bits of x
180 index = ( xhead >> 13 ) & 0xffu; // table index
181 GPR_t = SqrtTable[index];
182
183 // Form *single* precision exponent estimate from double argument:
184 // (((((xhead - bias1024) >> 1) + bias128) << 3) & 0x7f800000) =
185 // (((((xhead - bias1024 + bias128 + bias128) >> 1) << 3) & 0x7f800000))
186
187 ghead = ( ( xhead + 0x07f00000 + 0x07f00000 - 0x3ff00000 ) << 2 ) & 0x7f800000;
188 GInHex.lval = ghead + ( ( 0xff00u & GPR_t ) << 7 );
189
190 // Force GInHex.lval to memory. Then load it into g in the FPU register file
191 // on the *following* cycle. This avoids a Store/Load hazard.
192 asm volatile ( "add %0, %1, %2" : "=r" (GPR_foo) : "b" (&GInHex), "b" (&YInHex) : "memory" ); /* NOOP */
193 g = GInHex.fval;
194
195 yhead = 0x7e000000u - ghead;
196 YInHex.lval = yhead + ( ( 0xffu & GPR_t ) << 15 );
197
198 // Force YInHex.lval to memory. Then load it into y in the FPU register file
199 // on the *following* cycle. This avoids a Store/Load hazard.
200 asm volatile ( "add %0, %1, %2" : "=r" (GPR_foo) : "b" (&GInHex), "b" (&YInHex) : "memory" ); /* NOOP */
201 __NOOP;
202 __NOOP;
203 y = YInHex.fval;
204
205/*******************************************************************************
206* Iterate to refine both g and y. *
207*******************************************************************************/
208
209 d = __FNMSUB( g, g, x ); y2 = y + y;
210
211 g = __FMADD(y, d, g ); // 16-bit g
212
213 e = __FNMSUB( y, g, FPR_HalfULP ); d = __FNMSUB( g, g, x );
214
215 y = __FMADD( e, y2, y ); // 16-bit y
216
217 y2 = y + y; g = __FMADD(y, d, g); // 32-bit g
218
219 e = __FNMSUB( y, g, FPR_HalfULP ); d = __FNMSUB( g, g, x );
220
221 y = __FMADD( e, y2, y ); // 32-bit y
222
223 y2 = y + y; g = __FMADD(y, d, g ); // 64-bit g before rounding
224
225 e = __FNMSUB( y, g, FPR_HalfULP ); d = __FNMSUB( g, g, x );
226
227 y = __FMADD( e, y2, y ); // 64-bit y
228
229 FESETENVD( OldEnvironment ); // restore caller's environment
230
231 __NOOP;
232 return ( __FMADD( y, d, g ) ); // final step
233 }
234
235 if (likely( FPR_z < x && x < FPR_inf ))
236 {
237
238/*******************************************************************************
239* First and most common section: argument > 2.0^(-911), about 5.77662E-275.*
240*******************************************************************************/
241
242 if (likely( FPR_Two911 < x ))
243 {
244 register uint32_t GPR_t;
245
246/*******************************************************************************
247* Calculate initial estimates for g and y from x and SqrtTable[]. *
248*******************************************************************************/
249 xhead = xInHex.i.hi; // high 32 bits of x
250 index = ( xhead >> 13 ) & 0xffu; // table index
251 GPR_t = SqrtTable[index];
252
253 ghead = ( ( xhead + 0x3ff00000 ) >> 1 ) & 0x7ff00000;
254 yhead = 0x7fc00000u - ghead;
255
256 gInHex.i.hi = ghead + ( ( 0xff00u & GPR_t ) << 4 );
257 g = gInHex.d;
258
259 yInHex.i.hi = yhead + ( ( 0xffu & GPR_t ) << 12 );
260 y = yInHex.d;
261/*******************************************************************************
262* Iterate to refine both g and y. *
263*******************************************************************************/
264
265 d = __FNMSUB( g, g, x ); y2 = y + y;
266
267 g = __FMADD(y, d, g ); // 16-bit g
268
269 e = __FNMSUB( y, g, FPR_HalfULP ); d = __FNMSUB( g, g, x );
270
271 y = __FMADD( e, y2, y ); // 16-bit y
272
273 y2 = y + y; g = __FMADD(y, d, g); // 32-bit g
274
275 e = __FNMSUB( y, g, FPR_HalfULP ); d = __FNMSUB( g, g, x );
276
277 y = __FMADD( e, y2, y ); // 32-bit y
278
279 y2 = y + y; g = __FMADD(y, d, g ); // 64-bit g before rounding
280
281 e = __FNMSUB( y, g, FPR_HalfULP ); d = __FNMSUB( g, g, x );
282
283 y = __FMADD( e, y2, y ); // 64-bit y
284
285 FESETENVD( OldEnvironment ); // restore caller's environment
286 return ( __FMADD( y, d, g ) ); // final step
287 }
288
289/*******************************************************************************
290* Second section: 0.0 < argument < 2.0^(-911) which is about 5.77662E-275. *
291* Identical to the previous segment aside from 2^512 scale factor. *
292*******************************************************************************/
293 else
294 {
295 xInHex.d = x * twoTo512; // scale up by 2^512
296 __NOOP;
297 __NOOP;
298 __NOOP;
299 xhead = xInHex.i.hi;
300
301/*******************************************************************************
302* Calculate initial estimates for g and y from x and SqrtTable[]. *
303*******************************************************************************/
304
305 ghead = ( ( xhead + 0x3ff00000 ) >> 1) & 0x7ff00000;
306 index = ( xhead >> 13) & 0xffu; // table index
307 yhead = 0x7fc00000u - ghead;
308 gInHex.i.hi = ghead + ( ( 0xff00u & SqrtTable[index] ) << 4 );
309 yInHex.i.hi = yhead + ( ( 0xffu & SqrtTable[index] ) << 12 );
310 x = xInHex.d;
311 g = gInHex.d;
312 y = yInHex.d;
313
314/*******************************************************************************
315* Iterate to refine both g and y. *
316*******************************************************************************/
317
318 d = __FNMSUB( g, g, x ); y2 = y + y;
319
320 g = __FMADD(y, d, g ); // 16-bit g
321
322 e = __FNMSUB( y, g, FPR_HalfULP ); d = __FNMSUB( g, g, x );
323
324 y = __FMADD( e, y2, y ); // 16-bit y
325
326 y2 = y + y; g = __FMADD(y, d, g); // 32-bit g
327
328 e = __FNMSUB( y, g, FPR_HalfULP ); d = __FNMSUB( g, g, x );
329
330 y = __FMADD( e, y2, y ); // 32-bit y
331
332 y2 = y + y; g = __FMADD(y, d, g ); // 64-bit g before rounding
333
334 e = __FNMSUB( y, g, FPR_HalfULP ); d = __FNMSUB( g, g, x );
335
336 d *= twoToMinus256; g *= twoToMinus256; // undo scaling
337
338 y = __FMADD( e, y2, y ); // 64-bit y
339
340 FESETENVD( OldEnvironment ); // restore caller's environment
341 return ( __FMADD( y, d, g ) ); // final step
342 }
343 }
344
345/*******************************************************************************
346* Fourth section: special cases: argument is +INF, NaN, +/-0.0, or <0. *
347*******************************************************************************/
348
349 FESETENVD( OldEnvironment ); // restore caller's environment
350
351 if ( x < FPR_z && x == x ) // < 0.0 and not NAN
352 {
353 hexdouble env;
354 x = nan ( SQRT_NAN );
355 env.d = OldEnvironment;
356 __NOOP;
357 __NOOP;
358 __NOOP;
359
360 env.i.lo |= SET_INVALID;
361 FESETENVD_GRP( env.d ); // restore caller's environment
362 return ( x ); // return NAN
363 }
364 else return ( x ); // NANs and +/-0.0
365}
366
367/*
368By pushing the software sqrt implementation into its own routine, we avoid the PIC setup and simply have:
369_sqrt:
37000000340 lwz r0,0x8020(0)
37100000344 andis. r2,r0,0x2000
37200000348 bne 0x350
3730000034c b ___sqrt
37400000350 fsqrt f1,f1
37500000354 blr
376Perhaps gcc can someday be coaxed to eliminate the extra branch?
377Matt suggests "use -freorder-blocks", we'll pick that up for the next release.
378*/
379double sqrt(double x)
380{
381 if ((_get_cpu_capabilities() & kHasFsqrt))
382 return __fsqrt( x );
383 else return __sqrt( x );
384}
385
386