src/libm/Source/ARM/sqrt.c at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / ARM / sqrt.c
at fixPythonPipStalling 158 lines 6.1 kB view raw
wrap content
Lubos Dolezel Restructured source tree to prepare for merge with the "darling" repo 10y ago
f228ae16
  1// double sqrt(double)
  2//
  3// Portable implementation of sqrt( ) with correct IEEE-754 default rounding.
  4//
  5// Assumes that integer and float have the same endianness on the target
  6// platform.
  7//
  8// Stephen Canon, July 2010
  9
 10#include <math.h>
 11
 12#if defined __SOFTFP__
 13#include <stdint.h>
 14#include <limits.h>
 15
 16typedef double fp_t;
 17typedef uint64_t rep_t;
 18static const int significandBits = 52;
 19#define REP_C UINT64_C
 20
 21static inline int rep_clz(rep_t a) {
 22    if (a & REP_C(0xffffffff00000000))
 23        return 32 + __builtin_clz(a >> 32);
 24    else 
 25        return __builtin_clz(a & REP_C(0xffffffff));
 26}
 27
 28static inline rep_t toRep(fp_t x) {
 29    const union { rep_t i; fp_t f; } rep = { .f = x };
 30    return rep.i;
 31}
 32
 33static inline fp_t fromRep(rep_t x) {
 34    const union { rep_t i; fp_t f; } rep = { .i = x };
 35    return rep.f;
 36}
 37
 38static inline uint32_t mulhi(uint32_t a, uint32_t b) {
 39    return (uint64_t)a*b >> 32;
 40}
 41
 42#define loWord(a) (a & 0xffffffffU)
 43#define hiWord(a) (a >> 32)
 44static inline uint64_t mulhi64(uint64_t a, uint64_t b) {
 45    // Each of the component 32x32 -> 64 products
 46    const uint64_t plolo = loWord(a) * loWord(b);
 47    const uint64_t plohi = loWord(a) * hiWord(b);
 48    const uint64_t philo = hiWord(a) * loWord(b);
 49    const uint64_t phihi = hiWord(a) * hiWord(b);
 50    // Sum terms that contribute to lo in a way that allows us to get the carry
 51    const uint64_t r0 = loWord(plolo);
 52    const uint64_t r1 = hiWord(plolo) + loWord(plohi) + loWord(philo);
 53    // Sum terms contributing to hi with the carry from lo
 54    return phihi + hiWord(plohi) + hiWord(philo) + hiWord(r1);
 55}
 56
 57fp_t sqrt(fp_t x) {
 58    
 59    // Various constants parametrized by the type of x:
 60    static const int typeWidth = sizeof(rep_t) * CHAR_BIT;
 61    static const int exponentBits = typeWidth - significandBits - 1;
 62    static const int exponentBias = (1 << (exponentBits - 1)) - 1;
 63    static const rep_t minNormal = REP_C(1) << significandBits;
 64    static const rep_t significandMask = minNormal - 1;
 65    static const rep_t signBit = REP_C(1) << (typeWidth - 1);
 66    static const rep_t absMask = signBit - 1;
 67    static const rep_t infRep = absMask ^ significandMask;
 68    static const rep_t qnan = infRep | REP_C(1) << (significandBits - 1);
 69    
 70    // Extract the various important bits of x
 71    const rep_t xRep = toRep(x);
 72    rep_t significand = xRep & significandMask;
 73    int exponent = (xRep >> significandBits) - exponentBias;
 74    
 75    // Using an unsigned integer compare, we can detect all of the special
 76    // cases with a single branch: zero, denormal, negative, infinity, or NaN.
 77    if (xRep - minNormal >= infRep - minNormal) {
 78        const rep_t xAbs = xRep & absMask;
 79        // sqrt(+/- 0) = +/- 0
 80        if (xAbs == 0) return x;
 81        // sqrt(NaN) = qNaN
 82        if (xAbs > infRep) return fromRep(qnan | xRep);
 83        // sqrt(negative) = qNaN
 84        if (xRep > signBit) return fromRep(qnan);
 85        // sqrt(infinity) = infinity
 86        if (xRep == infRep) return x;
 87        
 88        // normalize denormals and fall back into the mainline
 89        const int shift = rep_clz(significand) - rep_clz(minNormal);
 90        significand <<= shift;
 91        exponent += 1 - shift;
 92    }
 93    
 94    // Insert the implicit bit of the significand.  If x was denormal, then
 95    // this bit was already set by the normalization process, but it won't hurt
 96    // to set it twice.
 97    significand |= minNormal;
 98    
 99    // Halve the exponent to get the exponent of the result, and transform the
100    // significand into a Q30 fixed-point xQ30 in the range [1,4) -- if the
101    // exponent of x is odd, then xQ30 is in [2,4); if it is even, then xQ30
102    // is in [1,2).
103    const int resultExponent = exponent >> 1;
104    const uint64_t xQ62 = significand << (10 + (exponent & 1));
105    const uint32_t xQ30 = xQ62 >> 32;
106    
107    // Q32 linear approximation to the reciprocal square root of xQ30.  This
108    // approximation is good to a bit more than 3.5 bits:
109    //
110    //     1/sqrt(a) ~ 1.1033542890963095 - a/6
111    const uint32_t oneSixthQ34 = UINT32_C(0xaaaaaaaa);
112    uint32_t recipQ32 = UINT32_C(0x1a756d3b) - mulhi(oneSixthQ34, xQ30);
113    
114    // Newton-Raphson iterations to improve our reciprocal:
115    const uint32_t threeQ30 = UINT32_C(0xc0000000);
116    uint32_t residualQ30 = mulhi(xQ30, mulhi(recipQ32, recipQ32));
117    recipQ32 = mulhi(recipQ32, threeQ30 - residualQ30) << 1;
118    residualQ30 = mulhi(xQ30, mulhi(recipQ32, recipQ32));
119    recipQ32 = mulhi(recipQ32, threeQ30 - residualQ30) << 1;
120    residualQ30 = mulhi(xQ30, mulhi(recipQ32, recipQ32));
121    recipQ32 = mulhi(recipQ32, threeQ30 - residualQ30) << 1;
122    residualQ30 = mulhi(xQ30, mulhi(recipQ32, recipQ32));
123    recipQ32 = mulhi(recipQ32, threeQ30 - residualQ30) << 1;
124    
125    // We need to compute the final Newton-Raphson step with 64-bit words:
126    const uint64_t threeQ62 = UINT64_C(0xc000000000000000);
127    const uint64_t residualQ62 = mulhi64(xQ62,(uint64_t)recipQ32*recipQ32);
128    const uint64_t stepQ62 = threeQ62 - residualQ62;
129    const uint64_t recipQ63hi = recipQ32 * hiWord(stepQ62);
130    const uint64_t recipQ63lo = recipQ32 * loWord(stepQ62) >> 32;
131    const uint64_t recipQ64 = (recipQ63hi + recipQ63lo) << 1;
132    
133    // recipQ64 now holds an approximate 1/sqrt(x).  Multiply by x to get an
134    // initial sqrt(x) in Q52.  From the construction of this estimate, we know
135    // that it is either the correctly rounded significand of the result or one
136    // less than the correctly rounded significand (the -2 guarantees that we
137    // fall on the correct side of the actual square root).
138    rep_t result = (mulhi64(recipQ64, xQ62) - 2) >> 10;
139    
140    // Compute the residual x - result*result to decide if the result needs to
141    // be rounded up.
142    rep_t residual = (xQ62 << 42) - result*result;
143    result += residual > result;
144    
145    // Clear the implicit bit of result:
146    result &= significandMask;
147    // Insert the exponent:
148    result |= (rep_t)(resultExponent + exponentBias) << significandBits;
149    return fromRep(result);    
150}
151
152#else // __SOFTFP__
153
154double sqrt(double x) {
155	return __builtin_sqrt(x);
156}
157
158#endif // __SOFTFP__