src/libm/Source/Intel/xmm_fma.c at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / xmm_fma.c
at fixPythonPipStalling 383 lines 14 kB view raw
wrap content
Lubos Dolezel Restructured source tree to prepare for merge with the "darling" repo 10y ago
f228ae16
  1/*
  2 *  xmm_fma.c
  3 *  xmmLibm
  4 *
  5 *  Created by Ian Ollmann on 8/8/05.
  6 *  Copyright 2005 Apple Computer Inc. All rights reserved.
  7 *
  8 */
  9
 10
 11#include "xmmLibm_prefix.h"
 12
 13#include <math.h>
 14
 15//For any rounding mode, we can calculate A + B exactly as a head to tail result as:
 16//
 17//  Rhi = A + B     (A has a larger magnitude than B)
 18//  Rlo = B - (Rhi - A);
 19//  
 20//  Rhi is rounded to the current rounding mode
 21//  Rlo represents the next 53+ bits of precision
 22
 23//returns carry bits that don't fit into A
 24static inline long double extended_accum( long double *A, long double B ) ALWAYS_INLINE;
 25static inline long double extended_accum( long double *A, long double B ) 
 26{
 27        // assume B >~ *A, B needs to be larger or in the same binade.
 28    long double r = *A + B;
 29    long double carry = *A - ( r - B );
 30    *A = r;
 31    return carry;
 32}
 33
 34// Set the LSB of a long double encoding. Turns +/- inf to nan, and +/- 0 to Dmin. 
 35static inline long double jam_stickyl(long double x) ALWAYS_INLINE;
 36static inline long double jam_stickyl(long double x) {
 37        union{
 38                long double ld;
 39                xUInt64 xu;
 40                struct{ uint64_t mantissa; int16_t sexp; } parts;
 41        } ux = {x}, uy, mask = {0.};
 42        mask.parts.mantissa = 0x1ULL;
 43        uy.xu = _mm_or_si128(ux.xu, mask.xu);
 44        return uy.ld;
 45}
 46
 47
 48// Compute next down unless value is already odd.
 49// This is useful for reflecting the contribution of "negative sticky". 
 50static inline long double jam_nstickyl(long double x) ALWAYS_INLINE;
 51static inline long double jam_nstickyl(long double x) {
 52        union{
 53                long double ld;
 54                xUInt64 xu;
 55                struct{ uint64_t mantissa; int16_t sexp; } parts;
 56        } ux = {x}, uy, mask = {0.}, exp_borrow = {0x1p-16382L};
 57        // exp_borrow.parts = {0x8000000000000000ULL, 0x0001}
 58        mask.parts.mantissa = 0x1ULL;
 59        const xUInt64 is_odd = _mm_and_si128(ux.xu, mask.xu);
 60        xUInt64 new_sig = _mm_sub_epi64(ux.xu, _mm_xor_si128(is_odd, mask.xu));
 61
 62        //Deal with exact powers of 2.
 63        if(0x1 & _mm_movemask_pd((xDouble)new_sig)) {
 64                /* If the fraction was nonzero, i.e., the significand was 0x8000000000000000,
 65                   (recall that 80-bit long double has an explicit integer bit)
 66                   we could safely subtract one without needing to borrow from the exponent.
 67                 */
 68                ;
 69        } else {
 70                /* If we are here, we subtracted one from an all zero fraction
 71                   This means we needed to borrow one from the exponent. 
 72                   We also need to restore the explicit integer bit in the 80-bit format.
 73                */
 74                new_sig = _mm_sub_epi64(new_sig, exp_borrow.xu);
 75        }
 76        uy.xu = new_sig;
 77
 78        return uy.ld;
 79}
 80
 81double fma( double a, double b, double c )
 82{
 83	union{ double ld; uint64_t u;}ua = {a};
 84	union{ double ld; uint64_t u;}ub = {b};
 85	union{ double ld; uint64_t u;}uc = {c};
 86
 87	int32_t sign = (ua.u ^ ub.u ^ uc.u)>>63; // 0 for true addition, 1 for true subtraction
 88    int32_t aexp = (ua.u>>52) & 0x7ff;
 89	int32_t bexp = (ub.u>>52) & 0x7ff;
 90	int32_t cexp = (uc.u>>52) & 0x7ff;
 91	int32_t pexp =  aexp + bexp - 0x3ff;
 92	
 93    if((aexp == 0x7ff)||(bexp == 0x7ff)||(cexp == 0x7ff)) {
 94        //deal with NaN
 95        if( a != a )    return a;
 96        if( b != b )    return b;
 97        if( c != c )    return c;
 98        //Have at least one inf and no NaNs at this point.
 99        //Need to deal with invalid cases for INF - INF.
100        //Also, should avoid inadvertant overflow for big*big - INF.
101        if(cexp < 0x7ff) { // If c is finite, then the product is the result.
102                return a*b; // One of a or b is INF, so this will produce correct exceptions exactly
103        }
104        // c is infinite. 
105        if((aexp < 0x7ff) && (0 < aexp)) {
106                ua.u = (ua.u & 0x8000000000000000ULL) | 0x4000000000000000ULL; // copysign(2.0,a)
107        }
108        if((bexp < 0x7ff) && (0 < bexp)) {
109                ub.u = (ub.u & 0x8000000000000000ULL) | 0x4000000000000000ULL; // copysign(2.0,b)
110        }
111        return (ua.ld*ub.ld)+c;
112    }
113    // Having ruled out INF and NaN, do a cheap test to see if any input might be zero
114    if((aexp == 0x000)||(bexp == 0x000)||(cexp == 0x000)) {
115            if(a==0.0 || b==0.0) {
116                    // If at least one input is +-0 we can use native arithmetic to compute a*b+c.
117                    // If a or b is zero, then we have a signed zero product that we add to c.
118                    // Not that in this case, c may also be zero, but the addition will produce the correctly signed zero.
119                    return ((a*b)+c);
120            }
121            if(c==0.0) {
122                    // If c is zero and the infinitely precise product is non-zero, 
123                    // we do not add c.
124                    // The floating point multiply with produce the correct result including overflow and underflow.
125                    // Note that if we tried to do (a*b)+c for this case relying on c being zero
126                    // we would get the wrong sign for some underflow cases, e.g., tiny * -tiny + 0.0 should be -0.0.
127                    return (a*b);
128            }
129    }
130
131    static const xUInt64 mask26 = { 0xFFFFFFFFFC000000ULL, 0 };
132    double ahi, bhi;
133
134    //break a, b and c into high and low components
135    //The high components have 26 bits of precision
136    //The low components have 27 bits of precision
137    xDouble xa = DOUBLE_2_XDOUBLE(a);
138    xDouble xb = DOUBLE_2_XDOUBLE(b);
139
140    xa = _mm_and_pd( xa, (xDouble) mask26 );
141    xb = _mm_and_pd( xb, (xDouble) mask26 );
142    
143    ahi = XDOUBLE_2_DOUBLE( xa );
144    bhi = XDOUBLE_2_DOUBLE( xb );
145
146    //double precision doesn't have enough precision for the next part. 
147    //so we abandond it and go to extended.
148    ///
149    //The problem is that the intermediate multiplication product needs to be infinitely
150    //precise. While we can fix the mantissa part of the infinite precision problem,
151    //we can't deal with the case where the product is outside the range of the
152    //representable double precision values. Extended precision allows us to solve
153    //that problem, since all double values and squares of double values are normalized
154    //numbers in extended precision
155    long double Ahi = ahi;
156    long double Bhi = bhi;
157    long double Alo = (long double) a - Ahi;
158    long double Blo = (long double) b - Bhi;
159    long double C = c;
160    
161    //The result of A * B is now exactly:
162    //
163    //  B1 + Ahi*Bhi + Alo*Bhi + Ahi*Blo + Alo*Blo 
164    //  all of these intermediate terms have either 52 or 53 bits of precision and are exact
165    long double AhiBhi = Ahi * Bhi;        //52 bits 
166    long double AloBhi = Alo * Bhi;        //53 bits  
167    long double AhiBlo = Ahi * Blo;        //53 bits  
168    long double AloBlo = Alo * Blo;        //54 bits   
169    
170    //accumulate the results into two head/tail buffers. This is infinitely precise.
171    //no effort is taken to make sure that a0 and a1 are actually head to tail
172    long double a0, a1;
173    a0 = AloBlo;
174    a1 = extended_accum( &a0, (AhiBlo + AloBhi) );
175    a1 += extended_accum( &a0, AhiBhi );
176
177    //Add C. If C has greater magnitude than a0, we need to swap them
178    if( __builtin_fabsl( C ) > __builtin_fabsl( a0 ) )
179    {
180        long double temp = C;
181        C = a0;
182        a0 = temp;
183    } 
184
185    if(cexp + 120 < pexp) {
186            // Check to see if c < ulp(a*b).
187            // If it is then c only contributes as a sticky bit. 
188
189            /* Since we know that the product A*B fits in 106 bits
190               and a0+a1 can accomodate 128 bits, 
191               we just need to add something of the correct sign and
192               less than ULP(A*B) but not too small.
193               We construct this by multiplying a0 by +/- 2^-120.
194            */ 
195            long double ulp_prod_est = a0 * (sign? -0x1p-120L: 0x1p-120L);
196            
197            a1 += ulp_prod_est;
198    } else {
199
200            //this will probably overflow, but we have 128 bits of precision here, which should mean we are covered. 
201            long double a2 = C;
202            if(__builtin_fabsl(C) < __builtin_fabsl(a0)) {
203                    a2 = a0;
204                    a0 = C;
205            }
206            a1 += extended_accum( &a0, a2 );
207    }
208
209    //push overflow in a1 back into a0. This should give us the correctly rounded result
210    a0 = extended_accum( &a1, a0 );
211    if(a0 > 0.) a1 = (a1>0.)?jam_stickyl(a1):jam_nstickyl(a1);
212    if(a0 < 0.) a1 = (a1<0.)?jam_stickyl(a1):jam_nstickyl(a1);
213    return a1;
214}
215
216long double fmal( long double a, long double b, long double c )
217{
218	/*****************
219	 Edge cases, from Ian's code.
220	*****************/
221	
222	union{ long double ld; struct{ uint64_t mantissa; int16_t sexp; }parts; }ua = {a};
223	union{ long double ld; struct{ uint64_t mantissa; int16_t sexp; }parts; }ub = {b};
224	
225	int16_t sign = (ua.parts.sexp ^ ub.parts.sexp) & 0x8000;
226	int32_t aexp = (ua.parts.sexp & 0x7fff);
227	int32_t bexp = (ub.parts.sexp & 0x7fff);
228	int32_t exp  =  aexp + bexp - 16383;
229	
230	//deal with NaN
231	if( a != a )    return a;
232	if( b != b )    return b;
233	if( c != c )    return c;
234	
235	// a = � | b = �
236	if ((aexp == 0x7fff) || (bexp == 0x7fff)) {				// We've already dealt with NaN, so this is only true 
237																										// if one of a and b is an inf.
238		if (( b == 0.0L ) || ( a == 0.0L)) return a*b;	// Return NaN if a = ��, b = 0, c � NaN (or a = 0, b = �)
239		if ( __builtin_fabsl(c) == __builtin_infl() ) {
240			if ( sign & 0x8000 ) {
241				if ( c > 0 )
242					return c - __builtin_infl();							// Return NaN if a = ��, c = -a
243				else
244					return c;																	// Return �inf if a = c = ��.
245			} else {
246				if ( c < 0 )
247					return c + __builtin_infl();							// Return NaN if a = ��, c = -a
248				else
249					return c;																	// Return �inf if a = c = ��.
250			}
251			if ( sign & 0x8000 ) return -__builtin_infl();
252			else return __builtin_infl();
253		}
254	}
255	
256	// c = �
257	if ( __builtin_fabsl(c) == __builtin_inf() ) return c; // a,b at this point are finite, c is infinite.
258
259	
260	if( exp < -16382 - 63 ) //sub denormal
261		return c;
262	
263	/*****************
264	 Computation of a*b + c. scanon, Jan 2007
265	 
266	 The whole game we're playing here only works in round-to-nearest.
267	 *****************/
268	
269	long double ahi, alo, bhi, blo, phi, plo, xhi, xlo, yhi, ylo, tmp;
270	
271	//  split a,b into high and low parts.
272	static const uint64_t split32_mask = 0xffffffff00000000ULL;
273	ua.parts.mantissa = ua.parts.mantissa & split32_mask;
274	ahi = ua.ld;								alo = a - ahi;
275	ub.parts.mantissa = ub.parts.mantissa & split32_mask;
276	bhi = ub.ld;								blo = b - bhi;
277	
278	// compute the product of a and b as a sum phi + plo.  This is exact.
279	phi = a * b;
280	
281	// In case of overflow, stop here and return phi.  This will need to be changed
282	// in order to have a fully C99 fmal.
283	if (__builtin_fabsl(phi) == __builtin_infl()) {
284		return phi; // We know that c != inf or nan, so phi is the correct result.
285	}
286	
287	plo = (((ahi * bhi - phi) + ahi*blo) + alo*bhi) + alo*blo;
288	
289	// compute plo + c = xhi + xlo where (xhi,xlo) is head-tail.
290	xhi = plo + c;
291	if (__builtin_fabsl(plo) > __builtin_fabsl(c)) {
292		tmp = xhi - plo;
293		xlo = c - tmp;
294	} else {
295		tmp = xhi - c;
296		xlo = plo - tmp;
297	}
298	
299	// Special case: xlo == 0, hence return phi + xhi:
300	if (xlo == 0.0L) return phi + xhi;
301	
302	// At this point we know that the meaningful bits of phi and xhi are entirely to the
303	// left (larger) side of the meaningful bits of xlo, and that our result is 
304	// round(phi + xhi + xlo).
305	yhi = phi + xhi;
306	if (__builtin_fabsl(phi) > __builtin_fabsl(xhi)) {
307		tmp = yhi - phi;
308		ylo = xhi - tmp;
309	} else {
310		tmp = yhi - xhi;
311		ylo = phi - tmp;
312	}
313	
314	// Handle the special case that one of yhi or ylo is zero.
315	// If yhi == 0, then ylo is also zero, so yhi + xlo = xlo is the appropriate result.
316	// If ylo == 0, then yhi + xlo is the appropriate result.
317	if ((yhi == 0.0L) || (ylo == 0.0L)) return yhi + xlo;
318	
319	// Now we have that (in terms of meaningful bits)
320	// yhi is strictly bigger than ylo is strictly bigger than xlo.
321	// Additionally, our desired result is round(yhi + ylo + xlo).
322	
323	// The only way for xlo to affect rounding (in round-to-nearest) is for ylo to
324	// be exactly half an ulp of yhi.  Test for the value of the mantissa of ylo;
325	// this is not the same condition, but getting this wrong can't affect the rounding.
326	union{ long double ld; struct{ uint64_t mantissa; int16_t sexp; }parts; } uylo = {ylo};
327	if (uylo.parts.mantissa == 0x8000000000000000ULL) {
328		return yhi + (ylo + copysignl(0.5L*ylo, xlo));
329	}
330	// In the other case, xlo has no affect on the final result, so just return yhi + ylo
331	else {
332		return yhi + ylo;
333	}
334	
335	/*
336	 Code that Ian wrote to do this in integer, never finished.
337	 
338	 //mantissa product
339    //  hi(a.hi*b.hi)   lo(a.hi*b.hi)
340    //                  hi(a.hi*b.lo)   lo(a.hi*b.lo)
341    //                  hi(a.lo*b.hi)   lo(a.lo*b.hi)
342    // +                                hi(a.lo*b.lo)   lo(a.lo*b.lo)
343    // --------------------------------------------------------------
344
345    uint32_t    ahi = ua.parts.mantissa >> 32;
346    uint32_t    bhi = ub.parts.mantissa >> 32;
347    uint32_t    alo = ua.parts.mantissa & 0xFFFFFFFFULL;
348    uint32_t    blo = ub.parts.mantissa & 0xFFFFFFFFULL;
349
350    uint64_t    templl, temphl, templh, temphh;
351    xUInt64     l, h, a;
352    
353    templl = (uint64_t) alo * (uint64_t) blo;
354    temphl = (uint64_t) ahi * (uint64_t) blo;
355    templh = (uint64_t) alo * (uint64_t) bhi;
356    temphh = (uint64_t) ahi * (uint64_t) bhi;
357
358    l = _mm_cvtsi32_si128( (int32_t) templl );          templl >>= 32;
359
360    temphl +=  templl;
361    temphl +=  templh & 0xFFFFFFFFULL;
362    h = _mm_cvtsi32_si128( (int32_t) temphl);           temphl >>= 32;
363    a = _mm_unpacklo_epi32( l, h );
364    
365
366    temphh += templh >> 32;
367    temphh += temphl;
368
369    a = _mm_cvtsi32_si128( (int32_t) temphh );          temphh >>= 32;
370    h = _mm_cvtsi32_si128( (int32_t) temphh);           
371    a = _mm_unpacklo_epi32( a, h );
372    l = _mm_unpacklo_epi64( l, a );
373    
374    union
375    {
376        xUInt64     v;
377        uint64_t    u[2];
378    }z = { l };
379
380    long double lo = (long double) temphh.
381	 */
382}
383