this repo has no description
at fixPythonPipStalling 383 lines 14 kB view raw
1/* 2 * xmm_fma.c 3 * xmmLibm 4 * 5 * Created by Ian Ollmann on 8/8/05. 6 * Copyright 2005 Apple Computer Inc. All rights reserved. 7 * 8 */ 9 10 11#include "xmmLibm_prefix.h" 12 13#include <math.h> 14 15//For any rounding mode, we can calculate A + B exactly as a head to tail result as: 16// 17// Rhi = A + B (A has a larger magnitude than B) 18// Rlo = B - (Rhi - A); 19// 20// Rhi is rounded to the current rounding mode 21// Rlo represents the next 53+ bits of precision 22 23//returns carry bits that don't fit into A 24static inline long double extended_accum( long double *A, long double B ) ALWAYS_INLINE; 25static inline long double extended_accum( long double *A, long double B ) 26{ 27 // assume B >~ *A, B needs to be larger or in the same binade. 28 long double r = *A + B; 29 long double carry = *A - ( r - B ); 30 *A = r; 31 return carry; 32} 33 34// Set the LSB of a long double encoding. Turns +/- inf to nan, and +/- 0 to Dmin. 35static inline long double jam_stickyl(long double x) ALWAYS_INLINE; 36static inline long double jam_stickyl(long double x) { 37 union{ 38 long double ld; 39 xUInt64 xu; 40 struct{ uint64_t mantissa; int16_t sexp; } parts; 41 } ux = {x}, uy, mask = {0.}; 42 mask.parts.mantissa = 0x1ULL; 43 uy.xu = _mm_or_si128(ux.xu, mask.xu); 44 return uy.ld; 45} 46 47 48// Compute next down unless value is already odd. 49// This is useful for reflecting the contribution of "negative sticky". 50static inline long double jam_nstickyl(long double x) ALWAYS_INLINE; 51static inline long double jam_nstickyl(long double x) { 52 union{ 53 long double ld; 54 xUInt64 xu; 55 struct{ uint64_t mantissa; int16_t sexp; } parts; 56 } ux = {x}, uy, mask = {0.}, exp_borrow = {0x1p-16382L}; 57 // exp_borrow.parts = {0x8000000000000000ULL, 0x0001} 58 mask.parts.mantissa = 0x1ULL; 59 const xUInt64 is_odd = _mm_and_si128(ux.xu, mask.xu); 60 xUInt64 new_sig = _mm_sub_epi64(ux.xu, _mm_xor_si128(is_odd, mask.xu)); 61 62 //Deal with exact powers of 2. 63 if(0x1 & _mm_movemask_pd((xDouble)new_sig)) { 64 /* If the fraction was nonzero, i.e., the significand was 0x8000000000000000, 65 (recall that 80-bit long double has an explicit integer bit) 66 we could safely subtract one without needing to borrow from the exponent. 67 */ 68 ; 69 } else { 70 /* If we are here, we subtracted one from an all zero fraction 71 This means we needed to borrow one from the exponent. 72 We also need to restore the explicit integer bit in the 80-bit format. 73 */ 74 new_sig = _mm_sub_epi64(new_sig, exp_borrow.xu); 75 } 76 uy.xu = new_sig; 77 78 return uy.ld; 79} 80 81double fma( double a, double b, double c ) 82{ 83 union{ double ld; uint64_t u;}ua = {a}; 84 union{ double ld; uint64_t u;}ub = {b}; 85 union{ double ld; uint64_t u;}uc = {c}; 86 87 int32_t sign = (ua.u ^ ub.u ^ uc.u)>>63; // 0 for true addition, 1 for true subtraction 88 int32_t aexp = (ua.u>>52) & 0x7ff; 89 int32_t bexp = (ub.u>>52) & 0x7ff; 90 int32_t cexp = (uc.u>>52) & 0x7ff; 91 int32_t pexp = aexp + bexp - 0x3ff; 92 93 if((aexp == 0x7ff)||(bexp == 0x7ff)||(cexp == 0x7ff)) { 94 //deal with NaN 95 if( a != a ) return a; 96 if( b != b ) return b; 97 if( c != c ) return c; 98 //Have at least one inf and no NaNs at this point. 99 //Need to deal with invalid cases for INF - INF. 100 //Also, should avoid inadvertant overflow for big*big - INF. 101 if(cexp < 0x7ff) { // If c is finite, then the product is the result. 102 return a*b; // One of a or b is INF, so this will produce correct exceptions exactly 103 } 104 // c is infinite. 105 if((aexp < 0x7ff) && (0 < aexp)) { 106 ua.u = (ua.u & 0x8000000000000000ULL) | 0x4000000000000000ULL; // copysign(2.0,a) 107 } 108 if((bexp < 0x7ff) && (0 < bexp)) { 109 ub.u = (ub.u & 0x8000000000000000ULL) | 0x4000000000000000ULL; // copysign(2.0,b) 110 } 111 return (ua.ld*ub.ld)+c; 112 } 113 // Having ruled out INF and NaN, do a cheap test to see if any input might be zero 114 if((aexp == 0x000)||(bexp == 0x000)||(cexp == 0x000)) { 115 if(a==0.0 || b==0.0) { 116 // If at least one input is +-0 we can use native arithmetic to compute a*b+c. 117 // If a or b is zero, then we have a signed zero product that we add to c. 118 // Not that in this case, c may also be zero, but the addition will produce the correctly signed zero. 119 return ((a*b)+c); 120 } 121 if(c==0.0) { 122 // If c is zero and the infinitely precise product is non-zero, 123 // we do not add c. 124 // The floating point multiply with produce the correct result including overflow and underflow. 125 // Note that if we tried to do (a*b)+c for this case relying on c being zero 126 // we would get the wrong sign for some underflow cases, e.g., tiny * -tiny + 0.0 should be -0.0. 127 return (a*b); 128 } 129 } 130 131 static const xUInt64 mask26 = { 0xFFFFFFFFFC000000ULL, 0 }; 132 double ahi, bhi; 133 134 //break a, b and c into high and low components 135 //The high components have 26 bits of precision 136 //The low components have 27 bits of precision 137 xDouble xa = DOUBLE_2_XDOUBLE(a); 138 xDouble xb = DOUBLE_2_XDOUBLE(b); 139 140 xa = _mm_and_pd( xa, (xDouble) mask26 ); 141 xb = _mm_and_pd( xb, (xDouble) mask26 ); 142 143 ahi = XDOUBLE_2_DOUBLE( xa ); 144 bhi = XDOUBLE_2_DOUBLE( xb ); 145 146 //double precision doesn't have enough precision for the next part. 147 //so we abandond it and go to extended. 148 /// 149 //The problem is that the intermediate multiplication product needs to be infinitely 150 //precise. While we can fix the mantissa part of the infinite precision problem, 151 //we can't deal with the case where the product is outside the range of the 152 //representable double precision values. Extended precision allows us to solve 153 //that problem, since all double values and squares of double values are normalized 154 //numbers in extended precision 155 long double Ahi = ahi; 156 long double Bhi = bhi; 157 long double Alo = (long double) a - Ahi; 158 long double Blo = (long double) b - Bhi; 159 long double C = c; 160 161 //The result of A * B is now exactly: 162 // 163 // B1 + Ahi*Bhi + Alo*Bhi + Ahi*Blo + Alo*Blo 164 // all of these intermediate terms have either 52 or 53 bits of precision and are exact 165 long double AhiBhi = Ahi * Bhi; //52 bits 166 long double AloBhi = Alo * Bhi; //53 bits 167 long double AhiBlo = Ahi * Blo; //53 bits 168 long double AloBlo = Alo * Blo; //54 bits 169 170 //accumulate the results into two head/tail buffers. This is infinitely precise. 171 //no effort is taken to make sure that a0 and a1 are actually head to tail 172 long double a0, a1; 173 a0 = AloBlo; 174 a1 = extended_accum( &a0, (AhiBlo + AloBhi) ); 175 a1 += extended_accum( &a0, AhiBhi ); 176 177 //Add C. If C has greater magnitude than a0, we need to swap them 178 if( __builtin_fabsl( C ) > __builtin_fabsl( a0 ) ) 179 { 180 long double temp = C; 181 C = a0; 182 a0 = temp; 183 } 184 185 if(cexp + 120 < pexp) { 186 // Check to see if c < ulp(a*b). 187 // If it is then c only contributes as a sticky bit. 188 189 /* Since we know that the product A*B fits in 106 bits 190 and a0+a1 can accomodate 128 bits, 191 we just need to add something of the correct sign and 192 less than ULP(A*B) but not too small. 193 We construct this by multiplying a0 by +/- 2^-120. 194 */ 195 long double ulp_prod_est = a0 * (sign? -0x1p-120L: 0x1p-120L); 196 197 a1 += ulp_prod_est; 198 } else { 199 200 //this will probably overflow, but we have 128 bits of precision here, which should mean we are covered. 201 long double a2 = C; 202 if(__builtin_fabsl(C) < __builtin_fabsl(a0)) { 203 a2 = a0; 204 a0 = C; 205 } 206 a1 += extended_accum( &a0, a2 ); 207 } 208 209 //push overflow in a1 back into a0. This should give us the correctly rounded result 210 a0 = extended_accum( &a1, a0 ); 211 if(a0 > 0.) a1 = (a1>0.)?jam_stickyl(a1):jam_nstickyl(a1); 212 if(a0 < 0.) a1 = (a1<0.)?jam_stickyl(a1):jam_nstickyl(a1); 213 return a1; 214} 215 216long double fmal( long double a, long double b, long double c ) 217{ 218 /***************** 219 Edge cases, from Ian's code. 220 *****************/ 221 222 union{ long double ld; struct{ uint64_t mantissa; int16_t sexp; }parts; }ua = {a}; 223 union{ long double ld; struct{ uint64_t mantissa; int16_t sexp; }parts; }ub = {b}; 224 225 int16_t sign = (ua.parts.sexp ^ ub.parts.sexp) & 0x8000; 226 int32_t aexp = (ua.parts.sexp & 0x7fff); 227 int32_t bexp = (ub.parts.sexp & 0x7fff); 228 int32_t exp = aexp + bexp - 16383; 229 230 //deal with NaN 231 if( a != a ) return a; 232 if( b != b ) return b; 233 if( c != c ) return c; 234 235 // a = � | b = � 236 if ((aexp == 0x7fff) || (bexp == 0x7fff)) { // We've already dealt with NaN, so this is only true 237 // if one of a and b is an inf. 238 if (( b == 0.0L ) || ( a == 0.0L)) return a*b; // Return NaN if a = ��, b = 0, c � NaN (or a = 0, b = �) 239 if ( __builtin_fabsl(c) == __builtin_infl() ) { 240 if ( sign & 0x8000 ) { 241 if ( c > 0 ) 242 return c - __builtin_infl(); // Return NaN if a = ��, c = -a 243 else 244 return c; // Return �inf if a = c = ��. 245 } else { 246 if ( c < 0 ) 247 return c + __builtin_infl(); // Return NaN if a = ��, c = -a 248 else 249 return c; // Return �inf if a = c = ��. 250 } 251 if ( sign & 0x8000 ) return -__builtin_infl(); 252 else return __builtin_infl(); 253 } 254 } 255 256 // c = � 257 if ( __builtin_fabsl(c) == __builtin_inf() ) return c; // a,b at this point are finite, c is infinite. 258 259 260 if( exp < -16382 - 63 ) //sub denormal 261 return c; 262 263 /***************** 264 Computation of a*b + c. scanon, Jan 2007 265 266 The whole game we're playing here only works in round-to-nearest. 267 *****************/ 268 269 long double ahi, alo, bhi, blo, phi, plo, xhi, xlo, yhi, ylo, tmp; 270 271 // split a,b into high and low parts. 272 static const uint64_t split32_mask = 0xffffffff00000000ULL; 273 ua.parts.mantissa = ua.parts.mantissa & split32_mask; 274 ahi = ua.ld; alo = a - ahi; 275 ub.parts.mantissa = ub.parts.mantissa & split32_mask; 276 bhi = ub.ld; blo = b - bhi; 277 278 // compute the product of a and b as a sum phi + plo. This is exact. 279 phi = a * b; 280 281 // In case of overflow, stop here and return phi. This will need to be changed 282 // in order to have a fully C99 fmal. 283 if (__builtin_fabsl(phi) == __builtin_infl()) { 284 return phi; // We know that c != inf or nan, so phi is the correct result. 285 } 286 287 plo = (((ahi * bhi - phi) + ahi*blo) + alo*bhi) + alo*blo; 288 289 // compute plo + c = xhi + xlo where (xhi,xlo) is head-tail. 290 xhi = plo + c; 291 if (__builtin_fabsl(plo) > __builtin_fabsl(c)) { 292 tmp = xhi - plo; 293 xlo = c - tmp; 294 } else { 295 tmp = xhi - c; 296 xlo = plo - tmp; 297 } 298 299 // Special case: xlo == 0, hence return phi + xhi: 300 if (xlo == 0.0L) return phi + xhi; 301 302 // At this point we know that the meaningful bits of phi and xhi are entirely to the 303 // left (larger) side of the meaningful bits of xlo, and that our result is 304 // round(phi + xhi + xlo). 305 yhi = phi + xhi; 306 if (__builtin_fabsl(phi) > __builtin_fabsl(xhi)) { 307 tmp = yhi - phi; 308 ylo = xhi - tmp; 309 } else { 310 tmp = yhi - xhi; 311 ylo = phi - tmp; 312 } 313 314 // Handle the special case that one of yhi or ylo is zero. 315 // If yhi == 0, then ylo is also zero, so yhi + xlo = xlo is the appropriate result. 316 // If ylo == 0, then yhi + xlo is the appropriate result. 317 if ((yhi == 0.0L) || (ylo == 0.0L)) return yhi + xlo; 318 319 // Now we have that (in terms of meaningful bits) 320 // yhi is strictly bigger than ylo is strictly bigger than xlo. 321 // Additionally, our desired result is round(yhi + ylo + xlo). 322 323 // The only way for xlo to affect rounding (in round-to-nearest) is for ylo to 324 // be exactly half an ulp of yhi. Test for the value of the mantissa of ylo; 325 // this is not the same condition, but getting this wrong can't affect the rounding. 326 union{ long double ld; struct{ uint64_t mantissa; int16_t sexp; }parts; } uylo = {ylo}; 327 if (uylo.parts.mantissa == 0x8000000000000000ULL) { 328 return yhi + (ylo + copysignl(0.5L*ylo, xlo)); 329 } 330 // In the other case, xlo has no affect on the final result, so just return yhi + ylo 331 else { 332 return yhi + ylo; 333 } 334 335 /* 336 Code that Ian wrote to do this in integer, never finished. 337 338 //mantissa product 339 // hi(a.hi*b.hi) lo(a.hi*b.hi) 340 // hi(a.hi*b.lo) lo(a.hi*b.lo) 341 // hi(a.lo*b.hi) lo(a.lo*b.hi) 342 // + hi(a.lo*b.lo) lo(a.lo*b.lo) 343 // -------------------------------------------------------------- 344 345 uint32_t ahi = ua.parts.mantissa >> 32; 346 uint32_t bhi = ub.parts.mantissa >> 32; 347 uint32_t alo = ua.parts.mantissa & 0xFFFFFFFFULL; 348 uint32_t blo = ub.parts.mantissa & 0xFFFFFFFFULL; 349 350 uint64_t templl, temphl, templh, temphh; 351 xUInt64 l, h, a; 352 353 templl = (uint64_t) alo * (uint64_t) blo; 354 temphl = (uint64_t) ahi * (uint64_t) blo; 355 templh = (uint64_t) alo * (uint64_t) bhi; 356 temphh = (uint64_t) ahi * (uint64_t) bhi; 357 358 l = _mm_cvtsi32_si128( (int32_t) templl ); templl >>= 32; 359 360 temphl += templl; 361 temphl += templh & 0xFFFFFFFFULL; 362 h = _mm_cvtsi32_si128( (int32_t) temphl); temphl >>= 32; 363 a = _mm_unpacklo_epi32( l, h ); 364 365 366 temphh += templh >> 32; 367 temphh += temphl; 368 369 a = _mm_cvtsi32_si128( (int32_t) temphh ); temphh >>= 32; 370 h = _mm_cvtsi32_si128( (int32_t) temphh); 371 a = _mm_unpacklo_epi32( a, h ); 372 l = _mm_unpacklo_epi64( l, a ); 373 374 union 375 { 376 xUInt64 v; 377 uint64_t u[2]; 378 }z = { l }; 379 380 long double lo = (long double) temphh. 381 */ 382} 383