this repo has no description
at fixPythonPipStalling 124 lines 5.4 kB view raw
1// float fmaf(float a, float b, float c) 2// 3// The fmaf function returns a*b + c computed without intermediate rounding, and 4// sets the flags appropriately. IEEE-754 leaves undefined whether or not an 5// implementation should set the invalid flag for fmaf(0,inf,NaN). We choose 6// not to do so in this implementation. 7// 8// This implementation is intended to be fully IEEE-754 compliant. It is 9// designed to produce the correctly rounded result in all rounding modes, 10// and sets all flags appropriately. It is intended to be relatively easily 11// ported across architectures and compilers; the one issue to be aware of 12// is the use of a union for type-punning double and uint64_t. Some compilers 13// will produce incorrect codegen for this due to overambitious type-based 14// aliasing optimizations. Make sure to check the generated code with your 15// compiler. 16// 17// Stephen Canon, January 2010 18 19#include <stdint.h> 20#include "math.h" 21 22float fmaf(float a, float b, float c) { 23 24 // Detect if c is NaN to prevent fmaf(0, inf, NaN) from setting invalid. 25 // If a or b is NaN, falling through the default path generates the correct 26 // qNaN result. 27 28 if (isnan(c)) return c + c; 29 30 // Naïve computation of a*b + c in double. Rounding this back to single 31 // gives the correct result in all but one-in-a-billion cases. 32 33 double product = (double)a*b; 34 double resultRoundedToDouble = product + c; 35 36 // Interlude -- possible incorrect double-rounding scenarios when an 37 // infinitely precise result is rounded first to double, then to single: 38 // 39 // In round to nearest: 40 // -------------------- 41 // bits 0 - 23 bits 24 - 52 bits 53 - 42 // result .....0 1000...00 ...1... 43 // (double)result .....0 1000...00 44 // (float)(double)result .....0 45 // (float)result .....1 46 // 47 // bits 0 - 23 bits 24 - 52 bits 53 - 48 // result .....1 0111...11 1...1.. 49 // (double)result .....1 1000...00 50 // (float)(double)result .....0 51 // (float)result .....1 52 // 53 // In the directed rounding modes, double rounding does not occur. 54 // 55 // In order to protect against double rounding, we check if 56 // resultRoundedToDouble is an exact halfway value for rounding to single. 57 // If it isn't, then rounding a second time (to single) gives the correct 58 // result. 59 60 union {double d; uint64_t x;} bitPattern = {.d = resultRoundedToDouble}; 61 uint32_t roundingBits = (uint32_t)bitPattern.x & UINT32_C(0x1fffffff); 62 63 if (roundingBits != UINT32_C(0x10000000)) 64 return (float)resultRoundedToDouble; 65 66 // Here we know that we were in an exact halfway case, so we need to find 67 // the low part of the sum and massage its information into the low part 68 // of the double precision result. 69 // 70 // We calculate the low part of the sum as follows: let "big" be the larger 71 // (in magnitude) of "product" and "c", and let "small" be the smaller. 72 // We already know resultRoundedToDouble, big, and small in the following 73 // equation: 74 // 75 // resultRoundedToDouble + lowPart = big + small 76 // 77 // so we can compute "lowPart" as: 78 // 79 // lowPart = (big - resultRoundedToDouble) + small 80 // 81 // This cannot generate spurious NaN/invalid because if resultRoundedToDouble 82 // were inf, we wouldn't have matched as a halfway case, and wouldn't be on 83 // this codepath. 84 // 85 // Note also that "lowPart" will not be exact in some circumstances in 86 // directed rounding modes. This is OK, because we are only going to use 87 // it to twiddle the low bit of an exact-halfway case for round-to-nearest; 88 // if we are in a directed rounding mode, this does not effect the final 89 // result. It may be an inexact operation, but the inexact flag is already 90 // raised at this point if that is the case. 91 92 double lowPart; 93 if (__builtin_fabs(product) >= __builtin_fabs(c)) 94 lowPart = (product - resultRoundedToDouble) + c; 95 else 96 lowPart = (c - resultRoundedToDouble) + product; 97 98 // If the lowPart is non-zero, and has the same sign as the result, 99 // add one to the bit representation of the double-precision result 100 // to act as sticky. This cannot spuriously raise inexact, because 101 // inexact is already (correctly) raised if lowPart is non-zero. The 102 // product cannot over/underflow because the inputs originally came from 103 // single precision, so the exponents are well controlled. 104 105 if (lowPart*resultRoundedToDouble > 0.0) { 106 bitPattern.x += 1; 107 resultRoundedToDouble = bitPattern.d; 108 } 109 110 // If instead the signs are opposite, subtract one. 111 112 if (lowPart*resultRoundedToDouble < 0.0) { 113 bitPattern.x -= 1; 114 resultRoundedToDouble = bitPattern.d; 115 } 116 117 // If the low part is exactly zero, then this really is an exact halfway 118 // case for rounding to single, and no action is necessary. 119 120 // Now that we have massaged in a sticky bit for the low part, we can round 121 // to single-precision and get the correct final result. 122 123 return (float)resultRoundedToDouble; 124}