this repo has no description
1// float fmaf(float a, float b, float c)
2//
3// The fmaf function returns a*b + c computed without intermediate rounding, and
4// sets the flags appropriately. IEEE-754 leaves undefined whether or not an
5// implementation should set the invalid flag for fmaf(0,inf,NaN). We choose
6// not to do so in this implementation.
7//
8// This implementation is intended to be fully IEEE-754 compliant. It is
9// designed to produce the correctly rounded result in all rounding modes,
10// and sets all flags appropriately. It is intended to be relatively easily
11// ported across architectures and compilers; the one issue to be aware of
12// is the use of a union for type-punning double and uint64_t. Some compilers
13// will produce incorrect codegen for this due to overambitious type-based
14// aliasing optimizations. Make sure to check the generated code with your
15// compiler.
16//
17// Stephen Canon, January 2010
18
19#include <stdint.h>
20#include "math.h"
21
22float fmaf(float a, float b, float c) {
23
24 // Detect if c is NaN to prevent fmaf(0, inf, NaN) from setting invalid.
25 // If a or b is NaN, falling through the default path generates the correct
26 // qNaN result.
27
28 if (isnan(c)) return c + c;
29
30 // Naïve computation of a*b + c in double. Rounding this back to single
31 // gives the correct result in all but one-in-a-billion cases.
32
33 double product = (double)a*b;
34 double resultRoundedToDouble = product + c;
35
36 // Interlude -- possible incorrect double-rounding scenarios when an
37 // infinitely precise result is rounded first to double, then to single:
38 //
39 // In round to nearest:
40 // --------------------
41 // bits 0 - 23 bits 24 - 52 bits 53 -
42 // result .....0 1000...00 ...1...
43 // (double)result .....0 1000...00
44 // (float)(double)result .....0
45 // (float)result .....1
46 //
47 // bits 0 - 23 bits 24 - 52 bits 53 -
48 // result .....1 0111...11 1...1..
49 // (double)result .....1 1000...00
50 // (float)(double)result .....0
51 // (float)result .....1
52 //
53 // In the directed rounding modes, double rounding does not occur.
54 //
55 // In order to protect against double rounding, we check if
56 // resultRoundedToDouble is an exact halfway value for rounding to single.
57 // If it isn't, then rounding a second time (to single) gives the correct
58 // result.
59
60 union {double d; uint64_t x;} bitPattern = {.d = resultRoundedToDouble};
61 uint32_t roundingBits = (uint32_t)bitPattern.x & UINT32_C(0x1fffffff);
62
63 if (roundingBits != UINT32_C(0x10000000))
64 return (float)resultRoundedToDouble;
65
66 // Here we know that we were in an exact halfway case, so we need to find
67 // the low part of the sum and massage its information into the low part
68 // of the double precision result.
69 //
70 // We calculate the low part of the sum as follows: let "big" be the larger
71 // (in magnitude) of "product" and "c", and let "small" be the smaller.
72 // We already know resultRoundedToDouble, big, and small in the following
73 // equation:
74 //
75 // resultRoundedToDouble + lowPart = big + small
76 //
77 // so we can compute "lowPart" as:
78 //
79 // lowPart = (big - resultRoundedToDouble) + small
80 //
81 // This cannot generate spurious NaN/invalid because if resultRoundedToDouble
82 // were inf, we wouldn't have matched as a halfway case, and wouldn't be on
83 // this codepath.
84 //
85 // Note also that "lowPart" will not be exact in some circumstances in
86 // directed rounding modes. This is OK, because we are only going to use
87 // it to twiddle the low bit of an exact-halfway case for round-to-nearest;
88 // if we are in a directed rounding mode, this does not effect the final
89 // result. It may be an inexact operation, but the inexact flag is already
90 // raised at this point if that is the case.
91
92 double lowPart;
93 if (__builtin_fabs(product) >= __builtin_fabs(c))
94 lowPart = (product - resultRoundedToDouble) + c;
95 else
96 lowPart = (c - resultRoundedToDouble) + product;
97
98 // If the lowPart is non-zero, and has the same sign as the result,
99 // add one to the bit representation of the double-precision result
100 // to act as sticky. This cannot spuriously raise inexact, because
101 // inexact is already (correctly) raised if lowPart is non-zero. The
102 // product cannot over/underflow because the inputs originally came from
103 // single precision, so the exponents are well controlled.
104
105 if (lowPart*resultRoundedToDouble > 0.0) {
106 bitPattern.x += 1;
107 resultRoundedToDouble = bitPattern.d;
108 }
109
110 // If instead the signs are opposite, subtract one.
111
112 if (lowPart*resultRoundedToDouble < 0.0) {
113 bitPattern.x -= 1;
114 resultRoundedToDouble = bitPattern.d;
115 }
116
117 // If the low part is exactly zero, then this really is an exact halfway
118 // case for rounding to single, and no action is necessary.
119
120 // Now that we have massaged in a sticky bit for the low part, we can round
121 // to single-precision and get the correct final result.
122
123 return (float)resultRoundedToDouble;
124}