this repo has no description
1// long double hypotl(long double x, long double y)
2// long double cabsl(long double complex z)
3//
4// Returns sqrt(x*x + y*y), computed without spurious overflow or underflow.
5// This implementation does not attempt to deliver sub-ulp accurate results.
6//
7// -- Stephen Canon, August 2009
8
9
10// Check that we are being built for Intel -----------------------------------
11
12#if !defined __i386__ && !defined __x86_64__
13#error This implementation is for i386 and x86_64 only
14#endif
15
16// Macros to unify x86_64 and i386 source ------------------------------------
17
18#ifdef __i386__
19 #define FIRSTARG 4(%esp)
20 #define SECONDARG 20(%esp)
21#else
22 #define FIRSTARG 8(%rsp)
23 #define SECONDARG 24(%rsp)
24#endif
25
26// Entry point ---------------------------------------------------------------
27
28.text
29.globl _hypotl
30.globl _cabsl
31.align 4
32_hypotl:
33_cabsl:
34
35// Check argument scaling ----------------------------------------------------
36//
37// Do a fast integer check on the exponent fields of both x and y. If the
38// biased exponent of either is outside the range [0x2000,0x5ffe], jump to
39// L_beCareful, where we will do more careful checks for special cases and
40// guard against undue overflow or underflow.
41//
42// The biased exponent range [0x2000,0x5ffe] corresponds to unbiased exponents
43// from -0x1fff to 0x1fff; numbers with exponents outside of this range will
44// underflow or overflow when they are squared, preventing use of the naive
45// algorithm.
46
47 movl 8+FIRSTARG, %ecx
48 andl $0x7fff, %ecx
49 movl %ecx, %eax
50 movl 8+SECONDARG, %edx
51 subl $0x2000, %ecx
52 andl $0x7fff, %edx
53 cmpl $0x3ffe, %ecx
54 ja L_beCareful
55
56 movl %edx, %ecx
57 subl $0x2000, %ecx
58 cmpl $0x3ffe, %ecx
59 ja L_beCareful
60
61// Naive computation ---------------------------------------------------------
62//
63// At this point we know that both x and y are squareable without risk of
64// overflow or underflow, so we do the naive computation:
65//
66// return sqrtl(x*x + y*y)
67
68 fldt FIRSTARG // x
69 fmul %st(0), %st(0) // x^2
70 fldt SECONDARG // y x^2
71 fmul %st(0), %st(0) // y^2 x^2
72 faddp // x^2+y^2
73 fsqrt // result
74 ret
75
76// Edge case returns ---------------------------------------------------------
77//
78// These are called from L_beCareful (below) to generate the correct return
79// value when one of the arguments is zero, infinity, or NaN. The column
80// of comments to the right contains the state of the x87 stack after the
81// instruction that is on the same line, or at the time of the jump in the
82// case of labels.
83//
84// hypot and cabs are a little bit special in that if one argument is infinity
85// and the other NaN, the C99 return value is infinity, not NaN. Put simply,
86// if either argument is infinity, we return infinity. Otherwise if either
87// argument is zero or NaN, we return |x|+|y|.
88
89L_yIsInfOrNaN: // inf |y| |x|
90 jp L_yIsNaN // inf |y| |x|
91 fstp %st(1) // inf |x|
92 fstp %st(1) // inf
93 ret
94L_xIsInfOrNaN: // |y| |x|
95 jp L_xIsNaN // |y| |x|
96 fstp %st(0) // |x|
97 ret
98L_yIsNaN: // inf |y| |x|
99 fucomip %st(2), %st(0) // |y| |x|
100 jz L_xIsInfOrNaN // |y| |x|
101L_xIsNaN: // |y| |x|
102 faddp // |y|+|x|
103 ret
104L_yIsZero: // 0.0 |y| |x|
105 fstp %st(0) // |y| |x|
106L_xIsZero: // |y| |x|
107 faddp // |y|+|x|
108 ret
109
110// Detect special cases ------------------------------------------------------
111//
112// Here we know that one or both arguments are either edge cases or are
113// outside the range of numbers that can be squared without overflow or
114// underflow. Begin by loading both arguments on the x87 stack, and checking
115// to see if either is zero, infinity, or NaN:
116
117#ifdef __i386__
118 #define PIC call 0f; 0: pop %ecx;
119 #define INFINITY L_infinity-0b(%ecx)
120 #define TINY L_tiny-0b(%ecx)
121#else
122 #define PIC // nothing
123 #define INFINITY L_infinity(%rip)
124 #define TINY L_tiny(%rip)
125#endif
126
127L_beCareful:
128 PIC // x87 stack
129 fldt FIRSTARG // x
130 fabs // |x|
131 fldt SECONDARG // y |x|
132 fabs // |y| |x|
133
134 fldt INFINITY // inf |y| |x|
135 fucomi %st(1), %st(0) // inf |y| |x|
136 jz L_yIsInfOrNaN // inf |y| |x|
137 fucomip %st(2), %st(0) // |y| |x|
138 jz L_xIsInfOrNaN // |y| |x|
139
140 fldz // 0.0 |y| |x|
141 fcomi %st(1), %st(0) // 0.0 |y| |x|
142 jz L_yIsZero // 0.0 |y| |x|
143 fcomip %st(2), %st(0) // |y| |x|
144 jz L_xIsZero // |y| |x|
145
146// Detect argument scaling ---------------------------------------------------
147
148 sub %edx, %eax
149 jl L_yIsLarger
150
151 // We want to have the argument with larger magnitude on the top of the x87
152 // stack; if x is larger in magnitude than y, we exchange them on the stack
153 fxch
154
155 // If exponent(x) - exponent(y) <= 32, we need to do a careful rescaling
156 // computation. Otherwise, we y only contributes rounding to the final
157 // result, so we can use a simpler approach.
158 cmp $32, %eax
159 jle L_scaledHypot
160
161// Scaled computation of |hi| + eps ------------------------------------------
162//
163// At this point, we know that x and y are vastly different in scale, and the
164// smaller of the two contributes only to the rounding of the last bit of the
165// larger, which we call "hi".
166//
167// Instead of doing a normal hypot computation, just add a small term to the
168// significand to force rounding.
169//
170// We do this addition in the significand space; begin by splitting hi into
171// hisig and hiexp that satisfy:
172//
173// hi = hisig * 2^hiexp and 1.0 <= hisig < 2.0
174//
175// Compute hisigRounded = hisig + tiny.
176//
177// Rescale: result = hisigRounded * 2^hiexp
178//
179// Note that if hi is denormal, this will cause a double rounding. We don't
180// protect against this because (a) we're not trying for sub-ulp accuracy,
181// and (b) long double denormal are exceptionally rare. If one wanted to do
182// the "really really right thing", a simple exponent check would allow one
183// to protect against this.
184
185L_forceRounding:
186 fstp %st(1) // pop the smaller argument
187 fxtract // extract hisig, hiexp
188 fldt TINY // load tiny
189 faddp // hisig + tiny
190 fscale // compute the final result and pop
191 fstp %st(1) // hiexp off the stack.
192 ret
193
194// y >~ x --------------------------------------------------------------------
195
196L_yIsLarger:
197 // If exponent(x) - exponent(y) < -32, we don't need to be careful, because
198 // x only affects the rounding of the last bit of |y|. Otherwise, do a
199 // careful scaled computation.
200 cmp $-32, %eax
201 jl L_forceRounding
202
203// Scaled computation of sqrt(x^2 + y^2) -------------------------------------
204//
205// At this point, we know that x and y are approximately equal in scale
206// (within 2^32 of eachother), that squaring them will result in overflow or
207// underflow, and that they are on the top of the floating-point stack -- in
208// particular, the larger of the two is in st(0), and the smaller is in st(1).
209// We'll call the larger one "hi" and the smaller one "lo".
210//
211// In order to compute sqrt(x^2 + y^2) without undue overflow or underflow,
212// begin by splitting hi and lo into exponents and significands that satisfy:
213//
214// hi = hisig * 2^hiexp and 1.0 <= hisig < 2.0
215// lo = losig * 2^loexp and 1.0 <= losig < 2.0
216//
217// Because of the checks we have already done on the exponents, we know that:
218//
219// -32 <= (loexp - hiexp) <= 0
220//
221// Let scaledlo be losig * 2^(loexp - hiexp), and note that the following hold:
222//
223// 0x1.0p-33 < scaledlo < 2.0 and lo = scaledlo * 2^(hiexp)
224//
225// Now compute a scaled result using hisig and scaledlo:
226//
227// scaledResult = sqrt(hisig^2 + scaledlo^2)
228//
229// Note that because of the bounds we have on hisig and scaledlo, this
230// computation cannot cause overflow or underflow. Next, apply the proper
231// scaling to get the final result, which may overflow, but such overflow
232// is justified:
233//
234// result = scaledResult * 2^hiexp
235
236L_scaledHypot:
237 fxtract // Extract hisig and hiexp.
238
239 fxch %st(2) // Move lo to the top of the stack and
240 fxtract // extract losig and loexp.
241
242 fxch // Move loexp to the top of the stack and
243 fsub %st(2), %st(0) // compute (loexp - hiexp).
244
245 fxch // Move losig to the top of the stack and
246 fscale // scale by (loexp - hiexp) to compute
247 fstp %st(1) // scaledlo.
248
249 fmul %st(0), %st(0) // scaledlo^2.
250
251 fxch %st(2) // move hisig to the top of the stack and
252 fmul %st(0), %st(0) // compute hisig^2.
253
254 faddp %st(0), %st(2) // compute hisig^2 + scaledlo^2, and move
255 fxch // it to the top of the stack. Then take
256 fsqrt // the square root to get scaledResult.
257
258 fscale // apply the scale hiexp to get the final
259 fstp %st(1) // result and return.
260 ret
261
262// Useful constants ----------------------------------------------------------
263
264.literal16
265.align 4
266L_infinity:
267.quad 0x8000000000000000, 0x0000000000007fff
268L_tiny:
269.quad 0x8000000000000000, 0x0000000000003f00