src/libm/Source/Intel/hypotl.S at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / hypotl.S
at fixPythonPipStalling 269 lines 11 kB view raw
wrap content
Lubos Dolezel More progress 9y ago
a76db625
  1//  long double hypotl(long double x, long double y)
  2//  long double cabsl(long double complex z)
  3//
  4//  Returns sqrt(x*x + y*y), computed without spurious overflow or underflow.
  5//  This implementation does not attempt to deliver sub-ulp accurate results.
  6//
  7//  -- Stephen Canon, August 2009
  8
  9
 10//  Check that we are being built for Intel -----------------------------------
 11
 12#if !defined __i386__ && !defined __x86_64__
 13#error This implementation is for i386 and x86_64 only
 14#endif
 15
 16//  Macros to unify x86_64 and i386 source ------------------------------------
 17
 18#ifdef __i386__
 19    #define FIRSTARG     4(%esp)
 20    #define SECONDARG   20(%esp)
 21#else
 22    #define FIRSTARG     8(%rsp)
 23    #define SECONDARG   24(%rsp)
 24#endif
 25
 26//  Entry point ---------------------------------------------------------------
 27
 28.text
 29.globl _hypotl
 30.globl _cabsl
 31.align 4
 32_hypotl:
 33_cabsl:
 34    
 35//  Check argument scaling ----------------------------------------------------
 36//    
 37//  Do a fast integer check on the exponent fields of both x and y.  If the
 38//  biased exponent of either is outside the range [0x2000,0x5ffe], jump to
 39//  L_beCareful, where we will do more careful checks for special cases and
 40//  guard against undue overflow or underflow.
 41//
 42//  The biased exponent range [0x2000,0x5ffe] corresponds to unbiased exponents
 43//  from -0x1fff to 0x1fff; numbers with exponents outside of this range will
 44//  underflow or overflow when they are squared, preventing use of the naive
 45//  algorithm.
 46
 47    movl      8+FIRSTARG,       %ecx
 48    andl        $0x7fff,        %ecx
 49    movl        %ecx,           %eax
 50    movl      8+SECONDARG,      %edx  
 51    subl        $0x2000,        %ecx
 52    andl        $0x7fff,        %edx
 53    cmpl        $0x3ffe,        %ecx  
 54    ja          L_beCareful
 55    
 56    movl        %edx,           %ecx
 57    subl        $0x2000,        %ecx
 58    cmpl        $0x3ffe,        %ecx
 59    ja          L_beCareful
 60    
 61//  Naive computation ---------------------------------------------------------
 62//
 63//  At this point we know that both x and y are squareable without risk of
 64//  overflow or underflow, so we do the naive computation:
 65//
 66//    return sqrtl(x*x + y*y)
 67                                        
 68    fldt        FIRSTARG                //  x
 69    fmul        %st(0),         %st(0)  // x^2
 70    fldt        SECONDARG               //  y  x^2
 71    fmul        %st(0),         %st(0)  // y^2 x^2
 72    faddp                               // x^2+y^2
 73    fsqrt                               // result
 74    ret
 75    
 76//  Edge case returns ---------------------------------------------------------
 77//
 78//  These are called from L_beCareful (below) to generate the correct return
 79//  value when one of the arguments is zero, infinity, or NaN.  The column
 80//  of comments to the right contains the state of the x87 stack after the
 81//  instruction that is on the same line, or at the time of the jump in the
 82//  case of labels.
 83//
 84//  hypot and cabs are a little bit special in that if one argument is infinity
 85//  and the other NaN, the C99 return value is infinity, not NaN.  Put simply,
 86//  if either argument is infinity, we return infinity.  Otherwise if either
 87//  argument is zero or NaN, we return |x|+|y|.
 88
 89L_yIsInfOrNaN:                          // inf |y| |x|
 90    jp          L_yIsNaN                // inf |y| |x|
 91    fstp        %st(1)                  // inf |x|
 92    fstp        %st(1)                  // inf
 93    ret
 94L_xIsInfOrNaN:                          // |y| |x|
 95    jp          L_xIsNaN                // |y| |x|
 96    fstp        %st(0)                  // |x|
 97    ret
 98L_yIsNaN:                               // inf |y| |x|
 99    fucomip     %st(2),         %st(0)  // |y| |x|
100    jz          L_xIsInfOrNaN           // |y| |x|
101L_xIsNaN:                               // |y| |x|
102    faddp                               // |y|+|x|
103    ret
104L_yIsZero:                              // 0.0 |y| |x|
105    fstp        %st(0)                  // |y| |x|
106L_xIsZero:                              // |y| |x|
107    faddp                               // |y|+|x|
108    ret
109
110//  Detect special cases ------------------------------------------------------
111//
112//  Here we know that one or both arguments are either edge cases or are
113//  outside the range of numbers that can be squared without overflow or
114//  underflow.  Begin by loading both arguments on the x87 stack, and checking
115//  to see if either is zero, infinity, or NaN:
116
117#ifdef __i386__
118    #define PIC         call 0f; 0: pop %ecx;
119    #define INFINITY    L_infinity-0b(%ecx)
120    #define TINY        L_tiny-0b(%ecx)
121#else
122    #define PIC         // nothing
123    #define INFINITY    L_infinity(%rip)
124    #define TINY        L_tiny(%rip)
125#endif
126
127L_beCareful:
128    PIC                                 // x87 stack
129    fldt        FIRSTARG                //  x
130    fabs                                // |x|
131    fldt        SECONDARG               //  y  |x|
132    fabs                                // |y| |x|
133    
134    fldt        INFINITY                // inf |y| |x|
135    fucomi      %st(1),         %st(0)  // inf |y| |x|
136    jz          L_yIsInfOrNaN           // inf |y| |x|
137    fucomip     %st(2),         %st(0)  // |y| |x|
138    jz          L_xIsInfOrNaN           // |y| |x|
139    
140    fldz                                // 0.0 |y| |x|
141    fcomi       %st(1),         %st(0)  // 0.0 |y| |x|
142    jz          L_yIsZero               // 0.0 |y| |x|
143    fcomip      %st(2),         %st(0)  // |y| |x|
144    jz          L_xIsZero               // |y| |x|
145       
146//  Detect argument scaling ---------------------------------------------------
147
148    sub         %edx,           %eax
149    jl          L_yIsLarger
150    
151    // We want to have the argument with larger magnitude on the top of the x87
152    // stack; if x is larger in magnitude than y, we exchange them on the stack
153    fxch
154    
155    // If exponent(x) - exponent(y) <= 32, we need to do a careful rescaling
156    // computation.  Otherwise, we y only contributes rounding to the final
157    // result, so we can use a simpler approach.
158    cmp         $32,            %eax
159    jle          L_scaledHypot
160    
161//  Scaled computation of |hi| + eps ------------------------------------------
162//
163//  At this point, we know that x and y are vastly different in scale, and the
164//  smaller of the two contributes only to the rounding of the last bit of the
165//  larger, which we call "hi".
166//
167//  Instead of doing a normal hypot computation, just add a small term to the
168//  significand to force rounding.
169//
170//  We do this addition in the significand space; begin by splitting hi into
171//  hisig and hiexp that satisfy:
172//
173//          hi = hisig * 2^hiexp   and   1.0 <= hisig < 2.0
174//
175//  Compute hisigRounded = hisig + tiny.
176//
177//  Rescale: result = hisigRounded * 2^hiexp
178//
179//  Note that if hi is denormal, this will cause a double rounding.  We don't
180//  protect against this because (a) we're not trying for sub-ulp accuracy,
181//  and (b) long double denormal are exceptionally rare.  If one wanted to do
182//  the "really really right thing", a simple exponent check would allow one
183//  to protect against this.
184
185L_forceRounding:
186    fstp        %st(1)                  // pop the smaller argument
187    fxtract                             // extract hisig, hiexp
188    fldt        TINY                    // load tiny
189    faddp                               // hisig + tiny
190    fscale                              // compute the final result and pop
191    fstp        %st(1)                  // hiexp off the stack.
192    ret
193          
194//  y >~ x --------------------------------------------------------------------
195
196L_yIsLarger:
197    // If exponent(x) - exponent(y) < -32, we don't need to be careful, because
198    // x only affects the rounding of the last bit of |y|.  Otherwise, do a
199    // careful scaled computation.
200    cmp         $-32,           %eax
201    jl          L_forceRounding
202    
203//  Scaled computation of sqrt(x^2 + y^2) -------------------------------------
204//
205//  At this point, we know that x and y are approximately equal in scale
206//  (within 2^32 of eachother), that squaring them will result in overflow or
207//  underflow, and that they are on the top of the floating-point stack -- in
208//  particular, the larger of the two is in st(0), and the smaller is in st(1).
209//  We'll call the larger one "hi" and the smaller one "lo".
210//
211//  In order to compute sqrt(x^2 + y^2) without undue overflow or underflow,
212//  begin by splitting hi and lo into exponents and significands that satisfy:
213//
214//          hi = hisig * 2^hiexp   and   1.0 <= hisig < 2.0
215//          lo = losig * 2^loexp   and   1.0 <= losig < 2.0
216//
217//  Because of the checks we have already done on the exponents, we know that:
218// 
219//          -32 <= (loexp - hiexp) <= 0
220//
221//  Let scaledlo be losig * 2^(loexp - hiexp), and note that the following hold:
222//
223//          0x1.0p-33 < scaledlo < 2.0   and   lo = scaledlo * 2^(hiexp)
224//          
225//  Now compute a scaled result using hisig and scaledlo:
226//
227//          scaledResult = sqrt(hisig^2 + scaledlo^2)
228//
229//  Note that because of the bounds we have on hisig and scaledlo, this
230//  computation cannot cause overflow or underflow.  Next, apply the proper
231//  scaling to get the final result, which may overflow, but such overflow
232//  is justified:
233//
234//          result = scaledResult * 2^hiexp
235
236L_scaledHypot:
237    fxtract                         // Extract hisig and hiexp.
238    
239    fxch        %st(2)              // Move lo to the top of the stack and
240    fxtract                         // extract losig and loexp.
241    
242    fxch                            // Move loexp to the top of the stack and
243    fsub        %st(2),     %st(0)  // compute (loexp - hiexp).
244    
245    fxch                            // Move losig to the top of the stack and
246    fscale                          // scale by (loexp - hiexp) to compute
247    fstp        %st(1)              // scaledlo.
248    
249    fmul        %st(0),     %st(0)  // scaledlo^2.
250    
251    fxch        %st(2)              // move hisig to the top of the stack and
252    fmul        %st(0),     %st(0)  // compute hisig^2.
253    
254    faddp       %st(0),     %st(2)  // compute hisig^2 + scaledlo^2, and move
255    fxch                            // it to the top of the stack.  Then take
256    fsqrt                           // the square root to get scaledResult.
257    
258    fscale                          // apply the scale hiexp to get the final
259    fstp        %st(1)              // result and return.
260    ret
261
262//  Useful constants ----------------------------------------------------------
263
264.literal16
265.align 4
266L_infinity:
267.quad 0x8000000000000000, 0x0000000000007fff
268L_tiny:
269.quad 0x8000000000000000, 0x0000000000003f00