// long double hypotl(long double x, long double y) // long double cabsl(long double complex z) // // Returns sqrt(x*x + y*y), computed without spurious overflow or underflow. // This implementation does not attempt to deliver sub-ulp accurate results. // // -- Stephen Canon, August 2009 // Check that we are being built for Intel ----------------------------------- #if !defined __i386__ && !defined __x86_64__ #error This implementation is for i386 and x86_64 only #endif // Macros to unify x86_64 and i386 source ------------------------------------ #ifdef __i386__ #define FIRSTARG 4(%esp) #define SECONDARG 20(%esp) #else #define FIRSTARG 8(%rsp) #define SECONDARG 24(%rsp) #endif // Entry point --------------------------------------------------------------- .text .globl _hypotl .globl _cabsl .align 4 _hypotl: _cabsl: // Check argument scaling ---------------------------------------------------- // // Do a fast integer check on the exponent fields of both x and y. If the // biased exponent of either is outside the range [0x2000,0x5ffe], jump to // L_beCareful, where we will do more careful checks for special cases and // guard against undue overflow or underflow. // // The biased exponent range [0x2000,0x5ffe] corresponds to unbiased exponents // from -0x1fff to 0x1fff; numbers with exponents outside of this range will // underflow or overflow when they are squared, preventing use of the naive // algorithm. movl 8+FIRSTARG, %ecx andl $0x7fff, %ecx movl %ecx, %eax movl 8+SECONDARG, %edx subl $0x2000, %ecx andl $0x7fff, %edx cmpl $0x3ffe, %ecx ja L_beCareful movl %edx, %ecx subl $0x2000, %ecx cmpl $0x3ffe, %ecx ja L_beCareful // Naive computation --------------------------------------------------------- // // At this point we know that both x and y are squareable without risk of // overflow or underflow, so we do the naive computation: // // return sqrtl(x*x + y*y) fldt FIRSTARG // x fmul %st(0), %st(0) // x^2 fldt SECONDARG // y x^2 fmul %st(0), %st(0) // y^2 x^2 faddp // x^2+y^2 fsqrt // result ret // Edge case returns --------------------------------------------------------- // // These are called from L_beCareful (below) to generate the correct return // value when one of the arguments is zero, infinity, or NaN. The column // of comments to the right contains the state of the x87 stack after the // instruction that is on the same line, or at the time of the jump in the // case of labels. // // hypot and cabs are a little bit special in that if one argument is infinity // and the other NaN, the C99 return value is infinity, not NaN. Put simply, // if either argument is infinity, we return infinity. Otherwise if either // argument is zero or NaN, we return |x|+|y|. L_yIsInfOrNaN: // inf |y| |x| jp L_yIsNaN // inf |y| |x| fstp %st(1) // inf |x| fstp %st(1) // inf ret L_xIsInfOrNaN: // |y| |x| jp L_xIsNaN // |y| |x| fstp %st(0) // |x| ret L_yIsNaN: // inf |y| |x| fucomip %st(2), %st(0) // |y| |x| jz L_xIsInfOrNaN // |y| |x| L_xIsNaN: // |y| |x| faddp // |y|+|x| ret L_yIsZero: // 0.0 |y| |x| fstp %st(0) // |y| |x| L_xIsZero: // |y| |x| faddp // |y|+|x| ret // Detect special cases ------------------------------------------------------ // // Here we know that one or both arguments are either edge cases or are // outside the range of numbers that can be squared without overflow or // underflow. Begin by loading both arguments on the x87 stack, and checking // to see if either is zero, infinity, or NaN: #ifdef __i386__ #define PIC call 0f; 0: pop %ecx; #define INFINITY L_infinity-0b(%ecx) #define TINY L_tiny-0b(%ecx) #else #define PIC // nothing #define INFINITY L_infinity(%rip) #define TINY L_tiny(%rip) #endif L_beCareful: PIC // x87 stack fldt FIRSTARG // x fabs // |x| fldt SECONDARG // y |x| fabs // |y| |x| fldt INFINITY // inf |y| |x| fucomi %st(1), %st(0) // inf |y| |x| jz L_yIsInfOrNaN // inf |y| |x| fucomip %st(2), %st(0) // |y| |x| jz L_xIsInfOrNaN // |y| |x| fldz // 0.0 |y| |x| fcomi %st(1), %st(0) // 0.0 |y| |x| jz L_yIsZero // 0.0 |y| |x| fcomip %st(2), %st(0) // |y| |x| jz L_xIsZero // |y| |x| // Detect argument scaling --------------------------------------------------- sub %edx, %eax jl L_yIsLarger // We want to have the argument with larger magnitude on the top of the x87 // stack; if x is larger in magnitude than y, we exchange them on the stack fxch // If exponent(x) - exponent(y) <= 32, we need to do a careful rescaling // computation. Otherwise, we y only contributes rounding to the final // result, so we can use a simpler approach. cmp $32, %eax jle L_scaledHypot // Scaled computation of |hi| + eps ------------------------------------------ // // At this point, we know that x and y are vastly different in scale, and the // smaller of the two contributes only to the rounding of the last bit of the // larger, which we call "hi". // // Instead of doing a normal hypot computation, just add a small term to the // significand to force rounding. // // We do this addition in the significand space; begin by splitting hi into // hisig and hiexp that satisfy: // // hi = hisig * 2^hiexp and 1.0 <= hisig < 2.0 // // Compute hisigRounded = hisig + tiny. // // Rescale: result = hisigRounded * 2^hiexp // // Note that if hi is denormal, this will cause a double rounding. We don't // protect against this because (a) we're not trying for sub-ulp accuracy, // and (b) long double denormal are exceptionally rare. If one wanted to do // the "really really right thing", a simple exponent check would allow one // to protect against this. L_forceRounding: fstp %st(1) // pop the smaller argument fxtract // extract hisig, hiexp fldt TINY // load tiny faddp // hisig + tiny fscale // compute the final result and pop fstp %st(1) // hiexp off the stack. ret // y >~ x -------------------------------------------------------------------- L_yIsLarger: // If exponent(x) - exponent(y) < -32, we don't need to be careful, because // x only affects the rounding of the last bit of |y|. Otherwise, do a // careful scaled computation. cmp $-32, %eax jl L_forceRounding // Scaled computation of sqrt(x^2 + y^2) ------------------------------------- // // At this point, we know that x and y are approximately equal in scale // (within 2^32 of eachother), that squaring them will result in overflow or // underflow, and that they are on the top of the floating-point stack -- in // particular, the larger of the two is in st(0), and the smaller is in st(1). // We'll call the larger one "hi" and the smaller one "lo". // // In order to compute sqrt(x^2 + y^2) without undue overflow or underflow, // begin by splitting hi and lo into exponents and significands that satisfy: // // hi = hisig * 2^hiexp and 1.0 <= hisig < 2.0 // lo = losig * 2^loexp and 1.0 <= losig < 2.0 // // Because of the checks we have already done on the exponents, we know that: // // -32 <= (loexp - hiexp) <= 0 // // Let scaledlo be losig * 2^(loexp - hiexp), and note that the following hold: // // 0x1.0p-33 < scaledlo < 2.0 and lo = scaledlo * 2^(hiexp) // // Now compute a scaled result using hisig and scaledlo: // // scaledResult = sqrt(hisig^2 + scaledlo^2) // // Note that because of the bounds we have on hisig and scaledlo, this // computation cannot cause overflow or underflow. Next, apply the proper // scaling to get the final result, which may overflow, but such overflow // is justified: // // result = scaledResult * 2^hiexp L_scaledHypot: fxtract // Extract hisig and hiexp. fxch %st(2) // Move lo to the top of the stack and fxtract // extract losig and loexp. fxch // Move loexp to the top of the stack and fsub %st(2), %st(0) // compute (loexp - hiexp). fxch // Move losig to the top of the stack and fscale // scale by (loexp - hiexp) to compute fstp %st(1) // scaledlo. fmul %st(0), %st(0) // scaledlo^2. fxch %st(2) // move hisig to the top of the stack and fmul %st(0), %st(0) // compute hisig^2. faddp %st(0), %st(2) // compute hisig^2 + scaledlo^2, and move fxch // it to the top of the stack. Then take fsqrt // the square root to get scaledResult. fscale // apply the scale hiexp to get the final fstp %st(1) // result and return. ret // Useful constants ---------------------------------------------------------- .literal16 .align 4 L_infinity: .quad 0x8000000000000000, 0x0000000000007fff L_tiny: .quad 0x8000000000000000, 0x0000000000003f00