this repo has no description
at fixPythonPipStalling 269 lines 11 kB view raw
1// long double hypotl(long double x, long double y) 2// long double cabsl(long double complex z) 3// 4// Returns sqrt(x*x + y*y), computed without spurious overflow or underflow. 5// This implementation does not attempt to deliver sub-ulp accurate results. 6// 7// -- Stephen Canon, August 2009 8 9 10// Check that we are being built for Intel ----------------------------------- 11 12#if !defined __i386__ && !defined __x86_64__ 13#error This implementation is for i386 and x86_64 only 14#endif 15 16// Macros to unify x86_64 and i386 source ------------------------------------ 17 18#ifdef __i386__ 19 #define FIRSTARG 4(%esp) 20 #define SECONDARG 20(%esp) 21#else 22 #define FIRSTARG 8(%rsp) 23 #define SECONDARG 24(%rsp) 24#endif 25 26// Entry point --------------------------------------------------------------- 27 28.text 29.globl _hypotl 30.globl _cabsl 31.align 4 32_hypotl: 33_cabsl: 34 35// Check argument scaling ---------------------------------------------------- 36// 37// Do a fast integer check on the exponent fields of both x and y. If the 38// biased exponent of either is outside the range [0x2000,0x5ffe], jump to 39// L_beCareful, where we will do more careful checks for special cases and 40// guard against undue overflow or underflow. 41// 42// The biased exponent range [0x2000,0x5ffe] corresponds to unbiased exponents 43// from -0x1fff to 0x1fff; numbers with exponents outside of this range will 44// underflow or overflow when they are squared, preventing use of the naive 45// algorithm. 46 47 movl 8+FIRSTARG, %ecx 48 andl $0x7fff, %ecx 49 movl %ecx, %eax 50 movl 8+SECONDARG, %edx 51 subl $0x2000, %ecx 52 andl $0x7fff, %edx 53 cmpl $0x3ffe, %ecx 54 ja L_beCareful 55 56 movl %edx, %ecx 57 subl $0x2000, %ecx 58 cmpl $0x3ffe, %ecx 59 ja L_beCareful 60 61// Naive computation --------------------------------------------------------- 62// 63// At this point we know that both x and y are squareable without risk of 64// overflow or underflow, so we do the naive computation: 65// 66// return sqrtl(x*x + y*y) 67 68 fldt FIRSTARG // x 69 fmul %st(0), %st(0) // x^2 70 fldt SECONDARG // y x^2 71 fmul %st(0), %st(0) // y^2 x^2 72 faddp // x^2+y^2 73 fsqrt // result 74 ret 75 76// Edge case returns --------------------------------------------------------- 77// 78// These are called from L_beCareful (below) to generate the correct return 79// value when one of the arguments is zero, infinity, or NaN. The column 80// of comments to the right contains the state of the x87 stack after the 81// instruction that is on the same line, or at the time of the jump in the 82// case of labels. 83// 84// hypot and cabs are a little bit special in that if one argument is infinity 85// and the other NaN, the C99 return value is infinity, not NaN. Put simply, 86// if either argument is infinity, we return infinity. Otherwise if either 87// argument is zero or NaN, we return |x|+|y|. 88 89L_yIsInfOrNaN: // inf |y| |x| 90 jp L_yIsNaN // inf |y| |x| 91 fstp %st(1) // inf |x| 92 fstp %st(1) // inf 93 ret 94L_xIsInfOrNaN: // |y| |x| 95 jp L_xIsNaN // |y| |x| 96 fstp %st(0) // |x| 97 ret 98L_yIsNaN: // inf |y| |x| 99 fucomip %st(2), %st(0) // |y| |x| 100 jz L_xIsInfOrNaN // |y| |x| 101L_xIsNaN: // |y| |x| 102 faddp // |y|+|x| 103 ret 104L_yIsZero: // 0.0 |y| |x| 105 fstp %st(0) // |y| |x| 106L_xIsZero: // |y| |x| 107 faddp // |y|+|x| 108 ret 109 110// Detect special cases ------------------------------------------------------ 111// 112// Here we know that one or both arguments are either edge cases or are 113// outside the range of numbers that can be squared without overflow or 114// underflow. Begin by loading both arguments on the x87 stack, and checking 115// to see if either is zero, infinity, or NaN: 116 117#ifdef __i386__ 118 #define PIC call 0f; 0: pop %ecx; 119 #define INFINITY L_infinity-0b(%ecx) 120 #define TINY L_tiny-0b(%ecx) 121#else 122 #define PIC // nothing 123 #define INFINITY L_infinity(%rip) 124 #define TINY L_tiny(%rip) 125#endif 126 127L_beCareful: 128 PIC // x87 stack 129 fldt FIRSTARG // x 130 fabs // |x| 131 fldt SECONDARG // y |x| 132 fabs // |y| |x| 133 134 fldt INFINITY // inf |y| |x| 135 fucomi %st(1), %st(0) // inf |y| |x| 136 jz L_yIsInfOrNaN // inf |y| |x| 137 fucomip %st(2), %st(0) // |y| |x| 138 jz L_xIsInfOrNaN // |y| |x| 139 140 fldz // 0.0 |y| |x| 141 fcomi %st(1), %st(0) // 0.0 |y| |x| 142 jz L_yIsZero // 0.0 |y| |x| 143 fcomip %st(2), %st(0) // |y| |x| 144 jz L_xIsZero // |y| |x| 145 146// Detect argument scaling --------------------------------------------------- 147 148 sub %edx, %eax 149 jl L_yIsLarger 150 151 // We want to have the argument with larger magnitude on the top of the x87 152 // stack; if x is larger in magnitude than y, we exchange them on the stack 153 fxch 154 155 // If exponent(x) - exponent(y) <= 32, we need to do a careful rescaling 156 // computation. Otherwise, we y only contributes rounding to the final 157 // result, so we can use a simpler approach. 158 cmp $32, %eax 159 jle L_scaledHypot 160 161// Scaled computation of |hi| + eps ------------------------------------------ 162// 163// At this point, we know that x and y are vastly different in scale, and the 164// smaller of the two contributes only to the rounding of the last bit of the 165// larger, which we call "hi". 166// 167// Instead of doing a normal hypot computation, just add a small term to the 168// significand to force rounding. 169// 170// We do this addition in the significand space; begin by splitting hi into 171// hisig and hiexp that satisfy: 172// 173// hi = hisig * 2^hiexp and 1.0 <= hisig < 2.0 174// 175// Compute hisigRounded = hisig + tiny. 176// 177// Rescale: result = hisigRounded * 2^hiexp 178// 179// Note that if hi is denormal, this will cause a double rounding. We don't 180// protect against this because (a) we're not trying for sub-ulp accuracy, 181// and (b) long double denormal are exceptionally rare. If one wanted to do 182// the "really really right thing", a simple exponent check would allow one 183// to protect against this. 184 185L_forceRounding: 186 fstp %st(1) // pop the smaller argument 187 fxtract // extract hisig, hiexp 188 fldt TINY // load tiny 189 faddp // hisig + tiny 190 fscale // compute the final result and pop 191 fstp %st(1) // hiexp off the stack. 192 ret 193 194// y >~ x -------------------------------------------------------------------- 195 196L_yIsLarger: 197 // If exponent(x) - exponent(y) < -32, we don't need to be careful, because 198 // x only affects the rounding of the last bit of |y|. Otherwise, do a 199 // careful scaled computation. 200 cmp $-32, %eax 201 jl L_forceRounding 202 203// Scaled computation of sqrt(x^2 + y^2) ------------------------------------- 204// 205// At this point, we know that x and y are approximately equal in scale 206// (within 2^32 of eachother), that squaring them will result in overflow or 207// underflow, and that they are on the top of the floating-point stack -- in 208// particular, the larger of the two is in st(0), and the smaller is in st(1). 209// We'll call the larger one "hi" and the smaller one "lo". 210// 211// In order to compute sqrt(x^2 + y^2) without undue overflow or underflow, 212// begin by splitting hi and lo into exponents and significands that satisfy: 213// 214// hi = hisig * 2^hiexp and 1.0 <= hisig < 2.0 215// lo = losig * 2^loexp and 1.0 <= losig < 2.0 216// 217// Because of the checks we have already done on the exponents, we know that: 218// 219// -32 <= (loexp - hiexp) <= 0 220// 221// Let scaledlo be losig * 2^(loexp - hiexp), and note that the following hold: 222// 223// 0x1.0p-33 < scaledlo < 2.0 and lo = scaledlo * 2^(hiexp) 224// 225// Now compute a scaled result using hisig and scaledlo: 226// 227// scaledResult = sqrt(hisig^2 + scaledlo^2) 228// 229// Note that because of the bounds we have on hisig and scaledlo, this 230// computation cannot cause overflow or underflow. Next, apply the proper 231// scaling to get the final result, which may overflow, but such overflow 232// is justified: 233// 234// result = scaledResult * 2^hiexp 235 236L_scaledHypot: 237 fxtract // Extract hisig and hiexp. 238 239 fxch %st(2) // Move lo to the top of the stack and 240 fxtract // extract losig and loexp. 241 242 fxch // Move loexp to the top of the stack and 243 fsub %st(2), %st(0) // compute (loexp - hiexp). 244 245 fxch // Move losig to the top of the stack and 246 fscale // scale by (loexp - hiexp) to compute 247 fstp %st(1) // scaledlo. 248 249 fmul %st(0), %st(0) // scaledlo^2. 250 251 fxch %st(2) // move hisig to the top of the stack and 252 fmul %st(0), %st(0) // compute hisig^2. 253 254 faddp %st(0), %st(2) // compute hisig^2 + scaledlo^2, and move 255 fxch // it to the top of the stack. Then take 256 fsqrt // the square root to get scaledResult. 257 258 fscale // apply the scale hiexp to get the final 259 fstp %st(1) // result and return. 260 ret 261 262// Useful constants ---------------------------------------------------------- 263 264.literal16 265.align 4 266L_infinity: 267.quad 0x8000000000000000, 0x0000000000007fff 268L_tiny: 269.quad 0x8000000000000000, 0x0000000000003f00