this repo has no description
at fixPythonPipStalling 351 lines 10 kB view raw
1/* tanf.s -- tanf for standard math library. 2 3 Written by Eric Postpischil, June 2007. 4*/ 5 6 7#define SignificandBits 24 8#define ExponentBits 8 9 10 .literal8 11 12 .align 3 13 14NegativeOne: 15 .double -1 16 17 18 .const 19 20 .align 4 21 22/* Coefficients to calculate: 23 24 x * (x**4 + c02*x**2 + c00) * (x**4 + c12*x**2 + c10) 25 * (x**4 + c22*x**2 + c20) * ( c32*x**2 + c30) 26 27 c02 and c12 are stored as a pair at the location below labeled c02, 28 c00 and c10 are at c00, c22 and c23 at c22, and c20 and c30 at c20. 29 30 This polynomial was designed to cover two binades (supporting an 31 exponent fold of one bit) for tanf with its period (pi) partitioned into 32 two intervals. The expanded polynomial was factored to create the form 33 above convenient for calculating with SIMD. 34 35 This was minimized for ULP error. 36*/ 37c02: .double -0.34431297545117363561, -1.53503935362160698123 38c00: .double +0.75337415977492632831, +0.92783629599735333058 39c22: .double +0.91301179133375347365, +5.27860261872543679865 40c20: .double +0.58807104225499649417, +3.82127246397188494330 41 42 43/* ExponentFold specifies how many bits of the exponent will not be used to 44 look up entries in the table. This must match the ExponentFold used to 45 generate the table. 46 47 The behavior for infinite inputs relies on the table contents -- if the 48 table entries looked up for an infinite input are m0 and m1, then 49 m0*infinity + m1*infinity should generate a NaN. This happens of either m0 50 or m1 is a zero or they have opposite signs. If this were not true, 51 additional arrangements would have to be made for infinite inputs. 52*/ 53#define ExponentFold 1 54 55// Include the reduction table. See tanfMakeTable.c for more information. 56#include "tanfTable.s" 57 58 59// Rename the general registers (just to make it easier to keep track of them). 60#if defined __i386__ 61 #define r0 %eax 62 #define r1 %ecx 63 #define r2 %edx 64 #define r3 %ebx 65 #define r4 %esp 66 #define r5 %ebp 67 #define r6 %esi 68 #define r7 %edi 69#elif defined __x86_64__ 70 #define r0 %rax 71 #define r1 %rcx 72 #define r2 %rdx 73 #define r3 %rbx 74 #define r4 %rsp 75 #define r5 %rbp 76 #define r6 %rsi 77 #define r7 %rdi 78#else 79 #error "Unknown architecture." 80#endif 81 82 83 .text 84 85 86// Define symbols for tanf. 87#define BaseP r0 // Base address for position-independent addressing. 88#define xInteger r1 // The input x in an integer register. 89#define ki r1 // x*m0 rounded to an integer. 90 91#define p0 %xmm0 // Must be in %xmm0 for return on x86_64. 92#define p2 %xmm1 93#define x %xmm2 94#define k %xmm3 95#define m1 %xmm4 96#define x1 %xmm5 97#define Numerator %xmm6 98 99#if defined __i386__ 100 101 // Define location of argument x. 102 #define Argx 4(%esp) 103 104 // Define how to address data. BaseP must contain the address of label 0. 105 #define Address(label) label-0b(BaseP) 106 107#elif defined __x86_64__ 108 109 // Define location of argument x. 110 #define Argx %xmm0 111 112 // Define how to address data. 113 #define Address(label) label(%rip) 114 115#endif 116 117 118/* float tanf(float x). 119 120 Notes: 121 122 Citations in parentheses below indicate the source of a requirement. 123 124 "C" stands for ISO/IEC 9899:TC2. 125 126 The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no 127 requirements since it defers to C and requires errno behavior only if 128 we choose to support it by arranging for "math_errhandling & 129 MATH_ERRNO" to be non-zero, which we do not. 130 131 Return value: 132 133 For +/- zero, return zero with same sign (C F.9 12 and F.9.1.7). 134 135 For +/- infinity, return a NaN (C F.9.1.7). 136 137 For a NaN, return the same NaN (C F.9 11 and 13). 138 139 Otherwise: 140 141 If the rounding mode is round-to-nearest, return tangent(x) 142 faithfully rounded. 143 144 Not currently implemented: In other rounding modes, return 145 tangent(x) possibly with slightly worse error, not necessarily 146 honoring the rounding mode (Ali Sazegari narrowing C F.9 10). 147 148 Exceptions: 149 150 Raise underflow for a denormal result (C F.9 7 and Draft Standard for 151 Floating-Point Arithmetic P754 Draft 1.2.5 9.5). If the input is the 152 smallest normal, underflow may or may not be raised. This is stricter 153 than the older 754 standard. 154 155 May or may not raise inexact, even if the result is exact (C F.9 8). 156 157 Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite 158 of C F.2.1) or an infinity (C F.9.1.7) but not if the input is a quiet 159 NaN (C F.9 11). 160 161 May not raise exceptions otherwise (C F.9 9). 162 163 Properties: 164 165 Desired to be monotonic. Not yet proven! 166 167 Exhaustive testing proved this routine returns faithfully rounded 168 results. 169*/ 170 .align 5 171 .globl _tanf 172_tanf: 173 174 // Put x into an integer register. 175 #if defined __i386__ 176 mov Argx, xInteger // Put x into xInteger. 177 #elif defined __x86_64__ 178 movd Argx, xInteger // Put x into xInteger. 179 #endif 180 181 cvtss2sd Argx, x // Convert x to double precision. 182 183 #if defined __i386__ 184 185 // Get address of 0 in BaseP. 186 call 0f // Push program counter onto stack. 187 0: 188 pop BaseP // Get program counter. 189 190 #elif defined __x86_64__ 191 192 // Get address of reduction table in BaseP. 193 lea ReductionTable(%rip), BaseP 194 195 #endif 196 197 movsd Address(NegativeOne), Numerator 198 199 unpcklpd x, x // Duplicate x. 200 201 /* Shift: 202 203 SignificandBits-1 bits right to remove significand. 204 205 ExponentFold bits right to remove unused bits of exponent. 206 207 4 bits left to multiply by size of a table entry. 208 */ 209 shr $SignificandBits-1+ExponentFold-4, xInteger 210 211 // Clear bits other than those we are using from exponent. 212 and $(1<<ExponentBits-ExponentFold)-1 << 4, xInteger 213 je ExponentIsZero // Branch so we can get -0 right. 214 215 ucomisd x, x // Test for Nan. 216 jpe HandleNaN 217 218 // Get table entry containing m0 and m1 and calculate x*m0 and x*m1. 219 #if defined __i386__ 220 mulpd ReductionTable-0b(BaseP, xInteger), x 221 #elif defined __x86_64__ 222 mulpd (BaseP, xInteger), x 223 #endif 224 /* Let the exponent of x be E, that is, E is the largest integer such 225 that 2**E <= x. We use e to index the table, where e is E with the 226 low ExponentFold bits set to zero. For example, if ExponentFold is 227 1, e = E & ~1. 228 229 The table entry indexed by e represents a number m that is (u/p % 230 1)/u*2**a, where u is the ULP of 2**e, p is the period (pi), and 231 2**a is the number of intervals we divide the period into (we use 232 two, so a is one). 233 234 Observe that when 2**e <= x, x/u is an integer, and x/u < 235 2**(24+ExponentFold). 236 237 Then for some integer k, x*m = x*(u/p % 1)/u*2**a = x*(u/p - 238 k)/u*2**a = x/p*2**a - x/u*k*2**a. x/u*k*2**a is an integer, so 239 x*m is congruent to x/p*2**a modulo 1. 240 241 So tangent(x*m*p*2**-a) equals tangent(x). 242 243 However, we do not have m exactly. The table contains two numbers 244 for 2**e, m0 and m1. m0 has the first 29 bits of m, and m1 has the 245 next 53 bits. 246 247 Since x has 23 bits and m0 has 29 bits, their multiplication in 248 double format is exact. From that product, we subtract the nearest 249 integer, which changes nothing modulo 1 but reduces the magnitude. 250 Then we add x*m1, yielding a precise but inexact approximation of 251 x*m0 % 1 + x*m1. 252 253 The magnitude of x*m0 % 1 is at most 1/2, and x*m1 may make the sum 254 slightly larger. 255 256 Recall that m is (u/p % 1)/u*2**a, so m's magnitude is at most 257 .5/u*2**a. m0 contains 29 bits of this, so m1's magnitude is at 258 most 2**-29*.5/u*2**a. With an exponent fold of 1 bit, x/u is less 259 than 2**25, so |x*m1| <= |x*2**-29*.5/u*2**a| < 260 2**25**2**-29*.5*2**a = 2**(a-5). 261 262 Therefore, |x*m0 % 1 + x*m1| < .5 + 2**(a-5). The polynomial that 263 approximates tangent(p*x*2**-a) has been engineered to be good in 264 this interval. 265 */ 266 267 movapd Address(c02), p0 // Get c02 and c12. 268 movapd Address(c22), p2 // Get c22 and c32. 269 270 cvtsd2si x, ki // Round to integer. 271 // This requires round-to-nearest mode in the MXCSR. 272 cvtsi2sd ki, k // Convert to floating-point. 273 movhlps x, m1 // Get x*m1. 274 subsd k, x // Subtract integer, leaving fraction. 275 addsd m1, x // Get x*m0%1 + x*m1. 276 277/* Below this point, references in the comments to "x" refer to the 278 result of the reduction, not to the original input argument. 279*/ 280 unpcklpd x, x // Duplicate x for SIMD work. 281 test $1, ki // Is 2**0 bit on? 282 movsd x, x1 // Save a copy of x for later. 283 mulpd x, x // Get x**2. 284#define x2 x // Use the name x2 to emphasize we have x**2. 285 addpd x2, p0 // Get x**2+c02 and x**2+c12. 286 mulpd x2, p0 // Get x**4+c02*x**2 and x**4+c12*x**2. 287 addpd Address(c00), p0 // Finish first pair of terms. 288 addsd x2, p2 // Get x**2+c22 and c32. 289 mulpd x2, p2 // Get x**4+c22*x**2 and c32*x**2. 290 addpd Address(c20), p2 // Finish second pair of terms. 291 mulpd p2, p0 // Multiply parallel pairs. 292 mulsd x1, p0 // Multiply isolated x term. 293 movhlps p0, p2 // Get high factor. 294 mulsd p2, p0 // Combine parts from SIMD work. 295 jne 2f // Branch if bit was on. 296 297// ki was even, tangent is in p0. 298 299 cvtsd2ss p0, p0 // Convert result to single precision. 300 #if defined __i386__ 301 movss p0, Argx // Shuttle result through memory. 302 // This uses the argument area for scratch space, which is allowed. 303 flds Argx // Return result on floating-point stack. 304 #else 305 // On x86_64, the return value is now in p0, which is %xmm0. 306 #endif 307 308 ret 309 310// ki was odd, tangent is -1/p0. 3112: 312 divsd p0, Numerator 313 cvtsd2ss Numerator, p0 // Convert result to single precision. 314 315ReturnSingle: 316 #if defined __i386__ 317 movss p0, Argx // Shuttle result through memory. 318 // This uses the argument area for scratch space, which is allowed. 319 flds Argx // Return result on floating-point stack. 320 #else 321 // On x86_64, the return value is now in p0, which is %xmm0. 322 #endif 323 324 ret 325 326 327// Handle a NaN input. 328HandleNaN: 329 330 #if defined __i386__ 331 flds Argx // Return result on floating-point stack. 332 #else 333 cvtsd2ss x, %xmm0 // Return in %xmm0. 334 /* We cannot just return the input, because we must quiet a 335 signalling NaN. 336 */ 337 #endif 338 339 ret 340 341 342/* The raw exponent field is zero, so the input is a zero or a denormal. We 343 need to handle this separately to get -0 right, and it is faster. 344*/ 345ExponentIsZero: 346 .literal8 347Finagle: .double 1.00000001 348 .text 349 mulsd Address(Finagle), x // Make non-zero results inexact. 350 cvtsd2ss x, p0 // Return x. 351 jmp ReturnSingle