this repo has no description
at fixPythonPipStalling 490 lines 15 kB view raw
1/* sinfcosf.s -- sinf and cosf for standard math library. 2 3 Written by Eric Postpischil, June 2007. 4*/ 5 6 7#define SignificandBits 24 8#define ExponentBits 8 9 10 .literal8 11 12 .align 3 13 14Half: .double .5 15 16 17 .const 18 19 .align 4 20 21/* Coefficients to calculate x*(x**4+c0*x**2+c1) * c4*(x**4+c2*x**2+c3). 22 23 c0 and c2 are paired for loading into an XMM register, and so are c1 and c3. 24*/ 25// Two binades, fractions of pi, minimize ULP error, refactored: 26C02: .double -4.1647719829166299, -3.6493923712207774 27C13: .double 15.485466333375713, 2.6522034727867632 28C4: .double 0.076492480587904088 29 30/* ExponentFold specifies how many bits of the exponent will not be used to 31 look up entries in the table. This must match the ExponentFold used to 32 generate the table. 33 34 The behavior for infinite inputs relies on the table contents -- if the 35 table entries looked up for an infinite input are m0 and m1, then 36 m0*infinity + m1*infinity should generate a NaN. This happens of either m0 37 or m1 is a zero or they have opposite signs. If this were not true, 38 additional arrangements would have to be made for infinite inputs. 39*/ 40#define ExponentFold 1 41 42// Include the reduction table. See sinfcosfMakeTable.c for more information. 43#include "sinfcosfTable.s" 44 45 46// Rename the general registers (just to make it easier to keep track of them). 47#if defined __i386__ 48 #define r0 %eax 49 #define r1 %ecx 50 #define r2 %edx 51 #define r3 %ebx 52 #define r4 %esp 53 #define r5 %ebp 54 #define r6 %esi 55 #define r7 %edi 56#elif defined __x86_64__ 57 #define r0 %rax 58 #define r1 %rcx 59 #define r2 %rdx 60 #define r3 %rbx 61 #define r4 %rsp 62 #define r5 %rbp 63 #define r6 %rsi 64 #define r7 %rdi 65#else 66 #error "Unknown architecture." 67#endif 68 69 70 .text 71 72 73// Define symbols common to sinf and cosf. 74#define BaseP r0 // Base address for position-independent addressing. 75#define xInteger r1 // The input x in an integer register. 76 77#define p %xmm0 // Must be in %xmm0 for return on x86_64. 78#define x %xmm1 79#define ki %xmm2 80#define k %xmm3 81#define m1 %xmm4 82#define pa m1 83 84#if defined __i386__ 85 86 // Define location of argument x. 87 #define Argx 4(%esp) 88 89 // Define how to address data. BaseP must contain the address of label 0. 90 #define Address(label) label-0b(BaseP) 91 92#elif defined __x86_64__ 93 94 // Define location of argument x. 95 #define Argx %xmm0 96 97 // Define how to address data. 98 #define Address(label) label(%rip) 99 100#endif 101 102 103/* float sinf(float x). 104 105 Notes: 106 107 Citations in parentheses below indicate the source of a requirement. 108 109 "C" stands for ISO/IEC 9899:TC2. 110 111 The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no 112 requirements since it defers to C and requires errno behavior only if 113 we choose to support it by arranging for "math_errhandling & 114 MATH_ERRNO" to be non-zero, which we do not. 115 116 Return value: 117 118 For +/- zero, return zero with same sign (C F.9 12 and F.9.1.6). 119 120 For +/- infinity, return a NaN (C F.9.1.6). 121 122 For a NaN, return the same NaN (C F.9 11 and 13). 123 124 Otherwise: 125 126 If the rounding mode is round-to-nearest, return sine(x) faithfully 127 rounded. 128 129 Not currently implemented: In other rounding modes, return sine(x) 130 possibly with slightly worse error, not necessarily honoring the 131 rounding mode (Ali Sazegari narrowing C F.9 10). 132 133 All results are in [-1, 1]. This is corollary to faithful rounding. 134 135 Exceptions: 136 137 Raise underflow for a denormal result (C F.9 7 and Draft Standard for 138 Floating-Point Arithmetic P754 Draft 1.2.5 9.5). If the input is the 139 smallest normal, underflow may or may not be raised. This is stricter 140 than the older 754 standard. 141 142 May or may not raise inexact, even if the result is exact (C F.9 8). 143 144 Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite 145 of C 4.2.1) or an infinity (C F.9.1.6) but not if the input is a quiet 146 NaN (C F.9 11). 147 148 May not raise exceptions otherwise (C F.9 9). 149 150 Properties: 151 152 Desired to be monotonic. Not yet proven! 153 154 Exhaustive testing proved this routine returns faithfully rounded 155 results. 156*/ 157 .align 5 158 .globl _sinf 159_sinf: 160 161 // Put x into an integer register. 162 #if defined __i386__ 163 mov Argx, xInteger // Put x into xInteger. 164 #elif defined __x86_64__ 165 movd Argx, xInteger // Put x into xInteger. 166 #endif 167 168 cvtss2sd Argx, x // Convert x to double precision. 169 170 #if defined __i386__ 171 172 // Get address of 0 in BaseP. 173 call 0f // Push program counter onto stack. 174 0: 175 pop BaseP // Get program counter. 176 177 #elif defined __x86_64__ 178 179 // Get address of reduction table in BaseP. 180 lea ReductionTable(%rip), BaseP 181 182 #endif 183 184 unpcklpd x, x // Duplicate x. 185 186 /* Shift: 187 188 SignificandBits-1 bits right to remove significand. 189 190 ExponentFold bits right to remove unused bits of exponent. 191 192 4 bits left to multiply by size of a table entry. 193 */ 194 shr $SignificandBits-1+ExponentFold-4, xInteger 195 196 // Clear bits other than those we are using from exponent. 197 and $(1<<ExponentBits-ExponentFold)-1 << 4, xInteger 198 je ExponentIsZero // Branch so we can get -0 right. 199 200 ucomisd x, x // Test for NaN. 201 jpe HandleNaN 202 203 // Get table entry containing m0 and m1 and calculate x*m0 and x*m1. 204 #if defined __i386__ 205 mulpd ReductionTable-0b(BaseP, xInteger), x 206 #elif defined __x86_64__ 207 mulpd (BaseP, xInteger), x 208 #endif 209 /* Let the exponent of x be E, that is, E is the largest integer such 210 that 2**E <= x. We use e to index the table, where e is E with the 211 low ExponentFold bits set to zero. For example, if ExponentFold is 212 1, e = E & ~1. 213 214 The table entry indexed by e represents a number m that is (u/p % 215 1)/u*2**a, where u is the ULP of 2**e, p is the period (2*pi), and 216 2**a is the number of intervals we divide the period into (we use 217 two, so a is one). 218 219 Observe that when 2**e <= x, x/u is an integer, and x/u < 220 2**(24+ExponentFold). 221 222 Then for some integer k, x*m = x*(u/p % 1)/u*2**a = x*(u/p - 223 k)/u*2**a = x/p*2**a - x/u*k*2**a. x/u*k*2**a is an integer, so 224 x*m is congruent to x/p*2**a modulo 1. 225 226 So sin(x*m*p*2**-a) equals sin(x). 227 228 However, we do not have m exactly. The table contains two numbers 229 for 2**e, m0 and m1. m0 has the first 29 bits of m, and m1 has the 230 next 53 bits. 231 232 Since x has 23 bits and m0 has 29 bits, their multiplication in 233 double format is exact. From that product, we subtract the nearest 234 integer, which changes nothing modulo 1 but reduces the magnitude. 235 Then we add x*m1, yielding a precise but inexact approximation of 236 x*m0 % 1 + x*m1. 237 238 The magnitude of x*m0 % 1 is at most 1/2, and x*m1 may make the sum 239 slightly larger. 240 241 Recall that m is (u/p % 1)/u*2**a, so m's magnitude is at most 242 .5/u*2**a. m0 contains 29 bits of this, so m1's magnitude is at 243 most 2**-29*.5/u*2**a. With an exponent fold of 1 bit, x/u is less 244 than 2**25, so |x*m1| <= |x*2**-29*.5/u*2**a| < 245 2**25**2**-29*.5*2**a = 2**(a-5). 246 247 Therefore, |x*m0 % 1 + x*m1| < .5 + 2**(a-5). The polynomial that 248 approximates sin(p*x*2**-a) has been engineered to be good in this 249 interval. 250 */ 251 252 cvtpd2dq x, ki // Round to integer. 253 // This requires round-to-nearest mode in the MXCSR. 254 cvtdq2pd ki, k // Convert to floating-point. 255 movhlps x, m1 // Get x*m1. 256 movapd Address(C02), p // Get c0 and c2. 257 psllq $63, ki // Move low bit to high bit. 258 subsd k, x // Subtract integer, leaving fraction. 259 addsd m1, x // Get x*m0%1 + x*m1. 260 xorpd x, ki // Negate x if low bit of ki was set. 261 mulsd x, x // Square x. 262 unpcklpd x, x // Duplicate x**2. 263 movhpd Address(C4), ki // Get sign*x and c4. 264 addpd x, p // Make x**2+c0 and x**2+c2. 265 mulpd x, p // Make x**4+c0*x**2 and x**4+c2*x**2. 266 addpd Address(C13), p // Make x**4+c0*x**2+c1 and x**4+c2*x**2+c3. 267 mulpd ki, p // Make sign*x*(x**4+c0*x**2+c1) and 268 // c4*(x**4+c2*x**2+c3). 269 movhlps p, pa // Get c4*(x**4+c2*x**2+c3). 270 mulsd pa, p // Multiply final two factors. 271 cvtsd2ss p, p // Convert result to single precision. 272 273ReturnSingle: 274 #if defined __i386__ 275 movss p, Argx // Shuttle result through memory. 276 // This uses the argument area for scratch space, which is allowed. 277 flds Argx // Return input on floating-point stack. 278 #else 279 // On x86_64, the return value is now in p, which is %xmm0. 280 #endif 281 282 ret 283 284 285/* The raw exponent field is zero, so the input is a zero or a denormal. We 286 need to handle this separately to get -0 right, and it is faster. 287*/ 288ExponentIsZero: 289 .literal8 290Finagle: .double 1.00000001 291 .text 292 mulsd Address(Finagle), x // Make non-zero results inexact. 293 cvtsd2ss x, p // Return x. 294 jmp ReturnSingle 295 296 297// Handle a NaN input. 298HandleNaN: 299 300 #if defined __i386__ 301 flds Argx // Return result on floating-point stack. 302 #else 303 cvtsd2ss x, %xmm0 // Return in %xmm0. 304 /* We cannot just return the input, because we must quiet a 305 signalling NaN. 306 */ 307 #endif 308 309 ret 310 311 312/* float cosf(float x). 313 314 Notes: 315 316 Citations in parentheses below indicate the source of a requirement. 317 318 "C" stands for ISO/IEC 9899:TC2. 319 320 The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no 321 requirements since it defers to C and requires errno behavior only if 322 we choose to support it by arranging for "math_errhandling & 323 MATH_ERRNO" to be non-zero, which we do not. 324 325 Return value: 326 327 For +/- infinity, return a NaN (C F.9.1.6). 328 329 For a NaN, return the same NaN (C F.9 11 and 13). 330 331 Otherwise: 332 333 If the rounding mode is round-to-nearest, return cosine(x) 334 faithfully rounded. 335 336 Not currently implemented: In other rounding modes, return sine(x) 337 possibly with slightly worse error, not necessarily honoring the 338 rounding mode (Ali Sazegari narrowing C F.9 10). 339 340 All results are in [-1, 1]. This is corollary to faithful rounding. 341 342 Exceptions: 343 344 Raise underflow for a denormal result (C F.9 7 and Draft Standard for 345 Floating-Point Arithmetic P754 Draft 1.2.5 9.5). If the input is the 346 smallest normal, underflow may or may not be raised. This is stricter 347 than the older 754 standard. 348 349 May or may not raise inexact, even if the result is exact (C F.9 8). 350 351 Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite 352 of C 4.2.1) or an infinity (C F.9.1.5) but not if the input is a quiet 353 NaN (C F.9 11). 354 355 May not raise exceptions otherwise (C F.9 9). 356 357 Properties: 358 359 Desired to be monotonic. Not yet proven! 360 361 Exhaustive testing proved this routine returns faithfully rounded 362 results. 363*/ 364 .align 5 365 .globl _cosf 366_cosf: 367 // This code is identical to _sinf except for the addition of Half. 368 369 // Put x into an integer register. 370 #if defined __i386__ 371 mov Argx, xInteger // Put x into xInteger. 372 #elif defined __x86_64__ 373 movd Argx, xInteger // Put x into xInteger. 374 #endif 375 376 cvtss2sd Argx, x // Convert x to double precision. 377 378 #if defined __i386__ 379 380 // Get address of 0 in BaseP. 381 call 0f // Push program counter onto stack. 382 0: 383 pop BaseP // Get program counter. 384 385 #elif defined __x86_64__ 386 387 // Get address of reduction table in BaseP. 388 lea ReductionTable(%rip), BaseP 389 390 #endif 391 392 unpcklpd x, x // Duplicate x. 393 394 /* Shift: 395 396 SignificandBits-1 bits right to remove significand. 397 398 ExponentFold bits right to remove unused bits of exponent. 399 400 4 bits left to multiply by size of a table entry. 401 */ 402 shr $SignificandBits-1+ExponentFold-4, xInteger 403 404 // Clear bits other than those we are using from exponent. 405 and $(1<<ExponentBits-ExponentFold)-1 << 4, xInteger 406 407 ucomisd x, x // Test for NaN. 408 jpe HandleNaN 409 410 // Get table entry containing m0 and m1 and calculate x*m0 and x*m1. 411 #if defined __i386__ 412 mulpd ReductionTable-0b+0*8(BaseP, xInteger), x 413 #elif defined __x86_64__ 414 mulpd (BaseP, xInteger), x 415 #endif 416 /* Let the exponent of x be E, that is, E is the largest integer such 417 that 2**E <= x. We use e to index the table, where e is E with the 418 low ExponentFold bits set to zero. For example, if ExponentFold is 419 1, e = E & ~1. 420 421 The table entry indexed by e represents a number m that is (u/p % 422 1)/u*2**a, where u is the ULP of 2**e, p is the period (2*pi), and 423 2**a is the number of intervals we divide the period into (we use 424 two, so a is one). 425 426 Observe that when 2**e <= x, x/u is an integer, and x/u < 427 2**(24+ExponentFold). 428 429 Then for some integer k, x*m = x*(u/p % 1)/u*2**a = x*(u/p - 430 k)/u*2**a = x/p*2**a - x/u*k*2**a. x/u*k*2**a is an integer, so 431 x*m is congruent to x/p*2**a modulo 1. 432 433 So cos(x*m*p*2**-a) equals cos(x). 434 435 However, we do not have m exactly. The table contains two numbers 436 for 2**e, m0 and m1. m0 has the first 29 bits of m, and m1 has the 437 next 53 bits. 438 439 Since x has 23 bits and m0 has 29 bits, their multiplication in 440 double format is exact. From that product, we subtract the nearest 441 integer, which changes nothing modulo 1 but reduces the magnitude. 442 Then we add x*m1, yielding a precise but inexact approximation of 443 x*m0 % 1 + x*m1. 444 445 The magnitude of x*m0 % 1 is at most 1/2, and x*m1 may make the sum 446 slightly larger. 447 448 Recall that m is (u/p % 1)/u*2**a, so m's magnitude is at most 449 .5/u*2**a. m0 contains 29 bits of this, so m1's magnitude is at 450 most 2**-29*.5/u*2**a. With an exponent fold of 1 bit, x/u is less 451 than 2**25, so |x*m1| <= |x*2**-29*.5/u*2**a| < 452 2**25**2**-29*.5*2**a = 2**(a-5). 453 454 Therefore, |x*m0 % 1 + x*m1| < .5 + 2**(a-5). The polynomial that 455 approximates sin(p*x*2**-a) has been engineered to be good in this 456 interval. 457 */ 458 459 addsd Address(Half), x 460 // Add a quarter circle, then calculate sine. 461 cvtpd2dq x, ki // Round to integer. 462 // This requires round-to-nearest mode in the MXCSR. 463 cvtdq2pd ki, k // Convert to floating-point. 464 movhlps x, m1 // Get x*m1. 465 movapd Address(C02), p // Get c0 and c2. 466 psllq $63, ki // Move low bit to high bit. 467 subsd k, x // Subtract integer, leaving fraction. 468 addsd m1, x // Get x*m0%1 + x*m1. 469 xorpd x, ki // Negate x if low bit of ki was set. 470 mulsd x, x // Square x. 471 unpcklpd x, x // Duplicate x**2. 472 movhpd Address(C4), ki // Get sign*x and c4. 473 addpd x, p // Make x**2+c0 and x**2+c2. 474 mulpd x, p // Make x**4+c0*x**2 and x**4+c2*x**2. 475 addpd Address(C13), p // Make x**4+c0*x**2+c1 and x**4+c2*x**2+c3. 476 mulpd ki, p // Make sign*x*(x**4+c0*x**2+c1) and 477 // c4*(x**4+c2*x**2+c3). 478 movhlps p, pa // Get c4*(x**4+c2*x**2+c3). 479 mulsd pa, p // Multiply final two factors. 480 cvtsd2ss p, p // Convert result to single precision. 481 482 #if defined __i386__ 483 movss p, Argx // Shuttle result through memory. 484 // This uses the argument area for scratch space, which is allowed. 485 flds Argx // Return result on floating-point stack. 486 #else 487 // On x86_64, the return value is now in p, which is %xmm0. 488 #endif 489 490 ret