this repo has no description
at fixPythonPipStalling 565 lines 21 kB view raw
1/* 2 * 3 * cosh.s 4 * 5 * by Stephen Canon 6 * 7 * Copyright (c) 2007, Apple Inc. All Rights Reserved. 8 * 9 * Implementation of the c99 cosh function for the MacOS X __i386__ and __x86_64__ ABIs. 10 */ 11 12#include <machine/asm.h> 13#include "abi.h" 14 15.const 16.align 4 17 // Polynomial coefficients, offset from exp2table 18 .quad 0x3f55e52272e0eaec, 0x3f55e52272e0eaec // c4 19 .quad 0x401cc9eea1e24220, 0x401cc9eea1e24220 // c3/c4 20 .quad 0x3fac6b08d78310b8, 0x3fac6b08d78310b8 // c2 21 .quad 0x3fcebfbdff82bda7, 0x3fcebfbdff82bda7 // c1 22 .quad 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef // c0 23 24 // Adapted from the table by jkidder in exp2.s 25 // -x 2^x 26exp2table: .quad 0x8000000000000000, 0x3ff0000000000000 27 .quad 0xbf8000003ac95980, 0x3ff0163daa4d2352 28 .quad 0xbf9000000f064dc0, 0x3ff02c9a3ea19cb9 29 .quad 0xbf98000018938b20, 0x3ff04315e8b3c115 30 .quad 0xbfa000000306d960, 0x3ff059b0d326ac37 31 .quad 0xbfa4000003824c90, 0x3ff0706b29f1f4cf 32 .quad 0xbfa8000004f61bf0, 0x3ff087451892075b 33 .quad 0xbfac00000e283450, 0x3ff09e3ecb187ccb 34 .quad 0xbfb00000078c7308, 0x3ff0b5586d50f577 35 .quad 0xbfb2000001591738, 0x3ff0cc922b81fa4a 36 .quad 0xbfb4000006507370, 0x3ff0e3ec331dbe33 37 .quad 0xbfb6000002e52340, 0x3ff0fb66b020e713 38 .quad 0xbfb8000005a20d30, 0x3ff11301d05505f2 39 .quad 0xbfba000000c21f28, 0x3ff12abdc07537b2 40 .quad 0xbfbc0000006f2418, 0x3ff1429aaeae5f8c 41 .quad 0xbfbe000004e59960, 0x3ff15a98c8e0759d 42 .quad 0xbfc0000002aea480, 0x3ff172b83cbe3227 43 .quad 0xbfc10000002d00d0, 0x3ff18af93890d45f 44 .quad 0xbfc20000017a069c, 0x3ff1a35beb93e6ce 45 .quad 0xbfc30000032ca860, 0x3ff1bbe084526787 46 .quad 0xbfc40000034f70a4, 0x3ff1d48731ba8c98 47 .quad 0xbfc50000026c3308, 0x3ff1ed5023390e5f 48 .quad 0xbfc6000002d6a13c, 0x3ff2063b88a9792c 49 .quad 0xbfc7000001164930, 0x3ff21f4991992be3 50 .quad 0xbfc80000027763f8, 0x3ff2387a6eb3ae9a 51 .quad 0xbfc9000003509a78, 0x3ff251ce5006d5a1 52 .quad 0xbfca000001a60348, 0x3ff26b45660c949f 53 .quad 0xbfcb000003b0e9d0, 0x3ff284dfe254261d 54 .quad 0xbfcc0000004cde7c, 0x3ff29e9df5279f0b 55 .quad 0xbfcd000000cfdaf4, 0x3ff2b87fd0efebe8 56 .quad 0xbfce00000396f458, 0x3ff2d285a741ad9e 57 .quad 0xbfcf0000001fc510, 0x3ff2ecafa94170d1 58 .quad 0xbfd00000011547ec, 0x3ff306fe0a6adb04 59 .quad 0xbfd0800000eec6c6, 0x3ff32170fc7e5139 60 .quad 0xbfd1000001d61dae, 0x3ff33c08b2c605fd 61 .quad 0xbfd1800001a52e62, 0x3ff356c55fead73c 62 .quad 0xbfd200000051ad30, 0x3ff371a7374bdcfb 63 .quad 0xbfd28000002c2c50, 0x3ff38cae6d0f32b4 64 .quad 0xbfd3000001d21df8, 0x3ff3a7db3548da03 65 .quad 0xbfd380000146a8dc, 0x3ff3c32dc3599392 66 .quad 0xbfd40000002a7356, 0x3ff3dea64c1b56c2 67 .quad 0xbfd480000054f350, 0x3ff3fa4504bee17d 68 .quad 0xbfd50000005ea89a, 0x3ff4160a220bc5bf 69 .quad 0xbfd5800001dc84b4, 0x3ff431f5d9b8e238 70 .quad 0xbfd600000003a7aa, 0x3ff44e08606256f0 71 .quad 0xbfd68000007c88ac, 0x3ff46a41ed388948 72 .quad 0xbfd7000000e3699a, 0x3ff486a2b5f3cad8 73 .quad 0xbfd7800001d7fb72, 0x3ff4a32af1415232 74 .quad 0xbfd800000036178e, 0x3ff4bfdad542520c 75 .quad 0xbfd880000103da68, 0x3ff4dcb29a38937c 76 .quad 0xbfd9000001d0e1f2, 0x3ff4f9b27706c869 77 .quad 0xbfd980000045a3a2, 0x3ff516daa2df4e31 78 .quad 0xbfda0000015743c6, 0x3ff5342b56ec23d4 79 .quad 0xbfda80000182b74e, 0x3ff551a4cab6dc4d 80 .quad 0xbfdb00000082fdae, 0x3ff56f4736d39096 81 .quad 0xbfdb8000014aa914, 0x3ff58d12d4e4f5b2 82 .quad 0xbfdc0000008a02aa, 0x3ff5ab07dd68b75f 83 .quad 0xbfdc800001be8102, 0x3ff5c9268ac2a0da 84 .quad 0xbfdd0000018186c2, 0x3ff5e76f160896a5 85 .quad 0xbfdd800001cbe9ce, 0x3ff605e1b9e48ea4 86 .quad 0xbfde0000013fb67c, 0x3ff6247eb0870165 87 .quad 0xbfde800000ef7448, 0x3ff6434635067f92 88 .quad 0xbfdf0000005a7522, 0x3ff66238826b1001 89 .quad 0xbfdf800001b5116a, 0x3ff68155d4b7317e 90 .quad 0xbfe0000000888e45, 0x3ff6a09e66c229dd 91 .quad 0xbfe0400000205a67, 0x3ff6c012751bcc3c 92 .quad 0xbfe0800000b6586f, 0x3ff6dfb23cbf72c3 93 .quad 0xbfe0c00000f74e0b, 0x3ff6ff7df9ccc6d4 94 .quad 0xbfe1000000a557b8, 0x3ff71f75e93f2fc7 95 .quad 0xbfe14000004dbf55, 0x3ff73f9a48cca865 96 .quad 0xbfe1800000641c09, 0x3ff75feb567517a8 97 .quad 0xbfe1c00000f97615, 0x3ff78069505d5b2f 98 .quad 0xbfe20000002ed0b2, 0x3ff7a1147402f7a4 99 .quad 0xbfe2400000a7c436, 0x3ff7c1ed018716b2 100 .quad 0xbfe28000007d3dbd, 0x3ff7e2f337101b34 101 .quad 0xbfe2c000000368bf, 0x3ff80427543fe015 102 .quad 0xbfe3000000adbadd, 0x3ff8258999a7ac0d 103 .quad 0xbfe3400000d62b8b, 0x3ff8471a46946832 104 .quad 0xbfe3800000ec0d48, 0x3ff868d99bc161d2 105 .quad 0xbfe3c0000054b58f, 0x3ff88ac7d9b76eb5 106 .quad 0xbfe40000006e90f6, 0x3ff8ace54265b990 107 .quad 0xbfe44000002abac4, 0x3ff8cf3216cc3af3 108 .quad 0xbfe4800000c48632, 0x3ff8f1ae997fa64d 109 .quad 0xbfe4c00000cad5c9, 0x3ff9145b0c00301f 110 .quad 0xbfe5000000350fe0, 0x3ff93737b0eac151 111 .quad 0xbfe5400000a31c3c, 0x3ff95a44cc21e4e0 112 .quad 0xbfe5800000ae6cc6, 0x3ff97d82a03e9cf0 113 .quad 0xbfe5c00000c26ee9, 0x3ff9a0f17135f7b9 114 .quad 0xbfe600000091b67d, 0x3ff9c49182f54513 115 .quad 0xbfe640000015c9de, 0x3ff9e86319ef5d59 116 .quad 0xbfe68000006ad404, 0x3ffa0c667b9a2c00 117 .quad 0xbfe6c000000cd09a, 0x3ffa309bec517245 118 .quad 0xbfe7000000fe5d74, 0x3ffa5503b2cf3ac1 119 .quad 0xbfe740000011ec75, 0x3ffa799e133afab3 120 .quad 0xbfe7800000249e28, 0x3ffa9e6b558f1ac2 121 .quad 0xbfe7c000002e4a1c, 0x3ffac36bbfeec930 122 .quad 0xbfe80000008c2c1b, 0x3ffae89f99ac8744 123 .quad 0xbfe8400000b96bed, 0x3ffb0e0729fa6005 124 .quad 0xbfe880000017a522, 0x3ffb33a2b85d048f 125 .quad 0xbfe8c000008666ed, 0x3ffb59728e34f847 126 .quad 0xbfe90000009231ee, 0x3ffb7f76f3527236 127 .quad 0xbfe9400000998280, 0x3ffba5b030fcf4ad 128 .quad 0xbfe9800000789634, 0x3ffbcc1e90945d34 129 .quad 0xbfe9c00000464c5a, 0x3ffbf2c25c01acba 130 .quad 0xbfea000000ac9edb, 0x3ffc199bddee6449 131 .quad 0xbfea4000009db0ed, 0x3ffc40ab60605147 132 .quad 0xbfea800000b25f74, 0x3ffc67f12ec591f4 133 .quad 0xbfeac00000dd93d8, 0x3ffc8f6d948ffb53 134 .quad 0xbfeb0000009a8e0d, 0x3ffcb720dd4fb272 135 .quad 0xbfeb4000006c9216, 0x3ffcdf0b55a1a9bc 136 .quad 0xbfeb80000029211c, 0x3ffd072d4a2165e4 137 .quad 0xbfebc00000acf6b0, 0x3ffd2f87087ae250 138 .quad 0xbfec000000c25639, 0x3ffd5818dd772ab4 139 .quad 0xbfec400000c79407, 0x3ffd80e317490efb 140 .quad 0xbfec8000001b543f, 0x3ffda9e603ecc1e4 141 .quad 0xbfecc00000bac791, 0x3ffdd321f37a5ea0 142 .quad 0xbfed000000264fb7, 0x3ffdfc9733947dd8 143 .quad 0xbfed4000007cc76d, 0x3ffe264615471e42 144 .quad 0xbfed8000006c7c5f, 0x3ffe502ee7d27b94 145 .quad 0xbfedc00000504b69, 0x3ffe7a51fbfc4eb0 146 .quad 0xbfee0000003582a0, 0x3ffea4afa2c81573 147 .quad 0xbfee400000ef29b3, 0x3ffecf482e2e03c7 148 .quad 0xbfee80000056b48b, 0x3ffefa1bee9b87c4 149 .quad 0xbfeec00000144453, 0x3fff252b377966cc 150 .quad 0xbfef0000002820d1, 0x3fff50765b897d3f 151 .quad 0xbfef400000dcd2f6, 0x3fff7bfdae3356ec 152 .quad 0xbfef800000132d42, 0x3fffa7c181abb707 153 .quad 0xbfefc00000cf5c89, 0x3fffd3c22c1e669f 154 155// Minimax polynomial coefficients constructed by Ali Sazegari 156small_poly: .quad 0x3fe0000000000000 157 .double 0.41666666666666676765549465759097287357999653e-1 158 .double 0.13888888888879080676272078806906929385566483e-2 159 .double 0.24801587336424046540631114342999222671468169e-4 160 .double 0.27557263298882991653449608369821879414016807e-6 161 .double 0.20918130318061568990950557203211923657510680e-8 162 163.literal8 164.align 3 165one_n7: .quad 0x3f80000000000000 166lge_p7: .quad 0x40671547652b82fe 167lge_hi: .quad 0x3ff7154760000000 168lge_lo: .quad 0x3e54ae0bf85ddf44 169one_p55: .quad 0x4360000000000000 170one_n55: .quad 0x3c80000000000000 171one: .quad 0x3ff0000000000000 172min_normal: .quad 0x0010000000000000 173huge_val: .quad 0x7fefffffffffffff 174poly_mask: .quad 0xfffffe0000000000 175 176.text 177#if defined(__x86_64__) 178 #define RELATIVE_ADDR(_a) (_a)(%rip) 179#else 180 #define RELATIVE_ADDR(_a) (_a)-cosh_body(%ecx) 181 182.align 4 183cosh_pic: 184 movl (%esp), %ecx 185 ret 186#endif 187 188ENTRY(cosh) 189#if defined(__x86_64__) 190 movapd %xmm0, %xmm1 191 psrlq $32, %xmm0 192 movd %xmm0, %eax 193#else // arch = i386 194 movl 4+FRAME_SIZE(STACKP), %eax 195 movsd FRAME_SIZE(STACKP), %xmm1 196 calll cosh_pic 197cosh_body: 198#endif 199 200 pcmpeqd %xmm0, %xmm0 201 andl $0x7fffffff, %eax 202 psrlq $1, %xmm0 203 subl $0x3fd62e42, %eax 204 cmpl $0x005ed1bd, %eax // If (|x| < .34657... or |x| > 21 or isnan(x)) 205 ja 1f // goto 1 206 207 /* If .3465.... < |x| < 21: 208 * 209 * 1. compute the head-tail product 210 * 211 * scaled_hi + scaled_lo = lg_e * |x| with relative error < 2^-80 212 * 213 * 2. set n = floor(lg_e * |x|) 214 * 215 * 3. set a = top 7 bits of the fractional part of lg_e * |x| 216 * 217 * 4. use a to lookup to find f_hi and g_hi such that 218 * 219 * |x| * lg_e = n + f_hi + f_lo with relative error < 2^-60 and -2^-24 < f_lo < 2^-7 + tiny 220 * |x| * lg_e = -n-1 + g_hi + g_lo with relative error < 2^-60 and -2^-24 < g_lo < 2^-7 + tiny 221 * 222 * 5. evaluate a minimax polynomial p(x) ~ 2^x at f_lo and g_lo 223 * 224 * 6. lookup b = 2^f_hi and c = 2^g_hi with relative error < 2^-75 225 * 226 * 7. if n < 16, split c into c_head and c_tail with c_head 21 bits wide. 227 * if n >= 16, then c_head = 0 and c_tail = c 228 * 229 * 8. Assemble the final result: 230 * 231 * (2^(n-1)b - 2^(-n-2)c_head) + ((2^(n-1)b * f_lo)p(f_lo) - (2^(-n-2)c * g_lo)p(g_lo) + c_tail) 232 * 233 * 9. The first subtraction is exact. The rounding point of the other subtractions is at least 6 bits to the 234 * right of the rounding point of the final result. The two summands are therefore correct to within 2^-5 ulps 235 * of the final result, so the error is bounded by ~ .53 ulps. 236 * 237 * The worst case errors occur in the binade from .25 - .5, as this is where b and c are of comparable magnitude. 238 * 239 */ 240 241 movsd RELATIVE_ADDR(lge_p7), %xmm3 242 movapd %xmm0, %xmm2 // xmm2 <-- 0x7fffffff....ffff 243 andpd %xmm1, %xmm0 // xmm0 <-- |x| 244 mulsd %xmm0, %xmm3 // xmm3 <-- lg(e) * |x| * 128.0 245 psllq $27, %xmm2 // xmm2 <-- 0xfffffffff8000000 (mask for the high 26 mantissa bits) 246 cvttsd2si %xmm3, %eax // %eax <-- (int)(lg(e) * |x| * 128.0) 247 movl %eax, %edx 248 and $0x7f, AX_P 249 shrl $7, %edx // %edx <-- n = (int)(lg(e) * |x|) 250 shll $4, %eax 251 252 /* Needed values are now: 253 xmm0 - |x| 254 xmm1 - signbit(x) 255 xmm2 - 0xfffffffff8000000 256 xmm3 - lg(e) * |x| * 0x1.0p7 257 %eax - lookup value from top 7 bits of the fractional part of |x| * lg_e 258 %edx - n = (int)(|x| * lg_e) 259 Everything else is fair game. */ 260 261 mulsd RELATIVE_ADDR(one_n7), %xmm3 // xmm3 <-- scaled_hi = lg(e) * |x| 262 andpd %xmm0, %xmm2 // xmm2 <-- x_hi 263 subsd %xmm2, %xmm0 // xmm0 <-- x_lo 264 movsd RELATIVE_ADDR(lge_hi), %xmm4 265 movsd RELATIVE_ADDR(lge_lo), %xmm5 266 movapd %xmm2, %xmm6 // xmm6 <-- x_hi 267 mulsd %xmm4, %xmm2 // xmm2 <-- x_hi * lge_hi 268 mulsd %xmm0, %xmm4 // xmm4 <-- x_lo * lge_hi 269 mulsd %xmm5, %xmm6 // xmm6 <-- x_hi * lge_lo 270 mulsd %xmm5, %xmm0 // xmm0 <-- x_lo * lge_lo 271 subsd %xmm3, %xmm2 // xmm2 <-- x_hi*lge_hi - x*lge 272 addsd %xmm4, %xmm2 // xmm2 <-- (x_hi*lge_hi - x*lge) + x_lo*lge_hi 273 addsd %xmm6, %xmm2 // xmm2 <-- ((x_hi*lge_hi - x*lge) + x_lo*lge_hi) + x_hi*lge_lo 274 addsd %xmm0, %xmm2 // xmm2 <-- (((x_hi*lge_hi - x*lge) + x_lo*lge_hi) + x_hi*lge_lo) + x_lo*lge_lo = scaled_lo 275 276 /* Needed values are now: 277 xmm1 - signbit of x 278 xmm2 - low part of |x| * lg_e 279 xmm3 - high part of |x| * lg_e 280 %eax - lookup value from top 7 bits of the fractional part of |x| * lg_e 281 %edx - n = (int)(|x| * lg_e) 282 Everything else is fair game. */ 283 284 cvtsi2sd %edx, %xmm4 // xmm4 <-- (double)n 285 subsd %xmm4, %xmm3 // xmm3 <-- frac = scaled_hi - (double)n 286 lea RELATIVE_ADDR(exp2table), CX_P 287 movapd (CX_P,AX_P,1), %xmm5 // xmm5 <-- -f_hi, exp2(f_hi) 288 negl %eax 289 addl $0x7f0, %eax 290 movapd (CX_P,AX_P,1), %xmm1 // xmm7 <-- -g_hi, exp2(g_hi) 291 addsd %xmm3, %xmm5 // xmm5 <-- frac - f_hi, exp2(f_hi) 292 subsd 8(CX_P), %xmm3 // xmm3 <-- frac - 1.0 293 addsd %xmm2, %xmm5 // xmm5 <-- f_lo = (frac - f_hi) + scaled_lo, exp2(f_hi) 294 subsd %xmm3, %xmm1 // xmm7 <-- (1.0 - frac) - g_hi, exp2(g_hi) 295 subsd %xmm2, %xmm1 // xmm7 <-- g_lo = ((1.0 - frac) - g_hi) - scaled_lo, exp2(g_hi) 296 297 movapd %xmm1, %xmm0 298 shufpd $3, %xmm5, %xmm1 // xmm7 <-- exp2(g_hi), exp2(f_hi) 299 shufpd $0, %xmm5, %xmm0 // xmm0 <-- g_lo, f_lo 300 301 /* Needed values are now: 302 xmm0 - g_lo, f_lo 303 xmm1 - exp2(g_hi), exp2(f_hi) 304 %edx - n 305 Everything else is fair game. */ 306 307 // put [2^(-n-2), 2^(n-1)] into xmm7 308 movl %edx, %eax 309 addl $(1023 - 1), %edx 310 movd %edx, %xmm6 311 movl $(1023 - 2), %edx 312 subl %eax, %edx 313 movd %edx, %xmm7 314 movlhps %xmm6, %xmm7 315 psllq $52, %xmm7 316 317 // For the purposes of commenting polynomial evaluation, "x" = [f_lo, g_lo] 318 movapd %xmm0, %xmm2 319 mulpd %xmm0, %xmm0 // xmm0 <-- x * x 320 movapd %xmm2, %xmm3 321 mulpd -64(CX_P), %xmm2 // xmm2 <-- c3/c4 * x 322 movapd %xmm3, %xmm4 // xmm4 <-- x 323 mulpd -32(CX_P), %xmm3 // xmm3 <-- c1 * x 324 movapd %xmm0, %xmm5 325 mulpd -80(CX_P), %xmm0 // xmm0 <-- c4 * xx 326 addpd %xmm5, %xmm2 // xmm2 <-- xx + c3x/c4 327 mulpd -48(CX_P), %xmm5 // xmm5 <-- c2 * xx 328 addpd -16(CX_P), %xmm3 // xmm3 <-- c1x + c0 329 mulpd %xmm0, %xmm2 // xmm2 <-- c4xx * (xx + c3x/c4) xmm0 freed 330 addpd %xmm5, %xmm3 // xmm3 <-- c2xx + (c1x + c0) xmm5 freed 331 addpd %xmm3, %xmm2 // xmm2 <-- exp2poly(x) xmm3 freed 332 333 // Compute a mask to separate the high and low parts of the lookup exponentials, if necessary. 334 movl $5, %edx 335 movd %eax, %xmm6 // xmm6 <-- n 336 movd %edx, %xmm5 // xmm5 <-- 4 337 pcmpgtd %xmm6, %xmm5 // xmm5 = (n < 4) ? [ 0, 0, 0, -1] : [ 0, 0, 0, 0] 338 pcmpeqd %xmm6, %xmm6 // xmm6 = -1ULL 339 movlhps %xmm6, %xmm5 // xmm5 = (n < 4) ? [ -1, -1, 0, -1] : [ -1, -1, 0, 0] 340 psllq $32, %xmm5 // xmm5 = (n < 4) ? [ -1, 0, -1, 0] : [ -1, 0, 0, 0] 341 342 // multiply the scale factors in xmm7 into [exp2(g_hi), exp2(f_hi)] 343 mulpd %xmm7, %xmm1 // xmm1 <-- exp2(-n-2 + g_hi), exp2(n-1 + f_hi) 344 // multiply the scaled results into [g_lo, f_lo] 345 mulpd %xmm1, %xmm4 346 // multiply that result into the polynomial 347 mulpd %xmm4, %xmm2 // xmm2 <-- g_lo*exp2(-n-2 + g_hi) * g_lo_poly, f_lo*exp2(n-1 + f_hi) * f_lo_poly 348 349 andpd %xmm1, %xmm5 // xmm5 <-- high parts of [ exp2(-n-2 + g_hi), exp2(n-1 + f_hi) ] 350 subpd %xmm5, %xmm1 // xmm1 <-- low parts of [ exp2(-n-2 + g_hi), exp2(n-1 + f_hi) ] 351 352 movhlps %xmm1, %xmm4 // xmm4 <-- low part of exp2(n-1 + f_hi) 353 movhlps %xmm2, %xmm3 // xmm3 <-- f_lo*exp2(n-1 + f_hi) * f_lo_poly 354 movhlps %xmm5, %xmm0 // xmm0 <-- high part of exp2(n-1 + f_hi) 355 356 addsd %xmm1, %xmm4 // xmm4 = low part of exp2(-n-2 + g_hi) + low part of exp2(n-1 + f_hi) 357 addsd %xmm2, %xmm4 // xmm4 += g_lo*exp2(-n-2 + g_hi) * g_lo_poly 358 addsd %xmm3, %xmm4 // xmm4 += f_lo*exp2(n-1 + f_hi) * f_lo_poly 359 addsd %xmm5, %xmm0 // xmm0 = high part of exp2(n-1 + f_hi) + high part of exp2(-n-2 + g_hi) 360 addsd %xmm4, %xmm0 // xmm0 += xmm4 361 362#if defined(__i386__) 363 movsd %xmm0, FRAME_SIZE(STACKP) 364 fldl FRAME_SIZE(STACKP) 365#endif 366 ret 367 3681: 369 jg 5f // If |x| > 21 or isnan(x), goto 4 370 cmpl $0xff39d1be, %eax // If |x| <= 2^-14 371 jbe 2f 372 373 // If .3465.... >= |x| > 2^-14, we use a minimax polynomial approximation with error < .53 ulps 374 movapd %xmm1, %xmm4 375 mulsd %xmm1, %xmm1 // xmm1 <-- u = xx 376 lea RELATIVE_ADDR(small_poly), AX_P 377 378 // Horner's is needed for numerics reasons, and the performance is actually pretty good. 379 movapd %xmm1, %xmm2 380 mulsd 40(AX_P), %xmm1 381 movapd %xmm2, %xmm3 382 mulsd %xmm2, %xmm2 383 addsd 32(AX_P), %xmm1 384 mulsd %xmm3, %xmm1 385 addsd 24(AX_P), %xmm1 386 mulsd %xmm3, %xmm1 387 addsd 16(AX_P), %xmm1 388 mulsd %xmm3, %xmm1 389 addsd 8(AX_P), %xmm1 390 mulsd %xmm2, %xmm1 // xmm1 <-- x^4 * p(x^2) 391 392 movsd RELATIVE_ADDR(poly_mask), %xmm0 393 andpd %xmm4, %xmm0 // xmm0 <-- x_hi 394 movapd %xmm4, %xmm5 // xmm5 <-- x 395 subsd %xmm0, %xmm4 // xmm4 <-- x_lo 396 movapd %xmm0, %xmm6 // xmm6 <-- x_hi 397 mulsd (AX_P), %xmm0 // xmm0 <-- 1/2 * x_hi 398 399 xorpd %xmm6, %xmm5 400 psrlq $1, %xmm5 401 orpd %xmm6, %xmm5 // xmm5 <-- x_hi + x_lo/2 402 403 mulsd %xmm6, %xmm0 // xmm0 <-- x_hi * x_hi/2 404 mulsd %xmm5, %xmm4 // xmm4 <-- x_lo * (x_hi + x_lo/2) 405 406 addsd RELATIVE_ADDR(one), %xmm0 // xmm0 <-- 1.0 + x_hi^2 / 2 -- this is exact, and strictly greater than 1. 407 addsd %xmm4, %xmm1 // xmm1 <-- x_lo * (x_hi + x_lo/2) + x^4 * p(x^2) -- not exact, but rounding point well below ulp(1) 408 409 addsd %xmm1, %xmm0 410 411#if defined(__i386__) 412 movsd %xmm0, FRAME_SIZE(STACKP) 413 fldl FRAME_SIZE(STACKP) 414#endif 415 ret 416 4172: 418 cmpl $0xfe69d1be, %eax // If |x| <= 2^-27 419 jbe 3f 420 421 // For 2^-14 > |x| > 2^-27 422 movsd RELATIVE_ADDR(one), %xmm0 423 mulsd %xmm1, %xmm1 424 mulsd RELATIVE_ADDR(small_poly), %xmm1 // xmm1 <-- x^2 / 2 425 addsd RELATIVE_ADDR(min_normal), %xmm1 // set inexact, round things that would have been exact the right way 426 addsd %xmm1, %xmm0 // xmm0 <-- 1 + x^2 / 2 427 428 #if defined(__i386__) 429 movsd %xmm0, FRAME_SIZE(STACKP) 430 fldl FRAME_SIZE(STACKP) 431#endif 432 ret 433 4343: 435 andpd %xmm1, %xmm0 // xmm0 <-- |x| 436 437 cmpl $0xc039d1be, %eax // If x is normal 438 jae 4f // goto 4 439 440 // Denormal handling. Zero passes through here just fine. 441 movsd RELATIVE_ADDR(one), %xmm2 442 orpd %xmm2, %xmm0 // xmm0 <-- 1.0 | x 443 subsd %xmm2, %xmm0 // xmm1 <-- 1.0 | x - 1.0 = 0x1p1022 * x 444 4454: 446 // Set inexact and round 1.0 return value 447 movsd RELATIVE_ADDR(one_p55), %xmm1 // xmm1 <-- 1.0p55 448 addsd %xmm1, %xmm0 // xmm0 <-- 1.0p55 + |x| (or |x|*0x1p1022 if |x| is denormal) 449 mulsd RELATIVE_ADDR(one_n55), %xmm0 // xmm0 <-- 1.0 + 1.0n55*|x| 450 451#if defined(__i386__) 452 movsd %xmm0, FRAME_SIZE(STACKP) 453 fldl FRAME_SIZE(STACKP) 454#endif 455 ret 456 4575: 458 cmpl $0x00b009be, %eax // If |x| > 711 or isnan(x) 459 jae 6f 460 461 // Here 21.0 < |x| < 711, and cosh(x) = exp(x) / 2.0 462 463 movsd RELATIVE_ADDR(lge_p7), %xmm3 464 movapd %xmm0, %xmm2 // xmm2 <-- 0x7fffffff....ffff 465 andpd %xmm1, %xmm0 // xmm0 <-- |x| 466 mulsd %xmm0, %xmm3 // xmm3 <-- lg(e) * |x| * 128.0 467 psllq $27, %xmm2 // xmm2 <-- 0xfffffffff8000000 (mask for the high 26 mantissa bits) 468 cvttsd2si %xmm3, %eax // %eax <-- (int)(lg(e) * |x| * 128.0) 469 movl %eax, %edx 470 and $0x7f, AX_P 471 shrl $7, %edx // %edx <-- n = (int)(lg(e) * |x|) 472 shll $4, %eax 473 474 /* Needed values are now: 475 xmm0 - |x| 476 xmm1 - signbit(x) 477 xmm2 - 0xfffffffff8000000 478 xmm3 - lg(e) * |x| * 0x1.0p7 479 %eax - lookup value from top 7 bits of the fractional part of |x| * lg_e 480 %edx - n = (int)(|x| * lg_e) 481 Everything else is fair game. */ 482 483 mulsd RELATIVE_ADDR(one_n7), %xmm3 // xmm3 <-- scaled_hi = lg(e) * |x| 484 andpd %xmm0, %xmm2 // xmm2 <-- x_hi 485 subsd %xmm2, %xmm0 // xmm0 <-- x_lo 486 movsd RELATIVE_ADDR(lge_hi), %xmm4 487 movsd RELATIVE_ADDR(lge_lo), %xmm5 488 movapd %xmm2, %xmm6 // xmm6 <-- x_hi 489 mulsd %xmm4, %xmm2 // xmm2 <-- x_hi * lge_hi 490 mulsd %xmm0, %xmm4 // xmm4 <-- x_lo * lge_hi 491 mulsd %xmm5, %xmm6 // xmm6 <-- x_hi * lge_lo 492 mulsd %xmm5, %xmm0 // xmm0 <-- x_lo * lge_lo 493 subsd %xmm3, %xmm2 // xmm2 <-- x_hi*lge_hi - x*lge 494 addsd %xmm4, %xmm2 // xmm2 <-- (x_hi*lge_hi - x*lge) + x_lo*lge_hi 495 addsd %xmm6, %xmm2 // xmm2 <-- ((x_hi*lge_hi - x*lge) + x_lo*lge_hi) + x_hi*lge_lo 496 addsd %xmm0, %xmm2 // xmm2 <-- (((x_hi*lge_hi - x*lge) + x_lo*lge_hi) + x_hi*lge_lo) + x_lo*lge_lo = scaled_lo 497 498 /* Needed values are now: 499 xmm1 - signbit of x 500 xmm2 - low part of |x| * lg_e 501 xmm3 - high part of |x| * lg_e 502 %eax - lookup value from top 7 bits of the fractional part of |x| * lg_e 503 %edx - n = (int)(|x| * lg_e) 504 Everything else is fair game. */ 505 506 cvtsi2sd %edx, %xmm4 // xmm4 <-- (double)n 507 subsd %xmm4, %xmm3 // xmm3 <-- frac = scaled_hi - (double)n 508 lea RELATIVE_ADDR(exp2table), CX_P 509 movsd (CX_P,AX_P,1), %xmm0 // xmm5 <-- -f_hi 510 movsd 8(CX_P,AX_P,1), %xmm1 // xmm6 <-- exp2(f_hi) 511 addsd %xmm3, %xmm0 // xmm5 <-- frac - f_hi 512 addsd %xmm2, %xmm0 // xmm5 <-- f_lo = (frac - f_hi) + scaled_lo 513 514 /* Needed values are now: 515 xmm0 - f_lo 516 xmm1 - signed exp2(f_hi) 517 %edx - n 518 Everything else is fair game. */ 519 520 // put 2^(n-1) into xmm7 521 addl $(1023 - 2), %edx 522 movd %edx, %xmm7 523 psllq $52, %xmm7 524 525 // polynomial in f_lo 526 movapd %xmm0, %xmm2 527 mulsd %xmm0, %xmm0 // xmm0 <-- x * x 528 movapd %xmm2, %xmm3 529 mulsd -64(CX_P), %xmm2 // xmm2 <-- c3/c4 * x 530 movapd %xmm3, %xmm4 // xmm4 <-- x 531 mulsd -32(CX_P), %xmm3 // xmm3 <-- c1 * x 532 533 movl $0x400, %eax 534 movd %eax, %xmm6 535 psllq $52, %xmm6 // 2.0 536 mulsd %xmm6, %xmm1 // high = exp2(1 + f_hi) 537 538 movapd %xmm0, %xmm5 539 mulsd -80(CX_P), %xmm0 // xmm0 <-- c4 * xx 540 addsd %xmm5, %xmm2 // xmm2 <-- xx + c3x/c4 541 mulsd -48(CX_P), %xmm5 // xmm5 <-- c2 * xx 542 addsd -16(CX_P), %xmm3 // xmm3 <-- c1x + c0 543 mulsd %xmm2, %xmm0 // xmm0 <-- c4xx * (xx + c3x/c4) xmm0 freed 544 addsd %xmm5, %xmm3 // xmm3 <-- c2xx + (c1x + c0) xmm5 freed 545 addsd %xmm3, %xmm0 // xmm0 <-- exp2poly(x) xmm3 freed 546 547 mulsd %xmm1, %xmm4 // xmm4 <-- high * f_lo 548 mulsd %xmm4, %xmm0 // xmm2 <-- high * f_lo * poly(f_lo) 549 addsd %xmm1, %xmm0 // xmm0 <-- high + high * f_lo * poly(f_lo) 550 mulsd %xmm7, %xmm0 // scale result by 2**(n - 2) 551 552#if defined(__i386__) 553 movsd %xmm0, FRAME_SIZE(STACKP) 554 fldl FRAME_SIZE(STACKP) 555#endif 556 ret 557 5586: 559 andpd %xmm1, %xmm0 // xmm0 <-- |x| 560 mulsd RELATIVE_ADDR(huge_val), %xmm0 // quiet NaNs, generate overflow for finite values 561#if defined(__i386__) 562 movsd %xmm0, FRAME_SIZE(STACKP) 563 fldl FRAME_SIZE(STACKP) 564#endif 565 ret