this repo has no description
at fixPythonPipStalling 567 lines 35 kB view raw
1/* 2 * asinhf.s 3 * 4 * by Stephen Canon 5 * 6 * Copyright (c) 2007, Apple Inc. All Rights Reserved. 7 * 8 * This file implements the C99 asinhf function for the MacOS X __i386__ and __x86_64__ ABIs. 9 */ 10 11/* 12 This code divides the floating-point numbers into three ranges, and uses a different approximation 13 to asinhf on each range. 14 15 For the "small" case, |x| < 1/4, a simpler approximation by a 7th-order polynomial suffices: 16 17 asinhf(x) = copysign(x) p(x) 18 19 p(x) has no constant term, and is computed via the factorization: 20 21 p(x) = cx(x + a)(x + b)(x^2 + cx + d)(x^2 + ex + f) 22 23 the two linear terms and the two quadratic terms are computed side-by-side in packed arithmetic. 24 Some care must be taken to avoid setting inexact in the product (x+b)(x^2 + ex + f) for the case 25 when x = 0, as b and f are full-width double-precision numbers. 26 27 28 The "middle" case is 1/4 < |x| < 4. Here a straightforward rational polynomial approximation is used: 29 p(x) 30 asinhf(x) = copysign(x) ---------- 31 q(x) 32 p(x) and q(x) are both order 6 polynomials, so they are computed side-by-side in packed arithmetic. 33 34 35 The "large" case is the most complicated, owing to the fact that asinh(|x|) = log(2|x|) + O(1/x^2) 36 for large x. Because of this, we take a divide and compute a logarithm (via polynomial approximation). 37 The logarithm code is copied from the file logf.s (Ians code), though some registers have been renamed, 38 so take care in making edits. A sketch of the algorithm for the log is provided where it is used in 39 this code, but more detailed comments may be found in logf.s. 40 41 After the log, a minimax polynomial in 1/x is used as a correction. 42 43 The "large" codepath is probably most suceptable to improvement - it is possible that a rational 44 approximation would prove faster (in terms of latency) for this case. The "small" and "middle" 45 code paths are believed to be more nearly optimal. 46 47 Globally, the error of this code is < .51 ulps 48 49 - stc (14 june, 2007) 50 */ 51 52#include <machine/asm.h> 53#include "abi.h" 54 55.const 56 57// Coefficients for 7th order polynomial approximation on [0, 1/4] 58// The polynomail is computed in packed factored form as follows: 59// 60// hi double: (x + ahi) * (x(x + b1hi) + b0hi) 61// lo double: (cx * (x + alo)) * (x(x + b1lo) + b0lo) 62// 63// The high and low parts are then unpacked and multiplied. 64.align 4 65asinhf_low: .quad 0x4008183efcaf7119, 0x4007eba0a6c21cf1 // b0hi, b0lo 66 .quad 0x3ffe9547f4507ace, 0xbffd2c173c2ad586 // ahi, alo 67 .quad 0x40021aa6afb79159, 0xc0015e2ed556dde5 // b1hi, b1lo 68 .quad 0xbfa050433953b0f9 // c 69 70// Coefficients for rational approximation on [1/4, 4] 71// p(x) and q(x) are computed side-by-side in packed arithmetic, then unpacked and divided. 72.align 4 73asinhf_mid: .quad 0x3e4328ccef61bd30, 0x3f80f6f9cf323b3c // p[0], q[0] 74 .quad 0x3f80f6e561f06785, 0x3f85b29a3e277523 // p[1], q[1] 75 .quad 0x3f85b32a2f11b40a, 0x3f8e26416e925090 // p[2], q[2] 76 .quad 0x3f8b5062b37338e9, 0x3f8417abb8190857 // p[3], q[3] 77 .quad 0x3f807d5f61f237fc, 0x3f74ccecc3a9e525 // p[4], q[4] 78 .quad 0x3f6a9bd617bd81f1, 0x3f5135659e34c549 // p[5], q[5] 79 .quad 0x3f26c17c7b263d18, 0x3f001e11059bddca // p[6], q[6] 80 81 82.align 4 83// Polynomial coefficients for the correction to log(2x) for the "large" case. 84// The polynomial is computed in factored form as follows: 85// 86// cx(x + a)(x^2 + b1hi x + b0hi)(x^2 + b1lo x + b0lo) 87// 88// The two quadratic terms are computed with packed arithmetic. 89 .quad 0x4005413ddf1d2ea5, 0x4003fbe28981b6b8 // b0hi, b0lo 90 .quad 0x4006dd1e92567458, 0xc0057cccb680cfdf // b1hi, b1lo 91 .quad 0x3e9564f782290c65, 0x3fa3493848c3ecc6 // a, c 92// Table used to compute log(2x) for the "large" case. Copied from logf.s 93asinhf_hi_table: .quad 0x0000000000000000, 0x3ff0000000000000 // log(1.00000), 1/1.00000 94 .quad 0x3f6ff00aa2b10bc0, 0x3fefe01fe01fe020 // log(1.00391), 1/1.00391 95 .quad 0x3f7fe02a6b106789, 0x3fefc07f01fc07f0 // log(1.00781), 1/1.00781 96 .quad 0x3f87dc475f810a77, 0x3fefa11caa01fa12 // log(1.01172), 1/1.01172 97 .quad 0x3f8fc0a8b0fc03e4, 0x3fef81f81f81f820 // log(1.01562), 1/1.01562 98 .quad 0x3f93cea44346a575, 0x3fef6310aca0dbb5 // log(1.01953), 1/1.01953 99 .quad 0x3f97b91b07d5b11b, 0x3fef44659e4a4271 // log(1.02344), 1/1.02344 100 .quad 0x3f9b9fc027af9198, 0x3fef25f644230ab5 // log(1.02734), 1/1.02734 101 .quad 0x3f9f829b0e783300, 0x3fef07c1f07c1f08 // log(1.03125), 1/1.03125 102 .quad 0x3fa1b0d98923d980, 0x3feee9c7f8458e02 // log(1.03516), 1/1.03516 103 .quad 0x3fa39e87b9febd60, 0x3feecc07b301ecc0 // log(1.03906), 1/1.03906 104 .quad 0x3fa58a5bafc8e4d5, 0x3feeae807aba01eb // log(1.04297), 1/1.04297 105 .quad 0x3fa77458f632dcfc, 0x3fee9131abf0b767 // log(1.04688), 1/1.04688 106 .quad 0x3fa95c830ec8e3eb, 0x3fee741aa59750e4 // log(1.05078), 1/1.05078 107 .quad 0x3fab42dd711971bf, 0x3fee573ac901e574 // log(1.05469), 1/1.05469 108 .quad 0x3fad276b8adb0b52, 0x3fee3a9179dc1a73 // log(1.05859), 1/1.05859 109 .quad 0x3faf0a30c01162a6, 0x3fee1e1e1e1e1e1e // log(1.06250), 1/1.06250 110 .quad 0x3fb075983598e471, 0x3fee01e01e01e01e // log(1.06641), 1/1.06641 111 .quad 0x3fb16536eea37ae1, 0x3fede5d6e3f8868a // log(1.07031), 1/1.07031 112 .quad 0x3fb253f62f0a1417, 0x3fedca01dca01dca // log(1.07422), 1/1.07422 113 .quad 0x3fb341d7961bd1d1, 0x3fedae6076b981db // log(1.07812), 1/1.07812 114 .quad 0x3fb42edcbea646f0, 0x3fed92f2231e7f8a // log(1.08203), 1/1.08203 115 .quad 0x3fb51b073f06183f, 0x3fed77b654b82c34 // log(1.08594), 1/1.08594 116 .quad 0x3fb60658a93750c4, 0x3fed5cac807572b2 // log(1.08984), 1/1.08984 117 .quad 0x3fb6f0d28ae56b4c, 0x3fed41d41d41d41d // log(1.09375), 1/1.09375 118 .quad 0x3fb7da766d7b12cd, 0x3fed272ca3fc5b1a // log(1.09766), 1/1.09766 119 .quad 0x3fb8c345d6319b21, 0x3fed0cb58f6ec074 // log(1.10156), 1/1.10156 120 .quad 0x3fb9ab42462033ad, 0x3fecf26e5c44bfc6 // log(1.10547), 1/1.10547 121 .quad 0x3fba926d3a4ad563, 0x3fecd85689039b0b // log(1.10938), 1/1.10938 122 .quad 0x3fbb78c82bb0eda1, 0x3fecbe6d9601cbe7 // log(1.11328), 1/1.11328 123 .quad 0x3fbc5e548f5bc743, 0x3feca4b3055ee191 // log(1.11719), 1/1.11719 124 .quad 0x3fbd4313d66cb35d, 0x3fec8b265afb8a42 // log(1.12109), 1/1.12109 125 .quad 0x3fbe27076e2af2e6, 0x3fec71c71c71c71c // log(1.12500), 1/1.12500 126 .quad 0x3fbf0a30c01162a6, 0x3fec5894d10d4986 // log(1.12891), 1/1.12891 127 .quad 0x3fbfec9131dbeabb, 0x3fec3f8f01c3f8f0 // log(1.13281), 1/1.13281 128 .quad 0x3fc0671512ca596e, 0x3fec26b5392ea01c // log(1.13672), 1/1.13672 129 .quad 0x3fc0d77e7cd08e59, 0x3fec0e070381c0e0 // log(1.14062), 1/1.14062 130 .quad 0x3fc14785846742ac, 0x3febf583ee868d8b // log(1.14453), 1/1.14453 131 .quad 0x3fc1b72ad52f67a0, 0x3febdd2b899406f7 // log(1.14844), 1/1.14844 132 .quad 0x3fc2266f190a5acb, 0x3febc4fd65883e7b // log(1.15234), 1/1.15234 133 .quad 0x3fc29552f81ff523, 0x3febacf914c1bad0 // log(1.15625), 1/1.15625 134 .quad 0x3fc303d718e47fd3, 0x3feb951e2b18ff23 // log(1.16016), 1/1.16016 135 .quad 0x3fc371fc201e8f74, 0x3feb7d6c3dda338b // log(1.16406), 1/1.16406 136 .quad 0x3fc3dfc2b0ecc62a, 0x3feb65e2e3beee05 // log(1.16797), 1/1.16797 137 .quad 0x3fc44d2b6ccb7d1e, 0x3feb4e81b4e81b4f // log(1.17188), 1/1.17188 138 .quad 0x3fc4ba36f39a55e5, 0x3feb37484ad806ce // log(1.17578), 1/1.17578 139 .quad 0x3fc526e5e3a1b438, 0x3feb2036406c80d9 // log(1.17969), 1/1.17969 140 .quad 0x3fc59338d9982086, 0x3feb094b31d922a4 // log(1.18359), 1/1.18359 141 .quad 0x3fc5ff3070a793d4, 0x3feaf286bca1af28 // log(1.18750), 1/1.18750 142 .quad 0x3fc66acd4272ad51, 0x3feadbe87f94905e // log(1.19141), 1/1.19141 143 .quad 0x3fc6d60fe719d21d, 0x3feac5701ac5701b // log(1.19531), 1/1.19531 144 .quad 0x3fc740f8f54037a5, 0x3feaaf1d2f87ebfd // log(1.19922), 1/1.19922 145 .quad 0x3fc7ab890210d909, 0x3fea98ef606a63be // log(1.20312), 1/1.20312 146 .quad 0x3fc815c0a14357eb, 0x3fea82e65130e159 // log(1.20703), 1/1.20703 147 .quad 0x3fc87fa06520c911, 0x3fea6d01a6d01a6d // log(1.21094), 1/1.21094 148 .quad 0x3fc8e928de886d41, 0x3fea574107688a4a // log(1.21484), 1/1.21484 149 .quad 0x3fc9525a9cf456b4, 0x3fea41a41a41a41a // log(1.21875), 1/1.21875 150 .quad 0x3fc9bb362e7dfb83, 0x3fea2c2a87c51ca0 // log(1.22266), 1/1.22266 151 .quad 0x3fca23bc1fe2b563, 0x3fea16d3f97a4b02 // log(1.22656), 1/1.22656 152 .quad 0x3fca8becfc882f19, 0x3fea01a01a01a01a // log(1.23047), 1/1.23047 153 .quad 0x3fcaf3c94e80bff3, 0x3fe9ec8e951033d9 // log(1.23438), 1/1.23438 154 .quad 0x3fcb5b519e8fb5a4, 0x3fe9d79f176b682d // log(1.23828), 1/1.23828 155 .quad 0x3fcbc286742d8cd6, 0x3fe9c2d14ee4a102 // log(1.24219), 1/1.24219 156 .quad 0x3fcc2968558c18c1, 0x3fe9ae24ea5510da // log(1.24609), 1/1.24609 157 .quad 0x3fcc8ff7c79a9a22, 0x3fe999999999999a // log(1.25000), 1/1.25000 158 .quad 0x3fccf6354e09c5dc, 0x3fe9852f0d8ec0ff // log(1.25391), 1/1.25391 159 .quad 0x3fcd5c216b4fbb91, 0x3fe970e4f80cb872 // log(1.25781), 1/1.25781 160 .quad 0x3fcdc1bca0abec7d, 0x3fe95cbb0be377ae // log(1.26172), 1/1.26172 161 .quad 0x3fce27076e2af2e6, 0x3fe948b0fcd6e9e0 // log(1.26562), 1/1.26562 162 .quad 0x3fce8c0252aa5a60, 0x3fe934c67f9b2ce6 // log(1.26953), 1/1.26953 163 .quad 0x3fcef0adcbdc5936, 0x3fe920fb49d0e229 // log(1.27344), 1/1.27344 164 .quad 0x3fcf550a564b7b37, 0x3fe90d4f120190d5 // log(1.27734), 1/1.27734 165 .quad 0x3fcfb9186d5e3e2b, 0x3fe8f9c18f9c18fa // log(1.28125), 1/1.28125 166 .quad 0x3fd00e6c45ad501d, 0x3fe8e6527af1373f // log(1.28516), 1/1.28516 167 .quad 0x3fd0402594b4d041, 0x3fe8d3018d3018d3 // log(1.28906), 1/1.28906 168 .quad 0x3fd071b85fcd590d, 0x3fe8bfce8062ff3a // log(1.29297), 1/1.29297 169 .quad 0x3fd0a324e27390e3, 0x3fe8acb90f6bf3aa // log(1.29688), 1/1.29688 170 .quad 0x3fd0d46b579ab74b, 0x3fe899c0f601899c // log(1.30078), 1/1.30078 171 .quad 0x3fd1058bf9ae4ad5, 0x3fe886e5f0abb04a // log(1.30469), 1/1.30469 172 .quad 0x3fd136870293a8b0, 0x3fe87427bcc092b9 // log(1.30859), 1/1.30859 173 .quad 0x3fd1675cababa60e, 0x3fe8618618618618 // log(1.31250), 1/1.31250 174 .quad 0x3fd1980d2dd4236f, 0x3fe84f00c2780614 // log(1.31641), 1/1.31641 175 .quad 0x3fd1c898c16999fb, 0x3fe83c977ab2bedd // log(1.32031), 1/1.32031 176 .quad 0x3fd1f8ff9e48a2f3, 0x3fe82a4a0182a4a0 // log(1.32422), 1/1.32422 177 .quad 0x3fd22941fbcf7966, 0x3fe8181818181818 // log(1.32812), 1/1.32812 178 .quad 0x3fd2596010df763a, 0x3fe8060180601806 // log(1.33203), 1/1.33203 179 .quad 0x3fd2895a13de86a3, 0x3fe7f405fd017f40 // log(1.33594), 1/1.33594 180 .quad 0x3fd2b9303ab89d25, 0x3fe7e225515a4f1d // log(1.33984), 1/1.33984 181 .quad 0x3fd2e8e2bae11d31, 0x3fe7d05f417d05f4 // log(1.34375), 1/1.34375 182 .quad 0x3fd31871c9544185, 0x3fe7beb3922e017c // log(1.34766), 1/1.34766 183 .quad 0x3fd347dd9a987d55, 0x3fe7ad2208e0ecc3 // log(1.35156), 1/1.35156 184 .quad 0x3fd3772662bfd85b, 0x3fe79baa6bb6398b // log(1.35547), 1/1.35547 185 .quad 0x3fd3a64c556945ea, 0x3fe78a4c8178a4c8 // log(1.35938), 1/1.35938 186 .quad 0x3fd3d54fa5c1f710, 0x3fe77908119ac60d // log(1.36328), 1/1.36328 187 .quad 0x3fd404308686a7e4, 0x3fe767dce434a9b1 // log(1.36719), 1/1.36719 188 .quad 0x3fd432ef2a04e814, 0x3fe756cac201756d // log(1.37109), 1/1.37109 189 .quad 0x3fd4618bc21c5ec2, 0x3fe745d1745d1746 // log(1.37500), 1/1.37500 190 .quad 0x3fd49006804009d1, 0x3fe734f0c541fe8d // log(1.37891), 1/1.37891 191 .quad 0x3fd4be5f957778a1, 0x3fe724287f46debc // log(1.38281), 1/1.38281 192 .quad 0x3fd4ec973260026a, 0x3fe713786d9c7c09 // log(1.38672), 1/1.38672 193 .quad 0x3fd51aad872df82d, 0x3fe702e05c0b8170 // log(1.39062), 1/1.39062 194 .quad 0x3fd548a2c3add263, 0x3fe6f26016f26017 // log(1.39453), 1/1.39453 195 .quad 0x3fd5767717455a6c, 0x3fe6e1f76b4337c7 // log(1.39844), 1/1.39844 196 .quad 0x3fd5a42ab0f4cfe2, 0x3fe6d1a62681c861 // log(1.40234), 1/1.40234 197 .quad 0x3fd5d1bdbf5809ca, 0x3fe6c16c16c16c17 // log(1.40625), 1/1.40625 198 .quad 0x3fd5ff3070a793d4, 0x3fe6b1490aa31a3d // log(1.41016), 1/1.41016 199 .quad 0x3fd62c82f2b9c795, 0x3fe6a13cd1537290 // log(1.41406), 1/1.41406 200 .quad 0x3fd659b57303e1f3, 0x3fe691473a88d0c0 // log(1.41797), 1/1.41797 201 .quad 0x3fd686c81e9b14af, 0x3fe6816816816817 // log(1.42188), 1/1.42188 202 .quad 0x3fd6b3bb2235943e, 0x3fe6719f3601671a // log(1.42578), 1/1.42578 203 .quad 0x3fd6e08eaa2ba1e4, 0x3fe661ec6a5122f9 // log(1.42969), 1/1.42969 204 .quad 0x3fd70d42e2789236, 0x3fe6524f853b4aa3 // log(1.43359), 1/1.43359 205 .quad 0x3fd739d7f6bbd007, 0x3fe642c8590b2164 // log(1.43750), 1/1.43750 206 .quad 0x3fd7664e1239dbcf, 0x3fe63356b88ac0de // log(1.44141), 1/1.44141 207 .quad 0x3fd792a55fdd47a2, 0x3fe623fa77016240 // log(1.44531), 1/1.44531 208 .quad 0x3fd7bede0a37afc0, 0x3fe614b36831ae94 // log(1.44922), 1/1.44922 209 .quad 0x3fd7eaf83b82afc3, 0x3fe6058160581606 // log(1.45312), 1/1.45312 210 .quad 0x3fd816f41da0d496, 0x3fe5f66434292dfc // log(1.45703), 1/1.45703 211 .quad 0x3fd842d1da1e8b17, 0x3fe5e75bb8d015e7 // log(1.46094), 1/1.46094 212 .quad 0x3fd86e919a330ba0, 0x3fe5d867c3ece2a5 // log(1.46484), 1/1.46484 213 .quad 0x3fd89a3386c1425b, 0x3fe5c9882b931057 // log(1.46875), 1/1.46875 214 .quad 0x3fd8c5b7c858b48b, 0x3fe5babcc647fa91 // log(1.47266), 1/1.47266 215 .quad 0x3fd8f11e873662c7, 0x3fe5ac056b015ac0 // log(1.47656), 1/1.47656 216 .quad 0x3fd91c67eb45a83e, 0x3fe59d61f123ccaa // log(1.48047), 1/1.48047 217 .quad 0x3fd947941c2116fb, 0x3fe58ed2308158ed // log(1.48438), 1/1.48438 218 .quad 0x3fd972a341135158, 0x3fe5805601580560 // log(1.48828), 1/1.48828 219 .quad 0x3fd99d958117e08b, 0x3fe571ed3c506b3a // log(1.49219), 1/1.49219 220 .quad 0x3fd9c86b02dc0863, 0x3fe56397ba7c52e2 // log(1.49609), 1/1.49609 221 .quad 0x3fd9f323ecbf984c, 0x3fe5555555555555 // log(1.50000), 1/1.50000 222 .quad 0x3fda1dc064d5b995, 0x3fe54725e6bb82fe // log(1.50391), 1/1.50391 223 .quad 0x3fda484090e5bb0a, 0x3fe5390948f40feb // log(1.50781), 1/1.50781 224 .quad 0x3fda72a4966bd9ea, 0x3fe52aff56a8054b // log(1.51172), 1/1.51172 225 .quad 0x3fda9cec9a9a084a, 0x3fe51d07eae2f815 // log(1.51562), 1/1.51562 226 .quad 0x3fdac718c258b0e4, 0x3fe50f22e111c4c5 // log(1.51953), 1/1.51953 227 .quad 0x3fdaf1293247786b, 0x3fe5015015015015 // log(1.52344), 1/1.52344 228 .quad 0x3fdb1b1e0ebdfc5b, 0x3fe4f38f62dd4c9b // log(1.52734), 1/1.52734 229 .quad 0x3fdb44f77bcc8f63, 0x3fe4e5e0a72f0539 // log(1.53125), 1/1.53125 230 .quad 0x3fdb6eb59d3cf35e, 0x3fe4d843bedc2c4c // log(1.53516), 1/1.53516 231 .quad 0x3fdb9858969310fb, 0x3fe4cab88725af6e // log(1.53906), 1/1.53906 232 .quad 0x3fdbc1e08b0dad0a, 0x3fe4bd3edda68fe1 // log(1.54297), 1/1.54297 233 .quad 0x3fdbeb4d9da71b7c, 0x3fe4afd6a052bf5b // log(1.54688), 1/1.54688 234 .quad 0x3fdc149ff115f027, 0x3fe4a27fad76014a // log(1.55078), 1/1.55078 235 .quad 0x3fdc3dd7a7cdad4d, 0x3fe49539e3b2d067 // log(1.55469), 1/1.55469 236 .quad 0x3fdc66f4e3ff6ff8, 0x3fe4880522014880 // log(1.55859), 1/1.55859 237 .quad 0x3fdc8ff7c79a9a22, 0x3fe47ae147ae147b // log(1.56250), 1/1.56250 238 .quad 0x3fdcb8e0744d7aca, 0x3fe46dce34596066 // log(1.56641), 1/1.56641 239 .quad 0x3fdce1af0b85f3eb, 0x3fe460cbc7f5cf9a // log(1.57031), 1/1.57031 240 .quad 0x3fdd0a63ae721e64, 0x3fe453d9e2c776ca // log(1.57422), 1/1.57422 241 .quad 0x3fdd32fe7e00ebd5, 0x3fe446f86562d9fb // log(1.57812), 1/1.57812 242 .quad 0x3fdd5b7f9ae2c684, 0x3fe43a2730abee4d // log(1.58203), 1/1.58203 243 .quad 0x3fdd83e7258a2f3e, 0x3fe42d6625d51f87 // log(1.58594), 1/1.58594 244 .quad 0x3fddac353e2c5954, 0x3fe420b5265e5951 // log(1.58984), 1/1.58984 245 .quad 0x3fddd46a04c1c4a1, 0x3fe4141414141414 // log(1.59375), 1/1.59375 246 .quad 0x3fddfc859906d5b5, 0x3fe40782d10e6566 // log(1.59766), 1/1.59766 247 .quad 0x3fde24881a7c6c26, 0x3fe3fb013fb013fb // log(1.60156), 1/1.60156 248 .quad 0x3fde4c71a8687704, 0x3fe3ee8f42a5af07 // log(1.60547), 1/1.60547 249 .quad 0x3fde744261d68788, 0x3fe3e22cbce4a902 // log(1.60938), 1/1.60938 250 .quad 0x3fde9bfa659861f5, 0x3fe3d5d991aa75c6 // log(1.61328), 1/1.61328 251 .quad 0x3fdec399d2468cc0, 0x3fe3c995a47babe7 // log(1.61719), 1/1.61719 252 .quad 0x3fdeeb20c640ddf4, 0x3fe3bd60d9232955 // log(1.62109), 1/1.62109 253 .quad 0x3fdf128f5faf06ed, 0x3fe3b13b13b13b14 // log(1.62500), 1/1.62500 254 .quad 0x3fdf39e5bc811e5c, 0x3fe3a524387ac822 // log(1.62891), 1/1.62891 255 .quad 0x3fdf6123fa7028ac, 0x3fe3991c2c187f63 // log(1.63281), 1/1.63281 256 .quad 0x3fdf884a36fe9ec2, 0x3fe38d22d366088e // log(1.63672), 1/1.63672 257 .quad 0x3fdfaf588f78f31f, 0x3fe3813813813814 // log(1.64062), 1/1.64062 258 .quad 0x3fdfd64f20f61572, 0x3fe3755bd1c945ee // log(1.64453), 1/1.64453 259 .quad 0x3fdffd2e0857f498, 0x3fe3698df3de0748 // log(1.64844), 1/1.64844 260 .quad 0x3fe011fab125ff8a, 0x3fe35dce5f9f2af8 // log(1.65234), 1/1.65234 261 .quad 0x3fe02552a5a5d0ff, 0x3fe3521cfb2b78c1 // log(1.65625), 1/1.65625 262 .quad 0x3fe0389eefce633b, 0x3fe34679ace01346 // log(1.66016), 1/1.66016 263 .quad 0x3fe04bdf9da926d2, 0x3fe33ae45b57bcb2 // log(1.66406), 1/1.66406 264 .quad 0x3fe05f14bd26459c, 0x3fe32f5ced6a1dfa // log(1.66797), 1/1.66797 265 .quad 0x3fe0723e5c1cdf40, 0x3fe323e34a2b10bf // log(1.67188), 1/1.67188 266 .quad 0x3fe0855c884b450e, 0x3fe3187758e9ebb6 // log(1.67578), 1/1.67578 267 .quad 0x3fe0986f4f573521, 0x3fe30d190130d190 // log(1.67969), 1/1.67969 268 .quad 0x3fe0ab76bece14d2, 0x3fe301c82ac40260 // log(1.68359), 1/1.68359 269 .quad 0x3fe0be72e4252a83, 0x3fe2f684bda12f68 // log(1.68750), 1/1.68750 270 .quad 0x3fe0d163ccb9d6b8, 0x3fe2eb4ea1fed14b // log(1.69141), 1/1.69141 271 .quad 0x3fe0e44985d1cc8c, 0x3fe2e025c04b8097 // log(1.69531), 1/1.69531 272 .quad 0x3fe0f7241c9b497d, 0x3fe2d50a012d50a0 // log(1.69922), 1/1.69922 273 .quad 0x3fe109f39e2d4c97, 0x3fe2c9fb4d812ca0 // log(1.70312), 1/1.70312 274 .quad 0x3fe11cb81787ccf8, 0x3fe2bef98e5a3711 // log(1.70703), 1/1.70703 275 .quad 0x3fe12f719593efbc, 0x3fe2b404ad012b40 // log(1.71094), 1/1.71094 276 .quad 0x3fe1422025243d45, 0x3fe2a91c92f3c105 // log(1.71484), 1/1.71484 277 .quad 0x3fe154c3d2f4d5ea, 0x3fe29e4129e4129e // log(1.71875), 1/1.71875 278 .quad 0x3fe1675cababa60e, 0x3fe293725bb804a5 // log(1.72266), 1/1.72266 279 .quad 0x3fe179eabbd899a1, 0x3fe288b01288b013 // log(1.72656), 1/1.72656 280 .quad 0x3fe18c6e0ff5cf06, 0x3fe27dfa38a1ce4d // log(1.73047), 1/1.73047 281 .quad 0x3fe19ee6b467c96f, 0x3fe27350b8812735 // log(1.73438), 1/1.73438 282 .quad 0x3fe1b154b57da29f, 0x3fe268b37cd60127 // log(1.73828), 1/1.73828 283 .quad 0x3fe1c3b81f713c25, 0x3fe25e22708092f1 // log(1.74219), 1/1.74219 284 .quad 0x3fe1d610fe677003, 0x3fe2539d7e9177b2 // log(1.74609), 1/1.74609 285 .quad 0x3fe1e85f5e7040d0, 0x3fe2492492492492 // log(1.75000), 1/1.75000 286 .quad 0x3fe1faa34b87094c, 0x3fe23eb79717605b // log(1.75391), 1/1.75391 287 .quad 0x3fe20cdcd192ab6e, 0x3fe23456789abcdf // log(1.75781), 1/1.75781 288 .quad 0x3fe21f0bfc65beec, 0x3fe22a0122a0122a // log(1.76172), 1/1.76172 289 .quad 0x3fe23130d7bebf43, 0x3fe21fb78121fb78 // log(1.76562), 1/1.76562 290 .quad 0x3fe2434b6f483934, 0x3fe21579804855e6 // log(1.76953), 1/1.76953 291 .quad 0x3fe2555bce98f7cb, 0x3fe20b470c67c0d9 // log(1.77344), 1/1.77344 292 .quad 0x3fe26762013430e0, 0x3fe2012012012012 // log(1.77734), 1/1.77734 293 .quad 0x3fe2795e1289b11b, 0x3fe1f7047dc11f70 // log(1.78125), 1/1.78125 294 .quad 0x3fe28b500df60783, 0x3fe1ecf43c7fb84c // log(1.78516), 1/1.78516 295 .quad 0x3fe29d37fec2b08b, 0x3fe1e2ef3b3fb874 // log(1.78906), 1/1.78906 296 .quad 0x3fe2af15f02640ad, 0x3fe1d8f5672e4abd // log(1.79297), 1/1.79297 297 .quad 0x3fe2c0e9ed448e8c, 0x3fe1cf06ada2811d // log(1.79688), 1/1.79688 298 .quad 0x3fe2d2b4012edc9e, 0x3fe1c522fc1ce059 // log(1.80078), 1/1.80078 299 .quad 0x3fe2e47436e40268, 0x3fe1bb4a4046ed29 // log(1.80469), 1/1.80469 300 .quad 0x3fe2f62a99509546, 0x3fe1b17c67f2bae3 // log(1.80859), 1/1.80859 301 .quad 0x3fe307d7334f10be, 0x3fe1a7b9611a7b96 // log(1.81250), 1/1.81250 302 .quad 0x3fe3197a0fa7fe6a, 0x3fe19e0119e0119e // log(1.81641), 1/1.81641 303 .quad 0x3fe32b1339121d71, 0x3fe19453808ca29c // log(1.82031), 1/1.82031 304 .quad 0x3fe33ca2ba328995, 0x3fe18ab083902bdb // log(1.82422), 1/1.82422 305 .quad 0x3fe34e289d9ce1d3, 0x3fe1811811811812 // log(1.82812), 1/1.82812 306 .quad 0x3fe35fa4edd36ea0, 0x3fe1778a191bd684 // log(1.83203), 1/1.83203 307 .quad 0x3fe37117b54747b6, 0x3fe16e0689427379 // log(1.83594), 1/1.83594 308 .quad 0x3fe38280fe58797f, 0x3fe1648d50fc3201 // log(1.83984), 1/1.83984 309 .quad 0x3fe393e0d3562a1a, 0x3fe15b1e5f75270d // log(1.84375), 1/1.84375 310 .quad 0x3fe3a5373e7ebdfa, 0x3fe151b9a3fdd5c9 // log(1.84766), 1/1.84766 311 .quad 0x3fe3b68449fffc23, 0x3fe1485f0e0acd3b // log(1.85156), 1/1.85156 312 .quad 0x3fe3c7c7fff73206, 0x3fe13f0e8d344724 // log(1.85547), 1/1.85547 313 .quad 0x3fe3d9026a7156fb, 0x3fe135c81135c811 // log(1.85938), 1/1.85938 314 .quad 0x3fe3ea33936b2f5c, 0x3fe12c8b89edc0ac // log(1.86328), 1/1.86328 315 .quad 0x3fe3fb5b84d16f42, 0x3fe12358e75d3033 // log(1.86719), 1/1.86719 316 .quad 0x3fe40c7a4880dce9, 0x3fe11a3019a74826 // log(1.87109), 1/1.87109 317 .quad 0x3fe41d8fe84672ae, 0x3fe1111111111111 // log(1.87500), 1/1.87500 318 .quad 0x3fe42e9c6ddf80bf, 0x3fe107fbbe011080 // log(1.87891), 1/1.87891 319 .quad 0x3fe43f9fe2f9ce67, 0x3fe0fef010fef011 // log(1.88281), 1/1.88281 320 .quad 0x3fe4509a5133bb0a, 0x3fe0f5edfab325a2 // log(1.88672), 1/1.88672 321 .quad 0x3fe4618bc21c5ec2, 0x3fe0ecf56be69c90 // log(1.89062), 1/1.89062 322 .quad 0x3fe472743f33aaad, 0x3fe0e40655826011 // log(1.89453), 1/1.89453 323 .quad 0x3fe48353d1ea88df, 0x3fe0db20a88f4696 // log(1.89844), 1/1.89844 324 .quad 0x3fe4942a83a2fc07, 0x3fe0d24456359e3a // log(1.90234), 1/1.90234 325 .quad 0x3fe4a4f85db03ebb, 0x3fe0c9714fbcda3b // log(1.90625), 1/1.90625 326 .quad 0x3fe4b5bd6956e274, 0x3fe0c0a7868b4171 // log(1.91016), 1/1.91016 327 .quad 0x3fe4c679afccee3a, 0x3fe0b7e6ec259dc8 // log(1.91406), 1/1.91406 328 .quad 0x3fe4d72d3a39fd00, 0x3fe0af2f722eecb5 // log(1.91797), 1/1.91797 329 .quad 0x3fe4e7d811b75bb1, 0x3fe0a6810a6810a7 // log(1.92188), 1/1.92188 330 .quad 0x3fe4f87a3f5026e9, 0x3fe09ddba6af8360 // log(1.92578), 1/1.92578 331 .quad 0x3fe50913cc01686b, 0x3fe0953f39010954 // log(1.92969), 1/1.92969 332 .quad 0x3fe519a4c0ba3446, 0x3fe08cabb37565e2 // log(1.93359), 1/1.93359 333 .quad 0x3fe52a2d265bc5ab, 0x3fe0842108421084 // log(1.93750), 1/1.93750 334 .quad 0x3fe53aad05b99b7d, 0x3fe07b9f29b8eae2 // log(1.94141), 1/1.94141 335 .quad 0x3fe54b2467999498, 0x3fe073260a47f7c6 // log(1.94531), 1/1.94531 336 .quad 0x3fe55b9354b40bcd, 0x3fe06ab59c7912fb // log(1.94922), 1/1.94922 337 .quad 0x3fe56bf9d5b3f399, 0x3fe0624dd2f1a9fc // log(1.95312), 1/1.95312 338 .quad 0x3fe57c57f336f191, 0x3fe059eea0727586 // log(1.95703), 1/1.95703 339 .quad 0x3fe58cadb5cd7989, 0x3fe05197f7d73404 // log(1.96094), 1/1.96094 340 .quad 0x3fe59cfb25fae87e, 0x3fe04949cc1664c5 // log(1.96484), 1/1.96484 341 .quad 0x3fe5ad404c359f2d, 0x3fe0410410410410 // log(1.96875), 1/1.96875 342 .quad 0x3fe5bd7d30e71c73, 0x3fe038c6b78247fc // log(1.97266), 1/1.97266 343 .quad 0x3fe5cdb1dc6c1765, 0x3fe03091b51f5e1a // log(1.97656), 1/1.97656 344 .quad 0x3fe5ddde57149923, 0x3fe02864fc7729e9 // log(1.98047), 1/1.98047 345 .quad 0x3fe5ee02a9241675, 0x3fe0204081020408 // log(1.98438), 1/1.98438 346 .quad 0x3fe5fe1edad18919, 0x3fe0182436517a37 // log(1.98828), 1/1.98828 347 .quad 0x3fe60e32f44788d9, 0x3fe0101010101010 // log(1.99219), 1/1.99219 348 .quad 0x3fe61e3efda46467, 0x3fe0080402010080 // log(1.99609), 1/1.99609 349 350.literal8 351.align 3 352one: .quad 0x3ff0000000000000 353onehalf: .quad 0x3fe0000000000000 354onethird: .quad 0x3fd5555555555555 355log2: .quad 0x3fe62e42fefa39ef // ln(2) 356 357.text 358#if defined( __x86_64__ ) 359 #define RELATIVE_ADDR( _a ) (_a)( %rip ) 360#else 361 #define RELATIVE_ADDR( _a ) (_a)-asinhf_body( %ecx ) 362 363.align 4 364asinhf_pic: 365 movl (%esp), %ecx // Copy address of this instruction to %ecx 366 ret 367#endif 368 369ENTRY(asinhf) 370#if defined(__i386__) 371 movl FRAME_SIZE(STACKP), %eax 372 movss FRAME_SIZE(STACKP), %xmm0 373 calll asinhf_pic 374asinhf_body: 375#else 376 movd %xmm0, %eax 377#endif 378 andnpd %xmm5, %xmm5 // zero out xmm5 379 380 andl $0x7fffffff, %eax // eax <-- |x| 381 382 movd %eax, %xmm1 // xmm1 <-- |x| 383 cvtss2sd %xmm1, %xmm2 // xmm2 <-- (double)|x| 384 andnps %xmm0, %xmm1 // xmm1 <-- signbit(x) 385 386 subl $0x3e800000, %eax // eax <-- |x| - 1/4 as integers 387 cmpl $0x02000000, %eax // if ( |x| > 4 or |x| < 1/4 or isnan(x) ) 388 ja 2f // goto 2 389 390 // 1/4 < |x| < 4 391 lea RELATIVE_ADDR(asinhf_mid), DX_P 392#if defined( __SSE3__ ) 393 movddup %xmm2, %xmm0 // xmm0 <-- [x, x] 394#else 395 movapd %xmm2, %xmm0 396 unpcklpd %xmm0, %xmm0 397#endif 398 399 mulsd %xmm2, %xmm2 // xmm2 <-- x*x 400 movapd %xmm0, %xmm3 // xmm3 <-- [x, x] 401 movapd %xmm0, %xmm4 // xmm4 <-- [x, x] 402 mulpd 48(DX_P), %xmm3 // xmm3 <-- [p3x, q3x] 403 mulpd 80(DX_P), %xmm4 // xmm4 <-- [p5x, q5x] 404 mulpd 16(DX_P), %xmm0 // xmm0 <-- [p1x, q1x] 405 movapd 96(DX_P), %xmm6 // xmm6 <-- [p6, q6] 406 407#if defined( __SSE3__ ) 408 movddup %xmm2, %xmm5 // xmm5 <-- [x2, x2] 409#else 410 movapd %xmm2, %xmm5 411 unpcklpd %xmm5, %xmm5 412#endif 413 414 mulsd %xmm2, %xmm2 // xmm2 <-- x4 415 mulpd %xmm5, %xmm6 // xmm6 <-- [p6x2, q6x2] 416 addpd 32(DX_P), %xmm3 // xmm3 <-- [p3x + p2, q3x + q2] 417 addpd 64(DX_P), %xmm4 // xmm4 <-- [p5x + p4, q5x + q4] 418 addpd (DX_P), %xmm0 // xmm0 <-- [p1x + p0, q1x + q0] 419 420 mulpd %xmm3, %xmm5 // xmm5 <-- [p3x3 + p2x2, q3x3 + q2x2] 421 addpd %xmm4, %xmm6 // xmm6 <-- [p6x2 + p5x + p4, q6x2 + q5x + q4] 422 unpcklpd %xmm2, %xmm2 // xmm2 <-- [x4, x4] 423 mulpd %xmm2, %xmm6 // xmm6 <-- [p6x6 + p5x5 + p4x4, q6x6 + q5x5 + q4x4] 424 addpd %xmm0, %xmm5 // xmm5 <-- [p3x3 + p2x2 + p1x + p0, q3x3 + q2x2 + q1x + q0] 425 addpd %xmm6, %xmm5 // xmm5 <-- [p(x), q(x)] 426 427 movhlps %xmm5, %xmm6 // xmm6 <-- q(x) 428 divsd %xmm6, %xmm5 // xmm5 <-- p(x)/q(x) 429 430 cvtsd2ss %xmm5, %xmm0 431 orps %xmm1, %xmm0 432#if defined(__i386__) 433 movss %xmm0, FRAME_SIZE( STACKP ) 434 flds FRAME_SIZE( STACKP ) 435#endif 436 ret 437 4382: 439 jge 3f // if ( |x| > 4 or isnan(x) ) goto 3 440 441// Polynomial approximation to asinhf(x) for |x| < 1/4. Details of the computation are at top with the constants. 442 443 lea RELATIVE_ADDR(asinhf_low), DX_P 444 445#if defined( __SSE3__ ) 446 movddup %xmm2, %xmm0 // xmm0 <-- [x, x] 447#else 448 movapd %xmm2, %xmm0 449 unpcklpd %xmm0, %xmm0 450#endif 451 452 cmppd $0, %xmm0, %xmm5 // detect |x| = 0 453 454 mulsd 48(DX_P), %xmm2 // xmm2 <-- [..., cx] 455 movapd %xmm0, %xmm3 // xmm3 <-- [x, x] 456 addpd 32(DX_P), %xmm0 // xmm0 <-- [x, x] + [b1hi, b1lo] 457 movapd %xmm3, %xmm4 // xmm4 <-- [x, x] 458 addpd 16(DX_P), %xmm3 // xmm3 <-- [x, x] + [ahi, alo] 459 mulpd %xmm4, %xmm0 // xmm0 <-- [x, x] * [x + b1hi, x + b1lo] 460 mulsd %xmm2, %xmm3 // xmm3 <-- [x + ahi, cx * (x + alo)] 461 addpd (DX_P), %xmm0 // xmm0 <-- [x^2 + b1hi x, x^2 + b1lo x] + [b0hi, b0lo] 462 463 andnpd %xmm3, %xmm5 // if |x| = 0, set xmm3 = 0 to suppress inexact from the next multiply. 464 465 mulpd %xmm5, %xmm0 // xmm0 <-- [(x + ahi)(x^2 + b1hi x + b0hi), cx(x + alo)(x^2 + b1lo x + b0lo)] 466 movhlps %xmm0, %xmm2 // xmm2 <-- [..., (x + ahi)(x^2 + b1hi x + b0hi)] 467 mulsd %xmm2, %xmm0 // xmm0 <-- cx(x + alo)(x + ahi)(x^2 + b1lo x + b0lo)(x^2 + b1hi x + b0hi) 468 469 cvtsd2ss %xmm0, %xmm0 470 orps %xmm1, %xmm0 471#if defined(__i386__) 472 movss %xmm0, FRAME_SIZE( STACKP ) 473 flds FRAME_SIZE( STACKP ) 474#endif 475 ret 476 4773: 478 addl $0x3e800000, %eax // %eax <-- |x| 479 cmpl $0x7f800000, %eax 480 ja 4f // if isnan(x) goto 4 481 je 5f // if (|x| = ) goto 5 482 483// Go off and compute the reciprocal of |x|. This will take a little while, but we have other stuff to do 484// while this is up in the air. 485 486 movsd RELATIVE_ADDR( one ), %xmm0 487 divsd %xmm2, %xmm0 // xmm0 <-- 1/|x| 488 489// We need to take a logarithm for this case. This code is shamelessly taken from our log2f.s implementation. 490// Consult that file for more details of operation. 491 492// begin by factoring x as 2^n*(mantissa) 493 494 movl %eax, %edx 495 shrl $23, %edx // right-align exponent bits 496 subl $126, %edx // subtract exponent bias, but add one to get log(2x) instead of log(x) 497 // i.e. we subtract 126 instead of 127 (the actual bias). 498 cvtsi2sd %edx, %xmm2 // xmm2 <-- trunc(lg |x|) = n 499 mulsd RELATIVE_ADDR(log2), %xmm2 // xmm2 <-- log(2^n) 500 501// now, let mantissa = 1 + a + b = (1 + a)(1 + r), if r = b/(1+a), and 502// 503// log(mantissa) = log(1+a) + log(1+r). 504// 505// we look up log(1+a) and 1/(1+a) in a table, keyed from a, and we compute log(1+r) via taylor series. 506 507 movl %eax, %edx 508 andl $0x00007fff, %eax // eax <-- b 509 andl $0x007f8000, %edx // edx <-- a << 15 510 orl $0x3f800000, %eax // eax <-- 1 + b 511 shrl $(15-4), %edx // edx <-- a << 4, lookup key for log(1+a) 512 movd %eax, %xmm3 // xmm3 <-- 1 + b 513 cvtss2sd %xmm3, %xmm3 514 subsd RELATIVE_ADDR( one ), %xmm3 // xmm3 <-- b 515 lea RELATIVE_ADDR(asinhf_hi_table), AX_P 516 mulsd 8(AX_P, DX_P, 1), %xmm3 // xmm3 <-- r = b * 1/(1+a) 517 movapd %xmm3, %xmm4 518 mulsd RELATIVE_ADDR(onethird), %xmm3 // xmm3 <-- 1/3 r 519 movapd %xmm4, %xmm5 // xmm5 <-- r 520 mulsd %xmm4, %xmm4 // xmm4 <-- r^2 521 subsd RELATIVE_ADDR(onehalf), %xmm3 // xmm3 <-- 1/3 r - 1/2 522 mulsd %xmm4, %xmm3 // xmm3 <-- 1/3 r^3 - 1/2 r^2 523 addsd %xmm5, %xmm3 // xmm3 <-- 1/3 r^3 - 1/2 r^2 + r ~ log(1+r) 524 addsd (AX_P, DX_P, 1), %xmm3 // xmm3 <-- log(a) + log(1+r) 525 addsd %xmm2, %xmm3 // xmm3 <-- log(2x) with error < .5002 ulps 526 527 528// Now we compute a correction from a polynomial in 1/|x|, which conveniently has landed in %xmm0 about now. 529 530 movsd -8(AX_P), %xmm6 // xmm6 <-- [ ... , c ] 531 movapd -32(AX_P), %xmm7 // xmm7 <-- [ b1hi, b1lo ] 532#if defined( __SSE3__ ) 533 movddup %xmm0, %xmm2 // xmm2 <-- [ 1/x , 1/x ] 534#else 535 movapd %xmm0, %xmm2 536 unpcklpd %xmm2, %xmm2 537#endif 538 mulsd %xmm0, %xmm6 // xmm6 <-- [ ... , c/x ] 539 addpd %xmm2, %xmm7 // xmm7 <-- [ 1/x + b1hi, 1/x + b1lo ] 540 addsd -16(AX_P), %xmm0 // xmm0 <-- [ ... , 1/x + a ] 541 mulpd %xmm7, %xmm2 // xmm2 <-- [ 1/x^2 + b1hi/x, 1/x^2 + b1lo/x ] 542 mulsd %xmm6, %xmm0 // xmm0 <-- [ ... , c/x(1/x + a) ] 543 addpd -48(AX_P), %xmm2 // xmm2 <-- [ 1/x^2 + b1hi/x + b0hi, 1/x^2 + b1lo/x + b0lo ] 544 movhlps %xmm2, %xmm4 // xmm4 <-- [ ... , 1/x^2 + b1hi/x + b0hi ] 545 mulsd %xmm2, %xmm0 // xmm0 <-- [ ... , c/x(1/x + a)(1/x^2 + b1lo/x + b0lo) ] 546 mulsd %xmm4, %xmm0 // xmm0 <-- [ ... , c/x(1/x + a)(1/x^2 + b1lo/x + b0lo)(1/x^2 + b1hi/x + b0hi) ] 547 548// Add log(2x) to the correction to get the final result 549 550 addsd %xmm3, %xmm0 551 cvtsd2ss %xmm0, %xmm0 552 orps %xmm1, %xmm0 553#if defined(__i386__) 554 movss %xmm0, FRAME_SIZE( STACKP ) 555 flds FRAME_SIZE( STACKP ) 556#endif 557 ret 558 5594: 560 addss %xmm0, %xmm0 // quiet the NaN 5615: 562 #if defined(__i386__) 563 movss %xmm0, FRAME_SIZE( STACKP ) 564 flds FRAME_SIZE( STACKP ) 565#endif 566 ret 567