this repo has no description
1/*
2 * asinhf.s
3 *
4 * by Stephen Canon
5 *
6 * Copyright (c) 2007, Apple Inc. All Rights Reserved.
7 *
8 * This file implements the C99 asinhf function for the MacOS X __i386__ and __x86_64__ ABIs.
9 */
10
11/*
12 This code divides the floating-point numbers into three ranges, and uses a different approximation
13 to asinhf on each range.
14
15 For the "small" case, |x| < 1/4, a simpler approximation by a 7th-order polynomial suffices:
16
17 asinhf(x) = copysign(x) p(x)
18
19 p(x) has no constant term, and is computed via the factorization:
20
21 p(x) = cx(x + a)(x + b)(x^2 + cx + d)(x^2 + ex + f)
22
23 the two linear terms and the two quadratic terms are computed side-by-side in packed arithmetic.
24 Some care must be taken to avoid setting inexact in the product (x+b)(x^2 + ex + f) for the case
25 when x = 0, as b and f are full-width double-precision numbers.
26
27
28 The "middle" case is 1/4 < |x| < 4. Here a straightforward rational polynomial approximation is used:
29 p(x)
30 asinhf(x) = copysign(x) ----------
31 q(x)
32 p(x) and q(x) are both order 6 polynomials, so they are computed side-by-side in packed arithmetic.
33
34
35 The "large" case is the most complicated, owing to the fact that asinh(|x|) = log(2|x|) + O(1/x^2)
36 for large x. Because of this, we take a divide and compute a logarithm (via polynomial approximation).
37 The logarithm code is copied from the file logf.s (Ians code), though some registers have been renamed,
38 so take care in making edits. A sketch of the algorithm for the log is provided where it is used in
39 this code, but more detailed comments may be found in logf.s.
40
41 After the log, a minimax polynomial in 1/x is used as a correction.
42
43 The "large" codepath is probably most suceptable to improvement - it is possible that a rational
44 approximation would prove faster (in terms of latency) for this case. The "small" and "middle"
45 code paths are believed to be more nearly optimal.
46
47 Globally, the error of this code is < .51 ulps
48
49 - stc (14 june, 2007)
50 */
51
52#include <machine/asm.h>
53#include "abi.h"
54
55.const
56
57// Coefficients for 7th order polynomial approximation on [0, 1/4]
58// The polynomail is computed in packed factored form as follows:
59//
60// hi double: (x + ahi) * (x(x + b1hi) + b0hi)
61// lo double: (cx * (x + alo)) * (x(x + b1lo) + b0lo)
62//
63// The high and low parts are then unpacked and multiplied.
64.align 4
65asinhf_low: .quad 0x4008183efcaf7119, 0x4007eba0a6c21cf1 // b0hi, b0lo
66 .quad 0x3ffe9547f4507ace, 0xbffd2c173c2ad586 // ahi, alo
67 .quad 0x40021aa6afb79159, 0xc0015e2ed556dde5 // b1hi, b1lo
68 .quad 0xbfa050433953b0f9 // c
69
70// Coefficients for rational approximation on [1/4, 4]
71// p(x) and q(x) are computed side-by-side in packed arithmetic, then unpacked and divided.
72.align 4
73asinhf_mid: .quad 0x3e4328ccef61bd30, 0x3f80f6f9cf323b3c // p[0], q[0]
74 .quad 0x3f80f6e561f06785, 0x3f85b29a3e277523 // p[1], q[1]
75 .quad 0x3f85b32a2f11b40a, 0x3f8e26416e925090 // p[2], q[2]
76 .quad 0x3f8b5062b37338e9, 0x3f8417abb8190857 // p[3], q[3]
77 .quad 0x3f807d5f61f237fc, 0x3f74ccecc3a9e525 // p[4], q[4]
78 .quad 0x3f6a9bd617bd81f1, 0x3f5135659e34c549 // p[5], q[5]
79 .quad 0x3f26c17c7b263d18, 0x3f001e11059bddca // p[6], q[6]
80
81
82.align 4
83// Polynomial coefficients for the correction to log(2x) for the "large" case.
84// The polynomial is computed in factored form as follows:
85//
86// cx(x + a)(x^2 + b1hi x + b0hi)(x^2 + b1lo x + b0lo)
87//
88// The two quadratic terms are computed with packed arithmetic.
89 .quad 0x4005413ddf1d2ea5, 0x4003fbe28981b6b8 // b0hi, b0lo
90 .quad 0x4006dd1e92567458, 0xc0057cccb680cfdf // b1hi, b1lo
91 .quad 0x3e9564f782290c65, 0x3fa3493848c3ecc6 // a, c
92// Table used to compute log(2x) for the "large" case. Copied from logf.s
93asinhf_hi_table: .quad 0x0000000000000000, 0x3ff0000000000000 // log(1.00000), 1/1.00000
94 .quad 0x3f6ff00aa2b10bc0, 0x3fefe01fe01fe020 // log(1.00391), 1/1.00391
95 .quad 0x3f7fe02a6b106789, 0x3fefc07f01fc07f0 // log(1.00781), 1/1.00781
96 .quad 0x3f87dc475f810a77, 0x3fefa11caa01fa12 // log(1.01172), 1/1.01172
97 .quad 0x3f8fc0a8b0fc03e4, 0x3fef81f81f81f820 // log(1.01562), 1/1.01562
98 .quad 0x3f93cea44346a575, 0x3fef6310aca0dbb5 // log(1.01953), 1/1.01953
99 .quad 0x3f97b91b07d5b11b, 0x3fef44659e4a4271 // log(1.02344), 1/1.02344
100 .quad 0x3f9b9fc027af9198, 0x3fef25f644230ab5 // log(1.02734), 1/1.02734
101 .quad 0x3f9f829b0e783300, 0x3fef07c1f07c1f08 // log(1.03125), 1/1.03125
102 .quad 0x3fa1b0d98923d980, 0x3feee9c7f8458e02 // log(1.03516), 1/1.03516
103 .quad 0x3fa39e87b9febd60, 0x3feecc07b301ecc0 // log(1.03906), 1/1.03906
104 .quad 0x3fa58a5bafc8e4d5, 0x3feeae807aba01eb // log(1.04297), 1/1.04297
105 .quad 0x3fa77458f632dcfc, 0x3fee9131abf0b767 // log(1.04688), 1/1.04688
106 .quad 0x3fa95c830ec8e3eb, 0x3fee741aa59750e4 // log(1.05078), 1/1.05078
107 .quad 0x3fab42dd711971bf, 0x3fee573ac901e574 // log(1.05469), 1/1.05469
108 .quad 0x3fad276b8adb0b52, 0x3fee3a9179dc1a73 // log(1.05859), 1/1.05859
109 .quad 0x3faf0a30c01162a6, 0x3fee1e1e1e1e1e1e // log(1.06250), 1/1.06250
110 .quad 0x3fb075983598e471, 0x3fee01e01e01e01e // log(1.06641), 1/1.06641
111 .quad 0x3fb16536eea37ae1, 0x3fede5d6e3f8868a // log(1.07031), 1/1.07031
112 .quad 0x3fb253f62f0a1417, 0x3fedca01dca01dca // log(1.07422), 1/1.07422
113 .quad 0x3fb341d7961bd1d1, 0x3fedae6076b981db // log(1.07812), 1/1.07812
114 .quad 0x3fb42edcbea646f0, 0x3fed92f2231e7f8a // log(1.08203), 1/1.08203
115 .quad 0x3fb51b073f06183f, 0x3fed77b654b82c34 // log(1.08594), 1/1.08594
116 .quad 0x3fb60658a93750c4, 0x3fed5cac807572b2 // log(1.08984), 1/1.08984
117 .quad 0x3fb6f0d28ae56b4c, 0x3fed41d41d41d41d // log(1.09375), 1/1.09375
118 .quad 0x3fb7da766d7b12cd, 0x3fed272ca3fc5b1a // log(1.09766), 1/1.09766
119 .quad 0x3fb8c345d6319b21, 0x3fed0cb58f6ec074 // log(1.10156), 1/1.10156
120 .quad 0x3fb9ab42462033ad, 0x3fecf26e5c44bfc6 // log(1.10547), 1/1.10547
121 .quad 0x3fba926d3a4ad563, 0x3fecd85689039b0b // log(1.10938), 1/1.10938
122 .quad 0x3fbb78c82bb0eda1, 0x3fecbe6d9601cbe7 // log(1.11328), 1/1.11328
123 .quad 0x3fbc5e548f5bc743, 0x3feca4b3055ee191 // log(1.11719), 1/1.11719
124 .quad 0x3fbd4313d66cb35d, 0x3fec8b265afb8a42 // log(1.12109), 1/1.12109
125 .quad 0x3fbe27076e2af2e6, 0x3fec71c71c71c71c // log(1.12500), 1/1.12500
126 .quad 0x3fbf0a30c01162a6, 0x3fec5894d10d4986 // log(1.12891), 1/1.12891
127 .quad 0x3fbfec9131dbeabb, 0x3fec3f8f01c3f8f0 // log(1.13281), 1/1.13281
128 .quad 0x3fc0671512ca596e, 0x3fec26b5392ea01c // log(1.13672), 1/1.13672
129 .quad 0x3fc0d77e7cd08e59, 0x3fec0e070381c0e0 // log(1.14062), 1/1.14062
130 .quad 0x3fc14785846742ac, 0x3febf583ee868d8b // log(1.14453), 1/1.14453
131 .quad 0x3fc1b72ad52f67a0, 0x3febdd2b899406f7 // log(1.14844), 1/1.14844
132 .quad 0x3fc2266f190a5acb, 0x3febc4fd65883e7b // log(1.15234), 1/1.15234
133 .quad 0x3fc29552f81ff523, 0x3febacf914c1bad0 // log(1.15625), 1/1.15625
134 .quad 0x3fc303d718e47fd3, 0x3feb951e2b18ff23 // log(1.16016), 1/1.16016
135 .quad 0x3fc371fc201e8f74, 0x3feb7d6c3dda338b // log(1.16406), 1/1.16406
136 .quad 0x3fc3dfc2b0ecc62a, 0x3feb65e2e3beee05 // log(1.16797), 1/1.16797
137 .quad 0x3fc44d2b6ccb7d1e, 0x3feb4e81b4e81b4f // log(1.17188), 1/1.17188
138 .quad 0x3fc4ba36f39a55e5, 0x3feb37484ad806ce // log(1.17578), 1/1.17578
139 .quad 0x3fc526e5e3a1b438, 0x3feb2036406c80d9 // log(1.17969), 1/1.17969
140 .quad 0x3fc59338d9982086, 0x3feb094b31d922a4 // log(1.18359), 1/1.18359
141 .quad 0x3fc5ff3070a793d4, 0x3feaf286bca1af28 // log(1.18750), 1/1.18750
142 .quad 0x3fc66acd4272ad51, 0x3feadbe87f94905e // log(1.19141), 1/1.19141
143 .quad 0x3fc6d60fe719d21d, 0x3feac5701ac5701b // log(1.19531), 1/1.19531
144 .quad 0x3fc740f8f54037a5, 0x3feaaf1d2f87ebfd // log(1.19922), 1/1.19922
145 .quad 0x3fc7ab890210d909, 0x3fea98ef606a63be // log(1.20312), 1/1.20312
146 .quad 0x3fc815c0a14357eb, 0x3fea82e65130e159 // log(1.20703), 1/1.20703
147 .quad 0x3fc87fa06520c911, 0x3fea6d01a6d01a6d // log(1.21094), 1/1.21094
148 .quad 0x3fc8e928de886d41, 0x3fea574107688a4a // log(1.21484), 1/1.21484
149 .quad 0x3fc9525a9cf456b4, 0x3fea41a41a41a41a // log(1.21875), 1/1.21875
150 .quad 0x3fc9bb362e7dfb83, 0x3fea2c2a87c51ca0 // log(1.22266), 1/1.22266
151 .quad 0x3fca23bc1fe2b563, 0x3fea16d3f97a4b02 // log(1.22656), 1/1.22656
152 .quad 0x3fca8becfc882f19, 0x3fea01a01a01a01a // log(1.23047), 1/1.23047
153 .quad 0x3fcaf3c94e80bff3, 0x3fe9ec8e951033d9 // log(1.23438), 1/1.23438
154 .quad 0x3fcb5b519e8fb5a4, 0x3fe9d79f176b682d // log(1.23828), 1/1.23828
155 .quad 0x3fcbc286742d8cd6, 0x3fe9c2d14ee4a102 // log(1.24219), 1/1.24219
156 .quad 0x3fcc2968558c18c1, 0x3fe9ae24ea5510da // log(1.24609), 1/1.24609
157 .quad 0x3fcc8ff7c79a9a22, 0x3fe999999999999a // log(1.25000), 1/1.25000
158 .quad 0x3fccf6354e09c5dc, 0x3fe9852f0d8ec0ff // log(1.25391), 1/1.25391
159 .quad 0x3fcd5c216b4fbb91, 0x3fe970e4f80cb872 // log(1.25781), 1/1.25781
160 .quad 0x3fcdc1bca0abec7d, 0x3fe95cbb0be377ae // log(1.26172), 1/1.26172
161 .quad 0x3fce27076e2af2e6, 0x3fe948b0fcd6e9e0 // log(1.26562), 1/1.26562
162 .quad 0x3fce8c0252aa5a60, 0x3fe934c67f9b2ce6 // log(1.26953), 1/1.26953
163 .quad 0x3fcef0adcbdc5936, 0x3fe920fb49d0e229 // log(1.27344), 1/1.27344
164 .quad 0x3fcf550a564b7b37, 0x3fe90d4f120190d5 // log(1.27734), 1/1.27734
165 .quad 0x3fcfb9186d5e3e2b, 0x3fe8f9c18f9c18fa // log(1.28125), 1/1.28125
166 .quad 0x3fd00e6c45ad501d, 0x3fe8e6527af1373f // log(1.28516), 1/1.28516
167 .quad 0x3fd0402594b4d041, 0x3fe8d3018d3018d3 // log(1.28906), 1/1.28906
168 .quad 0x3fd071b85fcd590d, 0x3fe8bfce8062ff3a // log(1.29297), 1/1.29297
169 .quad 0x3fd0a324e27390e3, 0x3fe8acb90f6bf3aa // log(1.29688), 1/1.29688
170 .quad 0x3fd0d46b579ab74b, 0x3fe899c0f601899c // log(1.30078), 1/1.30078
171 .quad 0x3fd1058bf9ae4ad5, 0x3fe886e5f0abb04a // log(1.30469), 1/1.30469
172 .quad 0x3fd136870293a8b0, 0x3fe87427bcc092b9 // log(1.30859), 1/1.30859
173 .quad 0x3fd1675cababa60e, 0x3fe8618618618618 // log(1.31250), 1/1.31250
174 .quad 0x3fd1980d2dd4236f, 0x3fe84f00c2780614 // log(1.31641), 1/1.31641
175 .quad 0x3fd1c898c16999fb, 0x3fe83c977ab2bedd // log(1.32031), 1/1.32031
176 .quad 0x3fd1f8ff9e48a2f3, 0x3fe82a4a0182a4a0 // log(1.32422), 1/1.32422
177 .quad 0x3fd22941fbcf7966, 0x3fe8181818181818 // log(1.32812), 1/1.32812
178 .quad 0x3fd2596010df763a, 0x3fe8060180601806 // log(1.33203), 1/1.33203
179 .quad 0x3fd2895a13de86a3, 0x3fe7f405fd017f40 // log(1.33594), 1/1.33594
180 .quad 0x3fd2b9303ab89d25, 0x3fe7e225515a4f1d // log(1.33984), 1/1.33984
181 .quad 0x3fd2e8e2bae11d31, 0x3fe7d05f417d05f4 // log(1.34375), 1/1.34375
182 .quad 0x3fd31871c9544185, 0x3fe7beb3922e017c // log(1.34766), 1/1.34766
183 .quad 0x3fd347dd9a987d55, 0x3fe7ad2208e0ecc3 // log(1.35156), 1/1.35156
184 .quad 0x3fd3772662bfd85b, 0x3fe79baa6bb6398b // log(1.35547), 1/1.35547
185 .quad 0x3fd3a64c556945ea, 0x3fe78a4c8178a4c8 // log(1.35938), 1/1.35938
186 .quad 0x3fd3d54fa5c1f710, 0x3fe77908119ac60d // log(1.36328), 1/1.36328
187 .quad 0x3fd404308686a7e4, 0x3fe767dce434a9b1 // log(1.36719), 1/1.36719
188 .quad 0x3fd432ef2a04e814, 0x3fe756cac201756d // log(1.37109), 1/1.37109
189 .quad 0x3fd4618bc21c5ec2, 0x3fe745d1745d1746 // log(1.37500), 1/1.37500
190 .quad 0x3fd49006804009d1, 0x3fe734f0c541fe8d // log(1.37891), 1/1.37891
191 .quad 0x3fd4be5f957778a1, 0x3fe724287f46debc // log(1.38281), 1/1.38281
192 .quad 0x3fd4ec973260026a, 0x3fe713786d9c7c09 // log(1.38672), 1/1.38672
193 .quad 0x3fd51aad872df82d, 0x3fe702e05c0b8170 // log(1.39062), 1/1.39062
194 .quad 0x3fd548a2c3add263, 0x3fe6f26016f26017 // log(1.39453), 1/1.39453
195 .quad 0x3fd5767717455a6c, 0x3fe6e1f76b4337c7 // log(1.39844), 1/1.39844
196 .quad 0x3fd5a42ab0f4cfe2, 0x3fe6d1a62681c861 // log(1.40234), 1/1.40234
197 .quad 0x3fd5d1bdbf5809ca, 0x3fe6c16c16c16c17 // log(1.40625), 1/1.40625
198 .quad 0x3fd5ff3070a793d4, 0x3fe6b1490aa31a3d // log(1.41016), 1/1.41016
199 .quad 0x3fd62c82f2b9c795, 0x3fe6a13cd1537290 // log(1.41406), 1/1.41406
200 .quad 0x3fd659b57303e1f3, 0x3fe691473a88d0c0 // log(1.41797), 1/1.41797
201 .quad 0x3fd686c81e9b14af, 0x3fe6816816816817 // log(1.42188), 1/1.42188
202 .quad 0x3fd6b3bb2235943e, 0x3fe6719f3601671a // log(1.42578), 1/1.42578
203 .quad 0x3fd6e08eaa2ba1e4, 0x3fe661ec6a5122f9 // log(1.42969), 1/1.42969
204 .quad 0x3fd70d42e2789236, 0x3fe6524f853b4aa3 // log(1.43359), 1/1.43359
205 .quad 0x3fd739d7f6bbd007, 0x3fe642c8590b2164 // log(1.43750), 1/1.43750
206 .quad 0x3fd7664e1239dbcf, 0x3fe63356b88ac0de // log(1.44141), 1/1.44141
207 .quad 0x3fd792a55fdd47a2, 0x3fe623fa77016240 // log(1.44531), 1/1.44531
208 .quad 0x3fd7bede0a37afc0, 0x3fe614b36831ae94 // log(1.44922), 1/1.44922
209 .quad 0x3fd7eaf83b82afc3, 0x3fe6058160581606 // log(1.45312), 1/1.45312
210 .quad 0x3fd816f41da0d496, 0x3fe5f66434292dfc // log(1.45703), 1/1.45703
211 .quad 0x3fd842d1da1e8b17, 0x3fe5e75bb8d015e7 // log(1.46094), 1/1.46094
212 .quad 0x3fd86e919a330ba0, 0x3fe5d867c3ece2a5 // log(1.46484), 1/1.46484
213 .quad 0x3fd89a3386c1425b, 0x3fe5c9882b931057 // log(1.46875), 1/1.46875
214 .quad 0x3fd8c5b7c858b48b, 0x3fe5babcc647fa91 // log(1.47266), 1/1.47266
215 .quad 0x3fd8f11e873662c7, 0x3fe5ac056b015ac0 // log(1.47656), 1/1.47656
216 .quad 0x3fd91c67eb45a83e, 0x3fe59d61f123ccaa // log(1.48047), 1/1.48047
217 .quad 0x3fd947941c2116fb, 0x3fe58ed2308158ed // log(1.48438), 1/1.48438
218 .quad 0x3fd972a341135158, 0x3fe5805601580560 // log(1.48828), 1/1.48828
219 .quad 0x3fd99d958117e08b, 0x3fe571ed3c506b3a // log(1.49219), 1/1.49219
220 .quad 0x3fd9c86b02dc0863, 0x3fe56397ba7c52e2 // log(1.49609), 1/1.49609
221 .quad 0x3fd9f323ecbf984c, 0x3fe5555555555555 // log(1.50000), 1/1.50000
222 .quad 0x3fda1dc064d5b995, 0x3fe54725e6bb82fe // log(1.50391), 1/1.50391
223 .quad 0x3fda484090e5bb0a, 0x3fe5390948f40feb // log(1.50781), 1/1.50781
224 .quad 0x3fda72a4966bd9ea, 0x3fe52aff56a8054b // log(1.51172), 1/1.51172
225 .quad 0x3fda9cec9a9a084a, 0x3fe51d07eae2f815 // log(1.51562), 1/1.51562
226 .quad 0x3fdac718c258b0e4, 0x3fe50f22e111c4c5 // log(1.51953), 1/1.51953
227 .quad 0x3fdaf1293247786b, 0x3fe5015015015015 // log(1.52344), 1/1.52344
228 .quad 0x3fdb1b1e0ebdfc5b, 0x3fe4f38f62dd4c9b // log(1.52734), 1/1.52734
229 .quad 0x3fdb44f77bcc8f63, 0x3fe4e5e0a72f0539 // log(1.53125), 1/1.53125
230 .quad 0x3fdb6eb59d3cf35e, 0x3fe4d843bedc2c4c // log(1.53516), 1/1.53516
231 .quad 0x3fdb9858969310fb, 0x3fe4cab88725af6e // log(1.53906), 1/1.53906
232 .quad 0x3fdbc1e08b0dad0a, 0x3fe4bd3edda68fe1 // log(1.54297), 1/1.54297
233 .quad 0x3fdbeb4d9da71b7c, 0x3fe4afd6a052bf5b // log(1.54688), 1/1.54688
234 .quad 0x3fdc149ff115f027, 0x3fe4a27fad76014a // log(1.55078), 1/1.55078
235 .quad 0x3fdc3dd7a7cdad4d, 0x3fe49539e3b2d067 // log(1.55469), 1/1.55469
236 .quad 0x3fdc66f4e3ff6ff8, 0x3fe4880522014880 // log(1.55859), 1/1.55859
237 .quad 0x3fdc8ff7c79a9a22, 0x3fe47ae147ae147b // log(1.56250), 1/1.56250
238 .quad 0x3fdcb8e0744d7aca, 0x3fe46dce34596066 // log(1.56641), 1/1.56641
239 .quad 0x3fdce1af0b85f3eb, 0x3fe460cbc7f5cf9a // log(1.57031), 1/1.57031
240 .quad 0x3fdd0a63ae721e64, 0x3fe453d9e2c776ca // log(1.57422), 1/1.57422
241 .quad 0x3fdd32fe7e00ebd5, 0x3fe446f86562d9fb // log(1.57812), 1/1.57812
242 .quad 0x3fdd5b7f9ae2c684, 0x3fe43a2730abee4d // log(1.58203), 1/1.58203
243 .quad 0x3fdd83e7258a2f3e, 0x3fe42d6625d51f87 // log(1.58594), 1/1.58594
244 .quad 0x3fddac353e2c5954, 0x3fe420b5265e5951 // log(1.58984), 1/1.58984
245 .quad 0x3fddd46a04c1c4a1, 0x3fe4141414141414 // log(1.59375), 1/1.59375
246 .quad 0x3fddfc859906d5b5, 0x3fe40782d10e6566 // log(1.59766), 1/1.59766
247 .quad 0x3fde24881a7c6c26, 0x3fe3fb013fb013fb // log(1.60156), 1/1.60156
248 .quad 0x3fde4c71a8687704, 0x3fe3ee8f42a5af07 // log(1.60547), 1/1.60547
249 .quad 0x3fde744261d68788, 0x3fe3e22cbce4a902 // log(1.60938), 1/1.60938
250 .quad 0x3fde9bfa659861f5, 0x3fe3d5d991aa75c6 // log(1.61328), 1/1.61328
251 .quad 0x3fdec399d2468cc0, 0x3fe3c995a47babe7 // log(1.61719), 1/1.61719
252 .quad 0x3fdeeb20c640ddf4, 0x3fe3bd60d9232955 // log(1.62109), 1/1.62109
253 .quad 0x3fdf128f5faf06ed, 0x3fe3b13b13b13b14 // log(1.62500), 1/1.62500
254 .quad 0x3fdf39e5bc811e5c, 0x3fe3a524387ac822 // log(1.62891), 1/1.62891
255 .quad 0x3fdf6123fa7028ac, 0x3fe3991c2c187f63 // log(1.63281), 1/1.63281
256 .quad 0x3fdf884a36fe9ec2, 0x3fe38d22d366088e // log(1.63672), 1/1.63672
257 .quad 0x3fdfaf588f78f31f, 0x3fe3813813813814 // log(1.64062), 1/1.64062
258 .quad 0x3fdfd64f20f61572, 0x3fe3755bd1c945ee // log(1.64453), 1/1.64453
259 .quad 0x3fdffd2e0857f498, 0x3fe3698df3de0748 // log(1.64844), 1/1.64844
260 .quad 0x3fe011fab125ff8a, 0x3fe35dce5f9f2af8 // log(1.65234), 1/1.65234
261 .quad 0x3fe02552a5a5d0ff, 0x3fe3521cfb2b78c1 // log(1.65625), 1/1.65625
262 .quad 0x3fe0389eefce633b, 0x3fe34679ace01346 // log(1.66016), 1/1.66016
263 .quad 0x3fe04bdf9da926d2, 0x3fe33ae45b57bcb2 // log(1.66406), 1/1.66406
264 .quad 0x3fe05f14bd26459c, 0x3fe32f5ced6a1dfa // log(1.66797), 1/1.66797
265 .quad 0x3fe0723e5c1cdf40, 0x3fe323e34a2b10bf // log(1.67188), 1/1.67188
266 .quad 0x3fe0855c884b450e, 0x3fe3187758e9ebb6 // log(1.67578), 1/1.67578
267 .quad 0x3fe0986f4f573521, 0x3fe30d190130d190 // log(1.67969), 1/1.67969
268 .quad 0x3fe0ab76bece14d2, 0x3fe301c82ac40260 // log(1.68359), 1/1.68359
269 .quad 0x3fe0be72e4252a83, 0x3fe2f684bda12f68 // log(1.68750), 1/1.68750
270 .quad 0x3fe0d163ccb9d6b8, 0x3fe2eb4ea1fed14b // log(1.69141), 1/1.69141
271 .quad 0x3fe0e44985d1cc8c, 0x3fe2e025c04b8097 // log(1.69531), 1/1.69531
272 .quad 0x3fe0f7241c9b497d, 0x3fe2d50a012d50a0 // log(1.69922), 1/1.69922
273 .quad 0x3fe109f39e2d4c97, 0x3fe2c9fb4d812ca0 // log(1.70312), 1/1.70312
274 .quad 0x3fe11cb81787ccf8, 0x3fe2bef98e5a3711 // log(1.70703), 1/1.70703
275 .quad 0x3fe12f719593efbc, 0x3fe2b404ad012b40 // log(1.71094), 1/1.71094
276 .quad 0x3fe1422025243d45, 0x3fe2a91c92f3c105 // log(1.71484), 1/1.71484
277 .quad 0x3fe154c3d2f4d5ea, 0x3fe29e4129e4129e // log(1.71875), 1/1.71875
278 .quad 0x3fe1675cababa60e, 0x3fe293725bb804a5 // log(1.72266), 1/1.72266
279 .quad 0x3fe179eabbd899a1, 0x3fe288b01288b013 // log(1.72656), 1/1.72656
280 .quad 0x3fe18c6e0ff5cf06, 0x3fe27dfa38a1ce4d // log(1.73047), 1/1.73047
281 .quad 0x3fe19ee6b467c96f, 0x3fe27350b8812735 // log(1.73438), 1/1.73438
282 .quad 0x3fe1b154b57da29f, 0x3fe268b37cd60127 // log(1.73828), 1/1.73828
283 .quad 0x3fe1c3b81f713c25, 0x3fe25e22708092f1 // log(1.74219), 1/1.74219
284 .quad 0x3fe1d610fe677003, 0x3fe2539d7e9177b2 // log(1.74609), 1/1.74609
285 .quad 0x3fe1e85f5e7040d0, 0x3fe2492492492492 // log(1.75000), 1/1.75000
286 .quad 0x3fe1faa34b87094c, 0x3fe23eb79717605b // log(1.75391), 1/1.75391
287 .quad 0x3fe20cdcd192ab6e, 0x3fe23456789abcdf // log(1.75781), 1/1.75781
288 .quad 0x3fe21f0bfc65beec, 0x3fe22a0122a0122a // log(1.76172), 1/1.76172
289 .quad 0x3fe23130d7bebf43, 0x3fe21fb78121fb78 // log(1.76562), 1/1.76562
290 .quad 0x3fe2434b6f483934, 0x3fe21579804855e6 // log(1.76953), 1/1.76953
291 .quad 0x3fe2555bce98f7cb, 0x3fe20b470c67c0d9 // log(1.77344), 1/1.77344
292 .quad 0x3fe26762013430e0, 0x3fe2012012012012 // log(1.77734), 1/1.77734
293 .quad 0x3fe2795e1289b11b, 0x3fe1f7047dc11f70 // log(1.78125), 1/1.78125
294 .quad 0x3fe28b500df60783, 0x3fe1ecf43c7fb84c // log(1.78516), 1/1.78516
295 .quad 0x3fe29d37fec2b08b, 0x3fe1e2ef3b3fb874 // log(1.78906), 1/1.78906
296 .quad 0x3fe2af15f02640ad, 0x3fe1d8f5672e4abd // log(1.79297), 1/1.79297
297 .quad 0x3fe2c0e9ed448e8c, 0x3fe1cf06ada2811d // log(1.79688), 1/1.79688
298 .quad 0x3fe2d2b4012edc9e, 0x3fe1c522fc1ce059 // log(1.80078), 1/1.80078
299 .quad 0x3fe2e47436e40268, 0x3fe1bb4a4046ed29 // log(1.80469), 1/1.80469
300 .quad 0x3fe2f62a99509546, 0x3fe1b17c67f2bae3 // log(1.80859), 1/1.80859
301 .quad 0x3fe307d7334f10be, 0x3fe1a7b9611a7b96 // log(1.81250), 1/1.81250
302 .quad 0x3fe3197a0fa7fe6a, 0x3fe19e0119e0119e // log(1.81641), 1/1.81641
303 .quad 0x3fe32b1339121d71, 0x3fe19453808ca29c // log(1.82031), 1/1.82031
304 .quad 0x3fe33ca2ba328995, 0x3fe18ab083902bdb // log(1.82422), 1/1.82422
305 .quad 0x3fe34e289d9ce1d3, 0x3fe1811811811812 // log(1.82812), 1/1.82812
306 .quad 0x3fe35fa4edd36ea0, 0x3fe1778a191bd684 // log(1.83203), 1/1.83203
307 .quad 0x3fe37117b54747b6, 0x3fe16e0689427379 // log(1.83594), 1/1.83594
308 .quad 0x3fe38280fe58797f, 0x3fe1648d50fc3201 // log(1.83984), 1/1.83984
309 .quad 0x3fe393e0d3562a1a, 0x3fe15b1e5f75270d // log(1.84375), 1/1.84375
310 .quad 0x3fe3a5373e7ebdfa, 0x3fe151b9a3fdd5c9 // log(1.84766), 1/1.84766
311 .quad 0x3fe3b68449fffc23, 0x3fe1485f0e0acd3b // log(1.85156), 1/1.85156
312 .quad 0x3fe3c7c7fff73206, 0x3fe13f0e8d344724 // log(1.85547), 1/1.85547
313 .quad 0x3fe3d9026a7156fb, 0x3fe135c81135c811 // log(1.85938), 1/1.85938
314 .quad 0x3fe3ea33936b2f5c, 0x3fe12c8b89edc0ac // log(1.86328), 1/1.86328
315 .quad 0x3fe3fb5b84d16f42, 0x3fe12358e75d3033 // log(1.86719), 1/1.86719
316 .quad 0x3fe40c7a4880dce9, 0x3fe11a3019a74826 // log(1.87109), 1/1.87109
317 .quad 0x3fe41d8fe84672ae, 0x3fe1111111111111 // log(1.87500), 1/1.87500
318 .quad 0x3fe42e9c6ddf80bf, 0x3fe107fbbe011080 // log(1.87891), 1/1.87891
319 .quad 0x3fe43f9fe2f9ce67, 0x3fe0fef010fef011 // log(1.88281), 1/1.88281
320 .quad 0x3fe4509a5133bb0a, 0x3fe0f5edfab325a2 // log(1.88672), 1/1.88672
321 .quad 0x3fe4618bc21c5ec2, 0x3fe0ecf56be69c90 // log(1.89062), 1/1.89062
322 .quad 0x3fe472743f33aaad, 0x3fe0e40655826011 // log(1.89453), 1/1.89453
323 .quad 0x3fe48353d1ea88df, 0x3fe0db20a88f4696 // log(1.89844), 1/1.89844
324 .quad 0x3fe4942a83a2fc07, 0x3fe0d24456359e3a // log(1.90234), 1/1.90234
325 .quad 0x3fe4a4f85db03ebb, 0x3fe0c9714fbcda3b // log(1.90625), 1/1.90625
326 .quad 0x3fe4b5bd6956e274, 0x3fe0c0a7868b4171 // log(1.91016), 1/1.91016
327 .quad 0x3fe4c679afccee3a, 0x3fe0b7e6ec259dc8 // log(1.91406), 1/1.91406
328 .quad 0x3fe4d72d3a39fd00, 0x3fe0af2f722eecb5 // log(1.91797), 1/1.91797
329 .quad 0x3fe4e7d811b75bb1, 0x3fe0a6810a6810a7 // log(1.92188), 1/1.92188
330 .quad 0x3fe4f87a3f5026e9, 0x3fe09ddba6af8360 // log(1.92578), 1/1.92578
331 .quad 0x3fe50913cc01686b, 0x3fe0953f39010954 // log(1.92969), 1/1.92969
332 .quad 0x3fe519a4c0ba3446, 0x3fe08cabb37565e2 // log(1.93359), 1/1.93359
333 .quad 0x3fe52a2d265bc5ab, 0x3fe0842108421084 // log(1.93750), 1/1.93750
334 .quad 0x3fe53aad05b99b7d, 0x3fe07b9f29b8eae2 // log(1.94141), 1/1.94141
335 .quad 0x3fe54b2467999498, 0x3fe073260a47f7c6 // log(1.94531), 1/1.94531
336 .quad 0x3fe55b9354b40bcd, 0x3fe06ab59c7912fb // log(1.94922), 1/1.94922
337 .quad 0x3fe56bf9d5b3f399, 0x3fe0624dd2f1a9fc // log(1.95312), 1/1.95312
338 .quad 0x3fe57c57f336f191, 0x3fe059eea0727586 // log(1.95703), 1/1.95703
339 .quad 0x3fe58cadb5cd7989, 0x3fe05197f7d73404 // log(1.96094), 1/1.96094
340 .quad 0x3fe59cfb25fae87e, 0x3fe04949cc1664c5 // log(1.96484), 1/1.96484
341 .quad 0x3fe5ad404c359f2d, 0x3fe0410410410410 // log(1.96875), 1/1.96875
342 .quad 0x3fe5bd7d30e71c73, 0x3fe038c6b78247fc // log(1.97266), 1/1.97266
343 .quad 0x3fe5cdb1dc6c1765, 0x3fe03091b51f5e1a // log(1.97656), 1/1.97656
344 .quad 0x3fe5ddde57149923, 0x3fe02864fc7729e9 // log(1.98047), 1/1.98047
345 .quad 0x3fe5ee02a9241675, 0x3fe0204081020408 // log(1.98438), 1/1.98438
346 .quad 0x3fe5fe1edad18919, 0x3fe0182436517a37 // log(1.98828), 1/1.98828
347 .quad 0x3fe60e32f44788d9, 0x3fe0101010101010 // log(1.99219), 1/1.99219
348 .quad 0x3fe61e3efda46467, 0x3fe0080402010080 // log(1.99609), 1/1.99609
349
350.literal8
351.align 3
352one: .quad 0x3ff0000000000000
353onehalf: .quad 0x3fe0000000000000
354onethird: .quad 0x3fd5555555555555
355log2: .quad 0x3fe62e42fefa39ef // ln(2)
356
357.text
358#if defined( __x86_64__ )
359 #define RELATIVE_ADDR( _a ) (_a)( %rip )
360#else
361 #define RELATIVE_ADDR( _a ) (_a)-asinhf_body( %ecx )
362
363.align 4
364asinhf_pic:
365 movl (%esp), %ecx // Copy address of this instruction to %ecx
366 ret
367#endif
368
369ENTRY(asinhf)
370#if defined(__i386__)
371 movl FRAME_SIZE(STACKP), %eax
372 movss FRAME_SIZE(STACKP), %xmm0
373 calll asinhf_pic
374asinhf_body:
375#else
376 movd %xmm0, %eax
377#endif
378 andnpd %xmm5, %xmm5 // zero out xmm5
379
380 andl $0x7fffffff, %eax // eax <-- |x|
381
382 movd %eax, %xmm1 // xmm1 <-- |x|
383 cvtss2sd %xmm1, %xmm2 // xmm2 <-- (double)|x|
384 andnps %xmm0, %xmm1 // xmm1 <-- signbit(x)
385
386 subl $0x3e800000, %eax // eax <-- |x| - 1/4 as integers
387 cmpl $0x02000000, %eax // if ( |x| > 4 or |x| < 1/4 or isnan(x) )
388 ja 2f // goto 2
389
390 // 1/4 < |x| < 4
391 lea RELATIVE_ADDR(asinhf_mid), DX_P
392#if defined( __SSE3__ )
393 movddup %xmm2, %xmm0 // xmm0 <-- [x, x]
394#else
395 movapd %xmm2, %xmm0
396 unpcklpd %xmm0, %xmm0
397#endif
398
399 mulsd %xmm2, %xmm2 // xmm2 <-- x*x
400 movapd %xmm0, %xmm3 // xmm3 <-- [x, x]
401 movapd %xmm0, %xmm4 // xmm4 <-- [x, x]
402 mulpd 48(DX_P), %xmm3 // xmm3 <-- [p3x, q3x]
403 mulpd 80(DX_P), %xmm4 // xmm4 <-- [p5x, q5x]
404 mulpd 16(DX_P), %xmm0 // xmm0 <-- [p1x, q1x]
405 movapd 96(DX_P), %xmm6 // xmm6 <-- [p6, q6]
406
407#if defined( __SSE3__ )
408 movddup %xmm2, %xmm5 // xmm5 <-- [x2, x2]
409#else
410 movapd %xmm2, %xmm5
411 unpcklpd %xmm5, %xmm5
412#endif
413
414 mulsd %xmm2, %xmm2 // xmm2 <-- x4
415 mulpd %xmm5, %xmm6 // xmm6 <-- [p6x2, q6x2]
416 addpd 32(DX_P), %xmm3 // xmm3 <-- [p3x + p2, q3x + q2]
417 addpd 64(DX_P), %xmm4 // xmm4 <-- [p5x + p4, q5x + q4]
418 addpd (DX_P), %xmm0 // xmm0 <-- [p1x + p0, q1x + q0]
419
420 mulpd %xmm3, %xmm5 // xmm5 <-- [p3x3 + p2x2, q3x3 + q2x2]
421 addpd %xmm4, %xmm6 // xmm6 <-- [p6x2 + p5x + p4, q6x2 + q5x + q4]
422 unpcklpd %xmm2, %xmm2 // xmm2 <-- [x4, x4]
423 mulpd %xmm2, %xmm6 // xmm6 <-- [p6x6 + p5x5 + p4x4, q6x6 + q5x5 + q4x4]
424 addpd %xmm0, %xmm5 // xmm5 <-- [p3x3 + p2x2 + p1x + p0, q3x3 + q2x2 + q1x + q0]
425 addpd %xmm6, %xmm5 // xmm5 <-- [p(x), q(x)]
426
427 movhlps %xmm5, %xmm6 // xmm6 <-- q(x)
428 divsd %xmm6, %xmm5 // xmm5 <-- p(x)/q(x)
429
430 cvtsd2ss %xmm5, %xmm0
431 orps %xmm1, %xmm0
432#if defined(__i386__)
433 movss %xmm0, FRAME_SIZE( STACKP )
434 flds FRAME_SIZE( STACKP )
435#endif
436 ret
437
4382:
439 jge 3f // if ( |x| > 4 or isnan(x) ) goto 3
440
441// Polynomial approximation to asinhf(x) for |x| < 1/4. Details of the computation are at top with the constants.
442
443 lea RELATIVE_ADDR(asinhf_low), DX_P
444
445#if defined( __SSE3__ )
446 movddup %xmm2, %xmm0 // xmm0 <-- [x, x]
447#else
448 movapd %xmm2, %xmm0
449 unpcklpd %xmm0, %xmm0
450#endif
451
452 cmppd $0, %xmm0, %xmm5 // detect |x| = 0
453
454 mulsd 48(DX_P), %xmm2 // xmm2 <-- [..., cx]
455 movapd %xmm0, %xmm3 // xmm3 <-- [x, x]
456 addpd 32(DX_P), %xmm0 // xmm0 <-- [x, x] + [b1hi, b1lo]
457 movapd %xmm3, %xmm4 // xmm4 <-- [x, x]
458 addpd 16(DX_P), %xmm3 // xmm3 <-- [x, x] + [ahi, alo]
459 mulpd %xmm4, %xmm0 // xmm0 <-- [x, x] * [x + b1hi, x + b1lo]
460 mulsd %xmm2, %xmm3 // xmm3 <-- [x + ahi, cx * (x + alo)]
461 addpd (DX_P), %xmm0 // xmm0 <-- [x^2 + b1hi x, x^2 + b1lo x] + [b0hi, b0lo]
462
463 andnpd %xmm3, %xmm5 // if |x| = 0, set xmm3 = 0 to suppress inexact from the next multiply.
464
465 mulpd %xmm5, %xmm0 // xmm0 <-- [(x + ahi)(x^2 + b1hi x + b0hi), cx(x + alo)(x^2 + b1lo x + b0lo)]
466 movhlps %xmm0, %xmm2 // xmm2 <-- [..., (x + ahi)(x^2 + b1hi x + b0hi)]
467 mulsd %xmm2, %xmm0 // xmm0 <-- cx(x + alo)(x + ahi)(x^2 + b1lo x + b0lo)(x^2 + b1hi x + b0hi)
468
469 cvtsd2ss %xmm0, %xmm0
470 orps %xmm1, %xmm0
471#if defined(__i386__)
472 movss %xmm0, FRAME_SIZE( STACKP )
473 flds FRAME_SIZE( STACKP )
474#endif
475 ret
476
4773:
478 addl $0x3e800000, %eax // %eax <-- |x|
479 cmpl $0x7f800000, %eax
480 ja 4f // if isnan(x) goto 4
481 je 5f // if (|x| = �) goto 5
482
483// Go off and compute the reciprocal of |x|. This will take a little while, but we have other stuff to do
484// while this is up in the air.
485
486 movsd RELATIVE_ADDR( one ), %xmm0
487 divsd %xmm2, %xmm0 // xmm0 <-- 1/|x|
488
489// We need to take a logarithm for this case. This code is shamelessly taken from our log2f.s implementation.
490// Consult that file for more details of operation.
491
492// begin by factoring x as 2^n*(mantissa)
493
494 movl %eax, %edx
495 shrl $23, %edx // right-align exponent bits
496 subl $126, %edx // subtract exponent bias, but add one to get log(2x) instead of log(x)
497 // i.e. we subtract 126 instead of 127 (the actual bias).
498 cvtsi2sd %edx, %xmm2 // xmm2 <-- trunc(lg |x|) = n
499 mulsd RELATIVE_ADDR(log2), %xmm2 // xmm2 <-- log(2^n)
500
501// now, let mantissa = 1 + a + b = (1 + a)(1 + r), if r = b/(1+a), and
502//
503// log(mantissa) = log(1+a) + log(1+r).
504//
505// we look up log(1+a) and 1/(1+a) in a table, keyed from a, and we compute log(1+r) via taylor series.
506
507 movl %eax, %edx
508 andl $0x00007fff, %eax // eax <-- b
509 andl $0x007f8000, %edx // edx <-- a << 15
510 orl $0x3f800000, %eax // eax <-- 1 + b
511 shrl $(15-4), %edx // edx <-- a << 4, lookup key for log(1+a)
512 movd %eax, %xmm3 // xmm3 <-- 1 + b
513 cvtss2sd %xmm3, %xmm3
514 subsd RELATIVE_ADDR( one ), %xmm3 // xmm3 <-- b
515 lea RELATIVE_ADDR(asinhf_hi_table), AX_P
516 mulsd 8(AX_P, DX_P, 1), %xmm3 // xmm3 <-- r = b * 1/(1+a)
517 movapd %xmm3, %xmm4
518 mulsd RELATIVE_ADDR(onethird), %xmm3 // xmm3 <-- 1/3 r
519 movapd %xmm4, %xmm5 // xmm5 <-- r
520 mulsd %xmm4, %xmm4 // xmm4 <-- r^2
521 subsd RELATIVE_ADDR(onehalf), %xmm3 // xmm3 <-- 1/3 r - 1/2
522 mulsd %xmm4, %xmm3 // xmm3 <-- 1/3 r^3 - 1/2 r^2
523 addsd %xmm5, %xmm3 // xmm3 <-- 1/3 r^3 - 1/2 r^2 + r ~ log(1+r)
524 addsd (AX_P, DX_P, 1), %xmm3 // xmm3 <-- log(a) + log(1+r)
525 addsd %xmm2, %xmm3 // xmm3 <-- log(2x) with error < .5002 ulps
526
527
528// Now we compute a correction from a polynomial in 1/|x|, which conveniently has landed in %xmm0 about now.
529
530 movsd -8(AX_P), %xmm6 // xmm6 <-- [ ... , c ]
531 movapd -32(AX_P), %xmm7 // xmm7 <-- [ b1hi, b1lo ]
532#if defined( __SSE3__ )
533 movddup %xmm0, %xmm2 // xmm2 <-- [ 1/x , 1/x ]
534#else
535 movapd %xmm0, %xmm2
536 unpcklpd %xmm2, %xmm2
537#endif
538 mulsd %xmm0, %xmm6 // xmm6 <-- [ ... , c/x ]
539 addpd %xmm2, %xmm7 // xmm7 <-- [ 1/x + b1hi, 1/x + b1lo ]
540 addsd -16(AX_P), %xmm0 // xmm0 <-- [ ... , 1/x + a ]
541 mulpd %xmm7, %xmm2 // xmm2 <-- [ 1/x^2 + b1hi/x, 1/x^2 + b1lo/x ]
542 mulsd %xmm6, %xmm0 // xmm0 <-- [ ... , c/x(1/x + a) ]
543 addpd -48(AX_P), %xmm2 // xmm2 <-- [ 1/x^2 + b1hi/x + b0hi, 1/x^2 + b1lo/x + b0lo ]
544 movhlps %xmm2, %xmm4 // xmm4 <-- [ ... , 1/x^2 + b1hi/x + b0hi ]
545 mulsd %xmm2, %xmm0 // xmm0 <-- [ ... , c/x(1/x + a)(1/x^2 + b1lo/x + b0lo) ]
546 mulsd %xmm4, %xmm0 // xmm0 <-- [ ... , c/x(1/x + a)(1/x^2 + b1lo/x + b0lo)(1/x^2 + b1hi/x + b0hi) ]
547
548// Add log(2x) to the correction to get the final result
549
550 addsd %xmm3, %xmm0
551 cvtsd2ss %xmm0, %xmm0
552 orps %xmm1, %xmm0
553#if defined(__i386__)
554 movss %xmm0, FRAME_SIZE( STACKP )
555 flds FRAME_SIZE( STACKP )
556#endif
557 ret
558
5594:
560 addss %xmm0, %xmm0 // quiet the NaN
5615:
562 #if defined(__i386__)
563 movss %xmm0, FRAME_SIZE( STACKP )
564 flds FRAME_SIZE( STACKP )
565#endif
566 ret
567