this repo has no description
1/*
2 * cbrtf.s
3 *
4 * by Stephen Canon
5 *
6 * Copyright (c) 2007, Apple Inc. All Rights Reserved.
7 *
8 * This file implements the cbrtf function for the MacOS X __i386__ and __x86_64__ ABIs.
9 */
10
11
12/*
13
14 Overview of algorithm used for cbrt(x):
15
16 1. Compute a reduced argument u in the range [1,8) such that 2^(3n) u = |x| for appropriate n.
17 2. Lookup an exact cube z such that |z - u| < 2^-6
18
19 We then have: cbrt(x) = sign(x) * 2^n * cbrt(z) * cbrt(1 + (x - z) * 1/z)
20
21 3. the mantissa of cbrt(z) is looked up in a table, and or-ed into the exponent 2^n
22 4. cbrt(1 + (x - z) * 1/z) is computed via a minimax polynomial of the form:
23
24 1 + cw(w + a)(w^2 + b1lo w + b0lo)(w^2 + b1hi w + b0hi)
25
26 where w = (x-z) * 1/z
27
28 5. Finally, we multiply the two parts, or in the signbit and return.
29
30 This algorithm produces correctly rounded results for single precision, and avoids setting the inexact flag
31 for exact cube arguments.
32
33
34
35
36
37 The table also contains the value 1/cbrt(z). This gets us power-of-two alignment for the table entries, and also
38 can be used to implement a correctly rounded reciprocal cube root, which could be useful for computing small rational
39 powers if we ever need to provide an ipow function.
40
41 The algorithm for recip_cbrtf(x) would proceed exactly as above, except we instead compute:
42
43 recip_cbrt(x) = sign(x) * 2^-n * recip_cbrt(z) * p(1 + (x - z) * 1/z)
44
45 where p(w) is a polynomial approximation to 1 / cbrt(1 + w) on |w| < 2^-6. (6th order minimax will suffice)
46
47 - scanon, July 2007
48
49 */
50
51
52#include <machine/asm.h>
53#include "abi.h"
54
55.const
56.align 4
57
58// Minimax polynomial coefficents (addressed by offset from cbrt_table)
59 .quad 0x4000204182c17486, 0xbff8b5b876f0d973 // b1lo b1hi
60 .quad 0x4005826bbd26eb39, 0x40080ac10b5eee54 // b0lo b0hi
61 .quad 0xbf980b1460621e80, 0xbffc1c8e77867969 // c a
62
63// base point z mantissa(cbrt(z)) 1/cbrt(z) 1/z
64// ------------------------------------------------------------------------------
65cbrt_table: .quad 0x3ff0000000000000, 0x0000000000000000, 0x3ff0000000000000, 0x3ff0000000000000 // z ~ 0x1.00p0
66 .quad 0x3ff060c080000000, 0x0000200000000000, 0x3fefc07f01fc07f0, 0x3fef42f61dacddc6 // z ~ 0x1.04p0
67 .quad 0x3ff09fe97c0b2e80, 0x000034a000000000, 0x3fef9815c85b04a3, 0x3feecc3168ac46e4 // z ~ 0x1.08p0
68 .quad 0x3ff0c30400000000, 0x0000400000000000, 0x3fef81f81f81f820, 0x3fee8bb1d5b6e585 // z ~ 0x1.0cp0
69 .quad 0x3ff126cd80000000, 0x0000600000000000, 0x3fef44659e4a4271, 0x3fedd9fb30af3365 // z ~ 0x1.10p0
70 .quad 0x3ff15f9b5b480000, 0x0000720000000000, 0x3fef222c82dba316, 0x3fed786108fd7a9f // z ~ 0x1.14p0
71 .quad 0x3ff18c2000000000, 0x0000800000000000, 0x3fef07c1f07c1f08, 0x3fed2d9cbd756afd // z ~ 0x1.18p0
72 .quad 0x3ff1f2fe80000000, 0x0000a00000000000, 0x3feecc07b301ecc0, 0x3fec86636f753a66 // z ~ 0x1.1cp0
73 .quad 0x3ff21fac7ca59c00, 0x0000adc000000000, 0x3feeb2a412496abd, 0x3fec40112c606d3e // z ~ 0x1.20p0
74 .quad 0x3ff25b6c00000000, 0x0000c00000000000, 0x3fee9131abf0b767, 0x3febe41e7ee3f7ed // z ~ 0x1.24p0
75 .quad 0x3ff29ff9aaaa2c00, 0x0000d4c000000000, 0x3fee6b8275501adb, 0x3feb7d7596e80007 // z ~ 0x1.28p0
76 .quad 0x3ff2c56b80000000, 0x0000e00000000000, 0x3fee573ac901e574, 0x3feb469f4adc7794 // z ~ 0x1.2cp0
77 .quad 0x3ff3310000000000, 0x0001000000000000, 0x3fee1e1e1e1e1e1e, 0x3feaadb93d39ae9c // z ~ 0x1.30p0
78 .quad 0x3ff35fb6f4579c00, 0x00010dc000000000, 0x3fee05d5a24448c5, 0x3fea6d6548fa984d // z ~ 0x1.34p0
79 .quad 0x3ff39e2c80000000, 0x0001200000000000, 0x3fede5d6e3f8868a, 0x3fea1941b013022d // z ~ 0x1.38p0
80 .quad 0x3ff3dfc1312b0000, 0x0001330000000000, 0x3fedc4cfaf10eb5c, 0x3fe9c322b87f17e8 // z ~ 0x1.3cp0
81 .quad 0x3ff40cf400000000, 0x0001400000000000, 0x3fedae6076b981db, 0x3fe9890fd4bf368f // z ~ 0x1.40p0
82 .quad 0x3ff47d5980000000, 0x0001600000000000, 0x3fed77b654b82c34, 0x3fe8fcfc9c44e2f4 // z ~ 0x1.44p0
83 .quad 0x3ff49feb2bc0dc00, 0x000169c000000000, 0x3fed67366d6ddfd0, 0x3fe8d31a9f2d47fb // z ~ 0x1.48p0
84 .quad 0x3ff4ef6000000000, 0x0001800000000000, 0x3fed41d41d41d41d, 0x3fe874e2a121159f // z ~ 0x1.4cp0
85 .quad 0x3ff51ff889bc6000, 0x00018d8000000000, 0x3fed2b539aeee152, 0x3fe83ca00a5a8f32 // z ~ 0x1.50p0
86 .quad 0x3ff5630a80000000, 0x0001a00000000000, 0x3fed0cb58f6ec074, 0x3fe7f09e124e78b8 // z ~ 0x1.54p0
87 .quad 0x3ff59fc8db9a7e80, 0x0001b0a000000000, 0x3fecf1688b3b4e6a, 0x3fe7ad5e68ed5f8c // z ~ 0x1.58p0
88 .quad 0x3ff5d85c00000000, 0x0001c00000000000, 0x3fecd85689039b0b, 0x3fe7700c9f78cc63 // z ~ 0x1.5cp0
89 .quad 0x3ff61fbc0c515400, 0x0001d34000000000, 0x3fecb92ff3a86d65, 0x3fe7246f92d40d4c // z ~ 0x1.60p0
90 .quad 0x3ff64f5780000000, 0x0001e00000000000, 0x3feca4b3055ee191, 0x3fe6f30d6649f11b // z ~ 0x1.64p0
91 .quad 0x3ff69fc04b688980, 0x0001f56000000000, 0x3fec829b51036037, 0x3fe6a17c8a1a662e // z ~ 0x1.68p0
92 .quad 0x3ff6c80000000000, 0x0002000000000000, 0x3fec71c71c71c71c, 0x3fe67980e0bf08c7 // z ~ 0x1.6cp0
93 .quad 0x3ff71fc3c5870000, 0x0002170000000000, 0x3fec4d9cd40d7cfd, 0x3fe6243421ae7a84 // z ~ 0x1.70p0
94 .quad 0x3ff7425880000000, 0x0002200000000000, 0x3fec3f8f01c3f8f0, 0x3fe60348d4756756 // z ~ 0x1.74p0
95 .quad 0x3ff7be6400000000, 0x0002400000000000, 0x3fec0e070381c0e0, 0x3fe5904842e0271b // z ~ 0x1.78p0
96 .quad 0x3ff7dfa08e162000, 0x0002488000000000, 0x3fec00fc08dc4fbf, 0x3fe57242f8b50298 // z ~ 0x1.7cp0
97 .quad 0x3ff83c2580000000, 0x0002600000000000, 0x3febdd2b899406f7, 0x3fe520635a583b96 // z ~ 0x1.80p0
98 .quad 0x3ff85fd33ff90000, 0x0002690000000000, 0x3febcf8c69606a07, 0x3fe50176a58004f0 // z ~ 0x1.84p0
99 .quad 0x3ff8bba000000000, 0x0002800000000000, 0x3febacf914c1bad0, 0x3fe4b37f67f9d05c // z ~ 0x1.88p0
100 .quad 0x3ff8dfca52590000, 0x0002890000000000, 0x3feb9f88e001b9f9, 0x3fe495664ea7f47d // z ~ 0x1.8cp0
101 .quad 0x3ff93cd680000000, 0x0002a00000000000, 0x3feb7d6c3dda338b, 0x3fe44982ca42a2eb // z ~ 0x1.90p0
102 .quad 0x3ff95ff68a951e80, 0x0002a8a000000000, 0x3feb70b72f76e7dd, 0x3fe42d6dab45c848 // z ~ 0x1.94p0
103 .quad 0x3ff9bfcc00000000, 0x0002c00000000000, 0x3feb4e81b4e81b4f, 0x3fe3e254e465d72c // z ~ 0x1.98p0
104 .quad 0x3ff9dfc708557c00, 0x0002c7c000000000, 0x3feb433cf4756912, 0x3fe3c9c1357411b6 // z ~ 0x1.9cp0
105 .quad 0x3ffa1f8756df7480, 0x0002d72000000000, 0x3feb2cfd6b4a2ec0, 0x3fe39976b1b376fb // z ~ 0x1.a0p0
106 .quad 0x3ffa448380000000, 0x0002e00000000000, 0x3feb2036406c80d9, 0x3fe37dde124a87f2 // z ~ 0x1.a4p0
107 .quad 0x3ffa9fbaa05b1c00, 0x0002f5c000000000, 0x3feb01182b5ac1ce, 0x3fe33b1676d97a5b // z ~ 0x1.a8p0
108 .quad 0x3ffacb0000000000, 0x0003000000000000, 0x3feaf286bca1af28, 0x3fe31c079d2b089f // z ~ 0x1.acp0
109 .quad 0x3ffb1ff52f400000, 0x0003140000000000, 0x3fead646ddd321c2, 0x3fe2e02d4701d501 // z ~ 0x1.b0p0
110 .quad 0x3ffb534480000000, 0x0003200000000000, 0x3feac5701ac5701b, 0x3fe2bcbbb0cb73f6 // z ~ 0x1.b4p0
111 .quad 0x3ffb9fa0378e5c00, 0x000331c000000000, 0x3feaacae5fd5e77d, 0x3fe288f0567537ff // z ~ 0x1.b8p0
112 .quad 0x3ffbdd5400000000, 0x0003400000000000, 0x3fea98ef606a63be, 0x3fe25fe5513ebf45 // z ~ 0x1.bcp0
113 .quad 0x3ffc1fc1c0569400, 0x00034f4000000000, 0x3fea83eded1251e7, 0x3fe2347ec39d66b0 // z ~ 0x1.c0p0
114 .quad 0x3ffc693180000000, 0x0003600000000000, 0x3fea6d01a6d01a6d, 0x3fe2057051321929 // z ~ 0x1.c4p0
115 .quad 0x3ffc9fc4ad339d80, 0x00036c6000000000, 0x3fea5c2b87b4e25a, 0x3fe1e3144d16fd97 // z ~ 0x1.c8p0
116 .quad 0x3ffcf6e000000000, 0x0003800000000000, 0x3fea41a41a41a41a, 0x3fe1ad4948b6e145 // z ~ 0x1.ccp0
117 .quad 0x3ffd1f9c6201cc80, 0x0003892000000000, 0x3fea35607552f1cd, 0x3fe1948fa1f5ff30 // z ~ 0x1.d0p0
118 .quad 0x3ffd5f8615bde180, 0x0003976000000000, 0x3fea22504db000b7, 0x3fe16e4ee12da718 // z ~ 0x1.d4p0
119 .quad 0x3ffd866280000000, 0x0003a00000000000, 0x3fea16d3f97a4b02, 0x3fe1575d8c8402f4 // z ~ 0x1.d8p0
120 .quad 0x3ffddfdfe805bc00, 0x0003b3c000000000, 0x3fe9fcacece0b241, 0x3fe1236b509d4023 // z ~ 0x1.dcp0
121 .quad 0x3ffe17bc00000000, 0x0003c00000000000, 0x3fe9ec8e951033d9, 0x3fe1039b25a7f122 // z ~ 0x1.e0p0
122 .quad 0x3ffe5ff3ecf6fc00, 0x0003cfc000000000, 0x3fe9d7f292cef9ba, 0x3fe0db275be001a6 // z ~ 0x1.e4p0
123 .quad 0x3ffeaaef80000000, 0x0003e00000000000, 0x3fe9c2d14ee4a102, 0x3fe0b1f0c9a4ed7c // z ~ 0x1.e8p0
124 .quad 0x3ffedfb5912a5180, 0x0003eb6000000000, 0x3fe9b41b55ca11fc, 0x3fe0956733c0be03 // z ~ 0x1.ecp0
125 .quad 0x3fff1fd112ab0c80, 0x0003f92000000000, 0x3fe9a2696dd75ba1, 0x3fe0733ed7907e73 // z ~ 0x1.f0p0
126 .quad 0x3fff400000000000, 0x0004000000000000, 0x3fe999999999999a, 0x3fe0624dd2f1a9fc // z ~ 0x1.f4p0
127 .quad 0x3fff9fe36d7a7d80, 0x0004146000000000, 0x3fe97f9f956c92fd, 0x3fe030a055aebedd // z ~ 0x1.f8p0
128 .quad 0x3fffd6f080000000, 0x0004200000000000, 0x3fe970e4f80cb872, 0x3fe014a239d8b1a9 // z ~ 0x1.fcp0
129 .quad 0x400037e200000000, 0x0004400000000000, 0x3fe948b0fcd6e9e0, 0x3fdf91bd1b62b9cf // z ~ 0x1.00p1
130 .quad 0x40005ff4c356ff40, 0x000450a000000000, 0x3fe933fff9b30002, 0x3fdf447b132ca3ac // z ~ 0x1.04p1
131 .quad 0x4000853ec0000000, 0x0004600000000000, 0x3fe920fb49d0e229, 0x3fdefde7dcdacefd // z ~ 0x1.08p1
132 .quad 0x4000d39000000000, 0x0004800000000000, 0x3fe8f9c18f9c18fa, 0x3fde6da80ced1523 // z ~ 0x1.0cp1
133 .quad 0x400122d740000000, 0x0004a00000000000, 0x3fe8d3018d3018d3, 0x3fdde0e209af882e // z ~ 0x1.10p1
134 .quad 0x4001731600000000, 0x0004c00000000000, 0x3fe8acb90f6bf3aa, 0x3fdd577b2f5c6f87 // z ~ 0x1.14p1
135 .quad 0x40019fb2ce620540, 0x0004d1a000000000, 0x3fe897d564f5cf98, 0x3fdd0d34ccd78141 // z ~ 0x1.18p1
136 .quad 0x4001c44dc0000000, 0x0004e00000000000, 0x3fe886e5f0abb04a, 0x3fdcd159cdbba714 // z ~ 0x1.1cp1
137 .quad 0x4002168000000000, 0x0005000000000000, 0x3fe8618618618618, 0x3fdc4e651e0c37d7 // z ~ 0x1.20p1
138 .quad 0x400269ae40000000, 0x0005200000000000, 0x3fe83c977ab2bedd, 0x3fdbce853967753c // z ~ 0x1.24p1
139 .quad 0x4002bdda00000000, 0x0005400000000000, 0x3fe8181818181818, 0x3fdb51a30f9739f8 // z ~ 0x1.28p1
140 .quad 0x4002dfff74f29dc0, 0x00054ce000000000, 0x3fe80987c755886a, 0x3fdb203708429799 // z ~ 0x1.2cp1
141 .quad 0x40031304c0000000, 0x0005600000000000, 0x3fe7f405fd017f40, 0x3fdad7a85e593e54 // z ~ 0x1.30p1
142 .quad 0x4003693000000000, 0x0005800000000000, 0x3fe7d05f417d05f4, 0x3fda607fa909db1f // z ~ 0x1.34p1
143 .quad 0x40039fe541ac7840, 0x0005942000000000, 0x3fe7ba298eae8947, 0x3fda16f787114257 // z ~ 0x1.38p1
144 .quad 0x4003c05d40000000, 0x0005a00000000000, 0x3fe7ad2208e0ecc3, 0x3fd9ec1430b0dfc7 // z ~ 0x1.3cp1
145 .quad 0x4004188e00000000, 0x0005c00000000000, 0x3fe78a4c8178a4c8, 0x3fd97a51ec6b707e // z ~ 0x1.40p1
146 .quad 0x400471c3c0000000, 0x0005e00000000000, 0x3fe767dce434a9b1, 0x3fd90b25822e2a9f // z ~ 0x1.44p1
147 .quad 0x40049fcfb130a6c0, 0x0005f06000000000, 0x3fe75664a1a72c8d, 0x3fd8d33bb2686480 // z ~ 0x1.48p1
148 .quad 0x4004cc0000000000, 0x0006000000000000, 0x3fe745d1745d1746, 0x3fd89e7c3fdb1246 // z ~ 0x1.4cp1
149 .quad 0x4005274440000000, 0x0006200000000000, 0x3fe724287f46debc, 0x3fd8344414a70cbd // z ~ 0x1.50p1
150 .quad 0x40055fc05a5df140, 0x000633a000000000, 0x3fe70fb3e12b41c4, 0x3fd7f44d50c76c8e // z ~ 0x1.54p1
151 .quad 0x4005839200000000, 0x0006400000000000, 0x3fe702e05c0b8170, 0x3fd7cc6b8acae7cb // z ~ 0x1.58p1
152 .quad 0x4005e0eac0000000, 0x0006600000000000, 0x3fe6e1f76b4337c7, 0x3fd766e1c17c26ec // z ~ 0x1.5cp1
153 .quad 0x40063f5000000000, 0x0006800000000000, 0x3fe6c16c16c16c17, 0x3fd70396672a04e5 // z ~ 0x1.60p1
154 .quad 0x40065fa1cdfa11c0, 0x00068ae000000000, 0x3fe6b671c62a2d0a, 0x3fd6e257c2026aef // z ~ 0x1.64p1
155 .quad 0x40069ec340000000, 0x0006a00000000000, 0x3fe6a13cd1537290, 0x3fd6a279b3fb4a4e // z ~ 0x1.68p1
156 .quad 0x4006ff4600000000, 0x0006c00000000000, 0x3fe6816816816817, 0x3fd6437c6489c8e0 // z ~ 0x1.6cp1
157 .quad 0x40071fef1bff2600, 0x0006cac000000000, 0x3fe676caae4b2e0f, 0x3fd6240aa2fa0dfd // z ~ 0x1.70p1
158 .quad 0x400760d9c0000000, 0x0006e00000000000, 0x3fe661ec6a5122f9, 0x3fd5e68fb4d877a7 // z ~ 0x1.74p1
159 .quad 0x40079fec8fa79000, 0x0006f48000000000, 0x3fe64def50b37b22, 0x3fd5ac1740057116 // z ~ 0x1.78p1
160 .quad 0x4007c38000000000, 0x0007000000000000, 0x3fe642c8590b2164, 0x3fd58ba55b815609 // z ~ 0x1.7cp1
161 .quad 0x4008273a40000000, 0x0007200000000000, 0x3fe623fa77016240, 0x3fd532af851862ac // z ~ 0x1.80p1
162 .quad 0x40085fccde240000, 0x0007320000000000, 0x3fe612cc01b977f0, 0x3fd5017c2589970e // z ~ 0x1.84p1
163 .quad 0x40088c0a00000000, 0x0007400000000000, 0x3fe6058160581606, 0x3fd4dba0cfc11861 // z ~ 0x1.88p1
164 .quad 0x4008f1f0c0000000, 0x0007600000000000, 0x3fe5e75bb8d015e7, 0x3fd4866c46f405db // z ~ 0x1.8cp1
165 .quad 0x40091fabaf07d200, 0x00076e4000000000, 0x3fe5da09741396f7, 0x3fd461102bc1cb8f // z ~ 0x1.90p1
166 .quad 0x400958f000000000, 0x0007800000000000, 0x3fe5c9882b931057, 0x3fd433055f7235db // z ~ 0x1.94p1
167 .quad 0x40099ffaac1ec3c0, 0x000795e000000000, 0x3fe5b55320eae3fd, 0x3fd3fb056724ebb2 // z ~ 0x1.98p1
168 .quad 0x4009c10940000000, 0x0007a00000000000, 0x3fe5ac056b015ac0, 0x3fd3e15ff3643c49 // z ~ 0x1.9cp1
169 .quad 0x400a2a3e00000000, 0x0007c00000000000, 0x3fe58ed2308158ed, 0x3fd391703ea2d9b9 // z ~ 0x1.a0p1
170 .quad 0x400a5fad7a3ee040, 0x0007d02000000000, 0x3fe580391c97b3f3, 0x3fd369cab16c4bb8 // z ~ 0x1.a4p1
171 .quad 0x400a948fc0000000, 0x0007e00000000000, 0x3fe571ed3c506b3a, 0x3fd3432adb274266 // z ~ 0x1.a8p1
172 .quad 0x400adffcaf535000, 0x0007f68000000000, 0x3fe55dca75792aa1, 0x3fd30d1b5accf7d2 // z ~ 0x1.acp1
173 .quad 0x400b000000000000, 0x0008000000000000, 0x3fe5555555555555, 0x3fd2f684bda12f68 // z ~ 0x1.b0p1
174 .quad 0x400b6c9040000000, 0x0008200000000000, 0x3fe5390948f40feb, 0x3fd2ab733230f96f // z ~ 0x1.b4p1
175 .quad 0x400b9fd76ec78000, 0x00082f0000000000, 0x3fe52bdf6a7a2620, 0x3fd288cb4a41a9b5 // z ~ 0x1.b8p1
176 .quad 0x400bda4200000000, 0x0008400000000000, 0x3fe51d07eae2f815, 0x3fd261ebd944131e // z ~ 0x1.bcp1
177 .quad 0x400c1fd3bf5cf840, 0x0008542000000000, 0x3fe50b90cb22a299, 0x3fd234731d751ccc // z ~ 0x1.c0p1
178 .quad 0x400c4916c0000000, 0x0008600000000000, 0x3fe5015015015015, 0x3fd219e4a4924f1f // z ~ 0x1.c4p1
179 .quad 0x400cb91000000000, 0x0008800000000000, 0x3fe4e5e0a72f0539, 0x3fd1d353d43a7247 // z ~ 0x1.c8p1
180 .quad 0x400cdfd181598000, 0x00088b0000000000, 0x3fe4dc82df5d0542, 0x3fd1bb66cda74540 // z ~ 0x1.ccp1
181 .quad 0x400d2a2f40000000, 0x0008a00000000000, 0x3fe4cab88725af6e, 0x3fd18e2ff3fca5ac // z ~ 0x1.d0p1
182 .quad 0x400d5f9b87878000, 0x0008af0000000000, 0x3fe4be15f5393e98, 0x3fd16e4227697dbf // z ~ 0x1.d4p1
183 .quad 0x400d9c7600000000, 0x0008c00000000000, 0x3fe4afd6a052bf5b, 0x3fd14a6fd8916ecf // z ~ 0x1.d8p1
184 .quad 0x400ddff55aa1e600, 0x0008d2c000000000, 0x3fe4a036770fd266, 0x3fd1235f02ce295a // z ~ 0x1.dcp1
185 .quad 0x400e0fe5c0000000, 0x0008e00000000000, 0x3fe49539e3b2d067, 0x3fd1080a9d1be542 // z ~ 0x1.e0p1
186 .quad 0x400e5fefa40c0000, 0x0008f60000000000, 0x3fe48315b6c3fc79, 0x3fd0db29bc986108 // z ~ 0x1.e4p1
187 .quad 0x400e848000000000, 0x0009000000000000, 0x3fe47ae147ae147b, 0x3fd0c6f7a0b5ed8d // z ~ 0x1.e8p1
188 .quad 0x400efa4640000000, 0x0009200000000000, 0x3fe460cbc7f5cf9a, 0x3fd0872e8415508d // z ~ 0x1.ecp1
189 .quad 0x400f1fc8b255bc40, 0x00092a2000000000, 0x3fe45898cb57730c, 0x3fd0734344eaebef // z ~ 0x1.f0p1
190 .quad 0x400f713a00000000, 0x0009400000000000, 0x3fe446f86562d9fb, 0x3fd048a727489527 // z ~ 0x1.f4p1
191 .quad 0x400f9f8b6ce70ec0, 0x00094c6000000000, 0x3fe43d0d2af8e146, 0x3fd030cd637fd65e // z ~ 0x1.f8p1
192 .quad 0x400fe95cc0000000, 0x0009600000000000, 0x3fe42d6625d51f87, 0x3fd00b59a78a8ffc // z ~ 0x1.fcp1
193 .quad 0x4010315800000000, 0x0009800000000000, 0x3fe4141414141414, 0x3fcf9e7cba5753af // z ~ 0x1.00p2
194 .quad 0x40106e9aa0000000, 0x0009a00000000000, 0x3fe3fb013fb013fb, 0x3fcf289bb31fd41c // z ~ 0x1.04p2
195 .quad 0x4010ac7700000000, 0x0009c00000000000, 0x3fe3e22cbce4a902, 0x3fceb501ca81bb3e // z ~ 0x1.08p2
196 .quad 0x4010eaede0000000, 0x0009e00000000000, 0x3fe3c995a47babe7, 0x3fce43a0fc24fe4b // z ~ 0x1.0cp2
197 .quad 0x40112a0000000000, 0x000a000000000000, 0x3fe3b13b13b13b14, 0x3fcdd46baab49c24 // z ~ 0x1.10p2
198 .quad 0x401169ae20000000, 0x000a200000000000, 0x3fe3991c2c187f63, 0x3fcd67549c6f9b67 // z ~ 0x1.14p2
199 .quad 0x4011a9f900000000, 0x000a400000000000, 0x3fe3813813813814, 0x3fccfc4ef7db5bff // z ~ 0x1.18p2
200 .quad 0x4011eae160000000, 0x000a600000000000, 0x3fe3698df3de0748, 0x3fcc934e4095d202 // z ~ 0x1.1cp2
201 .quad 0x40122c6800000000, 0x000a800000000000, 0x3fe3521cfb2b78c1, 0x3fcc2c46544650c1 // z ~ 0x1.20p2
202 .quad 0x40126e8da0000000, 0x000aa00000000000, 0x3fe33ae45b57bcb2, 0x3fcbc72b67ab9ce7 // z ~ 0x1.24p2
203 .quad 0x4012b15300000000, 0x000ac00000000000, 0x3fe323e34a2b10bf, 0x3fcb63f203c60c07 // z ~ 0x1.28p2
204 .quad 0x4012f4b8e0000000, 0x000ae00000000000, 0x3fe30d190130d190, 0x3fcb028f031c8644 // z ~ 0x1.2cp2
205 .quad 0x401338c000000000, 0x000b000000000000, 0x3fe2f684bda12f68, 0x3fcaa2f78f1b4cc6 // z ~ 0x1.30p2
206 .quad 0x40137d6920000000, 0x000b200000000000, 0x3fe2e025c04b8097, 0x3fca45211d8b748a // z ~ 0x1.34p2
207 .quad 0x40139ffaac000000, 0x000b300000000000, 0x3fe2d50a012d50a0, 0x3fca16db0ec408b2 // z ~ 0x1.38p2
208 .quad 0x4013c2b500000000, 0x000b400000000000, 0x3fe2c9fb4d812ca0, 0x3fc9e9016e2211b6 // z ~ 0x1.3cp2
209 .quad 0x401408a460000000, 0x000b600000000000, 0x3fe2b404ad012b40, 0x3fc98e8e88261b62 // z ~ 0x1.40p2
210 .quad 0x40144f3800000000, 0x000b800000000000, 0x3fe29e4129e4129e, 0x3fc935beb82c1ae7 // z ~ 0x1.44p2
211 .quad 0x40149670a0000000, 0x000ba00000000000, 0x3fe288b01288b013, 0x3fc8de888de6c48f // z ~ 0x1.48p2
212 .quad 0x4014de4f00000000, 0x000bc00000000000, 0x3fe27350b8812735, 0x3fc888e2da0ba19d // z ~ 0x1.4cp2
213 .quad 0x401526d3e0000000, 0x000be00000000000, 0x3fe25e22708092f1, 0x3fc834c4ac4afd3b // z ~ 0x1.50p2
214 .quad 0x4015700000000000, 0x000c000000000000, 0x3fe2492492492492, 0x3fc7e225515a4f1d // z ~ 0x1.54p2
215 .quad 0x4015b9d420000000, 0x000c200000000000, 0x3fe23456789abcdf, 0x3fc790fc51106751 // z ~ 0x1.58p2
216 .quad 0x4015dfdce5811360, 0x000c306000000000, 0x3fe229c346a04441, 0x3fc7680273c586ed // z ~ 0x1.5cp2
217 .quad 0x4016045100000000, 0x000c400000000000, 0x3fe21fb78121fb78, 0x3fc741416c92a70b // z ~ 0x1.60p2
218 .quad 0x40164f7760000000, 0x000c600000000000, 0x3fe20b470c67c0d9, 0x3fc6f2ec9c929a29 // z ~ 0x1.64p2
219 .quad 0x40169b4800000000, 0x000c800000000000, 0x3fe1f7047dc11f70, 0x3fc6a5f60f9b4c97 // z ~ 0x1.68p2
220 .quad 0x4016e7c3a0000000, 0x000ca00000000000, 0x3fe1e2ef3b3fb874, 0x3fc65a56286dbe08 // z ~ 0x1.6cp2
221 .quad 0x401734eb00000000, 0x000cc00000000000, 0x3fe1cf06ada2811d, 0x3fc610057c6bdd38 // z ~ 0x1.70p2
222 .quad 0x40175fb34f0902a0, 0x000cd1a000000000, 0x3fe1c4227955e4f1, 0x3fc5e7a396f89f71 // z ~ 0x1.74p2
223 .quad 0x401782bee0000000, 0x000ce00000000000, 0x3fe1bb4a4046ed29, 0x3fc5c6fcd2117a65 // z ~ 0x1.78p2
224 .quad 0x4017d14000000000, 0x000d000000000000, 0x3fe1a7b9611a7b96, 0x3fc57f351f7aa6ea // z ~ 0x1.7cp2
225 .quad 0x4018206f20000000, 0x000d200000000000, 0x3fe19453808ca29c, 0x3fc538a788f6fdd6 // z ~ 0x1.80p2
226 .quad 0x4018704d00000000, 0x000d400000000000, 0x3fe1811811811812, 0x3fc4f34d5fa956d6 // z ~ 0x1.84p2
227 .quad 0x40189fbb1ca4e0e0, 0x000d52e000000000, 0x3fe175d3b160af03, 0x3fc4caf2b205f9dd // z ~ 0x1.88p2
228 .quad 0x4018c0da60000000, 0x000d600000000000, 0x3fe16e0689427379, 0x3fc4af2020336a59 // z ~ 0x1.8cp2
229 .quad 0x4019121800000000, 0x000d800000000000, 0x3fe15b1e5f75270d, 0x3fc46c19716cf2c0 // z ~ 0x1.90p2
230 .quad 0x40196406a0000000, 0x000da00000000000, 0x3fe1485f0e0acd3b, 0x3fc42a332325db6b // z ~ 0x1.94p2
231 .quad 0x4019b6a700000000, 0x000dc00000000000, 0x3fe135c81135c811, 0x3fc3e9672cf3131d // z ~ 0x1.98p2
232 .quad 0x4019dfe6c1816fe0, 0x000dcfe000000000, 0x3fe12c9df926137b, 0x3fc3c9a8f2a1f8a5 // z ~ 0x1.9cp2
233 .quad 0x401a09f9e0000000, 0x000de00000000000, 0x3fe12358e75d3033, 0x3fc3a9afad059b87 // z ~ 0x1.a0p2
234 .quad 0x401a5e0000000000, 0x000e000000000000, 0x3fe1111111111111, 0x3fc36b06e70b7421 // z ~ 0x1.a4p2
235 .quad 0x401ab2ba20000000, 0x000e200000000000, 0x3fe0fef010fef011, 0x3fc32d67431a0280 // z ~ 0x1.a8p2
236 .quad 0x401adfb1053dbae0, 0x000e30e000000000, 0x3fe0f57023f898dc, 0x3fc30d50fe844fd2 // z ~ 0x1.acp2
237 .quad 0x401b082900000000, 0x000e400000000000, 0x3fe0ecf56be69c90, 0x3fc2f0cb4ca19e1e // z ~ 0x1.b0p2
238 .quad 0x401b5e4d60000000, 0x000e600000000000, 0x3fe0db20a88f4696, 0x3fc2b52db169e95e // z ~ 0x1.b4p2
239 .quad 0x401bb52800000000, 0x000e800000000000, 0x3fe0c9714fbcda3b, 0x3fc27a894096a4f5 // z ~ 0x1.b8p2
240 .quad 0x401bdfd332712ca0, 0x000e8fa000000000, 0x3fe0c0dc264ce74b, 0x3fc25e3ff656ec87 // z ~ 0x1.bcp2
241 .quad 0x401c0cb9a0000000, 0x000ea00000000000, 0x3fe0b7e6ec259dc8, 0x3fc240d8e9b4ae5d // z ~ 0x1.c0p2
242 .quad 0x401c650300000000, 0x000ec00000000000, 0x3fe0a6810a6810a7, 0x3fc20817bbcedd1f // z ~ 0x1.c4p2
243 .quad 0x401cbe04e0000000, 0x000ee00000000000, 0x3fe0953f39010954, 0x3fc1d040e48a75cd // z ~ 0x1.c8p2
244 .quad 0x401cdfeef0724420, 0x000eec2000000000, 0x3fe08ebe9d4e24ae, 0x3fc1bb54ba55bb8e // z ~ 0x1.ccp2
245 .quad 0x401d17c000000000, 0x000f000000000000, 0x3fe0842108421084, 0x3fc1994faf4aec92 // z ~ 0x1.d0p2
246 .quad 0x401d723520000000, 0x000f200000000000, 0x3fe073260a47f7c6, 0x3fc1633f845cb3de // z ~ 0x1.d4p2
247 .quad 0x401d9fb5ac000000, 0x000f300000000000, 0x3fe06ab59c7912fb, 0x3fc1488a6b10c148 // z ~ 0x1.d8p2
248 .quad 0x401dcd6500000000, 0x000f400000000000, 0x3fe0624dd2f1a9fc, 0x3fc12e0be826d695 // z ~ 0x1.dcp2
249 .quad 0x401e295060000000, 0x000f600000000000, 0x3fe05197f7d73404, 0x3fc0f9b07a631f92 // z ~ 0x1.e0p2
250 .quad 0x401e5fe06d9140e0, 0x000f72e000000000, 0x3fe047cca585fbe4, 0x3fc0db322dce8431 // z ~ 0x1.e4p2
251 .quad 0x401e85f800000000, 0x000f800000000000, 0x3fe0410410410410, 0x3fc0c628f55c92de // z ~ 0x1.e8p2
252 .quad 0x401ee35ca0000000, 0x000fa00000000000, 0x3fe03091b51f5e1a, 0x3fc093712d33ff42 // z ~ 0x1.ecp2
253 .quad 0x401f1ff2ff2d4ba0, 0x000fb4a000000000, 0x3fe02609989a73cf, 0x3fc0732ce999c3d1 // z ~ 0x1.f0p2
254 .quad 0x401f417f00000000, 0x000fc00000000000, 0x3fe0204081020408, 0x3fc061850f2a7123 // z ~ 0x1.f4p2
255 .quad 0x401fa05fe0000000, 0x000fe00000000000, 0x3fe0101010101010, 0x3fc03060a0f151c2 // z ~ 0x1.f8p2
256 .quad 0x401fdfcad4496d00, 0x000ff54000000000, 0x3fe00561cedb7dbc, 0x3fc0102adb93eef1 // z ~ 0x1.fcp2
257
258magic_numbers: .quad 0x0000000000aaaaab, 0x00000000002aaaab
259exponent_mask: .quad 0x0000000001800000, 0x000000007f800000
260exponent_bias: .quad 0x3ff0000000000000, 0x3d50000000000000
261denormal_bias: .quad 0x3ff0000000000000, 0x3ab0000000000000
262
263.text
264.align 4
265
266#if defined( __x86_64__ )
267 #define RELATIVE_ADDR( _a ) (_a)( %rip )
268#else
269 #define RELATIVE_ADDR( _a ) (_a)-cbrtf_body( %ecx )
270cbrtf_pic:
271 movl (%esp), %ecx // Copy address of this instruction to %ecx
272 ret
273#endif
274
275ENTRY(cbrtf)
276#if defined(__i386__)
277 movl FRAME_SIZE(STACKP), %eax
278 movss FRAME_SIZE(STACKP), %xmm0
279// pic code, boring.
280 calll cbrtf_pic
281cbrtf_body:
282#else
283 movd %xmm0, %eax
284#endif
285
286 andl $0x7fffffff, %eax
287 movd %eax, %xmm1
288
289 subl $0x00800000, %eax // push 0 and denormals negative
290 cmpl $0x7f000000, %eax // if ( |x| < 2^-126 or |x| = inf or isnan(x) )
291 jae 3f // goto 2
292
293// Normal numbers code path
294 xorps %xmm1, %xmm0 // xmm0 <-- signbit(x)
295 movdqa RELATIVE_ADDR(exponent_bias), %xmm3
296 movd %eax, %xmm1
297
298// Denormals re-enter code path here, using denormal exponent bias to rescale
2992:
300 psrld $23, %xmm1 // xmm1 <-- biased exponent(x) - 1 as integer
301 movlhps %xmm1, %xmm1
302 pmuludq RELATIVE_ADDR(magic_numbers), %xmm1 // magic number multiply and mask to put (e-1)/3 and (e-1)%3
303 pand RELATIVE_ADDR(exponent_mask), %xmm1 // in the high and low parts of xmm1, respectively.
304 andl $0x007fffff, %eax // mask sign and exponent bits to 0.
305 movd %eax, %xmm2 // xmm2 <-- mantissa(x)
306 por %xmm2, %xmm1 // xmm1 <-- [ (e-1)/3, (e-1)%3 | mantissa ]
307
308 lea RELATIVE_ADDR(cbrt_table), DX_P
309 movd %xmm1, %eax // eax <-- (e-1)%3 | mantissa
310#if defined(__x86_64__)
311 cdqe // sign extend rax
312#endif
313 andl $0x01fe0000, %eax // mask off top six mantissa bits and bottom two exponent bits
314 shrl $(17-5), %eax // shift into position for lookup (table entry size is 32 = 2^5)
315
316 psllq $29, %xmm1 // convert xmm1 to double via shift and bias add.
317 paddq %xmm3, %xmm1
318
319 movhlps %xmm1, %xmm2 // unpack high(xmm1) = 2^floor(exponent(x)/3)
320 // x is now reduced to the range 1-8 via multiplication by 2^(-3n)
321 // i.e., x has the biased exponent (e-1)%3 = (e-127)%3
322 subsd (DX_P,AX_P,1), %xmm1 // xmm1 <-- (x - z) where z ~ x is an exact cube
323 mulsd 24(DX_P,AX_P,1), %xmm1 // xmm1 <-- (x - z)/z = r
324 movsd 8(DX_P,AX_P,1), %xmm3 // xmm3 <-- mantissa(cbrt(z))
325 orpd %xmm3, %xmm2 // xmm2 <-- 2^floor(exponent(x)/3) * cbrt(z) = w
326 // computed by or-ing in the mantissa of cbrt(z)
327
328 // polynomial approximation p(r) ~ cbrt(1 + r) - 1:
329 movsd -16(DX_P), %xmm4 // load polynomial coefficients
330 movapd -48(DX_P), %xmm5
331#if defined( __SSE3__ )
332 movddup %xmm1, %xmm6
333#else
334 movapd %xmm1, %xmm6
335 unpcklpd %xmm6, %xmm6
336#endif
337 mulsd %xmm1, %xmm4 // xmm4 <-- cx
338 addpd %xmm6, %xmm5 // xmm5 <-- [ x + b1hi, x + b1lo ]
339 addsd -8(DX_P), %xmm1 // xmm1 <-- x + a
340 mulpd %xmm5, %xmm6 // xmm6 <-- [ x^2 + b1hi x, x^2 + b1lo x ]
341 mulsd %xmm4, %xmm1 // xmm1 <-- cx(x + a)
342 addpd -32(DX_P), %xmm6 // xmm6 <-- [ x^2 + b1hi x + b0hi, x^2 + b1lo x + b0lo ]
343 movhlps %xmm6, %xmm3 // xmm3 <-- x^2 + b1hi x + b0hi
344 mulsd %xmm1, %xmm6 // xmm6 <-- cx(x + a)(x^2 + b1lo x + b0lo)
345 mulsd %xmm3, %xmm6 // xmm6 <-- cx(x + a)(x^2 + b1lo x + b0lo)(x^2 + b1hi x + b0hi)
346
347 // y = w*((cbrt(1+r) - 1) + 1) ~ w + p(r)*w
348 mulsd %xmm2, %xmm6
349 addsd %xmm6, %xmm2
350
351 // convert to single and return.
352 cvtsd2ss %xmm2, %xmm1
353 orps %xmm1, %xmm0
354#if defined(__i386__)
355 movss %xmm0, FRAME_SIZE( STACKP )
356 flds FRAME_SIZE( STACKP )
357#endif
358 ret
359
3603:
361 jge 4f
362 cmpl $0xff800000, %eax
363 je 5f
364
365 // Denormal code path
366 xorps %xmm1, %xmm0 // xmm0 <-- signbit(x)
367 movdqa RELATIVE_ADDR(denormal_bias), %xmm3
368 addl $0x3f800000, %eax // 0.5 | denormal bits
369 movl $0x3f000000, %edx // 0.5
370 movd %eax, %xmm1
371 movd %edx, %xmm2
372 subss %xmm2, %xmm1
373 movd %xmm1, %eax
374 jmp 2b
375
3764:
377 je 5f
378 addss %xmm0, %xmm0
3795:
380#if defined(__i386__)
381 movss %xmm0, FRAME_SIZE( STACKP )
382 flds FRAME_SIZE( STACKP )
383#endif
384 ret