this repo has no description
at fixPythonPipStalling 384 lines 28 kB view raw
1/* 2 * cbrtf.s 3 * 4 * by Stephen Canon 5 * 6 * Copyright (c) 2007, Apple Inc. All Rights Reserved. 7 * 8 * This file implements the cbrtf function for the MacOS X __i386__ and __x86_64__ ABIs. 9 */ 10 11 12/* 13 14 Overview of algorithm used for cbrt(x): 15 16 1. Compute a reduced argument u in the range [1,8) such that 2^(3n) u = |x| for appropriate n. 17 2. Lookup an exact cube z such that |z - u| < 2^-6 18 19 We then have: cbrt(x) = sign(x) * 2^n * cbrt(z) * cbrt(1 + (x - z) * 1/z) 20 21 3. the mantissa of cbrt(z) is looked up in a table, and or-ed into the exponent 2^n 22 4. cbrt(1 + (x - z) * 1/z) is computed via a minimax polynomial of the form: 23 24 1 + cw(w + a)(w^2 + b1lo w + b0lo)(w^2 + b1hi w + b0hi) 25 26 where w = (x-z) * 1/z 27 28 5. Finally, we multiply the two parts, or in the signbit and return. 29 30 This algorithm produces correctly rounded results for single precision, and avoids setting the inexact flag 31 for exact cube arguments. 32 33 34 35 36 37 The table also contains the value 1/cbrt(z). This gets us power-of-two alignment for the table entries, and also 38 can be used to implement a correctly rounded reciprocal cube root, which could be useful for computing small rational 39 powers if we ever need to provide an ipow function. 40 41 The algorithm for recip_cbrtf(x) would proceed exactly as above, except we instead compute: 42 43 recip_cbrt(x) = sign(x) * 2^-n * recip_cbrt(z) * p(1 + (x - z) * 1/z) 44 45 where p(w) is a polynomial approximation to 1 / cbrt(1 + w) on |w| < 2^-6. (6th order minimax will suffice) 46 47 - scanon, July 2007 48 49 */ 50 51 52#include <machine/asm.h> 53#include "abi.h" 54 55.const 56.align 4 57 58// Minimax polynomial coefficents (addressed by offset from cbrt_table) 59 .quad 0x4000204182c17486, 0xbff8b5b876f0d973 // b1lo b1hi 60 .quad 0x4005826bbd26eb39, 0x40080ac10b5eee54 // b0lo b0hi 61 .quad 0xbf980b1460621e80, 0xbffc1c8e77867969 // c a 62 63// base point z mantissa(cbrt(z)) 1/cbrt(z) 1/z 64// ------------------------------------------------------------------------------ 65cbrt_table: .quad 0x3ff0000000000000, 0x0000000000000000, 0x3ff0000000000000, 0x3ff0000000000000 // z ~ 0x1.00p0 66 .quad 0x3ff060c080000000, 0x0000200000000000, 0x3fefc07f01fc07f0, 0x3fef42f61dacddc6 // z ~ 0x1.04p0 67 .quad 0x3ff09fe97c0b2e80, 0x000034a000000000, 0x3fef9815c85b04a3, 0x3feecc3168ac46e4 // z ~ 0x1.08p0 68 .quad 0x3ff0c30400000000, 0x0000400000000000, 0x3fef81f81f81f820, 0x3fee8bb1d5b6e585 // z ~ 0x1.0cp0 69 .quad 0x3ff126cd80000000, 0x0000600000000000, 0x3fef44659e4a4271, 0x3fedd9fb30af3365 // z ~ 0x1.10p0 70 .quad 0x3ff15f9b5b480000, 0x0000720000000000, 0x3fef222c82dba316, 0x3fed786108fd7a9f // z ~ 0x1.14p0 71 .quad 0x3ff18c2000000000, 0x0000800000000000, 0x3fef07c1f07c1f08, 0x3fed2d9cbd756afd // z ~ 0x1.18p0 72 .quad 0x3ff1f2fe80000000, 0x0000a00000000000, 0x3feecc07b301ecc0, 0x3fec86636f753a66 // z ~ 0x1.1cp0 73 .quad 0x3ff21fac7ca59c00, 0x0000adc000000000, 0x3feeb2a412496abd, 0x3fec40112c606d3e // z ~ 0x1.20p0 74 .quad 0x3ff25b6c00000000, 0x0000c00000000000, 0x3fee9131abf0b767, 0x3febe41e7ee3f7ed // z ~ 0x1.24p0 75 .quad 0x3ff29ff9aaaa2c00, 0x0000d4c000000000, 0x3fee6b8275501adb, 0x3feb7d7596e80007 // z ~ 0x1.28p0 76 .quad 0x3ff2c56b80000000, 0x0000e00000000000, 0x3fee573ac901e574, 0x3feb469f4adc7794 // z ~ 0x1.2cp0 77 .quad 0x3ff3310000000000, 0x0001000000000000, 0x3fee1e1e1e1e1e1e, 0x3feaadb93d39ae9c // z ~ 0x1.30p0 78 .quad 0x3ff35fb6f4579c00, 0x00010dc000000000, 0x3fee05d5a24448c5, 0x3fea6d6548fa984d // z ~ 0x1.34p0 79 .quad 0x3ff39e2c80000000, 0x0001200000000000, 0x3fede5d6e3f8868a, 0x3fea1941b013022d // z ~ 0x1.38p0 80 .quad 0x3ff3dfc1312b0000, 0x0001330000000000, 0x3fedc4cfaf10eb5c, 0x3fe9c322b87f17e8 // z ~ 0x1.3cp0 81 .quad 0x3ff40cf400000000, 0x0001400000000000, 0x3fedae6076b981db, 0x3fe9890fd4bf368f // z ~ 0x1.40p0 82 .quad 0x3ff47d5980000000, 0x0001600000000000, 0x3fed77b654b82c34, 0x3fe8fcfc9c44e2f4 // z ~ 0x1.44p0 83 .quad 0x3ff49feb2bc0dc00, 0x000169c000000000, 0x3fed67366d6ddfd0, 0x3fe8d31a9f2d47fb // z ~ 0x1.48p0 84 .quad 0x3ff4ef6000000000, 0x0001800000000000, 0x3fed41d41d41d41d, 0x3fe874e2a121159f // z ~ 0x1.4cp0 85 .quad 0x3ff51ff889bc6000, 0x00018d8000000000, 0x3fed2b539aeee152, 0x3fe83ca00a5a8f32 // z ~ 0x1.50p0 86 .quad 0x3ff5630a80000000, 0x0001a00000000000, 0x3fed0cb58f6ec074, 0x3fe7f09e124e78b8 // z ~ 0x1.54p0 87 .quad 0x3ff59fc8db9a7e80, 0x0001b0a000000000, 0x3fecf1688b3b4e6a, 0x3fe7ad5e68ed5f8c // z ~ 0x1.58p0 88 .quad 0x3ff5d85c00000000, 0x0001c00000000000, 0x3fecd85689039b0b, 0x3fe7700c9f78cc63 // z ~ 0x1.5cp0 89 .quad 0x3ff61fbc0c515400, 0x0001d34000000000, 0x3fecb92ff3a86d65, 0x3fe7246f92d40d4c // z ~ 0x1.60p0 90 .quad 0x3ff64f5780000000, 0x0001e00000000000, 0x3feca4b3055ee191, 0x3fe6f30d6649f11b // z ~ 0x1.64p0 91 .quad 0x3ff69fc04b688980, 0x0001f56000000000, 0x3fec829b51036037, 0x3fe6a17c8a1a662e // z ~ 0x1.68p0 92 .quad 0x3ff6c80000000000, 0x0002000000000000, 0x3fec71c71c71c71c, 0x3fe67980e0bf08c7 // z ~ 0x1.6cp0 93 .quad 0x3ff71fc3c5870000, 0x0002170000000000, 0x3fec4d9cd40d7cfd, 0x3fe6243421ae7a84 // z ~ 0x1.70p0 94 .quad 0x3ff7425880000000, 0x0002200000000000, 0x3fec3f8f01c3f8f0, 0x3fe60348d4756756 // z ~ 0x1.74p0 95 .quad 0x3ff7be6400000000, 0x0002400000000000, 0x3fec0e070381c0e0, 0x3fe5904842e0271b // z ~ 0x1.78p0 96 .quad 0x3ff7dfa08e162000, 0x0002488000000000, 0x3fec00fc08dc4fbf, 0x3fe57242f8b50298 // z ~ 0x1.7cp0 97 .quad 0x3ff83c2580000000, 0x0002600000000000, 0x3febdd2b899406f7, 0x3fe520635a583b96 // z ~ 0x1.80p0 98 .quad 0x3ff85fd33ff90000, 0x0002690000000000, 0x3febcf8c69606a07, 0x3fe50176a58004f0 // z ~ 0x1.84p0 99 .quad 0x3ff8bba000000000, 0x0002800000000000, 0x3febacf914c1bad0, 0x3fe4b37f67f9d05c // z ~ 0x1.88p0 100 .quad 0x3ff8dfca52590000, 0x0002890000000000, 0x3feb9f88e001b9f9, 0x3fe495664ea7f47d // z ~ 0x1.8cp0 101 .quad 0x3ff93cd680000000, 0x0002a00000000000, 0x3feb7d6c3dda338b, 0x3fe44982ca42a2eb // z ~ 0x1.90p0 102 .quad 0x3ff95ff68a951e80, 0x0002a8a000000000, 0x3feb70b72f76e7dd, 0x3fe42d6dab45c848 // z ~ 0x1.94p0 103 .quad 0x3ff9bfcc00000000, 0x0002c00000000000, 0x3feb4e81b4e81b4f, 0x3fe3e254e465d72c // z ~ 0x1.98p0 104 .quad 0x3ff9dfc708557c00, 0x0002c7c000000000, 0x3feb433cf4756912, 0x3fe3c9c1357411b6 // z ~ 0x1.9cp0 105 .quad 0x3ffa1f8756df7480, 0x0002d72000000000, 0x3feb2cfd6b4a2ec0, 0x3fe39976b1b376fb // z ~ 0x1.a0p0 106 .quad 0x3ffa448380000000, 0x0002e00000000000, 0x3feb2036406c80d9, 0x3fe37dde124a87f2 // z ~ 0x1.a4p0 107 .quad 0x3ffa9fbaa05b1c00, 0x0002f5c000000000, 0x3feb01182b5ac1ce, 0x3fe33b1676d97a5b // z ~ 0x1.a8p0 108 .quad 0x3ffacb0000000000, 0x0003000000000000, 0x3feaf286bca1af28, 0x3fe31c079d2b089f // z ~ 0x1.acp0 109 .quad 0x3ffb1ff52f400000, 0x0003140000000000, 0x3fead646ddd321c2, 0x3fe2e02d4701d501 // z ~ 0x1.b0p0 110 .quad 0x3ffb534480000000, 0x0003200000000000, 0x3feac5701ac5701b, 0x3fe2bcbbb0cb73f6 // z ~ 0x1.b4p0 111 .quad 0x3ffb9fa0378e5c00, 0x000331c000000000, 0x3feaacae5fd5e77d, 0x3fe288f0567537ff // z ~ 0x1.b8p0 112 .quad 0x3ffbdd5400000000, 0x0003400000000000, 0x3fea98ef606a63be, 0x3fe25fe5513ebf45 // z ~ 0x1.bcp0 113 .quad 0x3ffc1fc1c0569400, 0x00034f4000000000, 0x3fea83eded1251e7, 0x3fe2347ec39d66b0 // z ~ 0x1.c0p0 114 .quad 0x3ffc693180000000, 0x0003600000000000, 0x3fea6d01a6d01a6d, 0x3fe2057051321929 // z ~ 0x1.c4p0 115 .quad 0x3ffc9fc4ad339d80, 0x00036c6000000000, 0x3fea5c2b87b4e25a, 0x3fe1e3144d16fd97 // z ~ 0x1.c8p0 116 .quad 0x3ffcf6e000000000, 0x0003800000000000, 0x3fea41a41a41a41a, 0x3fe1ad4948b6e145 // z ~ 0x1.ccp0 117 .quad 0x3ffd1f9c6201cc80, 0x0003892000000000, 0x3fea35607552f1cd, 0x3fe1948fa1f5ff30 // z ~ 0x1.d0p0 118 .quad 0x3ffd5f8615bde180, 0x0003976000000000, 0x3fea22504db000b7, 0x3fe16e4ee12da718 // z ~ 0x1.d4p0 119 .quad 0x3ffd866280000000, 0x0003a00000000000, 0x3fea16d3f97a4b02, 0x3fe1575d8c8402f4 // z ~ 0x1.d8p0 120 .quad 0x3ffddfdfe805bc00, 0x0003b3c000000000, 0x3fe9fcacece0b241, 0x3fe1236b509d4023 // z ~ 0x1.dcp0 121 .quad 0x3ffe17bc00000000, 0x0003c00000000000, 0x3fe9ec8e951033d9, 0x3fe1039b25a7f122 // z ~ 0x1.e0p0 122 .quad 0x3ffe5ff3ecf6fc00, 0x0003cfc000000000, 0x3fe9d7f292cef9ba, 0x3fe0db275be001a6 // z ~ 0x1.e4p0 123 .quad 0x3ffeaaef80000000, 0x0003e00000000000, 0x3fe9c2d14ee4a102, 0x3fe0b1f0c9a4ed7c // z ~ 0x1.e8p0 124 .quad 0x3ffedfb5912a5180, 0x0003eb6000000000, 0x3fe9b41b55ca11fc, 0x3fe0956733c0be03 // z ~ 0x1.ecp0 125 .quad 0x3fff1fd112ab0c80, 0x0003f92000000000, 0x3fe9a2696dd75ba1, 0x3fe0733ed7907e73 // z ~ 0x1.f0p0 126 .quad 0x3fff400000000000, 0x0004000000000000, 0x3fe999999999999a, 0x3fe0624dd2f1a9fc // z ~ 0x1.f4p0 127 .quad 0x3fff9fe36d7a7d80, 0x0004146000000000, 0x3fe97f9f956c92fd, 0x3fe030a055aebedd // z ~ 0x1.f8p0 128 .quad 0x3fffd6f080000000, 0x0004200000000000, 0x3fe970e4f80cb872, 0x3fe014a239d8b1a9 // z ~ 0x1.fcp0 129 .quad 0x400037e200000000, 0x0004400000000000, 0x3fe948b0fcd6e9e0, 0x3fdf91bd1b62b9cf // z ~ 0x1.00p1 130 .quad 0x40005ff4c356ff40, 0x000450a000000000, 0x3fe933fff9b30002, 0x3fdf447b132ca3ac // z ~ 0x1.04p1 131 .quad 0x4000853ec0000000, 0x0004600000000000, 0x3fe920fb49d0e229, 0x3fdefde7dcdacefd // z ~ 0x1.08p1 132 .quad 0x4000d39000000000, 0x0004800000000000, 0x3fe8f9c18f9c18fa, 0x3fde6da80ced1523 // z ~ 0x1.0cp1 133 .quad 0x400122d740000000, 0x0004a00000000000, 0x3fe8d3018d3018d3, 0x3fdde0e209af882e // z ~ 0x1.10p1 134 .quad 0x4001731600000000, 0x0004c00000000000, 0x3fe8acb90f6bf3aa, 0x3fdd577b2f5c6f87 // z ~ 0x1.14p1 135 .quad 0x40019fb2ce620540, 0x0004d1a000000000, 0x3fe897d564f5cf98, 0x3fdd0d34ccd78141 // z ~ 0x1.18p1 136 .quad 0x4001c44dc0000000, 0x0004e00000000000, 0x3fe886e5f0abb04a, 0x3fdcd159cdbba714 // z ~ 0x1.1cp1 137 .quad 0x4002168000000000, 0x0005000000000000, 0x3fe8618618618618, 0x3fdc4e651e0c37d7 // z ~ 0x1.20p1 138 .quad 0x400269ae40000000, 0x0005200000000000, 0x3fe83c977ab2bedd, 0x3fdbce853967753c // z ~ 0x1.24p1 139 .quad 0x4002bdda00000000, 0x0005400000000000, 0x3fe8181818181818, 0x3fdb51a30f9739f8 // z ~ 0x1.28p1 140 .quad 0x4002dfff74f29dc0, 0x00054ce000000000, 0x3fe80987c755886a, 0x3fdb203708429799 // z ~ 0x1.2cp1 141 .quad 0x40031304c0000000, 0x0005600000000000, 0x3fe7f405fd017f40, 0x3fdad7a85e593e54 // z ~ 0x1.30p1 142 .quad 0x4003693000000000, 0x0005800000000000, 0x3fe7d05f417d05f4, 0x3fda607fa909db1f // z ~ 0x1.34p1 143 .quad 0x40039fe541ac7840, 0x0005942000000000, 0x3fe7ba298eae8947, 0x3fda16f787114257 // z ~ 0x1.38p1 144 .quad 0x4003c05d40000000, 0x0005a00000000000, 0x3fe7ad2208e0ecc3, 0x3fd9ec1430b0dfc7 // z ~ 0x1.3cp1 145 .quad 0x4004188e00000000, 0x0005c00000000000, 0x3fe78a4c8178a4c8, 0x3fd97a51ec6b707e // z ~ 0x1.40p1 146 .quad 0x400471c3c0000000, 0x0005e00000000000, 0x3fe767dce434a9b1, 0x3fd90b25822e2a9f // z ~ 0x1.44p1 147 .quad 0x40049fcfb130a6c0, 0x0005f06000000000, 0x3fe75664a1a72c8d, 0x3fd8d33bb2686480 // z ~ 0x1.48p1 148 .quad 0x4004cc0000000000, 0x0006000000000000, 0x3fe745d1745d1746, 0x3fd89e7c3fdb1246 // z ~ 0x1.4cp1 149 .quad 0x4005274440000000, 0x0006200000000000, 0x3fe724287f46debc, 0x3fd8344414a70cbd // z ~ 0x1.50p1 150 .quad 0x40055fc05a5df140, 0x000633a000000000, 0x3fe70fb3e12b41c4, 0x3fd7f44d50c76c8e // z ~ 0x1.54p1 151 .quad 0x4005839200000000, 0x0006400000000000, 0x3fe702e05c0b8170, 0x3fd7cc6b8acae7cb // z ~ 0x1.58p1 152 .quad 0x4005e0eac0000000, 0x0006600000000000, 0x3fe6e1f76b4337c7, 0x3fd766e1c17c26ec // z ~ 0x1.5cp1 153 .quad 0x40063f5000000000, 0x0006800000000000, 0x3fe6c16c16c16c17, 0x3fd70396672a04e5 // z ~ 0x1.60p1 154 .quad 0x40065fa1cdfa11c0, 0x00068ae000000000, 0x3fe6b671c62a2d0a, 0x3fd6e257c2026aef // z ~ 0x1.64p1 155 .quad 0x40069ec340000000, 0x0006a00000000000, 0x3fe6a13cd1537290, 0x3fd6a279b3fb4a4e // z ~ 0x1.68p1 156 .quad 0x4006ff4600000000, 0x0006c00000000000, 0x3fe6816816816817, 0x3fd6437c6489c8e0 // z ~ 0x1.6cp1 157 .quad 0x40071fef1bff2600, 0x0006cac000000000, 0x3fe676caae4b2e0f, 0x3fd6240aa2fa0dfd // z ~ 0x1.70p1 158 .quad 0x400760d9c0000000, 0x0006e00000000000, 0x3fe661ec6a5122f9, 0x3fd5e68fb4d877a7 // z ~ 0x1.74p1 159 .quad 0x40079fec8fa79000, 0x0006f48000000000, 0x3fe64def50b37b22, 0x3fd5ac1740057116 // z ~ 0x1.78p1 160 .quad 0x4007c38000000000, 0x0007000000000000, 0x3fe642c8590b2164, 0x3fd58ba55b815609 // z ~ 0x1.7cp1 161 .quad 0x4008273a40000000, 0x0007200000000000, 0x3fe623fa77016240, 0x3fd532af851862ac // z ~ 0x1.80p1 162 .quad 0x40085fccde240000, 0x0007320000000000, 0x3fe612cc01b977f0, 0x3fd5017c2589970e // z ~ 0x1.84p1 163 .quad 0x40088c0a00000000, 0x0007400000000000, 0x3fe6058160581606, 0x3fd4dba0cfc11861 // z ~ 0x1.88p1 164 .quad 0x4008f1f0c0000000, 0x0007600000000000, 0x3fe5e75bb8d015e7, 0x3fd4866c46f405db // z ~ 0x1.8cp1 165 .quad 0x40091fabaf07d200, 0x00076e4000000000, 0x3fe5da09741396f7, 0x3fd461102bc1cb8f // z ~ 0x1.90p1 166 .quad 0x400958f000000000, 0x0007800000000000, 0x3fe5c9882b931057, 0x3fd433055f7235db // z ~ 0x1.94p1 167 .quad 0x40099ffaac1ec3c0, 0x000795e000000000, 0x3fe5b55320eae3fd, 0x3fd3fb056724ebb2 // z ~ 0x1.98p1 168 .quad 0x4009c10940000000, 0x0007a00000000000, 0x3fe5ac056b015ac0, 0x3fd3e15ff3643c49 // z ~ 0x1.9cp1 169 .quad 0x400a2a3e00000000, 0x0007c00000000000, 0x3fe58ed2308158ed, 0x3fd391703ea2d9b9 // z ~ 0x1.a0p1 170 .quad 0x400a5fad7a3ee040, 0x0007d02000000000, 0x3fe580391c97b3f3, 0x3fd369cab16c4bb8 // z ~ 0x1.a4p1 171 .quad 0x400a948fc0000000, 0x0007e00000000000, 0x3fe571ed3c506b3a, 0x3fd3432adb274266 // z ~ 0x1.a8p1 172 .quad 0x400adffcaf535000, 0x0007f68000000000, 0x3fe55dca75792aa1, 0x3fd30d1b5accf7d2 // z ~ 0x1.acp1 173 .quad 0x400b000000000000, 0x0008000000000000, 0x3fe5555555555555, 0x3fd2f684bda12f68 // z ~ 0x1.b0p1 174 .quad 0x400b6c9040000000, 0x0008200000000000, 0x3fe5390948f40feb, 0x3fd2ab733230f96f // z ~ 0x1.b4p1 175 .quad 0x400b9fd76ec78000, 0x00082f0000000000, 0x3fe52bdf6a7a2620, 0x3fd288cb4a41a9b5 // z ~ 0x1.b8p1 176 .quad 0x400bda4200000000, 0x0008400000000000, 0x3fe51d07eae2f815, 0x3fd261ebd944131e // z ~ 0x1.bcp1 177 .quad 0x400c1fd3bf5cf840, 0x0008542000000000, 0x3fe50b90cb22a299, 0x3fd234731d751ccc // z ~ 0x1.c0p1 178 .quad 0x400c4916c0000000, 0x0008600000000000, 0x3fe5015015015015, 0x3fd219e4a4924f1f // z ~ 0x1.c4p1 179 .quad 0x400cb91000000000, 0x0008800000000000, 0x3fe4e5e0a72f0539, 0x3fd1d353d43a7247 // z ~ 0x1.c8p1 180 .quad 0x400cdfd181598000, 0x00088b0000000000, 0x3fe4dc82df5d0542, 0x3fd1bb66cda74540 // z ~ 0x1.ccp1 181 .quad 0x400d2a2f40000000, 0x0008a00000000000, 0x3fe4cab88725af6e, 0x3fd18e2ff3fca5ac // z ~ 0x1.d0p1 182 .quad 0x400d5f9b87878000, 0x0008af0000000000, 0x3fe4be15f5393e98, 0x3fd16e4227697dbf // z ~ 0x1.d4p1 183 .quad 0x400d9c7600000000, 0x0008c00000000000, 0x3fe4afd6a052bf5b, 0x3fd14a6fd8916ecf // z ~ 0x1.d8p1 184 .quad 0x400ddff55aa1e600, 0x0008d2c000000000, 0x3fe4a036770fd266, 0x3fd1235f02ce295a // z ~ 0x1.dcp1 185 .quad 0x400e0fe5c0000000, 0x0008e00000000000, 0x3fe49539e3b2d067, 0x3fd1080a9d1be542 // z ~ 0x1.e0p1 186 .quad 0x400e5fefa40c0000, 0x0008f60000000000, 0x3fe48315b6c3fc79, 0x3fd0db29bc986108 // z ~ 0x1.e4p1 187 .quad 0x400e848000000000, 0x0009000000000000, 0x3fe47ae147ae147b, 0x3fd0c6f7a0b5ed8d // z ~ 0x1.e8p1 188 .quad 0x400efa4640000000, 0x0009200000000000, 0x3fe460cbc7f5cf9a, 0x3fd0872e8415508d // z ~ 0x1.ecp1 189 .quad 0x400f1fc8b255bc40, 0x00092a2000000000, 0x3fe45898cb57730c, 0x3fd0734344eaebef // z ~ 0x1.f0p1 190 .quad 0x400f713a00000000, 0x0009400000000000, 0x3fe446f86562d9fb, 0x3fd048a727489527 // z ~ 0x1.f4p1 191 .quad 0x400f9f8b6ce70ec0, 0x00094c6000000000, 0x3fe43d0d2af8e146, 0x3fd030cd637fd65e // z ~ 0x1.f8p1 192 .quad 0x400fe95cc0000000, 0x0009600000000000, 0x3fe42d6625d51f87, 0x3fd00b59a78a8ffc // z ~ 0x1.fcp1 193 .quad 0x4010315800000000, 0x0009800000000000, 0x3fe4141414141414, 0x3fcf9e7cba5753af // z ~ 0x1.00p2 194 .quad 0x40106e9aa0000000, 0x0009a00000000000, 0x3fe3fb013fb013fb, 0x3fcf289bb31fd41c // z ~ 0x1.04p2 195 .quad 0x4010ac7700000000, 0x0009c00000000000, 0x3fe3e22cbce4a902, 0x3fceb501ca81bb3e // z ~ 0x1.08p2 196 .quad 0x4010eaede0000000, 0x0009e00000000000, 0x3fe3c995a47babe7, 0x3fce43a0fc24fe4b // z ~ 0x1.0cp2 197 .quad 0x40112a0000000000, 0x000a000000000000, 0x3fe3b13b13b13b14, 0x3fcdd46baab49c24 // z ~ 0x1.10p2 198 .quad 0x401169ae20000000, 0x000a200000000000, 0x3fe3991c2c187f63, 0x3fcd67549c6f9b67 // z ~ 0x1.14p2 199 .quad 0x4011a9f900000000, 0x000a400000000000, 0x3fe3813813813814, 0x3fccfc4ef7db5bff // z ~ 0x1.18p2 200 .quad 0x4011eae160000000, 0x000a600000000000, 0x3fe3698df3de0748, 0x3fcc934e4095d202 // z ~ 0x1.1cp2 201 .quad 0x40122c6800000000, 0x000a800000000000, 0x3fe3521cfb2b78c1, 0x3fcc2c46544650c1 // z ~ 0x1.20p2 202 .quad 0x40126e8da0000000, 0x000aa00000000000, 0x3fe33ae45b57bcb2, 0x3fcbc72b67ab9ce7 // z ~ 0x1.24p2 203 .quad 0x4012b15300000000, 0x000ac00000000000, 0x3fe323e34a2b10bf, 0x3fcb63f203c60c07 // z ~ 0x1.28p2 204 .quad 0x4012f4b8e0000000, 0x000ae00000000000, 0x3fe30d190130d190, 0x3fcb028f031c8644 // z ~ 0x1.2cp2 205 .quad 0x401338c000000000, 0x000b000000000000, 0x3fe2f684bda12f68, 0x3fcaa2f78f1b4cc6 // z ~ 0x1.30p2 206 .quad 0x40137d6920000000, 0x000b200000000000, 0x3fe2e025c04b8097, 0x3fca45211d8b748a // z ~ 0x1.34p2 207 .quad 0x40139ffaac000000, 0x000b300000000000, 0x3fe2d50a012d50a0, 0x3fca16db0ec408b2 // z ~ 0x1.38p2 208 .quad 0x4013c2b500000000, 0x000b400000000000, 0x3fe2c9fb4d812ca0, 0x3fc9e9016e2211b6 // z ~ 0x1.3cp2 209 .quad 0x401408a460000000, 0x000b600000000000, 0x3fe2b404ad012b40, 0x3fc98e8e88261b62 // z ~ 0x1.40p2 210 .quad 0x40144f3800000000, 0x000b800000000000, 0x3fe29e4129e4129e, 0x3fc935beb82c1ae7 // z ~ 0x1.44p2 211 .quad 0x40149670a0000000, 0x000ba00000000000, 0x3fe288b01288b013, 0x3fc8de888de6c48f // z ~ 0x1.48p2 212 .quad 0x4014de4f00000000, 0x000bc00000000000, 0x3fe27350b8812735, 0x3fc888e2da0ba19d // z ~ 0x1.4cp2 213 .quad 0x401526d3e0000000, 0x000be00000000000, 0x3fe25e22708092f1, 0x3fc834c4ac4afd3b // z ~ 0x1.50p2 214 .quad 0x4015700000000000, 0x000c000000000000, 0x3fe2492492492492, 0x3fc7e225515a4f1d // z ~ 0x1.54p2 215 .quad 0x4015b9d420000000, 0x000c200000000000, 0x3fe23456789abcdf, 0x3fc790fc51106751 // z ~ 0x1.58p2 216 .quad 0x4015dfdce5811360, 0x000c306000000000, 0x3fe229c346a04441, 0x3fc7680273c586ed // z ~ 0x1.5cp2 217 .quad 0x4016045100000000, 0x000c400000000000, 0x3fe21fb78121fb78, 0x3fc741416c92a70b // z ~ 0x1.60p2 218 .quad 0x40164f7760000000, 0x000c600000000000, 0x3fe20b470c67c0d9, 0x3fc6f2ec9c929a29 // z ~ 0x1.64p2 219 .quad 0x40169b4800000000, 0x000c800000000000, 0x3fe1f7047dc11f70, 0x3fc6a5f60f9b4c97 // z ~ 0x1.68p2 220 .quad 0x4016e7c3a0000000, 0x000ca00000000000, 0x3fe1e2ef3b3fb874, 0x3fc65a56286dbe08 // z ~ 0x1.6cp2 221 .quad 0x401734eb00000000, 0x000cc00000000000, 0x3fe1cf06ada2811d, 0x3fc610057c6bdd38 // z ~ 0x1.70p2 222 .quad 0x40175fb34f0902a0, 0x000cd1a000000000, 0x3fe1c4227955e4f1, 0x3fc5e7a396f89f71 // z ~ 0x1.74p2 223 .quad 0x401782bee0000000, 0x000ce00000000000, 0x3fe1bb4a4046ed29, 0x3fc5c6fcd2117a65 // z ~ 0x1.78p2 224 .quad 0x4017d14000000000, 0x000d000000000000, 0x3fe1a7b9611a7b96, 0x3fc57f351f7aa6ea // z ~ 0x1.7cp2 225 .quad 0x4018206f20000000, 0x000d200000000000, 0x3fe19453808ca29c, 0x3fc538a788f6fdd6 // z ~ 0x1.80p2 226 .quad 0x4018704d00000000, 0x000d400000000000, 0x3fe1811811811812, 0x3fc4f34d5fa956d6 // z ~ 0x1.84p2 227 .quad 0x40189fbb1ca4e0e0, 0x000d52e000000000, 0x3fe175d3b160af03, 0x3fc4caf2b205f9dd // z ~ 0x1.88p2 228 .quad 0x4018c0da60000000, 0x000d600000000000, 0x3fe16e0689427379, 0x3fc4af2020336a59 // z ~ 0x1.8cp2 229 .quad 0x4019121800000000, 0x000d800000000000, 0x3fe15b1e5f75270d, 0x3fc46c19716cf2c0 // z ~ 0x1.90p2 230 .quad 0x40196406a0000000, 0x000da00000000000, 0x3fe1485f0e0acd3b, 0x3fc42a332325db6b // z ~ 0x1.94p2 231 .quad 0x4019b6a700000000, 0x000dc00000000000, 0x3fe135c81135c811, 0x3fc3e9672cf3131d // z ~ 0x1.98p2 232 .quad 0x4019dfe6c1816fe0, 0x000dcfe000000000, 0x3fe12c9df926137b, 0x3fc3c9a8f2a1f8a5 // z ~ 0x1.9cp2 233 .quad 0x401a09f9e0000000, 0x000de00000000000, 0x3fe12358e75d3033, 0x3fc3a9afad059b87 // z ~ 0x1.a0p2 234 .quad 0x401a5e0000000000, 0x000e000000000000, 0x3fe1111111111111, 0x3fc36b06e70b7421 // z ~ 0x1.a4p2 235 .quad 0x401ab2ba20000000, 0x000e200000000000, 0x3fe0fef010fef011, 0x3fc32d67431a0280 // z ~ 0x1.a8p2 236 .quad 0x401adfb1053dbae0, 0x000e30e000000000, 0x3fe0f57023f898dc, 0x3fc30d50fe844fd2 // z ~ 0x1.acp2 237 .quad 0x401b082900000000, 0x000e400000000000, 0x3fe0ecf56be69c90, 0x3fc2f0cb4ca19e1e // z ~ 0x1.b0p2 238 .quad 0x401b5e4d60000000, 0x000e600000000000, 0x3fe0db20a88f4696, 0x3fc2b52db169e95e // z ~ 0x1.b4p2 239 .quad 0x401bb52800000000, 0x000e800000000000, 0x3fe0c9714fbcda3b, 0x3fc27a894096a4f5 // z ~ 0x1.b8p2 240 .quad 0x401bdfd332712ca0, 0x000e8fa000000000, 0x3fe0c0dc264ce74b, 0x3fc25e3ff656ec87 // z ~ 0x1.bcp2 241 .quad 0x401c0cb9a0000000, 0x000ea00000000000, 0x3fe0b7e6ec259dc8, 0x3fc240d8e9b4ae5d // z ~ 0x1.c0p2 242 .quad 0x401c650300000000, 0x000ec00000000000, 0x3fe0a6810a6810a7, 0x3fc20817bbcedd1f // z ~ 0x1.c4p2 243 .quad 0x401cbe04e0000000, 0x000ee00000000000, 0x3fe0953f39010954, 0x3fc1d040e48a75cd // z ~ 0x1.c8p2 244 .quad 0x401cdfeef0724420, 0x000eec2000000000, 0x3fe08ebe9d4e24ae, 0x3fc1bb54ba55bb8e // z ~ 0x1.ccp2 245 .quad 0x401d17c000000000, 0x000f000000000000, 0x3fe0842108421084, 0x3fc1994faf4aec92 // z ~ 0x1.d0p2 246 .quad 0x401d723520000000, 0x000f200000000000, 0x3fe073260a47f7c6, 0x3fc1633f845cb3de // z ~ 0x1.d4p2 247 .quad 0x401d9fb5ac000000, 0x000f300000000000, 0x3fe06ab59c7912fb, 0x3fc1488a6b10c148 // z ~ 0x1.d8p2 248 .quad 0x401dcd6500000000, 0x000f400000000000, 0x3fe0624dd2f1a9fc, 0x3fc12e0be826d695 // z ~ 0x1.dcp2 249 .quad 0x401e295060000000, 0x000f600000000000, 0x3fe05197f7d73404, 0x3fc0f9b07a631f92 // z ~ 0x1.e0p2 250 .quad 0x401e5fe06d9140e0, 0x000f72e000000000, 0x3fe047cca585fbe4, 0x3fc0db322dce8431 // z ~ 0x1.e4p2 251 .quad 0x401e85f800000000, 0x000f800000000000, 0x3fe0410410410410, 0x3fc0c628f55c92de // z ~ 0x1.e8p2 252 .quad 0x401ee35ca0000000, 0x000fa00000000000, 0x3fe03091b51f5e1a, 0x3fc093712d33ff42 // z ~ 0x1.ecp2 253 .quad 0x401f1ff2ff2d4ba0, 0x000fb4a000000000, 0x3fe02609989a73cf, 0x3fc0732ce999c3d1 // z ~ 0x1.f0p2 254 .quad 0x401f417f00000000, 0x000fc00000000000, 0x3fe0204081020408, 0x3fc061850f2a7123 // z ~ 0x1.f4p2 255 .quad 0x401fa05fe0000000, 0x000fe00000000000, 0x3fe0101010101010, 0x3fc03060a0f151c2 // z ~ 0x1.f8p2 256 .quad 0x401fdfcad4496d00, 0x000ff54000000000, 0x3fe00561cedb7dbc, 0x3fc0102adb93eef1 // z ~ 0x1.fcp2 257 258magic_numbers: .quad 0x0000000000aaaaab, 0x00000000002aaaab 259exponent_mask: .quad 0x0000000001800000, 0x000000007f800000 260exponent_bias: .quad 0x3ff0000000000000, 0x3d50000000000000 261denormal_bias: .quad 0x3ff0000000000000, 0x3ab0000000000000 262 263.text 264.align 4 265 266#if defined( __x86_64__ ) 267 #define RELATIVE_ADDR( _a ) (_a)( %rip ) 268#else 269 #define RELATIVE_ADDR( _a ) (_a)-cbrtf_body( %ecx ) 270cbrtf_pic: 271 movl (%esp), %ecx // Copy address of this instruction to %ecx 272 ret 273#endif 274 275ENTRY(cbrtf) 276#if defined(__i386__) 277 movl FRAME_SIZE(STACKP), %eax 278 movss FRAME_SIZE(STACKP), %xmm0 279// pic code, boring. 280 calll cbrtf_pic 281cbrtf_body: 282#else 283 movd %xmm0, %eax 284#endif 285 286 andl $0x7fffffff, %eax 287 movd %eax, %xmm1 288 289 subl $0x00800000, %eax // push 0 and denormals negative 290 cmpl $0x7f000000, %eax // if ( |x| < 2^-126 or |x| = inf or isnan(x) ) 291 jae 3f // goto 2 292 293// Normal numbers code path 294 xorps %xmm1, %xmm0 // xmm0 <-- signbit(x) 295 movdqa RELATIVE_ADDR(exponent_bias), %xmm3 296 movd %eax, %xmm1 297 298// Denormals re-enter code path here, using denormal exponent bias to rescale 2992: 300 psrld $23, %xmm1 // xmm1 <-- biased exponent(x) - 1 as integer 301 movlhps %xmm1, %xmm1 302 pmuludq RELATIVE_ADDR(magic_numbers), %xmm1 // magic number multiply and mask to put (e-1)/3 and (e-1)%3 303 pand RELATIVE_ADDR(exponent_mask), %xmm1 // in the high and low parts of xmm1, respectively. 304 andl $0x007fffff, %eax // mask sign and exponent bits to 0. 305 movd %eax, %xmm2 // xmm2 <-- mantissa(x) 306 por %xmm2, %xmm1 // xmm1 <-- [ (e-1)/3, (e-1)%3 | mantissa ] 307 308 lea RELATIVE_ADDR(cbrt_table), DX_P 309 movd %xmm1, %eax // eax <-- (e-1)%3 | mantissa 310#if defined(__x86_64__) 311 cdqe // sign extend rax 312#endif 313 andl $0x01fe0000, %eax // mask off top six mantissa bits and bottom two exponent bits 314 shrl $(17-5), %eax // shift into position for lookup (table entry size is 32 = 2^5) 315 316 psllq $29, %xmm1 // convert xmm1 to double via shift and bias add. 317 paddq %xmm3, %xmm1 318 319 movhlps %xmm1, %xmm2 // unpack high(xmm1) = 2^floor(exponent(x)/3) 320 // x is now reduced to the range 1-8 via multiplication by 2^(-3n) 321 // i.e., x has the biased exponent (e-1)%3 = (e-127)%3 322 subsd (DX_P,AX_P,1), %xmm1 // xmm1 <-- (x - z) where z ~ x is an exact cube 323 mulsd 24(DX_P,AX_P,1), %xmm1 // xmm1 <-- (x - z)/z = r 324 movsd 8(DX_P,AX_P,1), %xmm3 // xmm3 <-- mantissa(cbrt(z)) 325 orpd %xmm3, %xmm2 // xmm2 <-- 2^floor(exponent(x)/3) * cbrt(z) = w 326 // computed by or-ing in the mantissa of cbrt(z) 327 328 // polynomial approximation p(r) ~ cbrt(1 + r) - 1: 329 movsd -16(DX_P), %xmm4 // load polynomial coefficients 330 movapd -48(DX_P), %xmm5 331#if defined( __SSE3__ ) 332 movddup %xmm1, %xmm6 333#else 334 movapd %xmm1, %xmm6 335 unpcklpd %xmm6, %xmm6 336#endif 337 mulsd %xmm1, %xmm4 // xmm4 <-- cx 338 addpd %xmm6, %xmm5 // xmm5 <-- [ x + b1hi, x + b1lo ] 339 addsd -8(DX_P), %xmm1 // xmm1 <-- x + a 340 mulpd %xmm5, %xmm6 // xmm6 <-- [ x^2 + b1hi x, x^2 + b1lo x ] 341 mulsd %xmm4, %xmm1 // xmm1 <-- cx(x + a) 342 addpd -32(DX_P), %xmm6 // xmm6 <-- [ x^2 + b1hi x + b0hi, x^2 + b1lo x + b0lo ] 343 movhlps %xmm6, %xmm3 // xmm3 <-- x^2 + b1hi x + b0hi 344 mulsd %xmm1, %xmm6 // xmm6 <-- cx(x + a)(x^2 + b1lo x + b0lo) 345 mulsd %xmm3, %xmm6 // xmm6 <-- cx(x + a)(x^2 + b1lo x + b0lo)(x^2 + b1hi x + b0hi) 346 347 // y = w*((cbrt(1+r) - 1) + 1) ~ w + p(r)*w 348 mulsd %xmm2, %xmm6 349 addsd %xmm6, %xmm2 350 351 // convert to single and return. 352 cvtsd2ss %xmm2, %xmm1 353 orps %xmm1, %xmm0 354#if defined(__i386__) 355 movss %xmm0, FRAME_SIZE( STACKP ) 356 flds FRAME_SIZE( STACKP ) 357#endif 358 ret 359 3603: 361 jge 4f 362 cmpl $0xff800000, %eax 363 je 5f 364 365 // Denormal code path 366 xorps %xmm1, %xmm0 // xmm0 <-- signbit(x) 367 movdqa RELATIVE_ADDR(denormal_bias), %xmm3 368 addl $0x3f800000, %eax // 0.5 | denormal bits 369 movl $0x3f000000, %edx // 0.5 370 movd %eax, %xmm1 371 movd %edx, %xmm2 372 subss %xmm2, %xmm1 373 movd %xmm1, %eax 374 jmp 2b 375 3764: 377 je 5f 378 addss %xmm0, %xmm0 3795: 380#if defined(__i386__) 381 movss %xmm0, FRAME_SIZE( STACKP ) 382 flds FRAME_SIZE( STACKP ) 383#endif 384 ret