this repo has no description
1/*
2 * expm1f.s
3 *
4 * by Ian Ollmann
5 *
6 * Copyright (c) 2007, Apple Inc. All Rights Reserved.
7 *
8 * Implementation for C99 expm1f function for i386 and x86_64 ABIs.
9 */
10
11#include <machine/asm.h>
12#include "abi.h"
13
14.const
15
16//minimax polynomial for exp2(x)-1
17.align 4
18// 8th order minimax fit of exp2 on [-1.0,1.0]. |error| < 0.402865722354948566583852e-9:
19expm1f_c: .quad 0x40bc03f30399c376, 0x3dbea2a63403aaa8 // c4/c8 = 0.961813690023115610862381719985771e-2 / 0.134107709538786543922336536865157e-5, c0 = 0.278626872016317130037181614004e-10
20 .quad 0x408f10e7f73e6d8f, 0x3fe62e42fd0933ee // c5/c8 = 0.133318252930790403741964203236548e-2 / 0.134107709538786543922336536865157e-5, c1 = .693147176943623740308984004029708
21 .quad 0x405cb616a9384e69, 0x3fcebfbdfd0f0afa // c6/c8 = 0.154016177542147239746127455226575e-3 / 0.134107709538786543922336536865157e-5, c2 = .240226505817268621584559118975830
22 .quad 0x4027173ebd288ba1, 0x3fac6b0a74f15403 // c7/c8 = 0.154832722143258821052933667742417e-4 / 0.134107709538786543922336536865157e-5, c3 = 0.555041568519883074165425891257052e-1
23 .quad 0x3eb67fe1dc3105ba, 0x3ff0000000000000 // c8 = 0.134107709538786543922336536865157e-5, 1.0
24
25
26//expm1f_table
27 .quad 0xbfef800000000000, 0xbfe40adf8d149383 //-63/64, expm1(-63/64)
28 .quad 0xbfef000000000000, 0xbfe3daaae2395759 //-62/64, expm1(-62/64)
29 .quad 0xbfee800000000000, 0xbfe3a9b3e10921cd //-61/64, expm1(-61/64)
30 .quad 0xbfee000000000000, 0xbfe377f77a0fcb45 //-60/64, expm1(-60/64)
31 .quad 0xbfed800000000000, 0xbfe345729182bf1f //-59/64, expm1(-59/64)
32 .quad 0xbfed000000000000, 0xbfe31221ff0f3ecc //-58/64, expm1(-58/64)
33 .quad 0xbfec800000000000, 0xbfe2de028da7dc59 //-57/64, expm1(-57/64)
34 .quad 0xbfec000000000000, 0xbfe2a910fb51295a //-56/64, expm1(-56/64)
35 .quad 0xbfeb800000000000, 0xbfe27349f8ed96eb //-55/64, expm1(-55/64)
36 .quad 0xbfeb000000000000, 0xbfe23caa2a088391 //-54/64, expm1(-54/64)
37 .quad 0xbfea800000000000, 0xbfe2052e24a073a5 //-53/64, expm1(-53/64)
38 .quad 0xbfea000000000000, 0xbfe1ccd270f070f9 //-52/64, expm1(-52/64)
39 .quad 0xbfe9800000000000, 0xbfe1939389388e3d //-51/64, expm1(-51/64)
40 .quad 0xbfe9000000000000, 0xbfe1596dd9858ab1 //-50/64, expm1(-50/64)
41 .quad 0xbfe8800000000000, 0xbfe11e5dbf7792a9 //-49/64, expm1(-49/64)
42 .quad 0xbfe8000000000000, 0xbfe0e25f8a081941 //-48/64, expm1(-48/64)
43 .quad 0xbfe7800000000000, 0xbfe0a56f794ec7a4 //-47/64, expm1(-47/64)
44 .quad 0xbfe7000000000000, 0xbfe06789be457e3b //-46/64, expm1(-46/64)
45 .quad 0xbfe6800000000000, 0xbfe028aa7a8b63f2 //-45/64, expm1(-45/64)
46 .quad 0xbfe6000000000000, 0xbfdfd19b804dffbf //-44/64, expm1(-44/64)
47 .quad 0xbfe5800000000000, 0xbfdf4fdf228eb2ad //-43/64, expm1(-43/64)
48 .quad 0xbfe5000000000000, 0xbfdecc17c0083500 //-42/64, expm1(-42/64)
49 .quad 0xbfe4800000000000, 0xbfde463d1c396301 //-41/64, expm1(-41/64)
50 .quad 0xbfe4000000000000, 0xbfddbe46d96cd831 //-40/64, expm1(-40/64)
51 .quad 0xbfe3800000000000, 0xbfdd342c7833133a //-39/64, expm1(-39/64)
52 .quad 0xbfe3000000000000, 0xbfdca7e556da7e48 //-38/64, expm1(-38/64)
53 .quad 0xbfe2800000000000, 0xbfdc1968b0e55333 //-37/64, expm1(-37/64)
54 .quad 0xbfe2000000000000, 0xbfdb88ad9e7d52ea //-36/64, expm1(-36/64)
55 .quad 0xbfe1800000000000, 0xbfdaf5ab13e5474f //-35/64, expm1(-35/64)
56 .quad 0xbfe1000000000000, 0xbfda6057e0e846a4 //-34/64, expm1(-34/64)
57 .quad 0xbfe0800000000000, 0xbfd9c8aab046af7a //-33/64, expm1(-33/64)
58 .quad 0xbfe0000000000000, 0xbfd92e9a0720d3ec //-32/64, expm1(-32/64)
59 .quad 0xbfdf000000000000, 0xbfd8921c445f4add //-31/64, expm1(-31/64)
60 .quad 0xbfde000000000000, 0xbfd7f327a018ddb2 //-30/64, expm1(-30/64)
61 .quad 0xbfdd000000000000, 0xbfd751b22af608f0 //-29/64, expm1(-29/64)
62 .quad 0xbfdc000000000000, 0xbfd6adb1cd9205ee //-28/64, expm1(-28/64)
63 .quad 0xbfdb000000000000, 0xbfd6071c47d953b2 //-27/64, expm1(-27/64)
64 .quad 0xbfda000000000000, 0xbfd55de73065b4df //-26/64, expm1(-26/64)
65 .quad 0xbfd9000000000000, 0xbfd4b207f3d79870 //-25/64, expm1(-25/64)
66 .quad 0xbfd8000000000000, 0xbfd40373d42ce2e3 //-24/64, expm1(-24/64)
67 .quad 0xbfd7000000000000, 0xbfd3521fe8150d2b //-23/64, expm1(-23/64)
68 .quad 0xbfd6000000000000, 0xbfd29e011a428ec6 //-22/64, expm1(-22/64)
69 .quad 0xbfd5000000000000, 0xbfd1e70c28b987f3 //-21/64, expm1(-21/64)
70 .quad 0xbfd4000000000000, 0xbfd12d35a41ba104 //-20/64, expm1(-20/64)
71 .quad 0xbfd3000000000000, 0xbfd07071eef11388 //-19/64, expm1(-19/64)
72 .quad 0xbfd2000000000000, 0xbfcf616a79dda3a8 //-18/64, expm1(-18/64)
73 .quad 0xbfd1000000000000, 0xbfcddbe7247382af //-17/64, expm1(-17/64)
74 .quad 0xbfd0000000000000, 0xbfcc5041854df7d4 //-16/64, expm1(-16/64)
75 .quad 0xbfce000000000000, 0xbfcabe60e1f21836 //-15/64, expm1(-15/64)
76 .quad 0xbfcc000000000000, 0xbfc9262c1c3430a1 //-14/64, expm1(-14/64)
77 .quad 0xbfca000000000000, 0xbfc78789b0a5e0c0 //-13/64, expm1(-13/64)
78 .quad 0xbfc8000000000000, 0xbfc5e25fb4fde211 //-12/64, expm1(-12/64)
79 .quad 0xbfc6000000000000, 0xbfc43693d679612d //-11/64, expm1(-11/64)
80 .quad 0xbfc4000000000000, 0xbfc2840b5836cf67 //-10/64, expm1(-10/64)
81 .quad 0xbfc2000000000000, 0xbfc0caab118a1278 //-9/64, expm1(-9/64)
82 .quad 0xbfc0000000000000, 0xbfbe14aed893eef4 //-8/64, expm1(-8/64)
83 .quad 0xbfbc000000000000, 0xbfba85e8c62d9c13 //-7/64, expm1(-7/64)
84 .quad 0xbfb8000000000000, 0xbfb6e8caff341fea //-6/64, expm1(-6/64)
85 .quad 0xbfb4000000000000, 0xbfb33d1bb17df2e7 //-5/64, expm1(-5/64)
86 .quad 0xbfb0000000000000, 0xbfaf0540438fd5c3 //-4/64, expm1(-4/64)
87 .quad 0xbfa8000000000000, 0xbfa7723950130405 //-3/64, expm1(-3/64)
88 .quad 0xbfa0000000000000, 0xbf9f8152aee9450e //-2/64, expm1(-2/64)
89 .quad 0xbf90000000000000, 0xbf8fc055004416db //-1/64, expm1(-1/64)
90expm1f_table: .quad 0x0000000000000000, 0x0000000000000000 //0/64, expm1(0/64)
91 .quad 0x3f90000000000000, 0x3f90202ad5778e46 //1/64, expm1(1/64)
92 .quad 0x3fa0000000000000, 0x3fa040ac0224fd93 //2/64, expm1(2/64)
93 .quad 0x3fa8000000000000, 0x3fa89246d053d178 //3/64, expm1(3/64)
94 .quad 0x3fb0000000000000, 0x3fb082b577d34ed8 //4/64, expm1(4/64)
95 .quad 0x3fb4000000000000, 0x3fb4cd4fc989cd64 //5/64, expm1(5/64)
96 .quad 0x3fb8000000000000, 0x3fb92937074e0cd7 //6/64, expm1(6/64)
97 .quad 0x3fbc000000000000, 0x3fbd96b0eff0e794 //7/64, expm1(7/64)
98 .quad 0x3fc0000000000000, 0x3fc10b022db7ae68 //8/64, expm1(8/64)
99 .quad 0x3fc2000000000000, 0x3fc353bc9fb00b21 //9/64, expm1(9/64)
100 .quad 0x3fc4000000000000, 0x3fc5a5ac59b963cb //10/64, expm1(10/64)
101 .quad 0x3fc6000000000000, 0x3fc800f67b00d7b8 //11/64, expm1(11/64)
102 .quad 0x3fc8000000000000, 0x3fca65c0b85ac1a9 //12/64, expm1(12/64)
103 .quad 0x3fca000000000000, 0x3fccd4315e9e0833 //13/64, expm1(13/64)
104 .quad 0x3fcc000000000000, 0x3fcf4c6f5508ee5d //14/64, expm1(14/64)
105 .quad 0x3fce000000000000, 0x3fd0e7510fd7c564 //15/64, expm1(15/64)
106 .quad 0x3fd0000000000000, 0x3fd22d78f0fa061a //16/64, expm1(16/64)
107 .quad 0x3fd1000000000000, 0x3fd378c3b0847980 //17/64, expm1(17/64)
108 .quad 0x3fd2000000000000, 0x3fd4c946033eb3de //18/64, expm1(18/64)
109 .quad 0x3fd3000000000000, 0x3fd61f14f169ebc1 //19/64, expm1(19/64)
110 .quad 0x3fd4000000000000, 0x3fd77a45d8117fd5 //20/64, expm1(20/64)
111 .quad 0x3fd5000000000000, 0x3fd8daee6a60c961 //21/64, expm1(21/64)
112 .quad 0x3fd6000000000000, 0x3fda4124b2fe50cb //22/64, expm1(22/64)
113 .quad 0x3fd7000000000000, 0x3fdbacff156c79d7 //23/64, expm1(23/64)
114 .quad 0x3fd8000000000000, 0x3fdd1e944f6fbdaa //24/64, expm1(24/64)
115 .quad 0x3fd9000000000000, 0x3fde95fb7a7a88f8 //25/64, expm1(25/64)
116 .quad 0x3fda000000000000, 0x3fe009a6068f6a8c //26/64, expm1(26/64)
117 .quad 0x3fdb000000000000, 0x3fe0cb4eee42c98b //27/64, expm1(27/64)
118 .quad 0x3fdc000000000000, 0x3fe190048ef60020 //28/64, expm1(28/64)
119 .quad 0x3fdd000000000000, 0x3fe257d334137dff //29/64, expm1(29/64)
120 .quad 0x3fde000000000000, 0x3fe322c75a963b98 //30/64, expm1(30/64)
121 .quad 0x3fdf000000000000, 0x3fe3f0edb1d18acd //31/64, expm1(31/64)
122 .quad 0x3fe0000000000000, 0x3fe4c2531c3c0d38 //32/64, expm1(32/64)
123 .quad 0x3fe0800000000000, 0x3fe59704b03ddca9 //33/64, expm1(33/64)
124 .quad 0x3fe1000000000000, 0x3fe66f0fb901f2bd //34/64, expm1(34/64)
125 .quad 0x3fe1800000000000, 0x3fe74a81b74adcac //35/64, expm1(35/64)
126 .quad 0x3fe2000000000000, 0x3fe82968624ac88d //36/64, expm1(36/64)
127 .quad 0x3fe2800000000000, 0x3fe90bd1a87ef9a1 //37/64, expm1(37/64)
128 .quad 0x3fe3000000000000, 0x3fe9f1cbb08eb151 //38/64, expm1(38/64)
129 .quad 0x3fe3800000000000, 0x3feadb64da2d9acf //39/64, expm1(39/64)
130 .quad 0x3fe4000000000000, 0x3febc8abbf01c781 //40/64, expm1(40/64)
131 .quad 0x3fe4800000000000, 0x3fecb9af338d4a9c //41/64, expm1(41/64)
132 .quad 0x3fe5000000000000, 0x3fedae7e481b8284 //42/64, expm1(42/64)
133 .quad 0x3fe5800000000000, 0x3feea72849b21ebd //43/64, expm1(43/64)
134 .quad 0x3fe6000000000000, 0x3fefa3bcc305f191 //44/64, expm1(44/64)
135 .quad 0x3fe6800000000000, 0x3ff05225beb9ce55 //45/64, expm1(45/64)
136 .quad 0x3fe7000000000000, 0x3ff0d47240fe1412 //46/64, expm1(46/64)
137 .quad 0x3fe7800000000000, 0x3ff158cc0d22ca02 //47/64, expm1(47/64)
138 .quad 0x3fe8000000000000, 0x3ff1df3b68cfb9ef //48/64, expm1(48/64)
139 .quad 0x3fe8800000000000, 0x3ff267c8bb05d2a3 //49/64, expm1(49/64)
140 .quad 0x3fe9000000000000, 0x3ff2f27c8ca598a0 //50/64, expm1(50/64)
141 .quad 0x3fe9800000000000, 0x3ff37f5f88f7b4e5 //51/64, expm1(51/64)
142 .quad 0x3fea000000000000, 0x3ff40e7a7e37aa30 //52/64, expm1(52/64)
143 .quad 0x3fea800000000000, 0x3ff49fd65e20b96f //53/64, expm1(53/64)
144 .quad 0x3feb000000000000, 0x3ff5337c3e7cfe38 //54/64, expm1(54/64)
145 .quad 0x3feb800000000000, 0x3ff5c97559b6cc28 //55/64, expm1(55/64)
146 .quad 0x3fec000000000000, 0x3ff661cb0f6c564f //56/64, expm1(56/64)
147 .quad 0x3fec800000000000, 0x3ff6fc86e505a9dd //57/64, expm1(57/64)
148 .quad 0x3fed000000000000, 0x3ff799b2864d0569 //58/64, expm1(58/64)
149 .quad 0x3fed800000000000, 0x3ff83957c6099668 //59/64, expm1(59/64)
150 .quad 0x3fee000000000000, 0x3ff8db809e9ca670 //60/64, expm1(60/64)
151 .quad 0x3fee800000000000, 0x3ff9803732a14221 //61/64, expm1(61/64)
152 .quad 0x3fef000000000000, 0x3ffa2785cd8e63ad //62/64, expm1(62/64)
153 .quad 0x3fef800000000000, 0x3ffad176e45bab25 //63/64, expm1(63/64)
154
155
156expm1f_taylor_polynomial: .double 0.5
157 .double 0.16666666666666666666666666666
158 .double 0.04166666666666666666666666666
159
160.literal8
161reciprocalLn2: .quad 0x3ff71547652b82fe // 1.0 / ln(2)
162two6: .quad 0x4050000000000000 // 0x1.0p6
163
164
165.text
166
167#if defined( __x86_64__ )
168 #define RELATIVE_ADDR( _a) (_a)( %rip )
169 #define RELATIVE_ADDR_B( _a) (_a)( %rip )
170#elif defined( __i386__ )
171 #define RELATIVE_ADDR( _a) (_a)-expm1f_body( CX_P )
172 #define RELATIVE_ADDR_B( _a) (_a)-expm1f_no_fenv_body( CX_P )
173
174//a short routine to get the local address
175.align 4
176expm1f_pic: movl (%esp), %ecx //copy address of local_addr to %ecx
177 ret
178#else
179 #error arch not supported
180#endif
181
182//0x1.0p-24df
183#define LOW_CUTOFF 0x33800000
184
185ENTRY( expm1f )
186#if defined( __i386__ )
187 movl FRAME_SIZE(STACKP), %eax
188 movss FRAME_SIZE(STACKP), %xmm0
189#else
190 movd %xmm0, %eax
191#endif
192
193 movl %eax, %ecx
194 andl $0x7fffffff, %eax
195 cvtss2sd %xmm0, %xmm2
196 subl $LOW_CUTOFF, %eax
197 cmpl $(0x42b17218-LOW_CUTOFF), %eax // if( |x| >= 128.0f * ln(2) || |x| <= 0x1.0p-24f || isnan(f) )
198 jae 3f
199
2001:
201//PIC
202#if defined( __i386__ )
203 calll expm1f_pic // set %ecx to point to local_addr
204expm1f_body:
205#endif
206
207 cmpl $(0x3f800000-LOW_CUTOFF), %eax
208 jl 2f
209
210 // |x| >= 1.0f
211 movsd RELATIVE_ADDR( reciprocalLn2 ), %xmm1 // 1 / ln(2)
212 mulsd %xmm2, %xmm1 // x / ln(2)
213 cvttsd2si %xmm1, %eax // trunc( x / ln(2) )
214 lea RELATIVE_ADDR( expm1f_c), CX_P
215 cvtsi2sd %eax, %xmm3 // trunc( x / ln(2) )
216 addl $1023, %eax // add bias for exponent
217 subsd %xmm3, %xmm1 // f = x / ln(2) - trunc( x / ln(2) )
218 movd %eax, %xmm7 // 2**i >> 32
219 psllq $52, %xmm7 // 2**i
220
221 // c0 + c1x1 + c2x2 + c3x3 + c4x4 + c5x5 + c6x6 + c7x7 + c8x8
222#if defined( __SSE3__ )
223 movddup %xmm1, %xmm2 // { x, x }
224#else
225 movapd %xmm1, %xmm2 // x
226 unpcklpd %xmm2, %xmm2 // { x, x }
227#endif
228 mulsd %xmm1, %xmm1 // x*x
229 movapd %xmm2, %xmm3
230 mulpd 48(CX_P), %xmm2 // { c3x, (c7/c8)x }
231 mulpd 16(CX_P), %xmm3 // { c1x, (c5/c8)x }
232#if defined( __SSE3__ )
233 movddup %xmm1, %xmm4 // { xx, xx }
234#else
235 movapd %xmm1, %xmm4 // xx
236 unpcklpd %xmm4, %xmm4 // { xx, xx }
237#endif
238 mulsd %xmm1, %xmm1 // xx*xx
239 addpd 32(CX_P), %xmm2 // { c2 + c3x, (c6/c8) + (c7/c8)x }
240 addpd (CX_P), %xmm3 // { c0 + c1x, (c4/c8) + (c5/c8)x }
241 mulpd %xmm4, %xmm2 // { c2xx + c3xxx, (c6/c8)xx + (c7/c8)xxx }
242 addsd %xmm1, %xmm3 // { c0 + c1x, (c4/c8) + (c5/c8)x + xxxx }
243 mulsd 64(CX_P), %xmm1 // c8 * xxxx
244 addpd %xmm2, %xmm3 // { c0 + c1x + c2xx + c3xxx, (c4/c8) + (c5/c8)x + (c6/c8)xx + (c7/c8)xxx + xxxx }
245 movhlps %xmm3, %xmm6 // { ?, c0 + c1x + c2xx + c3xxx }
246 mulsd %xmm1, %xmm3 // { ..., c8xxxx* ((c4/c8) + (c5/c8)x + (c6/c8)xx + (c7/c8)xxx + xxxx) }
247 addsd %xmm6, %xmm3 // c0 + c1x + c2xx + c3xxx + c4xxxx + c5xxxxx + c6xxxxxx + c7xxxxxxx + c8xxxxxxxxx
248
249 //result = 2**i * ((2**d-1) + 1) - 1
250 // = 2**i * (2**d-1) + 2**i - 1
251 mulsd %xmm7, %xmm3 // 2**i * {c0 + c1x + c2xx + c3xxx + c4xxxx + c5xxxxx + c6xxxxxx + c7xxxxxxx + c8xxxxxxxxx}
252 subsd 72(CX_P), %xmm7 // 2**i - 1.0
253 addsd %xmm7, %xmm3
254
255// convert to single precision and return
256 cvtsd2ss %xmm3, %xmm0
257#if defined( __i386__ )
258 movss %xmm0, FRAME_SIZE(STACKP)
259 flds FRAME_SIZE(STACKP)
260#endif
261 ret
262
2632: // |x| < 1.0f
264 movsd RELATIVE_ADDR( two6 ), %xmm1 // 0x1.0p6
265 mulsd %xmm2, %xmm1 // x * 0x1.0p6
266 cvttsd2si %xmm1, %eax // i = trunc( x * 0x1.0p6 )
267#if defined( __x86_64__ )
268 cdqe // sign extend eax
269#endif
270 shl $4, AX_P // sizeof( double[2] ) * trunc( x * 0x1.0p6 )
271 lea RELATIVE_ADDR( expm1f_table ), DX_P
272 subsd (DX_P, AX_P, 1), %xmm2 // x -= i/64
273 movsd 8(DX_P, AX_P, 1), %xmm7 // e**i - 1
274
275 //calculate e**d-1 using minimax polynomial
276 lea RELATIVE_ADDR( expm1f_taylor_polynomial), CX_P
277
278 movsd (2*8)(CX_P), %xmm3 // 1/24
279 mulsd %xmm2, %xmm3 // d/24
280 movapd %xmm2, %xmm4 // d
281 addsd (1*8)(CX_P), %xmm3 // 1/6 + d/24
282 mulsd %xmm4, %xmm4 // d * d
283 mulsd %xmm2, %xmm3 // d/6 + dd/24
284 addsd (CX_P), %xmm3 // 1/2 + d/6 + dd/24
285 mulsd %xmm4, %xmm3 // dd/2 + ddd/6 + dddd/24
286 addsd %xmm2, %xmm3 // d + dd/2 + ddd/6 + dddd/24
287
288 //we have to do a little reduction here, otherwise we end up doing a 6th order Taylor series and it still is not very good:
289 // e**x-1 = e**(i+d) -1 c is a table entry chosen so |b| < 1/32
290 // = [(e**(i)-1)+1][(e**(d)-1)+1] -1 = (y+1)(z+1)-1 y = e**(i)-1, z = e**(d)-1
291 // = yz + y + z + 1 -1 = y + z + yz
292 movapd %xmm7, %xmm4 // y
293 mulsd %xmm3, %xmm4 // y*z
294 addsd %xmm3, %xmm4 // yz + z
295 addsd %xmm7, %xmm4 // (yz + z) + y
296
297// convert to single precision and return
298 cvtsd2ss %xmm4, %xmm0
299#if defined( __i386__ )
300 movss %xmm0, FRAME_SIZE(STACKP)
301 flds FRAME_SIZE(STACKP)
302#endif
303 ret
304
3053: // |x| >= 128.0f * ln(2) || |x| <= LOW_CUTOFF || isnan(f)
306 jl 4f
307
308 // |x| >= 128.0f * ln(2) || isnan(f)
309
310 //handle inf and NaN arguments
311 cmpl $(0x7f800000-LOW_CUTOFF), %eax
312 je 5f // |x| == inf
313 jg 6f // isnan( x )
314
315 //bounce denormal results back to the main code path
316 // if( x > -150.0f * ln(2)
317 cmpl $0xc2cff1b4, %ecx //if( -128.0f*ln(2) >= x > -150.0f*ln(2) )
318 jle 1b // go back to 1
319
320 // |x| >= 128.0f * ln(2)
321 xorps %xmm1, %xmm1 // 0.0f
322 movl $0x7f7fffff, %ecx
323 movl $0x21800000, %eax // 0x1.0p-60
324 cmpltss %xmm1, %xmm0 // x < 0
325 movd %ecx, %xmm3
326 movd %eax, %xmm4
327 movl $0x3f800000, %eax
328 andps %xmm0, %xmm4 // x < 0 ? 0x1.0p-60f : 0
329 andnps %xmm3, %xmm0 // x < 0 ? 0 : MAX_FLOAT
330 orps %xmm4, %xmm0 // x < 0 ? 0x1.0p-60f : MAX_FLT
331 mulss %xmm0, %xmm0 // x < 0 ? 0x1.0p-120f : Inf, overflow
332 movd %eax, %xmm1
333 subss %xmm1, %xmm0 // x < 0 ? 0x1.0p-120f - 1.0f (inexact, round to correct result) : Inf, overflow
334
335#if defined( __i386__ )
336 movss %xmm0, FRAME_SIZE( STACKP )
337 flds FRAME_SIZE(STACKP)
338#endif
339 ret
340
3414: // |x| <= LOW_CUTOFF
342 movl $0x3cb00000, %eax // 0x1.0p-52 >> 32
343 movl $0x3ff00000, %ecx // 1.0 >> 32
344 movd %eax, %xmm1
345 movd %ecx, %xmm3
346 psllq $32, %xmm1 // 0x1.0p-52
347 psllq $32, %xmm3 // 1.0
348 addsd %xmm3, %xmm1 // 1.0 + DBL_EPSILON
349 mulsd %xmm1, %xmm2 // nudge towards correctly rounded
350 cvtsd2ss %xmm2, %xmm0 // round correctly, set inexact
351#if defined( __i386__ )
352 movss %xmm0, FRAME_SIZE( STACKP )
353 flds FRAME_SIZE(STACKP)
354#endif
355 ret
356
3575: // |x| == inf, return -1 or Inf
358 movl $0x3f800000, %eax
359 movl $0x7f800000, %ecx
360 movd %eax, %xmm1
361 movd %ecx, %xmm3
362 cmpeqss %xmm0, %xmm3 // x == inf
363 andps %xmm3, %xmm0 // x == inf ? inf : 0
364 subss %xmm1, %xmm0 // x == inf ? inf : -1.0
365#if defined( __i386__ )
366 movss %xmm0, FRAME_SIZE( STACKP )
367 flds FRAME_SIZE(STACKP)
368#endif
369 ret
370
3716: // |x|| == NaN
372 addss %xmm0, %xmm0
373#if defined( __i386__ )
374 movss %xmm0, FRAME_SIZE( STACKP )
375 flds FRAME_SIZE(STACKP)
376#endif
377 ret
378
379