this repo has no description
1/*
2 * expf.s
3 *
4 * by Ian Ollmann
5 *
6 * Copyright (c) 2007, Apple Inc. All Rights Reserved.
7 *
8 * This file implements the C99 expf function for the MacOS X __i386__ and __x86_64__ abis.
9 */
10
11#include <machine/asm.h>
12#include "abi.h"
13
14//
15// Overall approach
16//
17// We calculate e**(y) as follows:
18//
19// e**y = 2**x x = y / ln(2)
20//
21// ...and proceed largely as we do in exp2f():
22//
23// i = trunc(x)
24// f = x - i
25//
26// 2**(x) = 2**(f+i) = 2**f * 2**i
27//
28// We choose trunc here, so that 2**f always has the property of pushing 2**i that much closer to the
29// extrema of finite, non-zero floating point computation. That is:
30//
31// x < 1 --> 2**f <= 1
32// x == 1 --> 2**f = 1
33// x > 1 --> 2**f >= 1
34//
35// This means that our 2**i calculation will not underflow/overflow prematurely before 2**f has had its say.
36// Round to nearest even would produce somewhat more accurate results, but accomplishing something like that
37// rounding without knowledge of the current rounding mode is expensive.
38//
39// In principle, the 2**f part can further be broken up as:
40//
41// 2**f = 2**(c+r) = 2**c * 2**r
42//
43// Where c is chosen so as to make r as close to 0 as possible and c can be determined fairly exactly by
44// consulting a lookup table. Our choice of trunc above means we cant sure of the sign of f,
45// so our table ends up being twice as large as it would have if we had chosen floor or ceil. On the bright
46// side, we never have to worry about f = x - trunc(f) being inexact in single precision.
47//
48// Unfortunately, in practice this reduction seemed to be bad for throughput, so this implementation went with
49// a large polynomial instead that covers the range -1,1 or -0.5,0.5 depending on whether we know the rounding
50// mode is nearest or not.
51//
52// Most of the arithmetic will be done in double precision. I investigated calculating the fractional term as 2**r-1, so as
53// to preserve precision -- we expect a lot of zeros between the 1 and the first non-zero bit. Normally, we dont want
54// throw that away until the final operation so that non-default rounding modes arrive at something close to the
55// right answer. Doing float in double precision means that this is unnecessary however.
56//
57// This code is probably worth revisiting once more to see if we can come up with a cheap reduction. These huge
58// polynomials just arent good for latency.
59//
60//
61
62.align 4
63// 8th order minimax fit of exp2 on [-1.0,1.0]. |error| < 0.402865722354948566583852e-9:
64expf_c: .quad 0x40bc03f30399c376, 0x3ff000000001ea2a // c4/c8 = 0.961813690023115610862381719985771e-2 / 0.134107709538786543922336536865157e-5, c0 = 1.0 + 0.278626872016317130037181614004e-10
65 .quad 0x408f10e7f73e6d8f, 0x3fe62e42fd0933ee // c5/c8 = 0.133318252930790403741964203236548e-2 / 0.134107709538786543922336536865157e-5, c1 = .693147176943623740308984004029708
66 .quad 0x405cb616a9384e69, 0x3fcebfbdfd0f0afa // c6/c8 = 0.154016177542147239746127455226575e-3 / 0.134107709538786543922336536865157e-5, c2 = .240226505817268621584559118975830
67 .quad 0x4027173ebd288ba1, 0x3fac6b0a74f15403 // c7/c8 = 0.154832722143258821052933667742417e-4 / 0.134107709538786543922336536865157e-5, c3 = 0.555041568519883074165425891257052e-1
68 .quad 0x3eb67fe1dc3105ba // c8 = 0.134107709538786543922336536865157e-5
69
70
71.align 3
72expf_nofenv_c: .quad 0x3ff0000000058fca // 1.0 + -8.09329727503262700660348520172e-11
73 .double 0.693147206709644041218074094717934
74 .double 0.240226515050550309232521176082490
75 .double 0.0555032721577134480296332498187050
76 .double 0.00961799451418197772157647729048837
77 .double 0.00134004316655903280893023589328151
78 .double 0.000154780222739739895295319810010147
79
80.literal8
81reciprocalLn2: .quad 0x3ff71547652b82fe // 1.0 / ln(2)
82
83.text
84
85#if defined( __x86_64__ )
86 #define RELATIVE_ADDR( _a) (_a)( %rip )
87 #define RELATIVE_ADDR_B( _a) (_a)( %rip )
88#elif defined( __i386__ )
89 #define RELATIVE_ADDR( _a) (_a)-expf_body( CX_P )
90 #define RELATIVE_ADDR_B( _a) (_a)-expf_no_fenv_body( CX_P )
91
92//a short routine to get the local address
93.align 4
94expf_pic: movl (%esp), %ecx //copy address of local_addr to %ecx
95 ret
96#else
97 #error arch not supported
98#endif
99
100//Accurate to within 0.5068 ulp in round to nearest.
101ENTRY( expf )
102#if defined( __i386__ )
103 movl FRAME_SIZE(STACKP), %eax
104 movss FRAME_SIZE(STACKP), %xmm0
105#else
106 movd %xmm0, %eax
107#endif
108
109 cvtss2sd %xmm0, %xmm2 // (double) x
110 movl %eax, %ecx // x
111 andl $0x7fffffff, %eax // |x|vl
112 movl $1023, %edx // double precision bias
113 subl $0x33b8aa3b, %eax // subtract 0x1.0p-24f / ln(2)
114 cmpl $0x0f7d1d57, %eax // if( |x| > 126/ln(2) || isnan(x) || |x| < 0x1.0p-26f /ln(2) )
115 ja 3f // goto 3
116
117 //The test above was a little too aggressive.
118 //values 126 < x < 128, and -150 < x < -126 will come back here:
119 //For x < -126, we change %edx to hold the exponent for 2**-126,
120 // and subtract -126 from x
121
1221:
123//PIC
124#if defined( __i386__ )
125 calll expf_pic // set %ecx to point to local_addr
126expf_body:
127#endif
128
129 mulsd RELATIVE_ADDR( reciprocalLn2 ), %xmm2
130 lea RELATIVE_ADDR( expf_c), CX_P
131
132
133 //extract fractional part and exit early if it is zero
134 cvttsd2si %xmm2, %eax // trunc(x)
135 addl %eax, %edx // calculate biased exponent of result
136 cvtsi2sd %eax, %xmm1 // trunc(x)
137 movd %edx, %xmm7 // exponent part of result
138 psllq $52, %xmm7 // shift result exponent into place
139 subsd %xmm1, %xmm2 // get the fractional part
140
141
142
143 // c0 + c1x1 + c2x2 + c3x3 + c4x4 + c5x5 + c6x6 + c7x7 + c8x8
144#if defined( __SSE3__ )
145 movddup %xmm2, %xmm1 // { x, x }
146#else
147 movapd %xmm2, %xmm1 // x
148 unpcklpd %xmm1, %xmm1 // { x, x }
149#endif
150 mulsd %xmm2, %xmm2 // x*x
151 movapd %xmm1, %xmm3
152 mulpd 48(CX_P), %xmm1 // { c3x, (c7/c8)x }
153 mulpd 16(CX_P), %xmm3 // { c1x, (c5/c8)x }
154#if defined( __SSE3__ )
155 movddup %xmm2, %xmm4 // { xx, xx }
156#else
157 movapd %xmm2, %xmm4 // xx
158 unpcklpd %xmm4, %xmm4 // { xx, xx }
159#endif
160 mulsd %xmm2, %xmm2 // xx*xx
161 addpd 32(CX_P), %xmm1 // { c2 + c3x, (c6/c8) + (c7/c8)x }
162 addpd (CX_P), %xmm3 // { c0 + c1x, (c4/c8) + (c5/c8)x }
163 mulpd %xmm4, %xmm1 // { c2xx + c3xxx, (c6/c8)xx + (c7/c8)xxx }
164 addsd %xmm2, %xmm3 // { c0 + c1x, (c4/c8) + (c5/c8)x + xxxx }
165 mulsd 64(CX_P), %xmm2 // c8 * xxxx
166 addpd %xmm1, %xmm3 // { c0 + c1x + c2xx + c3xxx, (c4/c8) + (c5/c8)x + (c6/c8)xx + (c7/c8)xxx + xxxx }
167 movhlps %xmm3, %xmm6 // { ?, c0 + c1x + c2xx + c3xxx }
168 mulsd %xmm2, %xmm3 // { ..., c8xxxx* ((c4/c8) + (c5/c8)x + (c6/c8)xx + (c7/c8)xxx + xxxx) }
169 addsd %xmm6, %xmm3 // c0 + c1x + c2xx + c3xxx + c4xxxx + c5xxxxx + c6xxxxxx + c7xxxxxxx + c8xxxxxxxxx
170 mulsd %xmm7, %xmm3 // 2**i * {c0 + c1x + c2xx + c3xxx + c4xxxx + c5xxxxx + c6xxxxxx + c7xxxxxxx + c8xxxxxxxxx}
171
172// convert to single precision and return
173 cvtsd2ss %xmm3, %xmm0
174#if defined( __i386__ )
175 movss %xmm0, FRAME_SIZE(STACKP)
176 flds FRAME_SIZE(STACKP)
177#endif
178 ret
179
180
1813: // |x| > 126 || isnan(x) || |x| < 0x1.0p-26f
182 jge 4f // if( |x| > 126 || isnan(x) ) goto 4
183
184 //fall through for common case of |x| < 0x1.0p-26f
185 movl $0x3f800000, %edx // 1.0f
186 movd %edx, %xmm1 // 1.0f
187 addss %xmm1, %xmm0 // round away from 1.0 as appropriate for current rounding mode
188#if defined( __i386__ )
189 movss %xmm0, FRAME_SIZE(STACKP)
190 flds FRAME_SIZE(STACKP)
191#endif
192 ret
193
194 // |x| > 126 || isnan(x)
1954: addl $0x33b8aa3b, %eax // restore |x|
196 cmpl $0x7f800000, %eax //
197 ja 5f // if isnan(x) goto 5
198 je 6f // if isinf(x) goto 6
199
200 cmpl $0xc358677d, %ecx // if( x <= -150 /ln(2) )
201 jae 7f // goto 7
202
203 cmpl $0x4338aa3b, %ecx // if( x >= 128 /ln(2) )
204 jge 8f // goto 8
205
206 jmp 1b
207
208
2095: //x is nan
210 addss %xmm0, %xmm0 //Quiet the NaN
211#if defined( __i386__ )
212 movss %xmm0, FRAME_SIZE(STACKP)
213 flds FRAME_SIZE(STACKP)
214#endif
215 ret
216
2176: // |x| is inf
218 movss %xmm0, %xmm1 // x
219 psrad $31, %xmm0 // x == -Inf ? -1U : 0
220 andnps %xmm1, %xmm0 // x == -Inf ? 0 : Inf
221#if defined( __i386__ )
222 movss %xmm0, FRAME_SIZE(STACKP)
223 flds FRAME_SIZE(STACKP)
224#endif
225 ret
226
2277: // x <= -150
228 movl $0x00800001, %eax
229 movd %eax, %xmm0
230 mulss %xmm0, %xmm0 //Create correctly rounded result, set inexact/underflow
231#if defined( __i386__ )
232 movss %xmm0, FRAME_SIZE(STACKP)
233 flds FRAME_SIZE(STACKP)
234#endif
235 ret
236
2378: // x >= 128.0
238 movl $0x7f7fffff, %eax // FLT_MAX
239 movd %eax, %xmm0 // FLT_MAX
240 addss %xmm0, %xmm0 // Inf, set overflow
241#if defined( __i386__ )
242 movss %xmm0, FRAME_SIZE(STACKP)
243 flds FRAME_SIZE(STACKP)
244#endif
245 ret
246
247//Uses round to nearest to deliver a reduced value between [ -0.5, 0.5 ]
248//This allows us to use a smaller polynomial
249//In addition, we dont bother to check for x is integer. This saves a few cycles in the general case.
250//Accurate to within 0.534 ulp
251ENTRY( expf$fenv_access_off )
252#if defined( __i386__ )
253 movl FRAME_SIZE(STACKP), %eax
254 movss FRAME_SIZE(STACKP), %xmm0
255#else
256 movd %xmm0, %eax
257#endif
258
259 cvtss2sd %xmm0, %xmm2 // (double) x
260 movl %eax, %ecx // x
261 andl $0x7fffffff, %eax // |x|vl
262 movl $1023, %edx // double precision bias
263 subl $0x33b8aa3b, %eax // subtract 0x1.0p-24f / ln(2)
264 cmpl $0x0f7d1d57, %eax // if( |x| > 126/ln(2) || isnan(x) || |x| < 0x1.0p-26f /ln(2) )
265 ja 3f // goto 3
266
267 //The test above was a little too aggressive.
268 //values 126 < x < 128, and -150 < x < -126 will come back here:
269 //For x < -126, we change %edx to hold the exponent for 2**-126,
270 // and subtract -126 from x
271
2721:
273//PIC
274#if defined( __i386__ )
275 calll expf_pic // set %ecx to point to local_addr
276expf_no_fenv_body:
277#endif
278
279 mulsd RELATIVE_ADDR( reciprocalLn2 ), %xmm2
280 lea RELATIVE_ADDR_B( expf_nofenv_c), CX_P
281
282 //extract fractional part and exit early if it is zero
283 andl $0x80000000, %ecx // signof(x)
284 orl $0x43300000, %ecx // copysign( 0x1.0p52, x ) >> 32
285 movd %ecx, %xmm6 // copysign( 0x1.0p52, x ) >> 32
286 movaps %xmm2, %xmm1 // x
287 psllq $32, %xmm6 // copysign( 0x1.0p52, x )
288 addsd %xmm6, %xmm2 // x + copysign( 0x1.0p23f, x )
289 subsd %xmm6, %xmm2 // rint(x)
290 cvttsd2si %xmm2, %eax // rint(x)
291 addl %eax, %edx // rint(x) + bias
292 movd %edx, %xmm7 // result exponent >> 32
293 subsd %xmm2, %xmm1 // fractional part
294 psllq $52, %xmm7 // shift result exponent into place
295
296 movsd (3*8)(CX_P), %xmm3 // c3
297 movsd (5*8)(CX_P), %xmm5 // c5
298 movsd (6*8)(CX_P), %xmm6 // c6
299 movapd %xmm1, %xmm2 // x
300 mulsd %xmm1, %xmm1 // x*x
301 mulsd %xmm2, %xmm3 // c3*x
302 mulsd %xmm2, %xmm5 // c5*x
303 mulsd (1*8)(CX_P), %xmm2 // c1*x
304 mulsd %xmm1, %xmm6 // c6*xx
305 addsd (2*8)(CX_P), %xmm3 // c2+c3x
306 movapd %xmm1, %xmm4 // xx
307 mulsd %xmm1, %xmm1 // xx*xx
308 addsd (4*8)(CX_P), %xmm5 // c4+c5x
309 addsd (CX_P), %xmm2 // c0+c1x
310 mulsd %xmm4, %xmm3 // c2xx+c3xxx
311 addsd %xmm6, %xmm5 // c4+c5x+c6xx
312 mulsd %xmm1, %xmm5 // c4xxxx+c5xxxxx+c6xxxxxx
313 addsd %xmm2, %xmm3 // c0+c1x+c2xx+c3xxx
314 addsd %xmm5, %xmm3 // c0+c1x+c2xx+c3xxx+c4xxxx+c5xxxxx+c6xxxxxx
315
316 mulsd %xmm7, %xmm3 // scale by 2**i
317
318// convert to single precision and return
319 cvtsd2ss %xmm3, %xmm0
320#if defined( __i386__ )
321 movss %xmm0, FRAME_SIZE(STACKP)
322 flds FRAME_SIZE(STACKP)
323#endif
324 ret
325
326
3273: // |x| > 126 || isnan(x) || |x| < 0x1.0p-26f
328 jge 4f // if( |x| > 126 || isnan(x) ) goto 4
329
330 //fall through for common case of |x| < 0x1.0p-26f
331 movl $0x3f800000, %edx // 1.0f
332 movd %edx, %xmm1 // 1.0f
333 addss %xmm1, %xmm0 // round away from 1.0 as appropriate for current rounding mode
334#if defined( __i386__ )
335 movss %xmm0, FRAME_SIZE(STACKP)
336 flds FRAME_SIZE(STACKP)
337#endif
338 ret
339
340 // |x| > 126 || isnan(x)
3414: addl $0x33b8aa3b, %eax // restore |x|
342 cmpl $0x7f800000, %eax //
343 ja 5f // if isnan(x) goto 5
344 je 6f // if isinf(x) goto 6
345
346 cmpl $0xc358677d, %ecx // if( x <= -150 /ln(2) )
347 jae 7f // goto 7
348
349 cmpl $0x4338aa3b, %ecx // if( x >= 128 /ln(2) )
350 jge 8f // goto 8
351
352 jmp 1b
353
354
3555: //x is nan
356 addss %xmm0, %xmm0 //Quiet the NaN
357#if defined( __i386__ )
358 movss %xmm0, FRAME_SIZE(STACKP)
359 flds FRAME_SIZE(STACKP)
360#endif
361 ret
362
3636: // |x| is inf
364 movss %xmm0, %xmm1 // x
365 psrad $31, %xmm0 // x == -Inf ? -1U : 0
366 andnps %xmm1, %xmm0 // x == -Inf ? 0 : Inf
367#if defined( __i386__ )
368 movss %xmm0, FRAME_SIZE(STACKP)
369 flds FRAME_SIZE(STACKP)
370#endif
371 ret
372
3737: // x <= -150
374 movl $0x00800001, %eax
375 movd %eax, %xmm0
376 mulss %xmm0, %xmm0 //Create correctly rounded result, set inexact/underflow
377#if defined( __i386__ )
378 movss %xmm0, FRAME_SIZE(STACKP)
379 flds FRAME_SIZE(STACKP)
380#endif
381 ret
382
3838: // x >= 128.0
384 movl $0x7f800000, %eax // Inf
385#if defined( __i386__ )
386 movl %eax, FRAME_SIZE(STACKP)
387 flds FRAME_SIZE(STACKP)
388#else
389 movd %eax, %xmm0
390#endif
391 ret
392