this repo has no description
1/*
2 * exp2f.s
3 *
4 * by Ian Ollmann
5 *
6 * Copyright (c) 2007, Apple Inc. All Rights Reserved.
7 *
8 * This file implements the C99 exp2f function for the MacOS X __i386__ and __x86_64__ abis.
9 */
10
11#include <machine/asm.h>
12#include "abi.h"
13
14//
15// Overall approach
16//
17// We break up 2**(x) as follows:
18//
19// i = trunc(x)
20// f = x - i
21//
22// 2**(x) = 2**(f+i) = 2**f * 2**i
23//
24// We choose trunc here, so that 2**f always has the property of pushing 2**i that much closer to the
25// extrema of finite, non-zero floating point computation. That is:
26//
27// x < 1 --> 2**f <= 1
28// x == 1 --> 2**f = 1
29// x > 1 --> 2**f >= 1
30//
31// This means that our 2**i calculation will not underflow/overflow prematurely before 2**f has had its say.
32// Round to nearest even would produce somewhat more accurate results, but accomplishing something like that
33// rounding without knowledge of the current rounding mode is expensive.
34//
35// In principle, the 2**f part can further be broken up as:
36//
37// 2**f = 2**(c+r) = 2**c * 2**r
38//
39// Where c is chosen so as to make r as close to 0 as possible and c can be determined fairly exactly by
40// consulting a lookup table. Our choice of trunc above means we cant sure of the sign of f,
41// so our table ends up being twice as large as it would have if we had chosen floor or ceil. On the bright
42// side, we never have to worry about f = x - trunc(f) being inexact in single precision.
43//
44// Unfortunately, in practice this reduction seemed to be bad for throughput, so this implementation went with
45// a large polynomial instead that covers the range -1,1 or -0.5,0.5 depending on whether we know the rounding
46// mode is nearest or not.
47//
48// Most of the arithmetic will be done in double precision. I investigated calculating the fractional term as 2**r-1, so as
49// to preserve precision -- we expect a lot of zeros between the 1 and the first non-zero bit. Normally, we dont want
50// throw that away until the final operation so that non-default rounding modes arrive at something close to the
51// right answer. Doing float in double precision means that this is unnecessary however.
52//
53// This code is probably worth revisiting once more to see if we can come up with a cheap reduction. These huge
54// polynomials just arent good for latency.
55//
56//
57
58.align 4
59// 8th order minimax fit of exp2 on [-1.0,1.0]. |error| < 0.402865722354948566583852e-9:
60exp2f_c: .quad 0x40bc03f30399c376, 0x3ff000000001ea2a // c4/c8 = 0.961813690023115610862381719985771e-2 / 0.134107709538786543922336536865157e-5, c0 = 1.0 + 0.278626872016317130037181614004e-10
61 .quad 0x408f10e7f73e6d8f, 0x3fe62e42fd0933ee // c5/c8 = 0.133318252930790403741964203236548e-2 / 0.134107709538786543922336536865157e-5, c1 = .693147176943623740308984004029708
62 .quad 0x405cb616a9384e69, 0x3fcebfbdfd0f0afa // c6/c8 = 0.154016177542147239746127455226575e-3 / 0.134107709538786543922336536865157e-5, c2 = .240226505817268621584559118975830
63 .quad 0x4027173ebd288ba1, 0x3fac6b0a74f15403 // c7/c8 = 0.154832722143258821052933667742417e-4 / 0.134107709538786543922336536865157e-5, c3 = 0.555041568519883074165425891257052e-1
64 .quad 0x3eb67fe1dc3105ba // c8 = 0.134107709538786543922336536865157e-5
65
66
67.align 3
68exp2f_nofenv_c: .quad 0x3ff0000000058fca // 1.0 + -8.09329727503262700660348520172e-11
69 .double 0.693147206709644041218074094717934
70 .double 0.240226515050550309232521176082490
71 .double 0.0555032721577134480296332498187050
72 .double 0.00961799451418197772157647729048837
73 .double 0.00134004316655903280893023589328151
74 .double 0.000154780222739739895295319810010147
75
76
77
78.text
79
80#if defined( __x86_64__ )
81 #define RELATIVE_ADDR( _a) (_a)( %rip )
82 #define RELATIVE_ADDR_B( _a) (_a)( %rip )
83#elif defined( __i386__ )
84 #define RELATIVE_ADDR( _a) (_a)-exp2f_body( CX_P )
85 #define RELATIVE_ADDR_B( _a) (_a)-exp2f_no_fenv_body( CX_P )
86
87//a short routine to get the local address
88.align 4
89exp2f_pic: movl (%esp), %ecx //copy address of local_addr to %ecx
90 ret
91#else
92 #error arch not supported
93#endif
94
95//Accurate to within 0.5034 ulp in round to nearest.
96ENTRY( exp2f )
97#if defined( __i386__ )
98 movl FRAME_SIZE(STACKP), %eax
99 movss FRAME_SIZE(STACKP), %xmm0
100#else
101 movd %xmm0, %eax
102#endif
103
104 movl %eax, %ecx // x
105 andl $0x7fffffff, %eax // |x|vl
106 movl $1023, %edx // double precision bias
107 subl $0x32800000, %eax // subtract 0x1.0p-26f
108 cmpl $0x107c0000, %eax // if( |x| > 126 || isnan(x) || |x| < 0x1.0p-26f )
109 ja 3f // goto 3
110
111 //The test above was a little too aggressive.
112 //values 126 < x < 128, and -150 < x < -126 will come back here:
113 //For x < -126, we change %edx to hold the exponent for 2**-126,
114 // and subtract -126 from x
115
1161: //extract fractional part and exit early if it is zero
117 cvttss2si %xmm0, %eax // trunc(x)
118 cvtss2sd %xmm0, %xmm2 // x
119 addl %eax, %edx // calculate biased exponent of result
120 cvtsi2sd %eax, %xmm1 // trunc(x)
121 movd %edx, %xmm7 // exponent part of result
122 ucomisd %xmm2, %xmm1 // check to see if x was an integer
123 psllq $52, %xmm7 // shift result exponent into place
124 je 2f // if x was an integer, goto 2
125
126//PIC
127#if defined( __i386__ )
128 calll exp2f_pic // set %ecx to point to local_addr
129exp2f_body:
130#endif
131
132 subsd %xmm1, %xmm2 // get the fractional part
133 lea RELATIVE_ADDR( exp2f_c), CX_P
134
135 // c0 + c1x1 + c2x2 + c3x3 + c4x4 + c5x5 + c6x6 + c7x7 + c8x8
136#if defined( __SSE3__ )
137 movddup %xmm2, %xmm1 // { x, x }
138#else
139 movapd %xmm2, %xmm1 // x
140 unpcklpd %xmm1, %xmm1 // { x, x }
141#endif
142 mulsd %xmm2, %xmm2 // x*x
143 movapd %xmm1, %xmm3
144 mulpd 48(CX_P), %xmm1 // { c3x, (c7/c8)x }
145 mulpd 16(CX_P), %xmm3 // { c1x, (c5/c8)x }
146#if defined( __SSE3__ )
147 movddup %xmm2, %xmm4 // { xx, xx }
148#else
149 movapd %xmm2, %xmm4 // xx
150 unpcklpd %xmm4, %xmm4 // { xx, xx }
151#endif
152 mulsd %xmm2, %xmm2 // xx*xx
153 addpd 32(CX_P), %xmm1 // { c2 + c3x, (c6/c8) + (c7/c8)x }
154 addpd (CX_P), %xmm3 // { c0 + c1x, (c4/c8) + (c5/c8)x }
155 mulpd %xmm4, %xmm1 // { c2xx + c3xxx, (c6/c8)xx + (c7/c8)xxx }
156 addsd %xmm2, %xmm3 // { c0 + c1x, (c4/c8) + (c5/c8)x + xxxx }
157 mulsd 64(CX_P), %xmm2 // c8 * xxxx
158 addpd %xmm1, %xmm3 // { c0 + c1x + c2xx + c3xxx, (c4/c8) + (c5/c8)x + (c6/c8)xx + (c7/c8)xxx + xxxx }
159 movhlps %xmm3, %xmm6 // { ?, c0 + c1x + c2xx + c3xxx }
160 mulsd %xmm2, %xmm3 // { ..., c8xxxx* ((c4/c8) + (c5/c8)x + (c6/c8)xx + (c7/c8)xxx + xxxx) }
161 addsd %xmm6, %xmm3 // c0 + c1x + c2xx + c3xxx + c4xxxx + c5xxxxx + c6xxxxxx + c7xxxxxxx + c8xxxxxxxxx
162 mulsd %xmm7, %xmm3 // 2**i * {c0 + c1x + c2xx + c3xxx + c4xxxx + c5xxxxx + c6xxxxxx + c7xxxxxxx + c8xxxxxxxxx}
163
164// convert to single precision and return
165 cvtsd2ss %xmm3, %xmm0
166#if defined( __i386__ )
167 movss %xmm0, FRAME_SIZE(STACKP)
168 flds FRAME_SIZE(STACKP)
169#endif
170 ret
171
172//x is an integer
1732: // find 2**x
174 xorps %xmm0, %xmm0
175 cvtsd2ss %xmm7, %xmm0
176#if defined( __i386__ )
177 movss %xmm0, FRAME_SIZE(STACKP)
178 flds FRAME_SIZE(STACKP)
179#endif
180 ret
181
182
1833: // |x| > 126 || isnan(x) || |x| < 0x1.0p-26f
184 jge 4f // if( |x| > 126 || isnan(x) ) goto 4
185
186 //fall through for common case of |x| < 0x1.0p-26f
187 movl $0x3f800000, %edx // 1.0f
188 movd %edx, %xmm1 // 1.0f
189 addss %xmm1, %xmm0 // round away from 1.0 as appropriate for current rounding mode
190#if defined( __i386__ )
191 movss %xmm0, FRAME_SIZE(STACKP)
192 flds FRAME_SIZE(STACKP)
193#endif
194 ret
195
196 // |x| > 126 || isnan(x)
1974: addl $0x32800000, %eax // restore |x|
198 cmpl $0x7f800000, %eax //
199 ja 5f // if isnan(x) goto 5
200 je 6f // if isinf(x) goto 6
201
202 cmpl $0xc3160000, %ecx // if( x <= -150 )
203 jae 7f // goto 7
204
205 cmpl $0x43000000, %ecx // if( x >= 128 )
206 jge 8f // goto 8
207
208 cmpl $0, %ecx // result is large finite
209 jge 1b // back to the main body
210
211 // result is between -150 < x < -126, denormal result
212 addl $-126, %edx // subtract 126 from exponent bias
213 movl $0x42fc0000, %eax // 126.0f
214 movd %eax, %xmm1 // 126.0f
215 addss %xmm1, %xmm0 // x += 126.0f
216 jmp 1b
217
218
2195: //x is nan
220 addss %xmm0, %xmm0 //Quiet the NaN
221#if defined( __i386__ )
222 movss %xmm0, FRAME_SIZE(STACKP)
223 flds FRAME_SIZE(STACKP)
224#endif
225 ret
226
2276: // |x| is inf
228 movss %xmm0, %xmm1 // x
229 psrad $31, %xmm0 // x == -Inf ? -1U : 0
230 andnps %xmm1, %xmm0 // x == -Inf ? 0 : Inf
231#if defined( __i386__ )
232 movss %xmm0, FRAME_SIZE(STACKP)
233 flds FRAME_SIZE(STACKP)
234#endif
235 ret
236
2377: // x <= -150
238 movl $0x00800001, %eax
239 movd %eax, %xmm0
240 mulss %xmm0, %xmm0 //Create correctly rounded result, set inexact/underflow
241#if defined( __i386__ )
242 movss %xmm0, FRAME_SIZE(STACKP)
243 flds FRAME_SIZE(STACKP)
244#endif
245 ret
246
2478: // x >= 128.0
248 movl $0x7f7fffff, %eax // FLT_MAX
249 movd %eax, %xmm0 // FLT_MAX
250 addss %xmm0, %xmm0 // Inf, set overflow
251#if defined( __i386__ )
252 movss %xmm0, FRAME_SIZE(STACKP)
253 flds FRAME_SIZE(STACKP)
254#endif
255 ret
256
257//Uses round to nearest to deliver a reduced value between [ -0.5, 0.5 ]
258//This allows us to use a smaller polynomial
259//In addition, we dont bother to check for x is integer. This saves a few cycles in the general case.
260//Accurate to within 0.534 ulp
261ENTRY( exp2f$fenv_access_off )
262#if defined( __i386__ )
263 movl FRAME_SIZE(STACKP), %eax
264 movss FRAME_SIZE(STACKP), %xmm0
265#else
266 movd %xmm0, %eax
267#endif
268
269 movl %eax, %ecx // x
270 andl $0x7fffffff, %eax // |x|vl
271 movl $1023, %edx // double precision bias
272 subl $0x32800000, %eax // subtract 0x1.0p-26f
273 cmpl $0x107c0000, %eax // if( |x| > 126 || isnan(x) || |x| < 0x1.0p-26f )
274 ja 3f // goto 3
275
276 //The test above was a little too aggressive.
277 //values 126 < x < 128, and -150 < x < -126 will come back here:
278 //For x < -126, we change %edx to hold the exponent for 2**-126,
279 // and subtract -126 from x
280
2811: //extract fractional part and exit early if it is zero
282 andl $0x80000000, %ecx // signof(x)
283 orl $0x4b000000, %ecx // copysign( 0x1.0p23f, x )
284 movd %ecx, %xmm2 // copysign( 0x1.0p23f, x )
285 movaps %xmm0, %xmm1 // x
286 addss %xmm2, %xmm0 // x + copysign( 0x1.0p23f, x )
287 subss %xmm2, %xmm0 // rint(x)
288 cvttss2si %xmm0, %eax // rint(x)
289 addl %eax, %edx // rint(x) + bias
290 movd %edx, %xmm7 // result exponent >> 32
291 subss %xmm0, %xmm1 // fractional part
292 psllq $52, %xmm7 // shift result exponent into place
293 cvtss2sd %xmm1, %xmm1 // fractional part as a double
294
295//PIC
296#if defined( __i386__ )
297 calll exp2f_pic // set %ecx to point to local_addr
298exp2f_no_fenv_body:
299#endif
300
301 lea RELATIVE_ADDR_B( exp2f_nofenv_c), CX_P
302 movsd (3*8)(CX_P), %xmm3 // c3
303 movsd (5*8)(CX_P), %xmm5 // c5
304 movsd (6*8)(CX_P), %xmm6 // c6
305 movapd %xmm1, %xmm2 // x
306 mulsd %xmm1, %xmm1 // x*x
307 mulsd %xmm2, %xmm3 // c3*x
308 mulsd %xmm2, %xmm5 // c5*x
309 mulsd (1*8)(CX_P), %xmm2 // c1*x
310 mulsd %xmm1, %xmm6 // c6*xx
311 addsd (2*8)(CX_P), %xmm3 // c2+c3x
312 movapd %xmm1, %xmm4 // xx
313 mulsd %xmm1, %xmm1 // xx*xx
314 addsd (4*8)(CX_P), %xmm5 // c4+c5x
315 addsd (CX_P), %xmm2 // c0+c1x
316 mulsd %xmm4, %xmm3 // c2xx+c3xxx
317 addsd %xmm6, %xmm5 // c4+c5x+c6xx
318 mulsd %xmm1, %xmm5 // c4xxxx+c5xxxxx+c6xxxxxx
319 addsd %xmm2, %xmm3 // c0+c1x+c2xx+c3xxx
320 addsd %xmm5, %xmm3 // c0+c1x+c2xx+c3xxx+c4xxxx+c5xxxxx+c6xxxxxx
321
322 mulsd %xmm7, %xmm3 // scale by 2**i
323
324// convert to single precision and return
325 cvtsd2ss %xmm3, %xmm0
326#if defined( __i386__ )
327 movss %xmm0, FRAME_SIZE(STACKP)
328 flds FRAME_SIZE(STACKP)
329#endif
330 ret
331
332
3333: // |x| > 126 || isnan(x) || |x| < 0x1.0p-26f
334 jge 4f // if( |x| > 126 || isnan(x) ) goto 4
335
336 //fall through for common case of |x| < 0x1.0p-26f
337 movl $0x3f800000, %edx // 1.0f
338 movd %edx, %xmm1 // 1.0f
339 addss %xmm1, %xmm0 // round away from 1.0 as appropriate for current rounding mode
340#if defined( __i386__ )
341 movss %xmm0, FRAME_SIZE(STACKP)
342 flds FRAME_SIZE(STACKP)
343#endif
344 ret
345
346 // |x| > 126 || isnan(x)
3474: addl $0x32800000, %eax // restore |x|
348 cmpl $0x7f800000, %eax //
349 ja 5f // if isnan(x) goto 5
350 je 6f // if isinf(x) goto 6
351
352 cmpl $0xc3160000, %ecx // if( x <= -150 )
353 jae 7f // goto 7
354
355 cmpl $0x43000000, %ecx // if( x >= 128 )
356 jge 8f // goto 8
357
358 cmpl $0, %ecx // result is large finite
359 jge 1b // back to the main body
360
361 // result is between -150 < x < -126, denormal result
362 addl $-126, %edx // subtract 126 from exponent bias
363 movl $0x42fc0000, %eax // 126.0f
364 movd %eax, %xmm1 // 126.0f
365 addss %xmm1, %xmm0 // x += 126.0f
366 jmp 1b
367
368
3695: //x is nan
370 addss %xmm0, %xmm0 //Quiet the NaN
371#if defined( __i386__ )
372 movss %xmm0, FRAME_SIZE(STACKP)
373 flds FRAME_SIZE(STACKP)
374#endif
375 ret
376
3776: // |x| is inf
378 movss %xmm0, %xmm1 // x
379 psrad $31, %xmm0 // x == -Inf ? -1U : 0
380 andnps %xmm1, %xmm0 // x == -Inf ? 0 : Inf
381#if defined( __i386__ )
382 movss %xmm0, FRAME_SIZE(STACKP)
383 flds FRAME_SIZE(STACKP)
384#endif
385 ret
386
3877: // x <= -150
388 movl $0x00800001, %eax
389 movd %eax, %xmm0
390 mulss %xmm0, %xmm0 //Create correctly rounded result, set inexact/underflow
391#if defined( __i386__ )
392 movss %xmm0, FRAME_SIZE(STACKP)
393 flds FRAME_SIZE(STACKP)
394#endif
395 ret
396
3978: // x >= 128.0
398 movl $0x7f800000, %eax // Inf
399#if defined( __i386__ )
400 movl %eax, FRAME_SIZE(STACKP)
401 flds FRAME_SIZE(STACKP)
402#else
403 movd %eax, %xmm0
404#endif
405 ret
406