this repo has no description
1// Note that while this file has a .s extension it is not intended to be assmbled directly
2// but rather included with appropriate #defines in place.
3
4// This begins the base-universal logarithm code.
5#ifndef BASE2
6#define BASE2 0
7#endif
8#ifndef BASEE
9#define BASEE 0
10#endif
11#ifndef BASE10
12#define BASE10 0
13#endif
14#ifndef LOGUP
15#define LOGUP 0
16#endif
17
18// Assume we need to respect the flags unless directed otherwise
19#ifndef FENVON
20#define FENVON 1
21#endif
22
23#if BASE2
24// double log2asm( double x )
25ENTRY( log2 )
26#elif BASEE
27#if LOGUP
28// private double logup( double x ) // log(x+ULP(x)/2)
29ENTRY( _logup )
30#else
31// double log( double x )
32ENTRY( log )
33#endif
34#elif BASE10
35// double log10( double x )
36ENTRY( log10 )
37#else
38#error "One of BASE2, BASEE, BASE10 needs to be set to 1"
39#endif
40 // Early out if x is NaN, Inf, 0, or <0
41 // I.e., if the encoding of x is not in (0, 7ff0000000000000)
42#if defined( __x86_64__ )
43 movd %xmm0, %rax
44 movd %xmm0, %rdx
45 subq REL_ADDR(small_cut), %rax
46 cmpq REL_ADDR(large_cut), %rax
47#else
48 movl 4+FRAME_SIZE(STACKP), %eax
49 movl 4+FRAME_SIZE(STACKP), %edx
50 movsd FRAME_SIZE(STACKP), %xmm0
51 call 0f // PIC code
520: pop %ecx // PIC base
53 sub $0x00100000, %eax
54 cmp $0x7fe00000, %eax
55#endif
56 jae 3f // Jump if NaN, Inf, Denorm, 0, or negative
57
58 SUBP $LOCAL_STACK_SIZE, STACKP
59
60#if LOGUP
61 // Need to adjust h with a small correction for log1p(x) when 1+x is not representable
62 //u = (x < 1.0) ? -0x1p-53 : 0x1p-54 = ...? 0xbca0000000000000 : 0x3c90000000000000;
63 // Add in the half ulp now that we know we have room.
64 //h = (f - a) + u; // h = R(f-a)+ulp(x)
65 movapd %xmm0, %xmm6 // x
66 cmpltsd REL_ADDR(one), %xmm6 // (x < 1.0)
67 movsd REL_ADDR(logup_ulp_mask), %xmm5 // xor(-0x1p-53, 0x1p-54) = 0x8030000000000000
68 andpd %xmm5, %xmm6 // m = (x < 1.0) ? 0x8030000000000000 : 0
69 movsd REL_ADDR(_1pm54), %xmm5 // 0x1p-54
70 xorpd %xmm5, %xmm6 // m ^ 0x1p-54 = (x < 1.0) ? 0x8030000000000000 ^ 0x1p-54 : 0x1p-54
71 movsd %xmm6, 24(STACKP) // (x < 1.0) ? -0x1p-53 : 0x1p-54
72#endif
73
74 // Break x down into exp, f with -0.25 <= f < 0.5 and x = 2^exp(1+f)
75#if defined( __x86_64__ )
76 movd %xmm0, %r8 // u.u = encoding of x
77 shrq $(52), %rdx // exponent >>= 52
78 andq REL_ADDR(frexp_mant_mask), %r8 // mantissa = u.u & 0x800fffffffffffffULL;
79
80 movd %xmm0, %rcx // encoding of xadjust
81
82 // or in -1 + bias to the new exponent for the mantissa
83
84 addq $-1023, %rdx // exponent + bias (= -1023 for normal data)
851: //This is where denormal inputs re-enter with bias compensated for denormal exponent and appropriate mantissa
86 //Note: for denormal xmm0 and rcx inputs are no longer the original x
87
88 shrq $51, %rcx // (encoding) >> 51
89 andq $0x1, %rcx // upperhalf = (encoding & 0x00080000ULL) >> 51 = (upperhalf != 0ULL);
90 addq %rcx, %rdx // exp = bias + (int) ( exponent >> 52 ) + (upperhalf != 0ULL);
91 movl %edx, 8(STACKP) // Save exp for later. Free dx and cx.
92
93 //Note: for denormal inputs this is no longer the original x
94 movd %xmm0, %rax // u.u
95 // grab before adjusting for which half we are in
96 andq REL_ADDR(frexp_head_mask), %rax // head = u.u & 0x000ff00000000000ULL;
97 shrq $((52-8)-5), %rax // k<<5 = (int)(head>>(52-8))<<5;
98 movd %xmm0, %rcx // u.u = encoding of xadjust
99 andq REL_ADDR(frexp_half_mask), %rcx // upperhalf = u.u & 0x0008000000000000ULL;
100 shlq $1, %rcx // (upperhalf << 1)
101 orq REL_ADDR(one), %r8 // (mantissa | 0x3ff0000000000000ULL)
102 // Divide by two if significand >= 3/2
103 xorq %rcx, %r8 // mantissa = (mantissa | 0x3ff0000000000000ULL) ^ (upperhalf << 1);
104 movd %r8, %xmm1 // fp1
105 // Now:
106 // k<<5 = %eax
107 // exp = %rdx
108 // fp1 = %xmm1 = (fp1 > 1.5) ? (0.5 * fp1 - 1.0) : (fp1 - 1.0);
109#else
110 movapd %xmm0, %xmm1 // u.u = encoding of x
111 andpd REL_ADDR(frexp_mant_mask), %xmm1 // mantissa = u.u & 0x800fffffffffffffULL;
112
113 movapd %xmm0, %xmm2 // u.u = encoding of adjusted x
114 psrlq $32, %xmm0 // top half of encoding of adjusted x
115
116 // or in -1 + bias to the new exponent for the mantissa
117 andpd REL_ADDR(frexp_half_mask), %xmm2 // upperhalf = u.u & 0x0008000000000000ULL;
118 shrl $(52-32), %edx // exponent >>= 52-32
119
120 addl $-1023, %edx // exponent + bias; bias = -1023 for normal data
121
1221: //This is where denormal inputs re-enter with bias compensated for denormal exponent and appropriate mantissa
123
124 movapd %xmm2, %xmm7 // upperhalf
125 psrlq $51, %xmm7 // (upperhalf != 0ULL) = upperhalf >> 51 == 1 or 0
126 movd %xmm7, 8(STACKP) // (int) ( exponent >> 52 )
127 add 8(STACKP), %edx // exp = bias + (int) ( exponent >> 52 ) + (upperhalf != 0ULL);
128 movl %edx, 8(STACKP) // exp
129 movd %xmm0, %eax // Encoding of top half of x
130 // grab before adjusting for which half we are in
131 andl $0x000ff000, %eax // head = u.u & 0x000ff00000000000ULL;
132 shrl $(12-5), %eax // k<<5 = (int)(head>>(52-8-32))<<5;
133 psllq $1, %xmm2 // (upperhalf << 1)
134 orpd REL_ADDR(xone), %xmm1 // (mantissa | 0x3ff0000000000000ULL)
135 // Divide by two if significand >= 3/2
136 xorpd %xmm2, %xmm1 // mantissa = (mantissa | 0x3ff0000000000000ULL) ^ (upperhalf << 1);
137
138 // Now:
139 // k<<5 = %eax
140 // exp = 8(STACKP)
141 // fp1 = %xmm1 = (fp1 > 1.5) ? (0.5 * fp1) : (fp1);
142#endif
143
144// We look up a+1 rather than a, so we can work with fp1 - ap1 rather than f - a
145
146#if BASE2
147 // if fp1 == 1.0 return nd // base 2 only
148 movsd REL_ADDR(one), %xmm2 // 1.0
149 ucomisd %xmm2, %xmm1 // f == 1.0 set cc
150 je 7f
151#endif
152
153 movsd %xmm1, (STACKP)
154
155// The lookup table is in a funny format since it has 2 long double and a single.
156 // {10-byte va ; 2-byte pad ; 4-byte single a ; 10-byte lg1pa ; 2-byte pad, 4-byte a+1}
157#if defined( __x86_64__ )
158 // in 64-bit need lea to get address
159 lea REL_ADDR(LOOKUP), %rdx
160 fldt (%rdx,AX_P,1) // [va]
161 fldl (STACKP) // [va ; f+1]
162
163 fsubs 28(%rdx,AX_P,1) // [va ; h = f+1 - a+1]
164#if LOGUP
165 //halfulp stuff goes here...
166 //h = (f - a) + u; // h = R(f-a)+ulp(x)
167 faddl 24(STACKP) // [va ; h = (f - a) + u]
168#endif //LOGUP
169 fmulp %st, %st(1) // [c = h * va]
170 fstl (STACKP) // [c] -- with double version stored on the stack
171
172 fldt 16(%rdx,AX_P,1) // [c ; lg1pa ]
173#if BASE2
174 // logbxhi = nd+lg1pa; // base 2
175 fiaddl 8(STACKP) // [c ; lghi = nd + lg1pa]
176#elif BASEE
177 // base e stuff goes here...
178 fiaddl 8(STACKP) // [c ; lghi = nd + lg1pa]
179 fldt REL_ADDR(ln2l) // [c ; lghi = nd + lg1pa ; ln2l]
180 fmulp %st, %st(1) // [ c ; logbxhi = ln2l*(nd+lg1pa)]
181#elif BASE10
182 // base 10 stuff goes here...
183 // free(%rax)
184 // free(%rdx)
185#if FENVON
186 movd %xmm0, %rdx // Original x encoding
187 shrq $(53-4), %rdx // 16*k = k << 4. k = key >> 53 = the top 6 of bottom 7 bits of exponent of x
188 andq $0x3f, %rdx // key = _mm_and_pd(vx,log10_key_mask);
189 movd %xmm0, %rax // Original x encoding
190 lea REL_ADDR(isPowerOf10), %rcx // Get base of table
191 cmp (%rcx,%rdx,1), %rax // if(isPowerOf10[k][0] == x) {return isPowerOf10[k][1];}
192 je 8f
193#endif //FENVON base 10 64-bit
194 fiaddl 8(STACKP) // [c ; lghi = nd + lg1pa]
195 fldt REL_ADDR(log102l) // [c ; lghi = nd + lg1pa ; log102l]
196 fmulp %st, %st(1) // [ c ; logbxhi = log102l*(nd+lg1pa)]
197#endif
198
199#else // i386
200 fldt (LOOKUP-0b)(%ecx,%eax,1) // [va]
201 fldl (STACKP) // [va ; f+1]
202 //halfulp stuff goes here...
203 fsubs (LOOKUP-0b+28)(%ecx,%eax,1) // [va ; h = f+1 - a+1]
204#if LOGUP
205 //h = (f - a) + u; // h = R(f-a)+ulp(x)
206 faddl 24(STACKP) // [va ; h = (f - a) + u]
207#endif //LOGUP
208 fmulp %st, %st(1) // [c = h * va]
209 fstl (STACKP) // [c] -- with double version stored on the stack
210 fldt (LOOKUP-0b+16)(%ecx,%eax,1) // [c ; lg1pa ]
211
212#if BASE2
213 // logbxhi = nd+lg1pa; // base 2
214 fiaddl 8(STACKP) // [c ; lghi = nd + lg1pa]
215#elif BASEE
216 // base e stuff goes here...
217 fiaddl 8(STACKP) // [c ; lghi = nd + lg1pa]
218 fldt REL_ADDR(ln2l) // [c ; lghi = nd + lg1pa ; ln2l]
219 fmulp %st, %st(1) // [ c ; logbxhi = ln2l*(nd+lg1pa)]
220#elif BASE10
221 // base 10 stuff goes here...
222 // free(%edx)
223#if FENVON
224 movl (4+LOCAL_STACK_SIZE)+FRAME_SIZE(STACKP), %edx
225 andl $0x07e00000, %edx // key = _mm_and_pd(vx,log10_key_mask);
226 shrl $((53-32)-4), %edx // 16*k = k << 4. k = key >> 53 = the top 6 of bottom 7 bits of exponent of x
227 movsd (LOCAL_STACK_SIZE)+FRAME_SIZE(STACKP), %xmm5
228 comisd (isPowerOf10-0b)(%ecx,%edx,1), %xmm5 // if(isPowerOf10[k][0] == x) {return isPowerOf10[k][1];}
229 je 8f
230#endif //FENVON base 10 i386
231 fiaddl 8(STACKP) // [c ; lghi = nd + lg1pa]
232 fldt REL_ADDR(log102l) // [c ; lghi = nd + lg1pa ; log102l]
233 fmulp %st, %st(1) // [ c ; logbxhi = log102l*(nd+lg1pa)]
234#endif //base 10 i386
235
236#endif // i386
237
238
239#ifdef __SSE3__
240 movddup (STACKP), %xmm0 // get cd, double version of c off stack in both registers
241#else
242 movsd (STACKP), %xmm0
243 movhpd (STACKP), %xmm0
244#endif
245 movapd %xmm0, %xmm1 // [cd,cd]
246 addpd REL_ADDR(a01), %xmm0 // [a0,a1] + [cd,cd]
247 mulpd %xmm1, %xmm0 // [a0+cd,a1+cd] * [cd,cd]
248 addpd REL_ADDR(b01), %xmm0 // vpoly = [b0,b1] + [(a0+cd)*cd,(a1+cd)*cd]
249
250//long double logbec is log_b(e) * c
251//double c5b is log_b(e) * c5
252 fld %st(1) // [c ; lghi ; c]
253#if BASE2
254 // in base 2 logbec = lgel * c; c5b = c5_2;
255 fldt REL_ADDR(lgel) // [c ; lghi ; c ; lgel]
256 fmulp %st, %st(1) // [c ; lghi ; logbec = lgel * c]
257#define c5b c5_2
258#elif BASEE
259 // in base e logbec = c; c5b = c5_e;
260 // [c ; lghi ; logbec = lnel * c = c]
261#define c5b c5_e
262#elif BASE10
263 // in base 10 logbec = log10el * c; c5b = c5_10;
264 fldt REL_ADDR(log10el) // [c ; lghi ; log10el ; c]
265 fmulp %st, %st(1) // [c ; lghi ; logbec = log10el * c]
266#define c5b c5_10
267#endif
268 fxch %st(2) // [logbec ; lghi ; c]
269 fldt REL_ADDR(c0) // [logbec ; lghi ; c ; c0]
270 fmulp %st, %st(1) // [logbec ; lghi ; c*c0]
271 fld1 // [logbec ; lghi ; c*c0 ; 1.0]
272 faddp %st, %st(1) // [logbec ; lghi ; 1.0 + c*c0]
273 fmulp %st, %st(2) // [linear = logbec*(1.0 + c*c0) ; lghi]
274 fxch %st(1) // [lghi ; linear]
275
276 movapd %xmm1, %xmm2 // [cd, cd]
277 movhpd REL_ADDR(c5b), %xmm1 // [c5b, cd]
278 mulpd %xmm2, %xmm1 // vccc5 = [c5b*cd, cd*cd]
279 mulpd %xmm1, %xmm0 // vp = vccc5 * vpoly = [c5b*cd*(b0+(a0+cd)*cd), cd*cd*(b1+(a1+cd)*cd)]
280 //Do a horizontal multiply.
281 movapd %xmm0, %xmm1 // [ *, vplo]
282 shufpd $3, %xmm0, %xmm0 // [ *, vphi]
283 mulsd %xmm1, %xmm0 // [ *, cccp = c5b*cd*(b0+(a0+cd)*cd) * cd*cd*(b1+(a1+cd)*cd)]
284 movsd %xmm0, (STACKP) // Store so we can load into x87
285 faddl (STACKP) // [lghi ; logbxlo = linear + cccp]
286 faddp %st, %st(1) // [longResult = lghi + logbxlo]
287 // Round to double
288 fstpl (STACKP) // result [<empty stack>]
289#if defined( __x86_64__ )
290 movq (STACKP), %xmm0 // result
291#else
292 fldl (STACKP)
293#endif
294#undef c5b
295
2962: // Clean up stack and return
297 ADDP $LOCAL_STACK_SIZE, STACKP
298 ret
299
300////////////////////////////////////////////////////////////////
301
302
3033: // Special operand handling: NaN, Inf, Denorm, 0, or negative
304 // When we get here AX has as encoding(x) - 0x00100000[00000000 in 64-bit]
305 // +denormal -> denormal handling
306 // +zero -> -inf #ZERO_DIVIDE
307 // -stuff -> sqrt(x) (pass quiet NaNs or raise invalid)
308 // -zero -> -inf #ZERO_DIVIDE
309 // +NaN -> x+x
310 // +inf -> x+x
311 xorpd %xmm2, %xmm2 // 0.0
312 ucomisd %xmm2, %xmm0 // f == 0 set cc
313 jp 9f // NaN: compares unequal and signals invalid only for signaling NaN
314 je 4f // Zero: x == 0.0 -> -1 / zero
315
316 // We now need to distinguish between Denormal, Negative, and NaN/Inf.
317 // Adding the small_cut back in sets the EFLAGS for conditional branches
318#if defined( __x86_64__ )
319 addq REL_ADDR(small_cut), %rax
320#else
321 addl $0x00100000, %eax
322#endif
323 jb 5f // Denormal (i.e., Inf > x > 0 and we have ruled out other cases)
324 js 6f // Negative sign (may be "-NaN", -Inf, -Normal, or -Denormal)
325
326 // If we fall through to here, we have "+NaN" or +Inf
3279:
328#if defined( __i386__ ) // Light cleanup required
329 movsd %xmm0, FRAME_SIZE(STACKP)
330 fldl FRAME_SIZE(STACKP)
331#endif
332 ret
333
3344: // Zero: x == 0.0 -> -1 / zero
335 //We need to return -Inf for zero and set div/0 flag
336 xorpd %xmm1, %xmm1
337 pcmpeqb %xmm0, %xmm0 // -1U
338 cvtdq2pd %xmm0, %xmm0 // -1.0f
339 divsd %xmm1, %xmm0 // -1.0f / 0 = -Inf + div/0 flag
340#if defined( __i386__ ) // Light cleanup required
341 movsd %xmm0, FRAME_SIZE(STACKP)
342 fldl FRAME_SIZE(STACKP)
343#endif
344 ret
345
3465: // Denormal
347//Denormal inputs are frexped with extra care and then we branch back into the mainline code.
348
349 SUBP $LOCAL_STACK_SIZE, STACKP
350 // Break x down into exp, f with -0.25 <= f < 0.5 and x = 2^exp(1+f)
351 // knowing that we are starting with denormal x
352#if defined( __x86_64__ )
353 movd %xmm0, %r8 // u.u = encoding of x
354 andq REL_ADDR(frexp_mant_mask), %r8 // mantissa = u.u & 0x800fffffffffffffULL;
355 // or in bias to the new exponent for the mantissa
356 orq REL_ADDR(one), %r8 // mantissa |= 0x3ff0000000000000ULL;
357 movd %r8, %xmm0 // mantissa
358 subsd REL_ADDR(one), %xmm0
359 movd %xmm0, %rdx // encoding rescaled input
360 movd %xmm0, %r8 // encoding rescaled input
361 shrq $(52), %rdx // exponent >>= 52
362 andq REL_ADDR(frexp_mant_mask), %r8 // mantissa = u.u & 0x800fffffffffffffULL;
363 movd %xmm0, %rcx // encoding of xadjust
364 // or in -1 + bias to the new exponent for the mantissa
365 addq $(-1023-1022), %rdx // exponent + bias (= -1023-1022 for subnormal data)
366#else
367 movapd %xmm0, %xmm1 // u.u = encoding of x
368 // or in bias to the new exponent for the mantissa
369 movq REL_ADDR(one), %xmm2 // 1.0
370 orpd %xmm2, %xmm0 // mantissa = _mm_or_pd(x, xOne);
371 subsd %xmm2, %xmm0 // u = mantissa - xOne;
372 movapd %xmm0, %xmm1 // u.u = encoding of adjusted x
373 movapd %xmm0, %xmm7 // u.u = encoding of adjusted x (but we only want the high half)
374 psrlq $32, %xmm7 // top half of encoding of adjusted x
375 movd %xmm7, %edx // top half of encoding of adjusted x
376 andpd REL_ADDR(frexp_mant_mask), %xmm1 // mantissa = u.u & 0x800fffffffffffffULL;
377
378 movapd %xmm0, %xmm2 // u.u = encoding of adjusted x
379 psrlq $32, %xmm0 // top half of encoding of adjusted x
380
381 // or in -1 + bias to the new exponent for the mantissa
382 andpd REL_ADDR(frexp_half_mask), %xmm2 // upperhalf = u.u & 0x0008000000000000ULL;
383 shrl $(52-32), %edx // exponent >>= 52-32
384
385 addl $(-1023-1022), %edx // exponent + bias; bias = -1023-1022 for subnormal data
386
387#endif
388 jmp 1b // Re-enter main line with bias compensated for denormal exponent and appropriate mantissa
389
390
3916: // Negative sign (may be "-NaN", -Inf, -Normal, or -Denormal)
392 // In any case sqrt(x) will do the right thing.
393 // -QNaN will pass through
394 // -SNaN will raise invalid and be quieted
395 // -N will raise invalid and return a quiet NaN
396 sqrtsd %xmm0, %xmm0
397#if defined( __i386__ ) // Light cleanup required
398 movsd %xmm0, FRAME_SIZE(STACKP)
399 fldl FRAME_SIZE(STACKP)
400#endif
401 ret
402
403#if BASE2
4047: //early out for power of 2
405 ADDP $LOCAL_STACK_SIZE, STACKP
406#if defined( __i386__ )
407 movl %edx, FRAME_SIZE( STACKP )
408 fildl FRAME_SIZE(STACKP)
409#else
410 cvtsi2sd %rdx, %xmm0
411#endif
412 ret
413#endif
414
415// This base10 branch only needed if FENVON
416#if FENVON
4178: // If(isPowerOf10[k][0] == x) {return isPowerOf10[k][1];}
418 // Come in with two things on the x87 stack // [c ; lg1pa ]
419 fstp %st // [c ]
420 fstp %st // [<empty>]
421#if defined( __i386__ )
422 movsd (isPowerOf10-0b+8)(%ecx,%edx,1), %xmm0 // isPowerOf10[k][1]
423#else
424 movsd 8(%rcx,%rdx,1), %xmm0 // isPowerOf10[k][1]
425#endif
426 // Clean up stack and return
427 ADDP $LOCAL_STACK_SIZE, STACKP
428
429#if defined( __i386__ ) // Light cleanup required
430 movsd %xmm0, FRAME_SIZE(STACKP)
431 fldl FRAME_SIZE(STACKP)
432#endif
433 ret
434#endif
435
436////////////////////////////////////////////////////////////////