this repo has no description
1// Note that while this file has a .s extension it is not intended to be assmbled directly
2// but rather included with appropriate #defines in place.
3
4#define _log log@PLT
5#define __logup _logup@PLT
6
7// double log1p( double x )
8ENTRY( log1p )
9// Cases
10// Main case: -0.25 <= x && x < 0.5
11// [0000000000000000, 3fe0000000000000)
12// [8000000000000000, bfd0000000000000]
13// Log1p_core: compute k and jump into log with f = x, n = 0.
14// Early out:
15// x is small: |x| in [0, 3f10000000000000)
16
17// x < -0.25 || x >= 0.5:
18// Call log(x+1) if x == (x+1)-1, otherwise
19// Call logup(x+1) = log(x+1+ulp(x)/2)
20// Note, the Special operands could be handled by the log(x+1) call:
21// x is NaN or inf: fff*, 7ff*
22// x+1 <= 0: [bff0000000000000, fff0000000000000)
23
24#if defined( __x86_64__ )
25 movd %xmm0, %rax
26 movd %xmm0, %rdx
27 andq REL_ADDR(mzero), %rdx // Sign bit
28 andq REL_ADDR(notmzero), %rax // absx
29 shrq $11, %rdx // sign >> 11
30 addq %rdx, %rax // t = x < 0?-2*x:x
31 movsd REL_ADDR(mone), %xmm2 // -1.0
32 movapd %xmm0, %xmm1 // x
33 cmpq REL_ADDR(half), %rax // t < 0.5
34#else
35 movl 4+FRAME_SIZE(STACKP), %eax
36 movl 4+FRAME_SIZE(STACKP), %edx
37 andl $0x80000000, %edx // Sign bit
38 andl $0x7fffffff, %eax // absx
39 call 0f // PIC code
400: pop %ecx // PIC base
41 movsd REL_ADDR(mone), %xmm2 // -1.0
42 shrl $11, %edx // sign >> 11
43 addl %edx, %eax // t = x < 0?-2*x:x
44 movsd FRAME_SIZE(STACKP), %xmm0
45 movsd FRAME_SIZE(STACKP), %xmm1
46 cmpl $0x3fe00000, %eax // t < 0.5
47#endif
48 jb 7f // Jump for main case: -0.25 < x && x < 0.5
49 // x < -0.25, or x >= 0.5, or x is NoN or Inf
50 // This does not work for x in [0.5-, 1.0) since we lose a bit when adding one.
51 //TBD: It might be faster to do this in parallel with the frexp in log
52 // and compute a correction that is always added.
53
54 ucomisd %xmm2, %xmm0 // x < -1.0 ? Since log(1+x) would be invalid
55 jb 5f
56 movapd %xmm2, %xmm3 // -1.0
57 subsd %xmm2, %xmm0 // xp = x + 1.0 = x - -1.0
58 addsd %xmm0, %xmm2 // xpm = xp - 1.0 = xp + -1.0
59 ucomisd %xmm1, %xmm2 // x == xpm?
60#if defined(__i386__)
61 movsd %xmm0, FRAME_SIZE(STACKP) //overwrite x with xp
62#endif
63 je _log // log(x+1.0)
64 andpd REL_ADDR(log1p_not_ulp_mask), %xmm1 // xm
65 movsd REL_ADDR(one), %xmm0 // 1.0
66 addsd %xmm1, %xmm0 // xp = 1.0+xm
67#if defined(__i386__)
68 movsd %xmm0, FRAME_SIZE(STACKP)
69#endif
70 jmp __logup // log((x-ulp)+1+ulp)
71
727:
73 // %xmm0 = x
74 // %xmm1 = x
75
76////////////////////////////////////////////////////////////////
77// log1p core early out for small values
78 movsd REL_ADDR(mzero), %xmm2 // 0x8000000000000000
79 andnpd %xmm0, %xmm2 // absx = fabs(x) // TBD: already have computed absx in 64-bit mode???
80 comisd REL_ADDR(_1pm14), %xmm2 // (absx < 0x1p-14)
81 jbe 3f
82
83//get_k: compute the table lookup index, k.
84 SUBP $LOCAL_STACK_SIZE, STACKP
85
86////////////////////////////////////////////////////////////////
87 // f = x
88 addsd REL_ADDR(one), %xmm0 // fp1 = f+1
89 movsd REL_ADDR(threehalves), %xmm2 // 1.5
90 cmpeqsd %xmm0, %xmm2 // (fp1 == 1.5) ? -1 : 0
91 psrlq $(52-8), %xmm0 // top 8 bits of significand encoding of adjusted x
92 paddq %xmm2, %xmm0 // (int)(fp1>>(52-8)) - (fp1 == 1.5);
93 movd %xmm0, AX_P
94 // grab before adjusting for which half we are in
95 and $0x000000ff, AX_P //k = (int)(head>>(52-8)) - (fp1 == 1.5);
96
97 // Now:
98 // k = %eax
99 // exp = 0
100 // fp1 = (fp1 > 1.5) ? (0.5 * fp1) : (fp1);
101 // %xmm1 = f = fp1 - 1 = x
102
103 movsd %xmm1, (STACKP)
104 shl $5, AX_P
105
106// The lookup table is in a funny format since it has 2 long double and a single.
107 // {10-byte va ; 2-byte pad ; 4-byte single a ; 10-byte lg1pa ; 6-byte pad}
108#if defined( __x86_64__ )
109 // in 64-bit need lea to get address
110 lea REL_ADDR(LOOKUP), %rdx
111 fldt (%rdx,AX_P,1) // [va]
112 fldl (STACKP) // [va ; f]
113 //halfulp stuff goes here...
114 fsubs 12(%rdx,AX_P,1) // [va ; h = f - a] // should we do f-a in x87 or SSE??? In x87 it is easy to write the load and convert from single
115 fmulp %st, %st(1) // [c = h * va]
116 fstl (STACKP) // [c] -- with double version stored on the stack
117
118 fldt 16(%rdx,AX_P,1) // [c ; lg1pa ]
119#else // i386
120 fldt (LOOKUP-0b)(%ecx,%eax,1) // [va]
121 fldl (STACKP) // [va ; f]
122
123 fsubs (LOOKUP-0b+12)(%ecx,%eax,1) // [va ; h = f - a]
124 fmulp %st, %st(1) // [c = h * va]
125 fstl (STACKP) // [c] -- with double version stored on the stack
126 fldt (LOOKUP-0b+16)(%ecx,%eax,1) // [c ; lg1pa ]
127#endif // i386
128
129 // base e specific stuff here...
130 // [c ; lghi = 0 + lg1pa] since nd = 0;
131 fldt REL_ADDR(ln2l) // [c ; lghi = nd + lg1pa ; ln2l]
132 fmulp %st, %st(1) // [ c ; logbxhi = ln2l*(nd+lg1pa)]
133
134#ifdef __SSE3__
135 movddup (STACKP), %xmm0 // get cd, double version of c off stack in both registers
136#else
137 movsd (STACKP), %xmm0
138 movhpd (STACKP), %xmm0
139#endif
140 movapd %xmm0, %xmm1 // [cd,cd]
141 addpd REL_ADDR(a01), %xmm0 // [a0,a1] + [cd,cd]
142 mulpd %xmm1, %xmm0 // [a0+cd,a1+cd] * [cd,cd]
143 addpd REL_ADDR(b01), %xmm0 // vpoly = [b0,b1] + [(a0+cd)*cd,(a1+cd)*cd]
144
145//long double logbec is log_b(e) * c
146//double c5b is log_b(e) * c5
147 fld %st(1) // [c ; lghi ; c]
148 // in base e logbec = c; c5b = c5_e;
149 // [c ; lghi ; logbec = lnel * c = c]
150#define c5b c5_e
151 fxch %st(2) // [logbec ; lghi ; c]
152 fldt REL_ADDR(c0) // [logbec ; lghi ; c ; c0]
153 fmulp %st, %st(1) // [logbec ; lghi ; c*c0]
154 fld1 // [logbec ; lghi ; c*c0 ; 1.0]
155 faddp %st, %st(1) // [logbec ; lghi ; 1.0 + c*c0]
156 fmulp %st, %st(2) // [linear = logbec*(1.0 + c*c0) ; lghi]
157 fxch %st(1) // [lghi ; linear]
158
159 movapd %xmm1, %xmm2 // [cd, cd]
160 movhpd REL_ADDR(c5b), %xmm1 // [c5b, cd]
161 mulpd %xmm2, %xmm1 // vccc5 = [c5b*cd, cd*cd]
162 mulpd %xmm1, %xmm0 // vp = vccc5 * vpoly = [c5b*cd*(b0+(a0+cd)*cd), cd*cd*(b1+(a1+cd)*cd)]
163 //Need to do a horizontal multiply
164 movapd %xmm0, %xmm1 // [ *, vplo]
165 shufpd $3, %xmm0, %xmm0 // [ *, vphi]
166 mulsd %xmm1, %xmm0 // [ *, cccp = c5b*cd*(b0+(a0+cd)*cd) * cd*cd*(b1+(a1+cd)*cd)]
167 movsd %xmm0, (STACKP) // Store so we can load into x87
168 faddl (STACKP) // [lghi ; logbxlo = linear + cccp]
169 faddp %st, %st(1) // [longResult = lghi + logbxlo]
170 // Convert to double
171 fstpl (STACKP) // result [<empty stack>]
172#if defined( __x86_64__ )
173 movq (STACKP), %xmm0 // result
174#else
175 fldl (STACKP) // result
176#endif
177#undef c5b
178
179 // Clean up stack and return
180 ADDP $LOCAL_STACK_SIZE, STACKP
181 ret
182
183////////////////////////////////////////////////////////////////
184// Here we know that |x| <= 0x1p-14
185// %xmm0 x
186// %xmm1 x
187// %xmm2 absx
188
189// TODO could have a better poly for less error, or wider interval
1903:
191 comisd REL_ADDR(_1pm54), %xmm2 // (absx <= 0x1p-54)
192 jbe 4f
193 // So 0x1p-54 < |x| < 0x1p-14
194 // We compute a small poly:
195 // p = ( (-0.25 * x + 0x0.55555555555555p0) * x + -0.5) * (x*x) + x
196 mulsd REL_ADDR(mquarter), %xmm1 // -0.25 * x
197 mulsd %xmm2, %xmm2 // x*x = absx * absx
198 addsd REL_ADDR(third), %xmm1 // (-0.25 * x + 0x0.55555555555555p0)
199 mulsd %xmm0, %xmm1 // (-0.25 * x + 0x0.55555555555555p0) * x
200 subsd REL_ADDR(half), %xmm1 // ( (-0.25 * x + 0x0.55555555555555p0) * x + -0.5)
201 mulsd %xmm2, %xmm1 // p = ( (-0.25 * x + 0x0.55555555555555p0) * x + -0.5) * (x*x)
202 addsd %xmm1, %xmm0 // return p
203#if defined( __i386__ ) // Light cleanup required
204 movsd %xmm0, FRAME_SIZE(STACKP)
205 fldl FRAME_SIZE(STACKP)
206#endif
207 ret
208
209
2104: // if(absx < 0x1p-54) ...
211 // Note: this handles denormals, 0, and -0 also
212#if FENVON
213 addsd REL_ADDR(one), %xmm1 // x + 1.0 to raise inexact since x is small (except when x == 0)
214#endif
215 // retrun x
216#if defined( __i386__ ) // Light cleanup required
217 movsd %xmm0, FRAME_SIZE(STACKP)
218 fldl FRAME_SIZE(STACKP)
219#endif
220 ret
221
2225: // x < -1.0
223 sqrtsd %xmm0, %xmm0
224#if defined( __i386__ ) // Light cleanup required
225 movsd %xmm0, FRAME_SIZE(STACKP)
226 fldl FRAME_SIZE(STACKP)
227#endif
228 ret