src/libm/Source/Intel/log_universal.h at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / log_universal.h
at fixPythonPipStalling 436 lines 16 kB view raw
wrap content
Lubos Dolezel Restructured source tree to prepare for merge with the "darling" repo 10y ago
f228ae16
  1// Note that while this file has a .s extension it is not intended to be assmbled directly
  2// but rather included with appropriate #defines in place.
  3	
  4// This begins the base-universal logarithm code. 
  5#ifndef BASE2	
  6#define BASE2			0
  7#endif
  8#ifndef BASEE	
  9#define BASEE			0	
 10#endif
 11#ifndef BASE10
 12#define BASE10			0
 13#endif
 14#ifndef LOGUP
 15#define LOGUP			0
 16#endif
 17
 18// Assume we need to respect the flags unless directed otherwise	
 19#ifndef FENVON
 20#define FENVON 			1
 21#endif
 22	
 23#if BASE2
 24// double log2asm( double x )
 25ENTRY( log2 )
 26#elif BASEE
 27#if LOGUP	
 28// private double logup( double x ) // log(x+ULP(x)/2)
 29ENTRY( _logup )
 30#else	
 31// double log( double x )
 32ENTRY( log )
 33#endif	
 34#elif BASE10
 35// double log10( double x )
 36ENTRY( log10 )
 37#else
 38#error "One of BASE2, BASEE, BASE10 needs to be set to 1"
 39#endif
 40	// Early out if x is NaN, Inf, 0, or <0
 41	// I.e., if the encoding of x is not in (0, 7ff0000000000000)
 42#if defined( __x86_64__ )
 43	movd		%xmm0,				%rax
 44	movd		%xmm0,				%rdx
 45	subq		REL_ADDR(small_cut),		%rax
 46	cmpq		REL_ADDR(large_cut),		%rax
 47#else
 48	movl		4+FRAME_SIZE(STACKP),		%eax
 49	movl		4+FRAME_SIZE(STACKP),		%edx
 50	movsd		FRAME_SIZE(STACKP),		%xmm0
 51	call		0f				// PIC code
 520:	pop		%ecx				// PIC base
 53	sub		$0x00100000,			%eax
 54	cmp		$0x7fe00000,			%eax
 55#endif
 56	jae		3f	// Jump if NaN, Inf, Denorm, 0, or negative
 57
 58	SUBP		$LOCAL_STACK_SIZE, 		STACKP
 59
 60#if LOGUP	
 61	// Need to adjust h with a small correction for log1p(x) when 1+x is not representable
 62        //u = (x < 1.0) ? -0x1p-53 : 0x1p-54 = ...? 0xbca0000000000000 : 0x3c90000000000000; 
 63            // Add in the half ulp now that we know we have room.
 64        //h = (f - a) + u; // h = R(f-a)+ulp(x)
 65	movapd		%xmm0,			%xmm6		// x
 66	cmpltsd		REL_ADDR(one),		%xmm6		// (x < 1.0)
 67	movsd		REL_ADDR(logup_ulp_mask),	%xmm5	// xor(-0x1p-53, 0x1p-54) = 0x8030000000000000
 68	andpd		%xmm5,			%xmm6		// m = (x < 1.0) ? 0x8030000000000000 : 0
 69	movsd		REL_ADDR(_1pm54),	%xmm5		// 0x1p-54
 70	xorpd		%xmm5,			%xmm6		// m ^ 0x1p-54 = (x < 1.0) ? 0x8030000000000000 ^ 0x1p-54 : 0x1p-54
 71	movsd		%xmm6,			24(STACKP)	// (x < 1.0) ? -0x1p-53 : 0x1p-54
 72#endif	
 73
 74	// Break x down into exp, f with -0.25 <= f < 0.5 and x = 2^exp(1+f)
 75#if defined( __x86_64__ )
 76	movd		%xmm0,				%r8	// u.u = encoding of x
 77        shrq		$(52),				%rdx	// exponent >>= 52
 78	andq		REL_ADDR(frexp_mant_mask),	%r8	// mantissa = u.u & 0x800fffffffffffffULL;
 79
 80	movd		%xmm0,				%rcx	// encoding of xadjust
 81	
 82	// or in -1 + bias to the new exponent for the mantissa
 83
 84	addq		$-1023,				%rdx	// exponent + bias (= -1023 for normal data)
 851:	//This is where denormal inputs re-enter with bias compensated for denormal exponent and appropriate mantissa
 86	//Note: for denormal xmm0 and rcx inputs are no longer the original x
 87	
 88	shrq		$51,				%rcx	// (encoding) >> 51
 89        andq		$0x1,				%rcx	// upperhalf = (encoding & 0x00080000ULL) >> 51 = (upperhalf != 0ULL);
 90	addq		%rcx,				%rdx	// exp = bias + (int) ( exponent >> 52 ) + (upperhalf != 0ULL);
 91	movl		%edx,				8(STACKP) // Save exp for later. Free dx and cx.
 92
 93	//Note: for denormal inputs this is no longer the original x
 94	movd		%xmm0,				%rax	// u.u
 95	// grab before adjusting for which half we are in
 96        andq		REL_ADDR(frexp_head_mask),	%rax	// head = u.u & 0x000ff00000000000ULL;
 97        shrq		$((52-8)-5),			%rax	// k<<5 = (int)(head>>(52-8))<<5;
 98	movd		%xmm0,				%rcx	// u.u = encoding of xadjust
 99        andq		REL_ADDR(frexp_half_mask),	%rcx	// upperhalf = u.u & 0x0008000000000000ULL;
100	shlq		$1,				%rcx	// (upperhalf << 1)
101	orq		REL_ADDR(one),			%r8	// (mantissa | 0x3ff0000000000000ULL)
102	// Divide by two if significand >= 3/2
103	xorq		%rcx,				%r8	// mantissa = (mantissa | 0x3ff0000000000000ULL) ^ (upperhalf << 1);
104	movd		%r8,				%xmm1	// fp1
105	// Now:
106	// k<<5 = %eax
107	// exp = %rdx
108	// fp1 = %xmm1 = (fp1 > 1.5) ? (0.5 * fp1 - 1.0) : (fp1 - 1.0);
109#else
110	movapd		%xmm0,				%xmm1	// u.u = encoding of x
111	andpd		REL_ADDR(frexp_mant_mask),	%xmm1	// mantissa = u.u & 0x800fffffffffffffULL;
112
113	movapd		%xmm0,				%xmm2	// u.u = encoding of adjusted x
114	psrlq		$32,				%xmm0	// top half of encoding of adjusted x
115
116	// or in -1 + bias to the new exponent for the mantissa
117        andpd		REL_ADDR(frexp_half_mask),		%xmm2	// upperhalf = u.u & 0x0008000000000000ULL;
118        shrl		$(52-32),				%edx	// exponent >>= 52-32
119	
120	addl		$-1023,				%edx	// exponent + bias; bias = -1023 for normal data
121	
1221:	//This is where denormal inputs re-enter with bias compensated for denormal exponent and appropriate mantissa
123
124	movapd		%xmm2,				%xmm7	// upperhalf
125	psrlq		$51,				%xmm7	// (upperhalf != 0ULL) = upperhalf >> 51 == 1 or 0
126	movd		%xmm7,				8(STACKP)	// (int) ( exponent >> 52 )
127	add		8(STACKP),			%edx		// exp = bias + (int) ( exponent >> 52 ) + (upperhalf != 0ULL);
128	movl		%edx,				8(STACKP)	// exp
129	movd		%xmm0,				%eax	// Encoding of top half of x 
130	// grab before adjusting for which half we are in
131        andl		$0x000ff000,			%eax	// head = u.u & 0x000ff00000000000ULL;
132        shrl		$(12-5),			%eax	// k<<5 = (int)(head>>(52-8-32))<<5;
133	psllq		$1,				%xmm2	// (upperhalf << 1)
134	orpd		REL_ADDR(xone),			%xmm1	// (mantissa | 0x3ff0000000000000ULL)
135	// Divide by two if significand >= 3/2
136	xorpd		%xmm2,				%xmm1	// mantissa = (mantissa | 0x3ff0000000000000ULL) ^ (upperhalf << 1);
137
138	// Now:
139	// k<<5 = %eax
140	// exp = 8(STACKP)
141	// fp1 = %xmm1 = (fp1 > 1.5) ? (0.5 * fp1) : (fp1);
142#endif
143	
144// We look up a+1 rather than a, so we can work with fp1 - ap1 rather than f - a
145	
146#if BASE2	
147	// if fp1 == 1.0 return nd // base 2 only
148	movsd		REL_ADDR(one),			%xmm2	// 1.0
149	ucomisd		%xmm2,				%xmm1	// f == 1.0 set cc
150	je		7f
151#endif 
152	
153	movsd		%xmm1,	(STACKP)
154
155// The lookup table is in a funny format since it has 2 long double and a single.
156	// {10-byte va ; 2-byte pad ; 4-byte single a ; 10-byte lg1pa ; 2-byte pad, 4-byte a+1}
157#if defined( __x86_64__ )
158	// in 64-bit need lea to get address
159	lea		REL_ADDR(LOOKUP),	%rdx
160	fldt		(%rdx,AX_P,1)	// [va]
161	fldl		(STACKP)	// [va ; f+1]
162
163	fsubs		28(%rdx,AX_P,1)	// [va ; h = f+1 - a+1]
164#if LOGUP	
165	//halfulp stuff goes here...
166        //h = (f - a) + u; // h = R(f-a)+ulp(x)
167	faddl		24(STACKP)			// [va ; h = (f - a) + u]
168#endif //LOGUP	
169	fmulp		%st,	%st(1)	// [c = h * va]
170	fstl		(STACKP) 	// [c] -- with double version stored on the stack
171	
172	fldt		16(%rdx,AX_P,1)	// [c ; lg1pa ]
173#if BASE2
174	// logbxhi = nd+lg1pa; // base 2
175	fiaddl		8(STACKP) 	// [c ; lghi = nd + lg1pa]
176#elif BASEE
177	// base e stuff goes here...
178	fiaddl		8(STACKP) 	// [c ; lghi = nd + lg1pa]
179	fldt		REL_ADDR(ln2l)	// [c ; lghi = nd + lg1pa ; ln2l]
180        fmulp		%st, %st(1)	// [ c ; logbxhi = ln2l*(nd+lg1pa)]
181#elif BASE10
182	// base 10 stuff goes here...
183	// free(%rax)
184	// free(%rdx)
185#if FENVON
186	movd		%xmm0,					%rdx	// Original x encoding
187	shrq		$(53-4),				%rdx	// 16*k = k << 4. k = key >> 53 = the top 6 of bottom 7 bits of exponent of x
188        andq		$0x3f,					%rdx	// key = _mm_and_pd(vx,log10_key_mask);
189	movd		%xmm0,					%rax	// Original x encoding
190	lea		REL_ADDR(isPowerOf10),			%rcx	// Get base of table
191	cmp		(%rcx,%rdx,1),				%rax	// if(isPowerOf10[k][0] == x) {return  isPowerOf10[k][1];}
192	je		8f
193#endif //FENVON base 10 64-bit
194	fiaddl		8(STACKP) 			// [c ; lghi = nd + lg1pa]
195	fldt		REL_ADDR(log102l)		// [c ; lghi = nd + lg1pa ; log102l]
196        fmulp		%st, %st(1)			// [ c ; logbxhi = log102l*(nd+lg1pa)]
197#endif
198	
199#else	// i386
200	fldt		(LOOKUP-0b)(%ecx,%eax,1)	// [va]
201	fldl		(STACKP)			// [va ; f+1]
202	//halfulp stuff goes here...
203	fsubs		(LOOKUP-0b+28)(%ecx,%eax,1)	// [va ; h = f+1 - a+1]
204#if LOGUP	
205        //h = (f - a) + u; // h = R(f-a)+ulp(x)
206	faddl		24(STACKP)			// [va ; h = (f - a) + u]
207#endif //LOGUP	
208	fmulp		%st,	%st(1)			// [c = h * va]
209	fstl		(STACKP) 			// [c] -- with double version stored on the stack
210	fldt		(LOOKUP-0b+16)(%ecx,%eax,1)	// [c ; lg1pa ]
211	
212#if BASE2
213	// logbxhi = nd+lg1pa; // base 2
214	fiaddl		8(STACKP) 			// [c ; lghi = nd + lg1pa]
215#elif BASEE
216	// base e stuff goes here...
217	fiaddl		8(STACKP) 			// [c ; lghi = nd + lg1pa]
218	fldt		REL_ADDR(ln2l)			// [c ; lghi = nd + lg1pa ; ln2l]
219        fmulp		%st, %st(1)			// [ c ; logbxhi = ln2l*(nd+lg1pa)]
220#elif BASE10
221	// base 10 stuff goes here...
222	// free(%edx)
223#if FENVON
224	movl		(4+LOCAL_STACK_SIZE)+FRAME_SIZE(STACKP),	%edx
225        andl		$0x07e00000,				%edx	// key = _mm_and_pd(vx,log10_key_mask);
226	shrl		$((53-32)-4),				%edx	// 16*k = k << 4. k = key >> 53 = the top 6 of bottom 7 bits of exponent of x
227	movsd		(LOCAL_STACK_SIZE)+FRAME_SIZE(STACKP),	%xmm5
228	comisd		(isPowerOf10-0b)(%ecx,%edx,1),		%xmm5	// if(isPowerOf10[k][0] == x) {return  isPowerOf10[k][1];}
229	je		8f
230#endif //FENVON base 10 i386
231	fiaddl		8(STACKP) 			// [c ; lghi = nd + lg1pa]
232	fldt		REL_ADDR(log102l)		// [c ; lghi = nd + lg1pa ; log102l]
233        fmulp		%st, %st(1)			// [ c ; logbxhi = log102l*(nd+lg1pa)]
234#endif //base 10 i386
235	
236#endif // i386
237
238	
239#ifdef __SSE3__	
240	movddup		(STACKP),	%xmm0	// get cd, double version of c off stack in both registers
241#else
242	movsd		(STACKP),	%xmm0
243	movhpd		(STACKP),	%xmm0	
244#endif
245	movapd		%xmm0,		%xmm1	// [cd,cd]
246	addpd		REL_ADDR(a01),	%xmm0	// [a0,a1] + [cd,cd]
247	mulpd		%xmm1,		%xmm0	// [a0+cd,a1+cd] * [cd,cd]
248	addpd		REL_ADDR(b01),	%xmm0	// vpoly = [b0,b1] + [(a0+cd)*cd,(a1+cd)*cd]
249	
250//long double logbec is log_b(e) * c
251//double c5b is log_b(e) * c5
252	fld		%st(1)	// [c ; lghi ; c]
253#if BASE2
254	// in base 2 logbec = lgel * c; c5b = c5_2;
255	fldt		REL_ADDR(lgel)	// [c ; lghi ; c ; lgel]
256	fmulp		%st, %st(1)	// [c ; lghi ; logbec = lgel * c]
257#define c5b c5_2	
258#elif BASEE
259	// in base e logbec = c; c5b = c5_e;
260					// [c ; lghi ; logbec = lnel * c = c]
261#define c5b c5_e
262#elif BASE10
263	// in base 10 logbec = log10el * c; c5b = c5_10;
264	fldt		REL_ADDR(log10el)	// [c ; lghi ; log10el ; c]
265	fmulp		%st, %st(1)	// [c ; lghi ; logbec = log10el * c]
266#define c5b c5_10
267#endif
268	fxch		%st(2)		// [logbec ; lghi ; c]
269	fldt		REL_ADDR(c0)	// [logbec ; lghi ; c ; c0]
270	fmulp		%st, %st(1)	// [logbec ; lghi ; c*c0]
271	fld1				// [logbec ; lghi ; c*c0 ; 1.0]
272	faddp		%st, %st(1)	// [logbec ; lghi ; 1.0 + c*c0]
273	fmulp		%st, %st(2)	// [linear = logbec*(1.0 + c*c0) ; lghi]
274	fxch		%st(1)		// [lghi ; linear]
275	
276	movapd		%xmm1,		%xmm2	// [cd, cd]
277	movhpd		REL_ADDR(c5b),	%xmm1	// [c5b, cd]
278	mulpd		%xmm2,		%xmm1	// vccc5 = [c5b*cd, cd*cd]
279	mulpd		%xmm1,		%xmm0	// vp = vccc5 * vpoly = [c5b*cd*(b0+(a0+cd)*cd), cd*cd*(b1+(a1+cd)*cd)]
280	//Do a horizontal multiply.
281	movapd		%xmm0,		%xmm1	// [ *, vplo]
282	shufpd		$3,	%xmm0,	%xmm0	// [ *, vphi]
283	mulsd		%xmm1,		%xmm0	// [ *, cccp = c5b*cd*(b0+(a0+cd)*cd) * cd*cd*(b1+(a1+cd)*cd)]
284	movsd		%xmm0,		(STACKP)	// Store so we can load into x87
285	faddl		(STACKP)		// [lghi ; logbxlo = linear + cccp]
286	faddp		%st,		%st(1)	// [longResult = lghi + logbxlo]
287						// Round to double
288	fstpl		(STACKP)		// result [<empty stack>]
289#if defined( __x86_64__ )
290	movq		(STACKP),	%xmm0	// result
291#else
292	fldl 		(STACKP)
293#endif
294#undef c5b
295
2962:	// Clean up stack and return
297	ADDP		$LOCAL_STACK_SIZE,	STACKP
298	ret
299
300////////////////////////////////////////////////////////////////	
301	
302	
3033:	// Special operand handling: NaN, Inf, Denorm, 0, or negative
304	// When we get here AX has as encoding(x) - 0x00100000[00000000 in 64-bit]
305	// +denormal -> denormal handling
306	// +zero -> -inf #ZERO_DIVIDE
307	// -stuff -> sqrt(x) (pass quiet NaNs or raise invalid)
308	// -zero -> -inf #ZERO_DIVIDE
309	// +NaN -> x+x
310	// +inf -> x+x	
311	xorpd		%xmm2,			%xmm2	// 0.0
312	ucomisd		%xmm2,			%xmm0	// f == 0 set cc
313	jp		9f	// NaN:  compares unequal and signals invalid only for signaling NaN
314	je		4f	// Zero: x == 0.0 -> -1 / zero
315
316        // We now need to distinguish between Denormal, Negative, and NaN/Inf.
317        // Adding the small_cut back in sets the EFLAGS for conditional branches
318#if defined( __x86_64__ )
319	addq		REL_ADDR(small_cut),	%rax
320#else
321	addl		$0x00100000,		%eax
322#endif
323	jb		5f	// Denormal (i.e., Inf > x > 0 and we have ruled out other cases)
324	js		6f	// Negative sign (may be "-NaN", -Inf, -Normal, or -Denormal)
325	
326	// If we fall through to here, we have "+NaN" or +Inf
3279:
328#if defined( __i386__ ) 	// Light cleanup required
329	movsd 		%xmm0,                  FRAME_SIZE(STACKP)
330	fldl 		FRAME_SIZE(STACKP)
331#endif
332	ret
333
3344:	// Zero: x == 0.0 -> -1 / zero
335	//We need to return -Inf for zero and set div/0 flag
336	xorpd 		%xmm1,			%xmm1
337	pcmpeqb 	%xmm0,                  %xmm0              // -1U
338	cvtdq2pd 	%xmm0, 			%xmm0              // -1.0f
339	divsd   	%xmm1,			%xmm0              // -1.0f / 0 = -Inf + div/0 flag
340#if defined( __i386__ ) 	// Light cleanup required
341	movsd 		%xmm0,                  FRAME_SIZE(STACKP)
342	fldl 		FRAME_SIZE(STACKP)
343#endif
344	ret
345	
3465:	// Denormal						     
347//Denormal inputs are frexped with extra care and then we branch back into the mainline code.
348	
349	SUBP		$LOCAL_STACK_SIZE, 		STACKP
350	// Break x down into exp, f with -0.25 <= f < 0.5 and x = 2^exp(1+f)
351	// knowing that we are starting with denormal x
352#if defined( __x86_64__ )
353	movd		%xmm0,				%r8	// u.u = encoding of x
354	andq		REL_ADDR(frexp_mant_mask),	%r8	// mantissa = u.u & 0x800fffffffffffffULL;
355        // or in bias to the new exponent for the mantissa
356        orq		REL_ADDR(one),			%r8	// mantissa |= 0x3ff0000000000000ULL;
357	movd		%r8,				%xmm0	// mantissa
358	subsd		REL_ADDR(one),			%xmm0
359	movd		%xmm0,				%rdx	// encoding rescaled input
360	movd		%xmm0,				%r8	// encoding rescaled input
361        shrq		$(52),				%rdx	// exponent >>= 52
362	andq		REL_ADDR(frexp_mant_mask),	%r8	// mantissa = u.u & 0x800fffffffffffffULL;	
363	movd		%xmm0,				%rcx	// encoding of xadjust
364	// or in -1 + bias to the new exponent for the mantissa
365	addq		$(-1023-1022),			%rdx	// exponent + bias (= -1023-1022 for subnormal data)
366#else
367	movapd		%xmm0,				%xmm1	// u.u = encoding of x
368        // or in bias to the new exponent for the mantissa
369	movq		REL_ADDR(one),			%xmm2	// 1.0
370        orpd		%xmm2,				%xmm0	// mantissa =  _mm_or_pd(x, xOne);
371	subsd		%xmm2,				%xmm0	// u = mantissa - xOne;
372	movapd		%xmm0,				%xmm1	// u.u = encoding of adjusted x
373	movapd		%xmm0,				%xmm7	// u.u = encoding of adjusted x (but we only want the high half)
374	psrlq		$32,				%xmm7	// top half of encoding of adjusted x
375	movd		%xmm7,				%edx	// top half of encoding of adjusted x
376	andpd		REL_ADDR(frexp_mant_mask),	%xmm1	// mantissa = u.u & 0x800fffffffffffffULL;
377
378	movapd		%xmm0,				%xmm2	// u.u = encoding of adjusted x
379	psrlq		$32,				%xmm0	// top half of encoding of adjusted x
380	
381	// or in -1 + bias to the new exponent for the mantissa
382        andpd		REL_ADDR(frexp_half_mask),		%xmm2	// upperhalf = u.u & 0x0008000000000000ULL;
383        shrl		$(52-32),				%edx	// exponent >>= 52-32
384	
385	addl		$(-1023-1022),				%edx	// exponent + bias; bias = -1023-1022 for subnormal data
386	
387#endif
388	jmp		1b	// Re-enter main line with bias compensated for denormal exponent and appropriate mantissa
389
390
3916:	// Negative sign (may be "-NaN", -Inf, -Normal, or -Denormal)	
392	// In any case sqrt(x) will do the right thing.
393	// -QNaN will pass through
394	// -SNaN will raise invalid and be quieted	
395	// -N will raise invalid and return a quiet NaN
396	sqrtsd		%xmm0,				%xmm0
397#if defined( __i386__ ) 	// Light cleanup required
398	movsd 		%xmm0,                  FRAME_SIZE(STACKP)
399	fldl 		FRAME_SIZE(STACKP)
400#endif
401	ret
402    
403#if BASE2	
4047:	 //early out for power of 2
405	ADDP 		$LOCAL_STACK_SIZE,	STACKP
406#if defined( __i386__ )
407	movl 		%edx, 			FRAME_SIZE( STACKP )
408	fildl 		FRAME_SIZE(STACKP)
409#else
410	cvtsi2sd 	%rdx, 			%xmm0
411#endif
412	ret
413#endif
414	
415// This base10 branch only needed if FENVON
416#if FENVON	
4178: 	// If(isPowerOf10[k][0] == x) {return  isPowerOf10[k][1];}
418	// Come in with two things on the x87 stack			// [c ; lg1pa ]
419	fstp		%st						// [c ]
420	fstp		%st						// [<empty>]
421#if defined( __i386__ )
422	movsd		(isPowerOf10-0b+8)(%ecx,%edx,1),	%xmm0	// isPowerOf10[k][1]
423#else	
424	movsd		8(%rcx,%rdx,1),				%xmm0	// isPowerOf10[k][1]
425#endif
426	// Clean up stack and return
427	ADDP		$LOCAL_STACK_SIZE,	STACKP
428	
429#if defined( __i386__ ) 	// Light cleanup required
430	movsd 		%xmm0,                  FRAME_SIZE(STACKP)
431	fldl 		FRAME_SIZE(STACKP)
432#endif
433	ret
434#endif
435	
436////////////////////////////////////////////////////////////////