src/libm/Source/Intel/e_cbrtl.S at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / e_cbrtl.S
at fixPythonPipStalling 203 lines 11 kB view raw
wrap content
Lubos Dolezel More progress 9y ago
a76db625
  1/*
  2 *  e_cbrtl.s
  3 *  LibmV5
  4 *
  5 *  Created by Ian Ollmann on 8/27/05.
  6 *  Copyright 2005 Apple Computer. All rights reserved.
  7 *
  8 */
  9
 10#define	ENTRY(name)							      \
 11  .globl _##name;				      			      \
 12  .align 4;							      	      \
 13  _##name##:								      
 14
 15#if defined( __LP64__ )
 16	#define LOCAL_STACK_SIZE	20
 17#else
 18	#define LOCAL_STACK_SIZE	28
 19#endif
 20
 21#include "abi.h"
 22
 23.const
 24.align 4
 25onethird:       .long           0xaaaaaaab, 0xaaaaaaaa, 0x00003ffd, 0x00000000                  //(long double) 1.0L/3.0L
 26
 27.align 3
 28correction:     .double         0.62996052494743658238361, 0.79370052598409973737585, 1.0, 1.2599210498948731647672, 1.5874010519681994747517
 29coeffs:         .double         1.7830491344381518, -1.5730724799776633, 1.2536000054780357, -0.60460822457398278, 0.15834924310704463, -0.017322841453552703
 30
 31.align 2
 32infinity:       .single         +Infinity
 33
 34// Stack:
 35//      old ebp
 36//      old ebx
 37
 38
 39.text
 40
 41#if defined( __x86_64__ )
 42	#define RELATIVE_ADDR( _a)								(_a)( %rip )
 43#elif defined( __i386__ )
 44	#define RELATIVE_ADDR( _a)								(_a)-rel_addr( BX_P )
 45#else
 46	#error arch not supported
 47#endif
 48
 49
 50#if defined( __i386__ )
 51	//a short routine to get the local address
 52	local_addr:
 53		MOVP    (STACKP), BX_P
 54		ret
 55#endif
 56
 57
 58ENTRY( cbrtl )
 59	#if defined( __LP64__ )
 60		SUBP	$LOCAL_STACK_SIZE, STACKP
 61        movl	$0x55555556, 16(STACKP)           //write out fixed point 1/3, align stack to 16 bytes
 62        movq    $0, 8(STACKP)
 63        movq    $0, 0(STACKP)
 64		leaq	(correction + 16)(%rip), %r8
 65	#else
 66        pushl   BASEP                           //push ebp
 67        movl    STACKP, BASEP                   //copy stack pointer to ebp
 68        pushl   %ebx                            //push ebx
 69        pushl   $0x55555556                     //write out fixed point 1/3, align stack to 16 bytes
 70        pushl   $0
 71        pushl   $0
 72        pushl   $0
 73        pushl   $0
 74        CALLP   local_addr                      //load the address of rel_addr into %ebx
 75rel_addr:
 76	#endif
 77
 78        //load our argument
 79        fldt            FIRST_ARG_OFFSET(STACKP)        //{x}
 80
 81        //if( fabs(x) == INF || fabs(x) is NaN )
 82        //      return x + x
 83        fld             %ST(0)                          //{x, x}
 84        fabs                                            //{|x|, x}
 85        flds            RELATIVE_ADDR(infinity)         //{inf, |x|,    x}
 86        fucomip         %ST(1), %ST                     //{|x|, x}
 87        jne             test_zero
 88        fstp            %ST(0)                          //{x}
 89        fadd            %ST(0)                          //{x + x}
 90        jmp             my_cbrtl_exit
 91
 92test_zero:
 93        //if( x == 0.0 )
 94        //      return x;
 95        fldz                                            //{0, |x|, x}
 96        fucomip         %ST(1), %ST                     //{|x|, x}
 97        jne             main_part                       //{|x|, x}
 98        fstp            %ST(0)                          //{x}
 99        jmp             my_cbrtl_exit                   //{x}
100
101main_part:                                              //{|x|, x}
102        //extract significand and exponent parts
103        fxtract                                         //{ |significand|, exponent, x }
104
105        //write out the exponent as an integer
106        fxch                                            //{ exponent, |significand|, x }
107        fistpl          (STACKP)                        //{ |significand|, x }
108
109        //apply polynomial to significand, store in s,                                                  figure out what the new exponent is
110        fld             %ST(0)                          //{ s, |significand|, x }
111        fmull           RELATIVE_ADDR(coeffs+5*8)       //{ s*c5, |significand|, x }
112        movl            (STACKP), %eax                  //                                              load the exponent
113        faddl           RELATIVE_ADDR(coeffs+4*8)       //{ s*c5+c4, |significand|, x }
114        imull           16(STACKP)                      //                                              divide the exponent by 3 (multiply by 0x55555556 and take high 32 bits, place in %edx)
115        movl            (STACKP), %eax                  //                                              get exponent >> 1
116        sarl            $31, %eax
117        fmul            %ST(1)                          //{ (c4+c5*s)s, |significand|, x }
118        faddl           RELATIVE_ADDR(coeffs+3*8)       //{ c3+(c4+c5*s)s, |significand|, x }
119        subl            %eax,   %edx                    //                                              subtract the sign of the exponent (makes our approximation work for neg numbers)
120        movl            %edx,   %eax                    //                                              copy exponent/3 to eax       
121        fmul            %ST(1)                          //{ (c3+(c4+c5*s)s)s, |significand|, x }
122        faddl           RELATIVE_ADDR(coeffs+2*8)       //{ c2+(c3+(c4+c5*s)s)s, |significand|, x }
123        imul            $3,     %edx                    //                                              exponent/3 *= 3
124        fmul            %ST(1)                          //{ (c2+(c3+(c4+c5*s)s)s)s, |significand|, x }
125        faddl           RELATIVE_ADDR(coeffs+1*8)       //{ c1+(c2+(c3+(c4+c5*s)s)s)s, |significand|, x }
126        subl            (STACKP), %edx                  //                                              remainder = (exponent/3)*3 - original exponent   (edx)
127        fmul            %ST(1)                          //{ (c1+(c2+(c3+(c4+c5*s)s)s)s)s, |significand|, x }
128        faddl           RELATIVE_ADDR(coeffs+0*8)       //{ c0+(c1+(c2+(c3+(c4+c5*s)s)s)s)s, |significand|, x }
129        neg             %eax                            //                                              exponent = -exponent
130        shld            $16, %eax, %eax                 //                                              exponent <<= 16
131        andl            $0xFFFF0000, %eax               //                                              mask off the other mantissa bits
132        addl            $0x3FFF8000, %eax               //                                              bias the exponent, set the top mantissa bit
133        movl            %eax,   6(STACKP)               //                                              write out exponent/3
134
135
136        //correct for exponent remainder to get our estimate
137#if defined( __LP64__ )		
138		movslq			%edx,	%rdx									//sign extend edx
139		fmull           ( %r8, %rdx, 8 )								//{ e, |significand|, x}
140#else
141		fmull           correction + 16 - rel_addr( %ebx, %edx, 8 )		//{ e, |significand|, x}
142#endif
143
144        //fix up the sign of the estimate
145        fld             %ST(0)                          //{ e, e, |significand|, x}
146        fchs                                            //{-e, e, |significand|, x}
147        fldz                                            //{0, -e, e, |significand|, x}
148        fucomip         %ST(4), %ST                     //{-e, e, |significand|, x}     if( 0 < x )
149        fcmovb          %ST(1), %ST(0)                  //{+-e, e, |significand|, x} 
150        fstp            %ST(1)                          //{+-e, |significand|, x}
151        fstp            %ST(1)                          //{+-e, x}
152
153        //apply the appropriate exponent
154        fldt            (STACKP)                        //{ new exponent, +-e, x }
155        fmulp                                           //{ +-e with correct exponent, x }
156
157        //       e += oneThird * e * (1.0L - x * e * e * e);
158        fldt            RELATIVE_ADDR( onethird)        //{0.3333, e, x}
159        fld             %ST(1)                          //{ e, 0.3333, e, x }
160        fmul            %ST(3)                          //{ x*e, 0.3333, e, x}
161        fmul            %ST(2)                          //{ e*x*e, 0.3333, e, x}
162        fmul            %ST(2)                          //{ e*e*x*e, 0.3333, e, x}
163        fld1                                            //{1.0, e*e*x*e, 0.3333, e, x}
164        fsubp                                           //{1.0 - e*e*x*e, 0.3333, e, x}
165        fld             %ST(2)                          //{e, 1.0 - e*e*x*e, 0.3333, e, x}
166        fmul            %ST(2)                          //{0.3333*e, 1.0 - e*e*x*e, 0.3333, e, x}
167        fmulp                                           //{0.3333*e*(1.0 - e*e*x*e), 0.3333, e, x}
168        faddp           %ST(0), %ST(2)                  //{0.3333, e+0.3333*e*(1.0 - e*e*x*e), x }
169
170        //       e += oneThird * e * (1.0L - x * e * e * e);
171        fld             %ST(1)                          //{ e, 0.3333, e, x }
172        fmul            %ST(3)                          //{ x*e, 0.3333, e, x}
173        fmul            %ST(2)                          //{ e*x*e, 0.3333, e, x}
174        fmul            %ST(2)                          //{ e*e*x*e, 0.3333, e, x}
175        fld1                                            //{1.0, e*e*x*e, 0.3333, e, x}
176        fsubp                                           //{1.0 - e*e*x*e, 0.3333, e, x}
177        fld             %ST(2)                          //{e, 1.0 - e*e*x*e, 0.3333, e, x}
178        fmul            %ST(2)                          //{0.3333*e, 1.0 - e*e*x*e, 0.3333, e, x}
179        fmulp                                           //{0.3333*e*(1.0 - e*e*x*e), 0.3333, e, x}
180        faddp           %ST(0), %ST(2)                  //{0.3333, e+0.3333*e*(1.0 - e*e*x*e), x }
181
182
183        //       e = (e*x)*e;
184        fld             %ST(1)                          //{e, 0.3333, e, x}
185        fmul            %ST(3)                          //{e*x, 0.3333, e, x}
186        fmulp           %ST(0), %ST(2)                  //{0.3333, (e*x)*e, x}
187
188        //       e -= ( e - (x/(e*e)) ) * oneThird;
189        fld             %ST(1)                          //{ e, 0.3333, e, x }
190        fmul            %ST(0)                          //{ e*e, 0.3333, e, x}
191        fdivr           %ST(3), %ST(0)                  //{ x/(e*e), 0.3333, e, x }
192        fsubr           %ST(2), %ST(0)                  //{ e - x/(e*e), 0.3333, e, x }
193        fmulp                                           //{ 0.3333*(e - x/(e*e)), e, x }
194        fsubrp          %ST(0), %ST(1)                  //{ e - 0.3333*(e - x/(e*e)), x }
195        fstp            %ST(1)
196
197my_cbrtl_exit:
198	#if defined( __i386__ )
199        movl    20(STACKP),       %ebx
200        movl    24(STACKP),       BASEP
201	#endif
202        ADDP    $LOCAL_STACK_SIZE,    STACKP
203        ret