src/libm/Source/Intel/lroundl.S at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / lroundl.S
at fixPythonPipStalling 284 lines 9.1 kB view raw
wrap content
Lubos Dolezel More progress 9y ago
a76db625
  1
  2/*
  3 *  lroundl.s
  4 *
  5 *      by Ian Ollmann
  6 *
  7 *  Apple Inc. Copyright (c) 2007.  All rights reserved.
  8 *
  9 */
 10 
 11#include "abi.h"
 12#include <machine/asm.h>
 13
 14.align 2
 15.literal4
 16two63:          .long           0x5f000000
 17mtwo63:         .long           0xdf000000
 18one:            .long           1
 19inf:            .long           0x7f800000
 20
 21.align 3
 22.literal8
 23cutoff32:        .double     2147483647.5            // 2**31-0.5
 24mcutoff32:       .double    -2147483648.5            // 2**31-0.5
 25
 26
 27.align 4
 28.literal16
 29cutoff:         .quad           0xffffffffffffffff, 0x403d 
 30sign:           .quad           0x0, 0xffffffffffffffff
 31
 32.text
 33#if defined( __x86_64__ )
 34
 35ENTRY( lroundl )
 36ENTRY( llroundl )
 37    movswl  8+FRAME_SIZE( STACKP ), %edx
 38    andl    $0x7fff,                %edx        // exponent of x
 39    movq    FRAME_SIZE( STACKP ),   %rax
 40    subl    $0x3ffe,                %edx        // push exponents less than -1 negative
 41    fldt    FRAME_SIZE( STACKP )                // { x }
 42    cmpl    $(63+1),                %edx        // if( |x| >= 0x1.0p63 || |x| < 0.5 || isnan(x) )
 43    jae     1f                                  //      goto 1
 44    
 45    // 0.5 <= |x| < 0x1.0p63
 46    fldt    cutoff( %rip )                      // { 0x1.0p63 - 0.5, x }
 47    fucomip %st(1),         %st(0)              // { x }
 48    je      3f
 49    
 50    //Shift the significand right so that units bit is at units + 1 position
 51    movl    $63,                    %ecx        
 52    subl    %edx,                   %ecx        // 63 - (exponent+1)
 53    shrq    %cl,                    %rax        // shift so that the units bit is at the +1 position
 54    movq    %rax,                   %rdx        // set aside a copy
 55    shrq    $1,                     %rax        // finish the shift with shift right by 1 bit -- we need to do 64-bit shifts here at times and not possible with ISA
 56    andq    $1,                     %rdx        // isolate the leading fractional bit
 57    addq    %rdx,                   %rax        // round the result up.
 58
 59    //fix sign
 60    movswq  8+FRAME_SIZE( STACKP ), %rdx        // read the sign + exponent
 61    sarq    $16,                    %rdx        // remove exponent
 62    xorq    %rdx,                   %rax        // flip the sign of the result
 63    subq    %rdx,                   %rax        // correct for 2's complement
 64
 65    // set inexact as necessary
 66    fabs                                        // { |x| }
 67    fadds   two63(%rip)                         // { |x| + 0x1.0p63 }   set inexact as necessary
 68    fstp    %st(0)                              // throw away numerical result.
 69
 70    ret
 71
 72//  |x| >= 0x1.0p63 || |x| < 0.5 || isnan(x)
 731:  jge     2f
 74
 75    // |x| < 0.5
 76    xorq    %rax,                   %rax
 77    fistpl  FRAME_SIZE( STACKP )                // set inexact  as necessary
 78    ret
 79
 80//  |x| >= 0x1.0p63 || isnan(x)
 812:  movswq  8+FRAME_SIZE( STACKP ), %rdx
 82    flds    mtwo63( %rip )
 83    fucomip %st(1), %st(0)
 84    je      4f
 85    fistpl  FRAME_SIZE( STACKP )                // set invalid
 86    shrq    $63,                    %rdx
 87    subq    $1,                     %rdx
 88    movq    $0x8000000000000000,    %rax
 89    xorq    %rdx,                   %rax
 90    ret
 91
 92
 93//  0x1.0p63 - 0.5, positive overflow
 943:  fistpl  FRAME_SIZE( STACKP )                // set invalid
 95    movq    $0x7fffffffffffffff,    %rax
 96    ret
 97  
 98//  -0x1.0p63 or nan
 994:  jp      5f
100    fstp    %st(0)
101    movq    $0x8000000000000000,    %rax
102    ret    
103
104//  nan
1055:  fistpl  FRAME_SIZE( STACKP )                // set invalid
106    movq    $0x8000000000000000,    %rax
107    ret
108
109#else
110
111ENTRY( lroundl )
112    movswl  8+FRAME_SIZE( STACKP ), %edx
113    andl    $0x7fff,                %edx        // exponent of x
114    movl    4+FRAME_SIZE( STACKP ), %eax
115    subl    $0x3ffe,                %edx        // push exponents less than -1 negative
116    fldt    FRAME_SIZE( STACKP )                // { x }
117    cmpl    $(31+1),                %edx        // if( |x| >= 0x1.0p31 || |x| < 0.5 || isnan(x) )
118    jae     1f                                  //      goto 1
119
120    //
121    call    0f
1220:  popl    %ecx
123    fldl    (cutoff32-0b)(%ecx)
124    fucomip %st(1),     %st(0)
125    jbe     3f
126    
127    // set inexact
128    fabs
129    fadds   (two63-0b)(%ecx)
130    fstp    %st(0)
131
132    // round
133    movl    $31,                    %ecx
134    subl    %edx,                   %ecx
135    shrl    %cl,                    %eax
136    movl    %eax,                   %edx
137    shrl    $1,                     %eax
138    andl    $1,                     %edx
139    addl    %edx,                   %eax
140    
141    // fix sign
142    movswl  8+FRAME_SIZE( STACKP ), %edx
143    sarl    $16,                    %edx
144    xorl    %edx,                   %eax
145    subl    %edx,                   %eax
146    ret
147
1481:  jge     2f
149
150    // |x| < 0.5
151    xorl    %eax,                   %eax
152    fistpl  FRAME_SIZE( STACKP )                // set inexact  as necessary
153    ret
154
1552:  movswl  8+FRAME_SIZE( STACKP),  %edx
156    call    0f
1570:  popl    %ecx
158    fldl    ( mcutoff32-0b)(%ecx)
159    fucomip %st(1),         %st(0)
160    jae     4f
161    fldl    ( cutoff32-0b )(%ecx)
162    fucomip %st(1),         %st(0)
163    jbe     3f
164
165    // non overflowing result
166    shrl    $31,                    %edx
167    subl    $1,                     %edx
168    movl    $0x80000000,            %eax
169    xorl    %edx,                   %eax
170
171    //set inexact
172    fabs
173    fadds   (two63-0b)(%ecx)
174    fstp    %st(0)
175    ret
176    
177//  positive overflow
1783:  jp      5f
179    fistps  FRAME_SIZE( STACKP )
180    movl    $0x7fffffff,            %eax
181    ret
182    
183// negative overflow
1844:  fistps  FRAME_SIZE( STACKP )
185    movl    $0x80000000,            %eax
186    ret
187
188// nan
1895:  fistpl  FRAME_SIZE( STACKP )
190    movl    $0x80000000,            %eax
191    ret
192    
193
194ENTRY( llroundl )
195    movswl  8+FRAME_SIZE( STACKP ), %edx
196    andl    $0x7fff,                %edx        // exponent of x
197    movq    FRAME_SIZE( STACKP ),   %xmm0
198    subl    $0x3ffe,                %edx        // push exponents less than -1 negative
199    fldt    FRAME_SIZE( STACKP )                // { x }
200    cmpl    $(63+1),                %edx        // if( |x| >= 0x1.0p63 || |x| < 0.5 || isnan(x) )
201    jae     1f                                  //      goto 1
202    
203    call    0f
2040:  popl    %ecx
205    
206    // 0.5 <= |x| < 0x1.0p63
207    fldt    (cutoff-0b)( %ecx )                 // { 0x1.0p63 - 0.5, x }
208    fucomip %st(1),         %st(0)              // { x }
209    je      3f
210    
211    //Shift the significand right so that units bit is at units + 1 position
212    movl    $63,                    %eax    
213    movd    (one-0b)(%ecx),         %xmm2       // 1
214    subl    %edx,                   %eax        // 63 - (exponent+1)
215    movd    %eax,                   %xmm1      
216    psrlq   %xmm1,                  %xmm0       // shift so that the units bit is at the +1 position
217    movq    %xmm0,                  %xmm1       // set aside a copy
218    psrlq   $1,                     %xmm0       // finish the shift with shift right by 1 bit -- we need to do 64-bit shifts here at times and not possible with ISA
219    pand    %xmm2,                  %xmm1       // isolate the leading fractional bit
220    paddq   %xmm1,                  %xmm0       // round the result up.
221
222    // set inexact as necessary
223    fabs                                        // { |x| }
224    fadds   (two63-0b)(%ecx)                         // { |x| + 0x1.0p63 }   set inexact as necessary
225    fstp    %st(0)                              // throw away numerical result.
226
227    //fix sign
228    movswl  8+FRAME_SIZE( STACKP ), %eax        // read the sign + exponent
229    shrl    $31,                    %eax        // remove exponent
230    movq    (sign-0b)(%ecx, %eax,8), %xmm1
231    pxor    %xmm1,                  %xmm0
232    psubq   %xmm1,                  %xmm0
233    movd    %xmm0,                  %eax
234    psrlq   $32,                    %xmm0
235    movd    %xmm0,                  %edx
236
237    ret
238
239//  |x| >= 0x1.0p63 || |x| < 0.5 || isnan(x)
2401:  jge     2f
241
242    // |x| < 0.5
243    xorl    %eax,                   %eax
244    xorl    %edx,                   %edx
245    fistpl  FRAME_SIZE( STACKP )                // set inexact  as necessary
246    ret
247
248//  |x| >= 0x1.0p63 || isnan(x)
2492:  movswl  8+FRAME_SIZE( STACKP ), %eax
250    call    0f
2510:  popl    %ecx
252    flds    (mtwo63-0b)( %ecx )
253    fucomip %st(1), %st(0)
254    je      4f
255    fistpl  FRAME_SIZE( STACKP )                // set invalid
256    shrl    $31,                    %eax
257    subl    $1,                     %eax
258    movl    $0x80000000,            %edx
259    xorl    %eax,                   %edx
260    ret
261
262
263//  0x1.0p63 - 0.5, positive overflow
2643:  fistpl  FRAME_SIZE( STACKP )                // set invalid
265    movl    $-1,                    %eax
266    movl    $0x7fffffff,            %edx
267    ret
268  
269//  -0x1.0p63 or nan
2704:  jp      5f
271    fstp    %st(0)
272    movl    $0x80000000,            %edx
273    xorl    %eax,                   %eax
274    ret    
275
276//  nan
2775:  fistpl  FRAME_SIZE( STACKP )                // set invalid
278    movl    $0x80000000,            %edx
279    xorl    %eax,                   %eax
280    ret
281
282
283
284#endif