src/libm/Source/Intel/roundl.S at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / roundl.S
at fixPythonPipStalling 155 lines 6.7 kB view raw
wrap content
Lubos Dolezel Restructured source tree to prepare for merge with the "darling" repo 10y ago
f228ae16
  1/*
  2 *  roundl.s
  3 *
  4 *		by Ian Ollmann
  5 *
  6 *  Copyright (c) 2007 Apple Inc. All rights reserved.
  7 *
  8 *	Implementation for C99 round, lround and llround functions for __i386__ and __x86_64__.
  9 */
 10
 11#include "machine/asm.h"
 12
 13#define LOCAL_STACK_SIZE	12
 14#include "abi.h"
 15
 16.literal8
 17zero:           .long   0,          0x80000000      // { 0.0f, -0.0f }
 18one:            .long   0x3f800000, 0xbf800000      // { 1.0f, -1.0f }
 19large:          .long   0x5f000000, 0xdf000000      // { 0x1.0p63, -0x1.0p63 }
 20
 21.literal16
 22explicitBit:    .quad   0x8000000000000000,     0
 23roundMask62:    .quad   0xFFFFFFFFFFFFFFFE,     0
 24
 25.text
 26#if defined( __x86_64__ )
 27ENTRY( roundl )
 28    movzwq  8+FRAME_SIZE( STACKP ),     %rdx    // sign + biased exponent
 29    movq    FRAME_SIZE( STACKP ),       %rax    // mantissa
 30    fldt    FRAME_SIZE( STACKP )                // {x}
 31    movq    %rdx,                       %r8     // sign + biased exponent
 32    andq    $0x7fff,                    %rdx    // exponent + bias
 33    shrq    $15,                        %r8     // x < 0 ? 1 : 0
 34    subq    $0x3ffe,                    %rdx    // push |x| < 0.5 negative
 35    cmp     $(63),                      %rdx    // if( |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x) )
 36    jae     1f                                  //      goto 1
 37  
 38//  |x| >= 0.5 and conversion does not overflow.
 39    movq    $63,                        %rcx    // 63
 40    subq    %rdx,                       %rcx    // 63-(exponent+1)
 41    leaq    large(%rip),                %r9     // address of large array
 42    fadds   (%r9, %r8, 4)                       // { x + (x < 0 ? -0x1.0p63 : 0x1.0p63) }       set inexact as necessary
 43    shrq    %cl,                        %rax    // shift units bit into 2's position
 44    fstp    %st(0)                              // { }
 45    addq    $1,                         %rax    // round away from zero
 46    shrq    $1,                         %rax    // shift units bit to 1's position
 47
 48    // find new exponent
 49    bsrq    %rax,                       %r9     // position of leading set bit. rax is never zero.
 50    movq    $0x3fff,                    %rdx    // bias
 51    movq    $63,                        %rcx    // 63
 52    addq    %r9,                        %rdx    // biased exponent
 53    subq    %r9,                        %rcx    // 63 - position of leading set bit
 54    movw    %dx,        8+FRAME_SIZE( STACKP )  // write out new exponent
 55    
 56    // shift significand into position
 57    shlq    %cl,                        %rax    // shift leading bit to higest position
 58    movq    %rax,       FRAME_SIZE( STACKP )    // write mantissa
 59
 60    // get sign
 61    fldt    FRAME_SIZE( STACKP )                // { |result| }
 62    leaq    one( %rip ),                %rax    // address of one array
 63    fmuls   (%rax, %r8, 4 )                     // { result }       multiply by +1 or -1 according to sign of original result
 64    ret
 65    
 66//  |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x)
 671:  je      3f
 68    jg      2f
 69
 70//  |x| < 0.5
 71    fistpl  FRAME_SIZE( STACKP )                // { } set inexact if x != 0
 72    leaq    zero( %rip),                %rax    // address of zero array
 73    flds    (%rax, %r8, 4 )                     // load result
 742:  ret
 75  
 76//  0x1.0p62 <= |x| < 0x1.0p63
 773:  leaq    large(%rip),                %r9     // address of large array
 78    fadds   (%r9, %r8, 4)                       // { x + (x < 0 ? -0x1.0p63 : 0x1.0p63) }       set inexact as necessary
 79    fstp    %st(0)                              // { }
 80    addq    $1,                         %rax    // add 0.5 to significand
 81    jz      4f                                  // handle overflow
 82    
 83    andq    roundMask62(%rip),          %rax    // prune fractional bits
 84    movq    %rax,                       FRAME_SIZE( STACKP )    // write to mantissa
 85    fldt    FRAME_SIZE( STACKP )                // load result
 86    ret
 87
 88// result is +- 0x1.0p63
 894:  flds    (%r9, %r8, 4)                       // load result
 90    ret
 91    
 92    
 93
 94#else
 95ENTRY( roundl )
 96    movzwl  8+FRAME_SIZE( STACKP ),     %edx
 97    movq    FRAME_SIZE( STACKP ),       %xmm0
 98    fldt    FRAME_SIZE( STACKP )
 99    calll   0f
1000:  popl    %ecx
101    movl    %edx,                       %eax    // sign + biased exponent
102    andl    $0x7fff,                    %edx    // biased exponent
103    shrl    $15,                        %eax    // signof( x )
104    subl    $0x3ffe,                    %edx    // push |x| < 0.5 negative
105    cmp     $63,                        %edx    // if( |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x) )
106    jae     1f                                  //      goto 1
107
108//  |x| >= 0.5 and conversion does not overflow.
109    subl    $63,                        %edx    // (exponent+1) - 63
110    fadds   (large-0b)(%ecx, %eax, 4)           // set inexact if necessary
111    negl    %edx                                // 63 - (exponent+1)
112    fstp    %st(0)                              // {}
113    movd    %edx,                       %xmm1   // 63 - (exponent+1)
114    psrlq   %xmm1,                      %xmm0   // move 0.5 bit to units position
115    pcmpeqb %xmm1,                      %xmm1   // -1
116    psubq   %xmm1,                      %xmm0   // add 1
117    psrlq   $1,                         %xmm0   // move 1's bit to units position
118    movq    %xmm0,                      FRAME_SIZE( STACKP )    // write out
119    
120    fildll  FRAME_SIZE( STACKP )                // { |result| }
121    fmuls   (one-0b)(%ecx, %eax, 4)             // { result }
122    ret
123
124
125//  |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x)
1261:  je      3f
127    jg      2f
128
129//  |x| < 0.5
130    fistpl  FRAME_SIZE( STACKP )                // { } set inexact if x != 0
131    flds    (zero-0b)(%ecx, %eax, 4 )                     // load result
1322:  ret
133
134//  0x1.0p62 <= |x| < 0x1.0p63
1353:  fadds   (large-0b)(%ecx, %eax, 4)           // { x + (x < 0 ? -0x1.0p63 : 0x1.0p63) }       set inexact as necessary
136    fstp    %st(0)                              // { }
137    movdqa  %xmm0,                      %xmm2   // significand
138    pcmpeqb %xmm1,                      %xmm1   // -1LL
139    psubq   %xmm1,                      %xmm0   // add 0.5 to significand
140    pxor    %xmm0,                      %xmm2   // set leading bit if leading bit changed (overflow)
141    movmskpd    %xmm2,                  %edx
142    test    $1,                         %edx
143    jnz     4f
144    
145    pand    (roundMask62-0b)(%ecx),     %xmm0    // prune fractional bits
146    movq    %xmm0,                      FRAME_SIZE( STACKP )    // write to mantissa
147    fldt    FRAME_SIZE( STACKP )                // load result
148    ret
149
150// result is +- 0x1.0p63
1514:  flds    (large-0b)(%ecx, %eax, 4)           // load result
152    ret
153
154
155#endif