this repo has no description
at fixPythonPipStalling 155 lines 6.7 kB view raw
1/* 2 * roundl.s 3 * 4 * by Ian Ollmann 5 * 6 * Copyright (c) 2007 Apple Inc. All rights reserved. 7 * 8 * Implementation for C99 round, lround and llround functions for __i386__ and __x86_64__. 9 */ 10 11#include "machine/asm.h" 12 13#define LOCAL_STACK_SIZE 12 14#include "abi.h" 15 16.literal8 17zero: .long 0, 0x80000000 // { 0.0f, -0.0f } 18one: .long 0x3f800000, 0xbf800000 // { 1.0f, -1.0f } 19large: .long 0x5f000000, 0xdf000000 // { 0x1.0p63, -0x1.0p63 } 20 21.literal16 22explicitBit: .quad 0x8000000000000000, 0 23roundMask62: .quad 0xFFFFFFFFFFFFFFFE, 0 24 25.text 26#if defined( __x86_64__ ) 27ENTRY( roundl ) 28 movzwq 8+FRAME_SIZE( STACKP ), %rdx // sign + biased exponent 29 movq FRAME_SIZE( STACKP ), %rax // mantissa 30 fldt FRAME_SIZE( STACKP ) // {x} 31 movq %rdx, %r8 // sign + biased exponent 32 andq $0x7fff, %rdx // exponent + bias 33 shrq $15, %r8 // x < 0 ? 1 : 0 34 subq $0x3ffe, %rdx // push |x| < 0.5 negative 35 cmp $(63), %rdx // if( |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x) ) 36 jae 1f // goto 1 37 38// |x| >= 0.5 and conversion does not overflow. 39 movq $63, %rcx // 63 40 subq %rdx, %rcx // 63-(exponent+1) 41 leaq large(%rip), %r9 // address of large array 42 fadds (%r9, %r8, 4) // { x + (x < 0 ? -0x1.0p63 : 0x1.0p63) } set inexact as necessary 43 shrq %cl, %rax // shift units bit into 2's position 44 fstp %st(0) // { } 45 addq $1, %rax // round away from zero 46 shrq $1, %rax // shift units bit to 1's position 47 48 // find new exponent 49 bsrq %rax, %r9 // position of leading set bit. rax is never zero. 50 movq $0x3fff, %rdx // bias 51 movq $63, %rcx // 63 52 addq %r9, %rdx // biased exponent 53 subq %r9, %rcx // 63 - position of leading set bit 54 movw %dx, 8+FRAME_SIZE( STACKP ) // write out new exponent 55 56 // shift significand into position 57 shlq %cl, %rax // shift leading bit to higest position 58 movq %rax, FRAME_SIZE( STACKP ) // write mantissa 59 60 // get sign 61 fldt FRAME_SIZE( STACKP ) // { |result| } 62 leaq one( %rip ), %rax // address of one array 63 fmuls (%rax, %r8, 4 ) // { result } multiply by +1 or -1 according to sign of original result 64 ret 65 66// |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x) 671: je 3f 68 jg 2f 69 70// |x| < 0.5 71 fistpl FRAME_SIZE( STACKP ) // { } set inexact if x != 0 72 leaq zero( %rip), %rax // address of zero array 73 flds (%rax, %r8, 4 ) // load result 742: ret 75 76// 0x1.0p62 <= |x| < 0x1.0p63 773: leaq large(%rip), %r9 // address of large array 78 fadds (%r9, %r8, 4) // { x + (x < 0 ? -0x1.0p63 : 0x1.0p63) } set inexact as necessary 79 fstp %st(0) // { } 80 addq $1, %rax // add 0.5 to significand 81 jz 4f // handle overflow 82 83 andq roundMask62(%rip), %rax // prune fractional bits 84 movq %rax, FRAME_SIZE( STACKP ) // write to mantissa 85 fldt FRAME_SIZE( STACKP ) // load result 86 ret 87 88// result is +- 0x1.0p63 894: flds (%r9, %r8, 4) // load result 90 ret 91 92 93 94#else 95ENTRY( roundl ) 96 movzwl 8+FRAME_SIZE( STACKP ), %edx 97 movq FRAME_SIZE( STACKP ), %xmm0 98 fldt FRAME_SIZE( STACKP ) 99 calll 0f 1000: popl %ecx 101 movl %edx, %eax // sign + biased exponent 102 andl $0x7fff, %edx // biased exponent 103 shrl $15, %eax // signof( x ) 104 subl $0x3ffe, %edx // push |x| < 0.5 negative 105 cmp $63, %edx // if( |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x) ) 106 jae 1f // goto 1 107 108// |x| >= 0.5 and conversion does not overflow. 109 subl $63, %edx // (exponent+1) - 63 110 fadds (large-0b)(%ecx, %eax, 4) // set inexact if necessary 111 negl %edx // 63 - (exponent+1) 112 fstp %st(0) // {} 113 movd %edx, %xmm1 // 63 - (exponent+1) 114 psrlq %xmm1, %xmm0 // move 0.5 bit to units position 115 pcmpeqb %xmm1, %xmm1 // -1 116 psubq %xmm1, %xmm0 // add 1 117 psrlq $1, %xmm0 // move 1's bit to units position 118 movq %xmm0, FRAME_SIZE( STACKP ) // write out 119 120 fildll FRAME_SIZE( STACKP ) // { |result| } 121 fmuls (one-0b)(%ecx, %eax, 4) // { result } 122 ret 123 124 125// |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x) 1261: je 3f 127 jg 2f 128 129// |x| < 0.5 130 fistpl FRAME_SIZE( STACKP ) // { } set inexact if x != 0 131 flds (zero-0b)(%ecx, %eax, 4 ) // load result 1322: ret 133 134// 0x1.0p62 <= |x| < 0x1.0p63 1353: fadds (large-0b)(%ecx, %eax, 4) // { x + (x < 0 ? -0x1.0p63 : 0x1.0p63) } set inexact as necessary 136 fstp %st(0) // { } 137 movdqa %xmm0, %xmm2 // significand 138 pcmpeqb %xmm1, %xmm1 // -1LL 139 psubq %xmm1, %xmm0 // add 0.5 to significand 140 pxor %xmm0, %xmm2 // set leading bit if leading bit changed (overflow) 141 movmskpd %xmm2, %edx 142 test $1, %edx 143 jnz 4f 144 145 pand (roundMask62-0b)(%ecx), %xmm0 // prune fractional bits 146 movq %xmm0, FRAME_SIZE( STACKP ) // write to mantissa 147 fldt FRAME_SIZE( STACKP ) // load result 148 ret 149 150// result is +- 0x1.0p63 1514: flds (large-0b)(%ecx, %eax, 4) // load result 152 ret 153 154 155#endif