this repo has no description
1/*
2 * roundl.s
3 *
4 * by Ian Ollmann
5 *
6 * Copyright (c) 2007 Apple Inc. All rights reserved.
7 *
8 * Implementation for C99 round, lround and llround functions for __i386__ and __x86_64__.
9 */
10
11#include "machine/asm.h"
12
13#define LOCAL_STACK_SIZE 12
14#include "abi.h"
15
16.literal8
17zero: .long 0, 0x80000000 // { 0.0f, -0.0f }
18one: .long 0x3f800000, 0xbf800000 // { 1.0f, -1.0f }
19large: .long 0x5f000000, 0xdf000000 // { 0x1.0p63, -0x1.0p63 }
20
21.literal16
22explicitBit: .quad 0x8000000000000000, 0
23roundMask62: .quad 0xFFFFFFFFFFFFFFFE, 0
24
25.text
26#if defined( __x86_64__ )
27ENTRY( roundl )
28 movzwq 8+FRAME_SIZE( STACKP ), %rdx // sign + biased exponent
29 movq FRAME_SIZE( STACKP ), %rax // mantissa
30 fldt FRAME_SIZE( STACKP ) // {x}
31 movq %rdx, %r8 // sign + biased exponent
32 andq $0x7fff, %rdx // exponent + bias
33 shrq $15, %r8 // x < 0 ? 1 : 0
34 subq $0x3ffe, %rdx // push |x| < 0.5 negative
35 cmp $(63), %rdx // if( |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x) )
36 jae 1f // goto 1
37
38// |x| >= 0.5 and conversion does not overflow.
39 movq $63, %rcx // 63
40 subq %rdx, %rcx // 63-(exponent+1)
41 leaq large(%rip), %r9 // address of large array
42 fadds (%r9, %r8, 4) // { x + (x < 0 ? -0x1.0p63 : 0x1.0p63) } set inexact as necessary
43 shrq %cl, %rax // shift units bit into 2's position
44 fstp %st(0) // { }
45 addq $1, %rax // round away from zero
46 shrq $1, %rax // shift units bit to 1's position
47
48 // find new exponent
49 bsrq %rax, %r9 // position of leading set bit. rax is never zero.
50 movq $0x3fff, %rdx // bias
51 movq $63, %rcx // 63
52 addq %r9, %rdx // biased exponent
53 subq %r9, %rcx // 63 - position of leading set bit
54 movw %dx, 8+FRAME_SIZE( STACKP ) // write out new exponent
55
56 // shift significand into position
57 shlq %cl, %rax // shift leading bit to higest position
58 movq %rax, FRAME_SIZE( STACKP ) // write mantissa
59
60 // get sign
61 fldt FRAME_SIZE( STACKP ) // { |result| }
62 leaq one( %rip ), %rax // address of one array
63 fmuls (%rax, %r8, 4 ) // { result } multiply by +1 or -1 according to sign of original result
64 ret
65
66// |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x)
671: je 3f
68 jg 2f
69
70// |x| < 0.5
71 fistpl FRAME_SIZE( STACKP ) // { } set inexact if x != 0
72 leaq zero( %rip), %rax // address of zero array
73 flds (%rax, %r8, 4 ) // load result
742: ret
75
76// 0x1.0p62 <= |x| < 0x1.0p63
773: leaq large(%rip), %r9 // address of large array
78 fadds (%r9, %r8, 4) // { x + (x < 0 ? -0x1.0p63 : 0x1.0p63) } set inexact as necessary
79 fstp %st(0) // { }
80 addq $1, %rax // add 0.5 to significand
81 jz 4f // handle overflow
82
83 andq roundMask62(%rip), %rax // prune fractional bits
84 movq %rax, FRAME_SIZE( STACKP ) // write to mantissa
85 fldt FRAME_SIZE( STACKP ) // load result
86 ret
87
88// result is +- 0x1.0p63
894: flds (%r9, %r8, 4) // load result
90 ret
91
92
93
94#else
95ENTRY( roundl )
96 movzwl 8+FRAME_SIZE( STACKP ), %edx
97 movq FRAME_SIZE( STACKP ), %xmm0
98 fldt FRAME_SIZE( STACKP )
99 calll 0f
1000: popl %ecx
101 movl %edx, %eax // sign + biased exponent
102 andl $0x7fff, %edx // biased exponent
103 shrl $15, %eax // signof( x )
104 subl $0x3ffe, %edx // push |x| < 0.5 negative
105 cmp $63, %edx // if( |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x) )
106 jae 1f // goto 1
107
108// |x| >= 0.5 and conversion does not overflow.
109 subl $63, %edx // (exponent+1) - 63
110 fadds (large-0b)(%ecx, %eax, 4) // set inexact if necessary
111 negl %edx // 63 - (exponent+1)
112 fstp %st(0) // {}
113 movd %edx, %xmm1 // 63 - (exponent+1)
114 psrlq %xmm1, %xmm0 // move 0.5 bit to units position
115 pcmpeqb %xmm1, %xmm1 // -1
116 psubq %xmm1, %xmm0 // add 1
117 psrlq $1, %xmm0 // move 1's bit to units position
118 movq %xmm0, FRAME_SIZE( STACKP ) // write out
119
120 fildll FRAME_SIZE( STACKP ) // { |result| }
121 fmuls (one-0b)(%ecx, %eax, 4) // { result }
122 ret
123
124
125// |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x)
1261: je 3f
127 jg 2f
128
129// |x| < 0.5
130 fistpl FRAME_SIZE( STACKP ) // { } set inexact if x != 0
131 flds (zero-0b)(%ecx, %eax, 4 ) // load result
1322: ret
133
134// 0x1.0p62 <= |x| < 0x1.0p63
1353: fadds (large-0b)(%ecx, %eax, 4) // { x + (x < 0 ? -0x1.0p63 : 0x1.0p63) } set inexact as necessary
136 fstp %st(0) // { }
137 movdqa %xmm0, %xmm2 // significand
138 pcmpeqb %xmm1, %xmm1 // -1LL
139 psubq %xmm1, %xmm0 // add 0.5 to significand
140 pxor %xmm0, %xmm2 // set leading bit if leading bit changed (overflow)
141 movmskpd %xmm2, %edx
142 test $1, %edx
143 jnz 4f
144
145 pand (roundMask62-0b)(%ecx), %xmm0 // prune fractional bits
146 movq %xmm0, FRAME_SIZE( STACKP ) // write to mantissa
147 fldt FRAME_SIZE( STACKP ) // load result
148 ret
149
150// result is +- 0x1.0p63
1514: flds (large-0b)(%ecx, %eax, 4) // load result
152 ret
153
154
155#endif