this repo has no description
1/* Single-precision roundf, reimplemented using integer operations
2 * for improved performance, especially on in-order machines.
3 *
4 * Steve Canon, March 2009.
5 */
6
7#if defined __i386__
8
9.text
10.align 4
11.globl _roundf
12_roundf:
13 mov 4(%esp), %eax // load the input, x
14 movss 4(%esp), %xmm0
15 and $0x7f800000, %eax // |x|
16 mov $23, %ecx
17 mov $0x1, %edx
18 sar %cl, %eax // exponent(x) + 1
19
20 sub $0x7e, %eax // if |x| < 1.0f. goto 2
21 jbe 2f
22
23 sub %eax, %ecx // 23 - exponent(x) - 1
24 js 1f // return x if |x| >= 0x1.0p23
25
26 shl %cl, %edx
27 mov $0xfffffffe, %eax
28 shl %cl, %eax
29 add %edx, 4(%esp) // add 0.5 (ish -- non-integral bits are garbage)
30 and %eax, 4(%esp) // truncate
31 cvttps2dq %xmm0, %xmm0 // raise inexact
321: flds 4(%esp)
33 ret
34.align 4
352: je 3f // if |x| >= 0.5f, goto 3
36 andl $0x80000000, 4(%esp) // copysign(0.0, x)
37 cvttps2dq %xmm0, %xmm0 // raise inexact
38 flds 4(%esp)
39 ret
40.align 4
413: addl $0x00800000, 4(%esp)
42 andl $0xff800000, 4(%esp) // copysign(1.0, x)
43 cvttps2dq %xmm0, %xmm0 // raise inexact
44 flds 4(%esp)
45 ret
46
47#elif defined __x86_64__
48
49.const
50.align 4
51mzero: .long 0x80000000
52.align 4
53expbit: .long 0x00800000
54.align 4
55expmask:.long 0xff800000
56
57.text
58.align 4
59.globl _roundf
60_roundf:
61 movd %xmm0, %eax
62 and $0x7f800000, %eax // |x|
63 mov $23, %ecx
64 mov $0x1, %edx
65 sar %cl, %eax // exponent(x) + 1
66
67 sub $0x7e, %eax // if |x| < 1.0f. goto 2
68 jbe 2f
69
70 sub %eax, %ecx // 23 - exponent(x) - 1
71 js 1f // return x if |x| >= 0x1.0p23
72
73 shl %cl, %edx
74 mov $0xfffffffe, %eax
75 shl %cl, %eax
76 movd %edx, %xmm2
77 movd %eax, %xmm3
78 cvttps2dq %xmm0, %xmm1 // raise inexact
79 paddd %xmm2, %xmm0 // add 0.5 (ish -- non-integral bits are garbage)
80 pand %xmm3, %xmm0 // truncate
811: ret
82.align 4
832: je 3f // if |x| >= 0.5f, goto 3
84 cvttps2dq %xmm0, %xmm1 // raise inexact
85 andps mzero(%rip), %xmm0 // copysign(0.0, x)
86 ret
87.align 4
883: cvttps2dq %xmm0, %xmm1 // raise inexact
89 paddd expbit(%rip), %xmm0
90 pand expmask(%rip), %xmm0 // copysign(1.0, x)
91 ret
92
93#else
94 #error unknown arch
95#endif