this repo has no description
1/* Single-precision truncf
2 *
3 * Reimplemented for improved performance on in-order machines and
4 * machines that support SSE 4.1
5 *
6 * Steve Canon, March 2009.
7 */
8
9#include <System/i386/cpu_capabilities.h>
10.set cpubits, _COMM_PAGE_CPU_CAPABILITIES
11
12#if defined __i386__
13
14.text
15.align 4
16.globl _truncf
17_truncf:
18 movss 4(%esp), %xmm0 // load argument
19 testl $(kHasSSE4_1), cpubits
20 jz 0f
21
22 roundss $0x3, %xmm0, %xmm0 // fast path using SSE 4.1
23 movss %xmm0, 4(%esp) //
24 flds 4(%esp) //
25 ret
26
27.align 4
280: mov 4(%esp), %eax // load the input, x
29 and $0x7f800000, %eax // |x|
30 mov $23, %ecx
31 sub $0x3f800000, %eax // if |x| < 1.0f, goto 2
32 js 2f
33 sar %cl, %eax // exponent(x)
34 mov $0xffffffff, %edx
35 sub %eax, %ecx // 23 - exponent(x)
36 js 1f // return x if |x| >= 0x1.0p24
37 shl %cl, %edx
38 and %edx, 4(%esp) // mask off non-integral bits
39 cvttps2dq %xmm0, %xmm0 // raise inexact
401: flds 4(%esp)
41 ret
42
43.align 4
442: // Handle |x| < 1.0 here.
45 andl $0x80000000, 4(%esp) // copysign(0.0, x)
46 cvttps2dq %xmm0, %xmm0 // raise inexact
47 flds 4(%esp)
48 ret
49
50#elif defined __x86_64__
51
52.const
53.align 4
54mzero: .long 0x80000000
55
56.text
57.align 4
58.globl _truncf
59_truncf:
60 movd %xmm0, %eax
61 and $0x7f800000, %eax // |x|
62 mov $23, %ecx
63 sub $0x3f800000, %eax // if |x| < 1.0f, goto 2
64 js 2f
65 sar %cl, %eax // exponent(x)
66 mov $0xffffffff, %edx
67 sub %eax, %ecx // 23 - exponent(x)
68 js 1f // return x if |x| >= 0x1.0p23
69 shl %cl, %edx
70 movd %edx, %xmm2
71 cvttps2dq %xmm0, %xmm1 // raise inexact
72 andps %xmm2, %xmm0 // mask off non-integral bits
731: ret
74.align 4
752: cvttps2dq %xmm0, %xmm1 // raise inexact
76 andps mzero(%rip), %xmm0 // copysign(0.0, x)
77 ret
78
79#else
80 #error unknown arch
81#endif