this repo has no description
1/* Single-precision floorf, reimplemented using integer operations
2 * for improved performance, especially on in-order machines.
3 *
4 * Steve Canon, March 2009.
5 */
6
7#if defined __i386__
8
9#include <System/i386/cpu_capabilities.h>
10.set cpubits, _COMM_PAGE_CPU_CAPABILITIES
11
12.text
13.align 4
14.globl _floorf
15_floorf:
16 movss 4(%esp), %xmm0 // load argument
17 testl $(kHasSSE4_1), cpubits
18 jz 0f
19
20 // fast path using SSE 4.1
21 roundss $0x1, %xmm0, %xmm0
22 movss %xmm0, 4(%esp)
23 flds 4(%esp)
24 ret
25
26.align 4
270: // no SSE 4.1
28 mov 4(%esp), %eax
29 mov $23, %cl
30 mov %eax, %edx
31 shr %cl, %eax
32 dec %edx
33 sub $0x7f, %al
34 jb 2f
35
36 sub %al, %cl
37 mov $0xffffffff, %eax
38 jbe 1f
39
40 shl %cl, %eax // m = mask for integral bits of x
41 mov %edx, %ecx
42 not %edx
43 sar $31, %edx // (x < 0) ? -1 : 0
44 or %eax, %edx // (x < 0) ? -1 : m
45 sub %edx, %ecx // (x < 0) ? x : (x + (1.0 - ulp(x)))
46 and %ecx, %eax // ceil(x)
47 mov %eax, 4(%esp)
48 cvttps2dq %xmm0, %xmm0 // set inexact
491: flds 4(%esp)
50 ret
51.align 4
522: cvttps2dq %xmm0, %xmm0 // set inexact
53 cmp $0xffffffff, %edx // if x < 0.0, goto 3
54 jl 3f
55 andl $0x80000000, 4(%esp) // copysign(0.0, x)
56 flds 4(%esp)
57 ret
58.align 4
593: movl $0xbf800000, 4(%esp) // return 1.0
60 flds 4(%esp)
61 ret
62
63#elif defined __x86_64__
64
65.const
66.align 4
67mone: .long 0xbf800000
68absmask:.long 0x7fffffff
69
70.text
71.align 4
72.globl _floorf
73_floorf:
74 movd %xmm0, %ecx
75 andl $0x7fffffff, %ecx // |x|
76 subl $1, %ecx // subtract 1. This forces |+-0| to -0
77 cmpl $0x4afffffe, %ecx // values >= 0x4b000000 - 1 are either integers, NaN or Inf
78 ja 1f // unsigned compare adds 0 to the list
79
80 cvttps2dq %xmm0, %xmm2
81 cvtdq2ps %xmm2, %xmm2
82 cmpltss %xmm2, %xmm0
83 andps mone(%rip), %xmm0
84 addss %xmm2, %xmm0
851: ret
86
87#else
88 #error unknown arch
89#endif