this repo has no description
1/* Single-precision ceilf, reimplemented using integer operations
2 * for improved performance, especially on in-order machines.
3 *
4 * Steve Canon, March 2009.
5 */
6
7#if defined __i386__
8
9#include <System/i386/cpu_capabilities.h>
10.set cpubits, _COMM_PAGE_CPU_CAPABILITIES
11
12.text
13.align 4
14.globl _ceilf
15_ceilf:
16 movss 4(%esp), %xmm0 // load argument
17 testl $(kHasSSE4_1), cpubits
18 jz 0f
19
20 // fast path using SSE 4.1
21 roundss $0x2, %xmm0, %xmm0
22 movss %xmm0, 4(%esp)
23 flds 4(%esp)
24 ret
25
26.align 4
270: // no SSE 4.1
28 mov 4(%esp), %eax
29 mov $23, %cl
30 mov %eax, %edx
31 shr %cl, %eax // x >> 23
32 sub $0x7f, %al // unbiased exponent of x
33 jb 2f // if |x| < 1.0, goto 2
34
35 sub %al, %cl // 23 - exponent of x
36 mov $0xffffffff, %eax
37 jbe 1f // if |x| >= 0x1.0p23, goto 1
38
39 dec %edx // (x - 1)
40 shl %cl, %eax // m = mask for integral bits of x
41 mov %edx, %ecx
42 sar $31, %edx // (x < 0) ? -1 : 0
43 or %eax, %edx // (x < 0) ? -1 : m
44 sub %edx, %ecx // (x < 0) ? x : (x + (1.0 - ulp(x)))
45 and %ecx, %eax // ceil(x)
46 mov %eax, 4(%esp)
47 cvttps2dq %xmm0, %xmm0 // set inexact
481: flds 4(%esp)
49 ret
50.align 4
512: cvttps2dq %xmm0, %xmm0 // set inexact
52 cmp $1, %edx // if x > 0, goto 3
53 jge 3f
54 andl $0x80000000, 4(%esp) // copysign(0.0, x)
55 flds 4(%esp)
56 ret
57.align 4
583: movl $0x3f800000, 4(%esp) // return 1.0
59 flds 4(%esp)
60 ret
61
62#elif defined __x86_64__
63
64.const
65.align 4
66one: .long 0x3f800000
67absmask:.long 0x7fffffff
68
69.text
70.align 4
71.globl _ceilf
72_ceilf:
73 movd %xmm0, %eax
74 andl absmask(%rip), %eax
75 movd absmask(%rip), %xmm1
76 cmpl $0x4b000000, %eax
77 andnps %xmm0, %xmm1
78 jae 1f
79
80 cvttps2dq %xmm0, %xmm2
81 movdqa %xmm0, %xmm3
82 psrad $31, %xmm0 // (x < 0) ? -1 : 0
83 cvtdq2ps %xmm2, %xmm2
84 pcmpgtd %xmm2, %xmm3 // (x >i trunc(x)) ? -1 : 0
85 andnps %xmm3, %xmm0 // (x > trunc(x)) ? -1 : 0
86 andps one(%rip), %xmm0 // (x > trunc(x)) ? 1.0 : 0.0
87 addss %xmm2, %xmm0 // (x > trunc(x)) ? trunc(x) + 1.0 : trunc(x)
88 orps %xmm1, %xmm0 // ceil(x)
891: ret
90
91#else
92 #error unknown arch
93#endif