this repo has no description
at fixPythonPipStalling 93 lines 2.1 kB view raw
1/* Single-precision ceilf, reimplemented using integer operations 2 * for improved performance, especially on in-order machines. 3 * 4 * Steve Canon, March 2009. 5 */ 6 7#if defined __i386__ 8 9#include <System/i386/cpu_capabilities.h> 10.set cpubits, _COMM_PAGE_CPU_CAPABILITIES 11 12.text 13.align 4 14.globl _ceilf 15_ceilf: 16 movss 4(%esp), %xmm0 // load argument 17 testl $(kHasSSE4_1), cpubits 18 jz 0f 19 20 // fast path using SSE 4.1 21 roundss $0x2, %xmm0, %xmm0 22 movss %xmm0, 4(%esp) 23 flds 4(%esp) 24 ret 25 26.align 4 270: // no SSE 4.1 28 mov 4(%esp), %eax 29 mov $23, %cl 30 mov %eax, %edx 31 shr %cl, %eax // x >> 23 32 sub $0x7f, %al // unbiased exponent of x 33 jb 2f // if |x| < 1.0, goto 2 34 35 sub %al, %cl // 23 - exponent of x 36 mov $0xffffffff, %eax 37 jbe 1f // if |x| >= 0x1.0p23, goto 1 38 39 dec %edx // (x - 1) 40 shl %cl, %eax // m = mask for integral bits of x 41 mov %edx, %ecx 42 sar $31, %edx // (x < 0) ? -1 : 0 43 or %eax, %edx // (x < 0) ? -1 : m 44 sub %edx, %ecx // (x < 0) ? x : (x + (1.0 - ulp(x))) 45 and %ecx, %eax // ceil(x) 46 mov %eax, 4(%esp) 47 cvttps2dq %xmm0, %xmm0 // set inexact 481: flds 4(%esp) 49 ret 50.align 4 512: cvttps2dq %xmm0, %xmm0 // set inexact 52 cmp $1, %edx // if x > 0, goto 3 53 jge 3f 54 andl $0x80000000, 4(%esp) // copysign(0.0, x) 55 flds 4(%esp) 56 ret 57.align 4 583: movl $0x3f800000, 4(%esp) // return 1.0 59 flds 4(%esp) 60 ret 61 62#elif defined __x86_64__ 63 64.const 65.align 4 66one: .long 0x3f800000 67absmask:.long 0x7fffffff 68 69.text 70.align 4 71.globl _ceilf 72_ceilf: 73 movd %xmm0, %eax 74 andl absmask(%rip), %eax 75 movd absmask(%rip), %xmm1 76 cmpl $0x4b000000, %eax 77 andnps %xmm0, %xmm1 78 jae 1f 79 80 cvttps2dq %xmm0, %xmm2 81 movdqa %xmm0, %xmm3 82 psrad $31, %xmm0 // (x < 0) ? -1 : 0 83 cvtdq2ps %xmm2, %xmm2 84 pcmpgtd %xmm2, %xmm3 // (x >i trunc(x)) ? -1 : 0 85 andnps %xmm3, %xmm0 // (x > trunc(x)) ? -1 : 0 86 andps one(%rip), %xmm0 // (x > trunc(x)) ? 1.0 : 0.0 87 addss %xmm2, %xmm0 // (x > trunc(x)) ? trunc(x) + 1.0 : trunc(x) 88 orps %xmm1, %xmm0 // ceil(x) 891: ret 90 91#else 92 #error unknown arch 93#endif