this repo has no description
at fixPythonPipStalling 81 lines 1.8 kB view raw
1/* Single-precision truncf 2 * 3 * Reimplemented for improved performance on in-order machines and 4 * machines that support SSE 4.1 5 * 6 * Steve Canon, March 2009. 7 */ 8 9#include <System/i386/cpu_capabilities.h> 10.set cpubits, _COMM_PAGE_CPU_CAPABILITIES 11 12#if defined __i386__ 13 14.text 15.align 4 16.globl _truncf 17_truncf: 18 movss 4(%esp), %xmm0 // load argument 19 testl $(kHasSSE4_1), cpubits 20 jz 0f 21 22 roundss $0x3, %xmm0, %xmm0 // fast path using SSE 4.1 23 movss %xmm0, 4(%esp) // 24 flds 4(%esp) // 25 ret 26 27.align 4 280: mov 4(%esp), %eax // load the input, x 29 and $0x7f800000, %eax // |x| 30 mov $23, %ecx 31 sub $0x3f800000, %eax // if |x| < 1.0f, goto 2 32 js 2f 33 sar %cl, %eax // exponent(x) 34 mov $0xffffffff, %edx 35 sub %eax, %ecx // 23 - exponent(x) 36 js 1f // return x if |x| >= 0x1.0p24 37 shl %cl, %edx 38 and %edx, 4(%esp) // mask off non-integral bits 39 cvttps2dq %xmm0, %xmm0 // raise inexact 401: flds 4(%esp) 41 ret 42 43.align 4 442: // Handle |x| < 1.0 here. 45 andl $0x80000000, 4(%esp) // copysign(0.0, x) 46 cvttps2dq %xmm0, %xmm0 // raise inexact 47 flds 4(%esp) 48 ret 49 50#elif defined __x86_64__ 51 52.const 53.align 4 54mzero: .long 0x80000000 55 56.text 57.align 4 58.globl _truncf 59_truncf: 60 movd %xmm0, %eax 61 and $0x7f800000, %eax // |x| 62 mov $23, %ecx 63 sub $0x3f800000, %eax // if |x| < 1.0f, goto 2 64 js 2f 65 sar %cl, %eax // exponent(x) 66 mov $0xffffffff, %edx 67 sub %eax, %ecx // 23 - exponent(x) 68 js 1f // return x if |x| >= 0x1.0p23 69 shl %cl, %edx 70 movd %edx, %xmm2 71 cvttps2dq %xmm0, %xmm1 // raise inexact 72 andps %xmm2, %xmm0 // mask off non-integral bits 731: ret 74.align 4 752: cvttps2dq %xmm0, %xmm1 // raise inexact 76 andps mzero(%rip), %xmm0 // copysign(0.0, x) 77 ret 78 79#else 80 #error unknown arch 81#endif