this repo has no description
at fixPythonPipStalling 89 lines 1.9 kB view raw
1/* Single-precision floorf, reimplemented using integer operations 2 * for improved performance, especially on in-order machines. 3 * 4 * Steve Canon, March 2009. 5 */ 6 7#if defined __i386__ 8 9#include <System/i386/cpu_capabilities.h> 10.set cpubits, _COMM_PAGE_CPU_CAPABILITIES 11 12.text 13.align 4 14.globl _floorf 15_floorf: 16 movss 4(%esp), %xmm0 // load argument 17 testl $(kHasSSE4_1), cpubits 18 jz 0f 19 20 // fast path using SSE 4.1 21 roundss $0x1, %xmm0, %xmm0 22 movss %xmm0, 4(%esp) 23 flds 4(%esp) 24 ret 25 26.align 4 270: // no SSE 4.1 28 mov 4(%esp), %eax 29 mov $23, %cl 30 mov %eax, %edx 31 shr %cl, %eax 32 dec %edx 33 sub $0x7f, %al 34 jb 2f 35 36 sub %al, %cl 37 mov $0xffffffff, %eax 38 jbe 1f 39 40 shl %cl, %eax // m = mask for integral bits of x 41 mov %edx, %ecx 42 not %edx 43 sar $31, %edx // (x < 0) ? -1 : 0 44 or %eax, %edx // (x < 0) ? -1 : m 45 sub %edx, %ecx // (x < 0) ? x : (x + (1.0 - ulp(x))) 46 and %ecx, %eax // ceil(x) 47 mov %eax, 4(%esp) 48 cvttps2dq %xmm0, %xmm0 // set inexact 491: flds 4(%esp) 50 ret 51.align 4 522: cvttps2dq %xmm0, %xmm0 // set inexact 53 cmp $0xffffffff, %edx // if x < 0.0, goto 3 54 jl 3f 55 andl $0x80000000, 4(%esp) // copysign(0.0, x) 56 flds 4(%esp) 57 ret 58.align 4 593: movl $0xbf800000, 4(%esp) // return 1.0 60 flds 4(%esp) 61 ret 62 63#elif defined __x86_64__ 64 65.const 66.align 4 67mone: .long 0xbf800000 68absmask:.long 0x7fffffff 69 70.text 71.align 4 72.globl _floorf 73_floorf: 74 movd %xmm0, %ecx 75 andl $0x7fffffff, %ecx // |x| 76 subl $1, %ecx // subtract 1. This forces |+-0| to -0 77 cmpl $0x4afffffe, %ecx // values >= 0x4b000000 - 1 are either integers, NaN or Inf 78 ja 1f // unsigned compare adds 0 to the list 79 80 cvttps2dq %xmm0, %xmm2 81 cvtdq2ps %xmm2, %xmm2 82 cmpltss %xmm2, %xmm0 83 andps mone(%rip), %xmm0 84 addss %xmm2, %xmm0 851: ret 86 87#else 88 #error unknown arch 89#endif