this repo has no description
at fixPythonPipStalling 95 lines 2.2 kB view raw
1/* Single-precision roundf, reimplemented using integer operations 2 * for improved performance, especially on in-order machines. 3 * 4 * Steve Canon, March 2009. 5 */ 6 7#if defined __i386__ 8 9.text 10.align 4 11.globl _roundf 12_roundf: 13 mov 4(%esp), %eax // load the input, x 14 movss 4(%esp), %xmm0 15 and $0x7f800000, %eax // |x| 16 mov $23, %ecx 17 mov $0x1, %edx 18 sar %cl, %eax // exponent(x) + 1 19 20 sub $0x7e, %eax // if |x| < 1.0f. goto 2 21 jbe 2f 22 23 sub %eax, %ecx // 23 - exponent(x) - 1 24 js 1f // return x if |x| >= 0x1.0p23 25 26 shl %cl, %edx 27 mov $0xfffffffe, %eax 28 shl %cl, %eax 29 add %edx, 4(%esp) // add 0.5 (ish -- non-integral bits are garbage) 30 and %eax, 4(%esp) // truncate 31 cvttps2dq %xmm0, %xmm0 // raise inexact 321: flds 4(%esp) 33 ret 34.align 4 352: je 3f // if |x| >= 0.5f, goto 3 36 andl $0x80000000, 4(%esp) // copysign(0.0, x) 37 cvttps2dq %xmm0, %xmm0 // raise inexact 38 flds 4(%esp) 39 ret 40.align 4 413: addl $0x00800000, 4(%esp) 42 andl $0xff800000, 4(%esp) // copysign(1.0, x) 43 cvttps2dq %xmm0, %xmm0 // raise inexact 44 flds 4(%esp) 45 ret 46 47#elif defined __x86_64__ 48 49.const 50.align 4 51mzero: .long 0x80000000 52.align 4 53expbit: .long 0x00800000 54.align 4 55expmask:.long 0xff800000 56 57.text 58.align 4 59.globl _roundf 60_roundf: 61 movd %xmm0, %eax 62 and $0x7f800000, %eax // |x| 63 mov $23, %ecx 64 mov $0x1, %edx 65 sar %cl, %eax // exponent(x) + 1 66 67 sub $0x7e, %eax // if |x| < 1.0f. goto 2 68 jbe 2f 69 70 sub %eax, %ecx // 23 - exponent(x) - 1 71 js 1f // return x if |x| >= 0x1.0p23 72 73 shl %cl, %edx 74 mov $0xfffffffe, %eax 75 shl %cl, %eax 76 movd %edx, %xmm2 77 movd %eax, %xmm3 78 cvttps2dq %xmm0, %xmm1 // raise inexact 79 paddd %xmm2, %xmm0 // add 0.5 (ish -- non-integral bits are garbage) 80 pand %xmm3, %xmm0 // truncate 811: ret 82.align 4 832: je 3f // if |x| >= 0.5f, goto 3 84 cvttps2dq %xmm0, %xmm1 // raise inexact 85 andps mzero(%rip), %xmm0 // copysign(0.0, x) 86 ret 87.align 4 883: cvttps2dq %xmm0, %xmm1 // raise inexact 89 paddd expbit(%rip), %xmm0 90 pand expmask(%rip), %xmm0 // copysign(1.0, x) 91 ret 92 93#else 94 #error unknown arch 95#endif