/* double round( double ) * * Reimplemented by Steve Canon, based on Ian Ollmann's implementations * tuned for increased performance on in-order machines (but faster on * out-of-order machines as well). * * Copyright 2009, Apple Inc. */ #ifdef __i386__ #ifdef __SSE3__ #define TRUNCATE \ fisttpll 4(%esp); \ fildll 4(%esp) #else #define TRUNCATE \ fnstcw 4(%esp); \ movw 4(%esp), %dx; \ orw $0xc00, 4(%esp); \ fldcw 4(%esp); \ frndint; \ movw %dx, 4(%esp); \ fldcw 4(%esp) #endif .text .align 4 .globl _round _round: movl 8(%esp), %ecx cmpl $0x43300000, %ecx fldl 4(%esp) jae 2f fld %st(0) // { x, x } TRUNCATE // { trunc(x), x } fxch // { x, trunc(x) } fsub %st(1), %st(0) // { frac(x), x } fadd %st(0), %st(0) // { 2*frac(x), x } fld1 // { 1.0, 2*frac(x), x } fucomi %st(1), %st(0) fstp %st(1) // { 1.0, trunc(x) } ja 1f fadd %st(0), %st(1) // { 1.0, trunc(x) + 1.0 } 1: fstp %st(0) // { round(x) } ret 2: andl $0x7fffffff, %ecx cmpl $0x43300000, %ecx jge 4f fabs fld %st(0) TRUNCATE // { trunc(x), x } fxch // { x, trunc(x) } fsub %st(1), %st(0) // { frac(x), x } fadd %st(0), %st(0) // { 2*frac(x), x } fld1 // { 1.0, 2*frac(x), x } fucomi %st(1), %st(0) fstp %st(1) // { 1.0, trunc(x) } ja 3f fadd %st(0), %st(1) // { 1.0, trunc(x) + 1.0 } 3: fstp %st(0) // { round(x) } fchs 4: ret #else //x86_64 .const .align 4 one: .quad 0x3ff0000000000000 absmask:.quad 0x7fffffffffffffff half: .quad 0x3fe0000000000000 thresh: .quad 0x4330000000000000 .text .align 4 .globl _round _round: movd %xmm0, %rcx andq absmask(%rip), %rcx movsd absmask(%rip), %xmm2 cmpq thresh(%rip), %rcx jae 1f cvttsd2si %xmm0, %rax andnpd %xmm0, %xmm2 // signbit(x) movsd half(%rip), %xmm3 cvtsi2sd %rax, %xmm1 // trunc(x) subsd %xmm1, %xmm0 // frac(x) orpd %xmm2, %xmm1 xorpd %xmm2, %xmm0 // |frac(x)| cmpltpd %xmm3, %xmm0 // (|frac(x)| < 0.5) ? andnpd one(%rip), %xmm0 // (|frac(x)| < 0.5) ? 0.0 : 1.0 orpd %xmm2, %xmm0 addsd %xmm1, %xmm0 // round(x) 1: ret #endif