this repo has no description
at fixPythonPipStalling 129 lines 3.4 kB view raw
1 2/* 3 * lroundf.s 4 * 5 * by Ian Ollmann 6 * 7 * Copyright (c) 2007, Apple Inc. All Rights Reserved. 8 * 9 * C99 lroundf for __i386__ and __x86_64__ 10 * 11 */ 12 13#include <machine/asm.h> 14#include "abi.h" 15 16 17#if defined( __i386__ ) 18 19 20 ENTRY( llroundf ) 21 movl FRAME_SIZE(STACKP), %eax 22 flds FRAME_SIZE(STACKP) // { x } 23 fld %st(0) // { x, x } 24 SUBP $16-FRAME_SIZE, STACKP 25 26 fistpll (STACKP) // { x }, llrint(x), set invalid / inexact if necessary 27 movl %eax, %edx // x 28 andl $0x7fffffff, %eax // |x| 29 xorl %eax, %edx // signof( x ) 30 cmpl $0x4b000000, %eax // |x| >= 0x1.0p23f or NaN 31 jae 1f 32 33 fildll (STACKP) // { llrint(x), x } 34 fucomip %st(1), %st(0) // { x } x == llrint(x) 35 fstp %st(0) // { } 36 je 2f // return llrint(x) 37 38 cmpl $0x3effffff, %eax // |x| == 0.5f - 1 ulp 39 je 4f 40 41 // at this point we know that |x| < 0x1.0p23f, so we dont need a full 64-bit conversion, which is good because we really need trunc now. 42 orl $0x3f000000, %edx // copysign( 0.5f, x ) 43 movss 16(STACKP), %xmm0 // x 44 movd %edx, %xmm1 // copysign( 0.5f, x ) 45 addss %xmm1, %xmm0 // x + copysign( 0.5f, x ) 46 cvttss2si %xmm0, %edx // result = (int32_t) ( x + copysign( 0.5f, x )) 47 movl %edx, %eax // result 48 sarl $31, %edx // sign extended result 49 ADDP $16-FRAME_SIZE, STACKP 50 ret 51 521: // |x| >= 0x1.0p23f or NaN 53 fstp %st(0) // { } 54 cmpl $0x5f000000, %eax // |x| >= 0x1.0p63f 55 jae 3f 56 57 // |x| is non-overflowing integer (NaN ends up here eventually too) 582: movl (STACKP), %eax // low 32 bits 59 movl 4(STACKP), %edx // high 32 bits 60 ADDP $16-FRAME_SIZE, STACKP 61 ret 62 633: // |x| overflows or is NaN 64 cmpl $0x7f800000, %eax // |x| > 0x1.0p63f 65 ja 2b 66 67 // |x| overflows 68 subl $1, %edx // x < 0 ? 0x7fffffff : -1U 69 sarl $31, %edx // x < 0 ? 0 : -1U 70 movl (STACKP), %eax 71 xorl %edx, %eax // x < 0 ? low result : low result ^ -1U 72 xorl 4(STACKP), %edx 73 ADDP $16-FRAME_SIZE, STACKP 74 ret 75 764: // |x| == 0.5f - 1 ulp, return 0 77 xorl %eax, %eax 78 xorl %edx, %edx 79 ADDP $16-FRAME_SIZE, STACKP 80 ret 81 82#define LONG_MIN_f 0x4f000000 83 84 ENTRY( lroundf ) 85 movl FRAME_SIZE(STACKP), %edx 86 movss FRAME_SIZE(STACKP), %xmm0 87#elif defined( __x86_64__ ) 88 89#define LONG_MIN_f 0x5f000000 90 91 92 ENTRY( lroundf ) 93 ENTRY( llroundf ) 94 xorq %rdx, %rdx 95 movd %xmm0, %edx // |x| 96#endif 97 98 cvttss2si %xmm0, AX_P // (long) x, set invalid / inexact if necessary 99 MOVP DX_P, CX_P // x 100 and $0x7fffffff, DX_P // |x| 101 XORP DX_P, CX_P // signof( x ) 102 cmpl $0x4b000000, %edx // |x| >= 0x1.0p23f or NaN 103 jae 2f 104 105 cvtsi2ss AX_P, %xmm1 // trunc(x) 106 ucomiss %xmm0, %xmm1 // x == trunc(x) 107 je 1f // return (long) x 108 109 orl $0x3f000000, %ecx // copysign( 0.5f, x ) 110 movd %ecx, %xmm1 // copysign( 0.5f, x ) 111 cmpl $0x3effffff, %edx // |x| == 0.5f - 1 ulp 112 je 1f // return (long) x 113 114 addss %xmm1, %xmm0 // x += copysign( 0.5, x ) 115 cvttss2si %xmm0, AX_P // (int) (x + copysign( 0.5, x ) ) 116 117 1: ret 118 119 2: // |x| >= 0x1.0p23f or NaN 120 cmpl $0x7f800000, %edx // |x| is NaN 121 ja 3f 122 cmpl $LONG_MIN_f, %edx // |x| < LONG_MIN 123 jb 1b 124 SUBP $1, CX_P // x < 0 ? 0x7fffffff : -1LL 125 sar $31, CX_P // x < 0 ? 0 : -1LL 126 XORP CX_P, AX_P // flip LONG_LONG_MIN to LONG_LONG_MAX if needed 127 3: ret 128 129