at v2.6.12-rc4 217 lines 3.6 kB view raw
1#include <asm/ppc_asm.h> 2#include <asm/processor.h> 3 4/* 5 * The routines below are in assembler so we can closely control the 6 * usage of floating-point registers. These routines must be called 7 * with preempt disabled. 8 */ 9 .data 10fpzero: 11 .long 0 12fpone: 13 .long 0x3f800000 /* 1.0 in single-precision FP */ 14fphalf: 15 .long 0x3f000000 /* 0.5 in single-precision FP */ 16 17 .text 18/* 19 * Internal routine to enable floating point and set FPSCR to 0. 20 * Don't call it from C; it doesn't use the normal calling convention. 21 */ 22fpenable: 23 mfmsr r10 24 ori r11,r10,MSR_FP 25 mtmsr r11 26 isync 27 stfd fr0,24(r1) 28 stfd fr1,16(r1) 29 stfd fr31,8(r1) 30 lis r11,fpzero@ha 31 mffs fr31 32 lfs fr1,fpzero@l(r11) 33 mtfsf 0xff,fr1 34 blr 35 36fpdisable: 37 mtfsf 0xff,fr31 38 lfd fr31,8(r1) 39 lfd fr1,16(r1) 40 lfd fr0,24(r1) 41 mtmsr r10 42 isync 43 blr 44 45/* 46 * Vector add, floating point. 47 */ 48 .globl vaddfp 49vaddfp: 50 stwu r1,-32(r1) 51 mflr r0 52 stw r0,36(r1) 53 bl fpenable 54 li r0,4 55 mtctr r0 56 li r6,0 571: lfsx fr0,r4,r6 58 lfsx fr1,r5,r6 59 fadds fr0,fr0,fr1 60 stfsx fr0,r3,r6 61 addi r6,r6,4 62 bdnz 1b 63 bl fpdisable 64 lwz r0,36(r1) 65 mtlr r0 66 addi r1,r1,32 67 blr 68 69/* 70 * Vector subtract, floating point. 71 */ 72 .globl vsubfp 73vsubfp: 74 stwu r1,-32(r1) 75 mflr r0 76 stw r0,36(r1) 77 bl fpenable 78 li r0,4 79 mtctr r0 80 li r6,0 811: lfsx fr0,r4,r6 82 lfsx fr1,r5,r6 83 fsubs fr0,fr0,fr1 84 stfsx fr0,r3,r6 85 addi r6,r6,4 86 bdnz 1b 87 bl fpdisable 88 lwz r0,36(r1) 89 mtlr r0 90 addi r1,r1,32 91 blr 92 93/* 94 * Vector multiply and add, floating point. 95 */ 96 .globl vmaddfp 97vmaddfp: 98 stwu r1,-48(r1) 99 mflr r0 100 stw r0,52(r1) 101 bl fpenable 102 stfd fr2,32(r1) 103 li r0,4 104 mtctr r0 105 li r7,0 1061: lfsx fr0,r4,r7 107 lfsx fr1,r5,r7 108 lfsx fr2,r6,r7 109 fmadds fr0,fr0,fr2,fr1 110 stfsx fr0,r3,r7 111 addi r7,r7,4 112 bdnz 1b 113 lfd fr2,32(r1) 114 bl fpdisable 115 lwz r0,52(r1) 116 mtlr r0 117 addi r1,r1,48 118 blr 119 120/* 121 * Vector negative multiply and subtract, floating point. 122 */ 123 .globl vnmsubfp 124vnmsubfp: 125 stwu r1,-48(r1) 126 mflr r0 127 stw r0,52(r1) 128 bl fpenable 129 stfd fr2,32(r1) 130 li r0,4 131 mtctr r0 132 li r7,0 1331: lfsx fr0,r4,r7 134 lfsx fr1,r5,r7 135 lfsx fr2,r6,r7 136 fnmsubs fr0,fr0,fr2,fr1 137 stfsx fr0,r3,r7 138 addi r7,r7,4 139 bdnz 1b 140 lfd fr2,32(r1) 141 bl fpdisable 142 lwz r0,52(r1) 143 mtlr r0 144 addi r1,r1,48 145 blr 146 147/* 148 * Vector reciprocal estimate. We just compute 1.0/x. 149 * r3 -> destination, r4 -> source. 150 */ 151 .globl vrefp 152vrefp: 153 stwu r1,-32(r1) 154 mflr r0 155 stw r0,36(r1) 156 bl fpenable 157 lis r9,fpone@ha 158 li r0,4 159 lfs fr1,fpone@l(r9) 160 mtctr r0 161 li r6,0 1621: lfsx fr0,r4,r6 163 fdivs fr0,fr1,fr0 164 stfsx fr0,r3,r6 165 addi r6,r6,4 166 bdnz 1b 167 bl fpdisable 168 lwz r0,36(r1) 169 mtlr r0 170 addi r1,r1,32 171 blr 172 173/* 174 * Vector reciprocal square-root estimate, floating point. 175 * We use the frsqrte instruction for the initial estimate followed 176 * by 2 iterations of Newton-Raphson to get sufficient accuracy. 177 * r3 -> destination, r4 -> source. 178 */ 179 .globl vrsqrtefp 180vrsqrtefp: 181 stwu r1,-48(r1) 182 mflr r0 183 stw r0,52(r1) 184 bl fpenable 185 stfd fr2,32(r1) 186 stfd fr3,40(r1) 187 stfd fr4,48(r1) 188 stfd fr5,56(r1) 189 lis r9,fpone@ha 190 lis r8,fphalf@ha 191 li r0,4 192 lfs fr4,fpone@l(r9) 193 lfs fr5,fphalf@l(r8) 194 mtctr r0 195 li r6,0 1961: lfsx fr0,r4,r6 197 frsqrte fr1,fr0 /* r = frsqrte(s) */ 198 fmuls fr3,fr1,fr0 /* r * s */ 199 fmuls fr2,fr1,fr5 /* r * 0.5 */ 200 fnmsubs fr3,fr1,fr3,fr4 /* 1 - s * r * r */ 201 fmadds fr1,fr2,fr3,fr1 /* r = r + 0.5 * r * (1 - s * r * r) */ 202 fmuls fr3,fr1,fr0 /* r * s */ 203 fmuls fr2,fr1,fr5 /* r * 0.5 */ 204 fnmsubs fr3,fr1,fr3,fr4 /* 1 - s * r * r */ 205 fmadds fr1,fr2,fr3,fr1 /* r = r + 0.5 * r * (1 - s * r * r) */ 206 stfsx fr1,r3,r6 207 addi r6,r6,4 208 bdnz 1b 209 lfd fr5,56(r1) 210 lfd fr4,48(r1) 211 lfd fr3,40(r1) 212 lfd fr2,32(r1) 213 bl fpdisable 214 lwz r0,36(r1) 215 mtlr r0 216 addi r1,r1,32 217 blr