Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at c9a28fa7b9ac19b676deefa0a171ce7df8755c08 403 lines 8.5 kB view raw
1#include <linux/types.h> 2#include <linux/string.h> 3#include <linux/sched.h> 4#include <linux/hardirq.h> 5#include <linux/module.h> 6 7#include <asm/i387.h> 8 9 10/* 11 * MMX 3DNow! library helper functions 12 * 13 * To do: 14 * We can use MMX just for prefetch in IRQ's. This may be a win. 15 * (reported so on K6-III) 16 * We should use a better code neutral filler for the short jump 17 * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ?? 18 * We also want to clobber the filler register so we don't get any 19 * register forwarding stalls on the filler. 20 * 21 * Add *user handling. Checksums are not a win with MMX on any CPU 22 * tested so far for any MMX solution figured. 23 * 24 * 22/09/2000 - Arjan van de Ven 25 * Improved for non-egineering-sample Athlons 26 * 27 */ 28 29void *_mmx_memcpy(void *to, const void *from, size_t len) 30{ 31 void *p; 32 int i; 33 34 if (unlikely(in_interrupt())) 35 return __memcpy(to, from, len); 36 37 p = to; 38 i = len >> 6; /* len/64 */ 39 40 kernel_fpu_begin(); 41 42 __asm__ __volatile__ ( 43 "1: prefetch (%0)\n" /* This set is 28 bytes */ 44 " prefetch 64(%0)\n" 45 " prefetch 128(%0)\n" 46 " prefetch 192(%0)\n" 47 " prefetch 256(%0)\n" 48 "2: \n" 49 ".section .fixup, \"ax\"\n" 50 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 51 " jmp 2b\n" 52 ".previous\n" 53 ".section __ex_table,\"a\"\n" 54 " .align 4\n" 55 " .long 1b, 3b\n" 56 ".previous" 57 : : "r" (from) ); 58 59 60 for(; i>5; i--) 61 { 62 __asm__ __volatile__ ( 63 "1: prefetch 320(%0)\n" 64 "2: movq (%0), %%mm0\n" 65 " movq 8(%0), %%mm1\n" 66 " movq 16(%0), %%mm2\n" 67 " movq 24(%0), %%mm3\n" 68 " movq %%mm0, (%1)\n" 69 " movq %%mm1, 8(%1)\n" 70 " movq %%mm2, 16(%1)\n" 71 " movq %%mm3, 24(%1)\n" 72 " movq 32(%0), %%mm0\n" 73 " movq 40(%0), %%mm1\n" 74 " movq 48(%0), %%mm2\n" 75 " movq 56(%0), %%mm3\n" 76 " movq %%mm0, 32(%1)\n" 77 " movq %%mm1, 40(%1)\n" 78 " movq %%mm2, 48(%1)\n" 79 " movq %%mm3, 56(%1)\n" 80 ".section .fixup, \"ax\"\n" 81 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 82 " jmp 2b\n" 83 ".previous\n" 84 ".section __ex_table,\"a\"\n" 85 " .align 4\n" 86 " .long 1b, 3b\n" 87 ".previous" 88 : : "r" (from), "r" (to) : "memory"); 89 from+=64; 90 to+=64; 91 } 92 93 for(; i>0; i--) 94 { 95 __asm__ __volatile__ ( 96 " movq (%0), %%mm0\n" 97 " movq 8(%0), %%mm1\n" 98 " movq 16(%0), %%mm2\n" 99 " movq 24(%0), %%mm3\n" 100 " movq %%mm0, (%1)\n" 101 " movq %%mm1, 8(%1)\n" 102 " movq %%mm2, 16(%1)\n" 103 " movq %%mm3, 24(%1)\n" 104 " movq 32(%0), %%mm0\n" 105 " movq 40(%0), %%mm1\n" 106 " movq 48(%0), %%mm2\n" 107 " movq 56(%0), %%mm3\n" 108 " movq %%mm0, 32(%1)\n" 109 " movq %%mm1, 40(%1)\n" 110 " movq %%mm2, 48(%1)\n" 111 " movq %%mm3, 56(%1)\n" 112 : : "r" (from), "r" (to) : "memory"); 113 from+=64; 114 to+=64; 115 } 116 /* 117 * Now do the tail of the block 118 */ 119 __memcpy(to, from, len&63); 120 kernel_fpu_end(); 121 return p; 122} 123 124#ifdef CONFIG_MK7 125 126/* 127 * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and 128 * other MMX using processors do not. 129 */ 130 131static void fast_clear_page(void *page) 132{ 133 int i; 134 135 kernel_fpu_begin(); 136 137 __asm__ __volatile__ ( 138 " pxor %%mm0, %%mm0\n" : : 139 ); 140 141 for(i=0;i<4096/64;i++) 142 { 143 __asm__ __volatile__ ( 144 " movntq %%mm0, (%0)\n" 145 " movntq %%mm0, 8(%0)\n" 146 " movntq %%mm0, 16(%0)\n" 147 " movntq %%mm0, 24(%0)\n" 148 " movntq %%mm0, 32(%0)\n" 149 " movntq %%mm0, 40(%0)\n" 150 " movntq %%mm0, 48(%0)\n" 151 " movntq %%mm0, 56(%0)\n" 152 : : "r" (page) : "memory"); 153 page+=64; 154 } 155 /* since movntq is weakly-ordered, a "sfence" is needed to become 156 * ordered again. 157 */ 158 __asm__ __volatile__ ( 159 " sfence \n" : : 160 ); 161 kernel_fpu_end(); 162} 163 164static void fast_copy_page(void *to, void *from) 165{ 166 int i; 167 168 kernel_fpu_begin(); 169 170 /* maybe the prefetch stuff can go before the expensive fnsave... 171 * but that is for later. -AV 172 */ 173 __asm__ __volatile__ ( 174 "1: prefetch (%0)\n" 175 " prefetch 64(%0)\n" 176 " prefetch 128(%0)\n" 177 " prefetch 192(%0)\n" 178 " prefetch 256(%0)\n" 179 "2: \n" 180 ".section .fixup, \"ax\"\n" 181 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 182 " jmp 2b\n" 183 ".previous\n" 184 ".section __ex_table,\"a\"\n" 185 " .align 4\n" 186 " .long 1b, 3b\n" 187 ".previous" 188 : : "r" (from) ); 189 190 for(i=0; i<(4096-320)/64; i++) 191 { 192 __asm__ __volatile__ ( 193 "1: prefetch 320(%0)\n" 194 "2: movq (%0), %%mm0\n" 195 " movntq %%mm0, (%1)\n" 196 " movq 8(%0), %%mm1\n" 197 " movntq %%mm1, 8(%1)\n" 198 " movq 16(%0), %%mm2\n" 199 " movntq %%mm2, 16(%1)\n" 200 " movq 24(%0), %%mm3\n" 201 " movntq %%mm3, 24(%1)\n" 202 " movq 32(%0), %%mm4\n" 203 " movntq %%mm4, 32(%1)\n" 204 " movq 40(%0), %%mm5\n" 205 " movntq %%mm5, 40(%1)\n" 206 " movq 48(%0), %%mm6\n" 207 " movntq %%mm6, 48(%1)\n" 208 " movq 56(%0), %%mm7\n" 209 " movntq %%mm7, 56(%1)\n" 210 ".section .fixup, \"ax\"\n" 211 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 212 " jmp 2b\n" 213 ".previous\n" 214 ".section __ex_table,\"a\"\n" 215 " .align 4\n" 216 " .long 1b, 3b\n" 217 ".previous" 218 : : "r" (from), "r" (to) : "memory"); 219 from+=64; 220 to+=64; 221 } 222 for(i=(4096-320)/64; i<4096/64; i++) 223 { 224 __asm__ __volatile__ ( 225 "2: movq (%0), %%mm0\n" 226 " movntq %%mm0, (%1)\n" 227 " movq 8(%0), %%mm1\n" 228 " movntq %%mm1, 8(%1)\n" 229 " movq 16(%0), %%mm2\n" 230 " movntq %%mm2, 16(%1)\n" 231 " movq 24(%0), %%mm3\n" 232 " movntq %%mm3, 24(%1)\n" 233 " movq 32(%0), %%mm4\n" 234 " movntq %%mm4, 32(%1)\n" 235 " movq 40(%0), %%mm5\n" 236 " movntq %%mm5, 40(%1)\n" 237 " movq 48(%0), %%mm6\n" 238 " movntq %%mm6, 48(%1)\n" 239 " movq 56(%0), %%mm7\n" 240 " movntq %%mm7, 56(%1)\n" 241 : : "r" (from), "r" (to) : "memory"); 242 from+=64; 243 to+=64; 244 } 245 /* since movntq is weakly-ordered, a "sfence" is needed to become 246 * ordered again. 247 */ 248 __asm__ __volatile__ ( 249 " sfence \n" : : 250 ); 251 kernel_fpu_end(); 252} 253 254#else 255 256/* 257 * Generic MMX implementation without K7 specific streaming 258 */ 259 260static void fast_clear_page(void *page) 261{ 262 int i; 263 264 kernel_fpu_begin(); 265 266 __asm__ __volatile__ ( 267 " pxor %%mm0, %%mm0\n" : : 268 ); 269 270 for(i=0;i<4096/128;i++) 271 { 272 __asm__ __volatile__ ( 273 " movq %%mm0, (%0)\n" 274 " movq %%mm0, 8(%0)\n" 275 " movq %%mm0, 16(%0)\n" 276 " movq %%mm0, 24(%0)\n" 277 " movq %%mm0, 32(%0)\n" 278 " movq %%mm0, 40(%0)\n" 279 " movq %%mm0, 48(%0)\n" 280 " movq %%mm0, 56(%0)\n" 281 " movq %%mm0, 64(%0)\n" 282 " movq %%mm0, 72(%0)\n" 283 " movq %%mm0, 80(%0)\n" 284 " movq %%mm0, 88(%0)\n" 285 " movq %%mm0, 96(%0)\n" 286 " movq %%mm0, 104(%0)\n" 287 " movq %%mm0, 112(%0)\n" 288 " movq %%mm0, 120(%0)\n" 289 : : "r" (page) : "memory"); 290 page+=128; 291 } 292 293 kernel_fpu_end(); 294} 295 296static void fast_copy_page(void *to, void *from) 297{ 298 int i; 299 300 301 kernel_fpu_begin(); 302 303 __asm__ __volatile__ ( 304 "1: prefetch (%0)\n" 305 " prefetch 64(%0)\n" 306 " prefetch 128(%0)\n" 307 " prefetch 192(%0)\n" 308 " prefetch 256(%0)\n" 309 "2: \n" 310 ".section .fixup, \"ax\"\n" 311 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 312 " jmp 2b\n" 313 ".previous\n" 314 ".section __ex_table,\"a\"\n" 315 " .align 4\n" 316 " .long 1b, 3b\n" 317 ".previous" 318 : : "r" (from) ); 319 320 for(i=0; i<4096/64; i++) 321 { 322 __asm__ __volatile__ ( 323 "1: prefetch 320(%0)\n" 324 "2: movq (%0), %%mm0\n" 325 " movq 8(%0), %%mm1\n" 326 " movq 16(%0), %%mm2\n" 327 " movq 24(%0), %%mm3\n" 328 " movq %%mm0, (%1)\n" 329 " movq %%mm1, 8(%1)\n" 330 " movq %%mm2, 16(%1)\n" 331 " movq %%mm3, 24(%1)\n" 332 " movq 32(%0), %%mm0\n" 333 " movq 40(%0), %%mm1\n" 334 " movq 48(%0), %%mm2\n" 335 " movq 56(%0), %%mm3\n" 336 " movq %%mm0, 32(%1)\n" 337 " movq %%mm1, 40(%1)\n" 338 " movq %%mm2, 48(%1)\n" 339 " movq %%mm3, 56(%1)\n" 340 ".section .fixup, \"ax\"\n" 341 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 342 " jmp 2b\n" 343 ".previous\n" 344 ".section __ex_table,\"a\"\n" 345 " .align 4\n" 346 " .long 1b, 3b\n" 347 ".previous" 348 : : "r" (from), "r" (to) : "memory"); 349 from+=64; 350 to+=64; 351 } 352 kernel_fpu_end(); 353} 354 355 356#endif 357 358/* 359 * Favour MMX for page clear and copy. 360 */ 361 362static void slow_zero_page(void * page) 363{ 364 int d0, d1; 365 __asm__ __volatile__( \ 366 "cld\n\t" \ 367 "rep ; stosl" \ 368 : "=&c" (d0), "=&D" (d1) 369 :"a" (0),"1" (page),"0" (1024) 370 :"memory"); 371} 372 373void mmx_clear_page(void * page) 374{ 375 if(unlikely(in_interrupt())) 376 slow_zero_page(page); 377 else 378 fast_clear_page(page); 379} 380 381static void slow_copy_page(void *to, void *from) 382{ 383 int d0, d1, d2; 384 __asm__ __volatile__( \ 385 "cld\n\t" \ 386 "rep ; movsl" \ 387 : "=&c" (d0), "=&D" (d1), "=&S" (d2) \ 388 : "0" (1024),"1" ((long) to),"2" ((long) from) \ 389 : "memory"); 390} 391 392 393void mmx_copy_page(void *to, void *from) 394{ 395 if(unlikely(in_interrupt())) 396 slow_copy_page(to, from); 397 else 398 fast_copy_page(to, from); 399} 400 401EXPORT_SYMBOL(_mmx_memcpy); 402EXPORT_SYMBOL(mmx_clear_page); 403EXPORT_SYMBOL(mmx_copy_page);