Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v5.5 378 lines 8.3 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * MMX 3DNow! library helper functions 4 * 5 * To do: 6 * We can use MMX just for prefetch in IRQ's. This may be a win. 7 * (reported so on K6-III) 8 * We should use a better code neutral filler for the short jump 9 * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ?? 10 * We also want to clobber the filler register so we don't get any 11 * register forwarding stalls on the filler. 12 * 13 * Add *user handling. Checksums are not a win with MMX on any CPU 14 * tested so far for any MMX solution figured. 15 * 16 * 22/09/2000 - Arjan van de Ven 17 * Improved for non-egineering-sample Athlons 18 * 19 */ 20#include <linux/hardirq.h> 21#include <linux/string.h> 22#include <linux/export.h> 23#include <linux/sched.h> 24#include <linux/types.h> 25 26#include <asm/fpu/api.h> 27#include <asm/asm.h> 28 29void *_mmx_memcpy(void *to, const void *from, size_t len) 30{ 31 void *p; 32 int i; 33 34 if (unlikely(in_interrupt())) 35 return __memcpy(to, from, len); 36 37 p = to; 38 i = len >> 6; /* len/64 */ 39 40 kernel_fpu_begin(); 41 42 __asm__ __volatile__ ( 43 "1: prefetch (%0)\n" /* This set is 28 bytes */ 44 " prefetch 64(%0)\n" 45 " prefetch 128(%0)\n" 46 " prefetch 192(%0)\n" 47 " prefetch 256(%0)\n" 48 "2: \n" 49 ".section .fixup, \"ax\"\n" 50 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 51 " jmp 2b\n" 52 ".previous\n" 53 _ASM_EXTABLE(1b, 3b) 54 : : "r" (from)); 55 56 for ( ; i > 5; i--) { 57 __asm__ __volatile__ ( 58 "1: prefetch 320(%0)\n" 59 "2: movq (%0), %%mm0\n" 60 " movq 8(%0), %%mm1\n" 61 " movq 16(%0), %%mm2\n" 62 " movq 24(%0), %%mm3\n" 63 " movq %%mm0, (%1)\n" 64 " movq %%mm1, 8(%1)\n" 65 " movq %%mm2, 16(%1)\n" 66 " movq %%mm3, 24(%1)\n" 67 " movq 32(%0), %%mm0\n" 68 " movq 40(%0), %%mm1\n" 69 " movq 48(%0), %%mm2\n" 70 " movq 56(%0), %%mm3\n" 71 " movq %%mm0, 32(%1)\n" 72 " movq %%mm1, 40(%1)\n" 73 " movq %%mm2, 48(%1)\n" 74 " movq %%mm3, 56(%1)\n" 75 ".section .fixup, \"ax\"\n" 76 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 77 " jmp 2b\n" 78 ".previous\n" 79 _ASM_EXTABLE(1b, 3b) 80 : : "r" (from), "r" (to) : "memory"); 81 82 from += 64; 83 to += 64; 84 } 85 86 for ( ; i > 0; i--) { 87 __asm__ __volatile__ ( 88 " movq (%0), %%mm0\n" 89 " movq 8(%0), %%mm1\n" 90 " movq 16(%0), %%mm2\n" 91 " movq 24(%0), %%mm3\n" 92 " movq %%mm0, (%1)\n" 93 " movq %%mm1, 8(%1)\n" 94 " movq %%mm2, 16(%1)\n" 95 " movq %%mm3, 24(%1)\n" 96 " movq 32(%0), %%mm0\n" 97 " movq 40(%0), %%mm1\n" 98 " movq 48(%0), %%mm2\n" 99 " movq 56(%0), %%mm3\n" 100 " movq %%mm0, 32(%1)\n" 101 " movq %%mm1, 40(%1)\n" 102 " movq %%mm2, 48(%1)\n" 103 " movq %%mm3, 56(%1)\n" 104 : : "r" (from), "r" (to) : "memory"); 105 106 from += 64; 107 to += 64; 108 } 109 /* 110 * Now do the tail of the block: 111 */ 112 __memcpy(to, from, len & 63); 113 kernel_fpu_end(); 114 115 return p; 116} 117EXPORT_SYMBOL(_mmx_memcpy); 118 119#ifdef CONFIG_MK7 120 121/* 122 * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and 123 * other MMX using processors do not. 124 */ 125 126static void fast_clear_page(void *page) 127{ 128 int i; 129 130 kernel_fpu_begin(); 131 132 __asm__ __volatile__ ( 133 " pxor %%mm0, %%mm0\n" : : 134 ); 135 136 for (i = 0; i < 4096/64; i++) { 137 __asm__ __volatile__ ( 138 " movntq %%mm0, (%0)\n" 139 " movntq %%mm0, 8(%0)\n" 140 " movntq %%mm0, 16(%0)\n" 141 " movntq %%mm0, 24(%0)\n" 142 " movntq %%mm0, 32(%0)\n" 143 " movntq %%mm0, 40(%0)\n" 144 " movntq %%mm0, 48(%0)\n" 145 " movntq %%mm0, 56(%0)\n" 146 : : "r" (page) : "memory"); 147 page += 64; 148 } 149 150 /* 151 * Since movntq is weakly-ordered, a "sfence" is needed to become 152 * ordered again: 153 */ 154 __asm__ __volatile__("sfence\n"::); 155 156 kernel_fpu_end(); 157} 158 159static void fast_copy_page(void *to, void *from) 160{ 161 int i; 162 163 kernel_fpu_begin(); 164 165 /* 166 * maybe the prefetch stuff can go before the expensive fnsave... 167 * but that is for later. -AV 168 */ 169 __asm__ __volatile__( 170 "1: prefetch (%0)\n" 171 " prefetch 64(%0)\n" 172 " prefetch 128(%0)\n" 173 " prefetch 192(%0)\n" 174 " prefetch 256(%0)\n" 175 "2: \n" 176 ".section .fixup, \"ax\"\n" 177 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 178 " jmp 2b\n" 179 ".previous\n" 180 _ASM_EXTABLE(1b, 3b) : : "r" (from)); 181 182 for (i = 0; i < (4096-320)/64; i++) { 183 __asm__ __volatile__ ( 184 "1: prefetch 320(%0)\n" 185 "2: movq (%0), %%mm0\n" 186 " movntq %%mm0, (%1)\n" 187 " movq 8(%0), %%mm1\n" 188 " movntq %%mm1, 8(%1)\n" 189 " movq 16(%0), %%mm2\n" 190 " movntq %%mm2, 16(%1)\n" 191 " movq 24(%0), %%mm3\n" 192 " movntq %%mm3, 24(%1)\n" 193 " movq 32(%0), %%mm4\n" 194 " movntq %%mm4, 32(%1)\n" 195 " movq 40(%0), %%mm5\n" 196 " movntq %%mm5, 40(%1)\n" 197 " movq 48(%0), %%mm6\n" 198 " movntq %%mm6, 48(%1)\n" 199 " movq 56(%0), %%mm7\n" 200 " movntq %%mm7, 56(%1)\n" 201 ".section .fixup, \"ax\"\n" 202 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 203 " jmp 2b\n" 204 ".previous\n" 205 _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory"); 206 207 from += 64; 208 to += 64; 209 } 210 211 for (i = (4096-320)/64; i < 4096/64; i++) { 212 __asm__ __volatile__ ( 213 "2: movq (%0), %%mm0\n" 214 " movntq %%mm0, (%1)\n" 215 " movq 8(%0), %%mm1\n" 216 " movntq %%mm1, 8(%1)\n" 217 " movq 16(%0), %%mm2\n" 218 " movntq %%mm2, 16(%1)\n" 219 " movq 24(%0), %%mm3\n" 220 " movntq %%mm3, 24(%1)\n" 221 " movq 32(%0), %%mm4\n" 222 " movntq %%mm4, 32(%1)\n" 223 " movq 40(%0), %%mm5\n" 224 " movntq %%mm5, 40(%1)\n" 225 " movq 48(%0), %%mm6\n" 226 " movntq %%mm6, 48(%1)\n" 227 " movq 56(%0), %%mm7\n" 228 " movntq %%mm7, 56(%1)\n" 229 : : "r" (from), "r" (to) : "memory"); 230 from += 64; 231 to += 64; 232 } 233 /* 234 * Since movntq is weakly-ordered, a "sfence" is needed to become 235 * ordered again: 236 */ 237 __asm__ __volatile__("sfence \n"::); 238 kernel_fpu_end(); 239} 240 241#else /* CONFIG_MK7 */ 242 243/* 244 * Generic MMX implementation without K7 specific streaming 245 */ 246static void fast_clear_page(void *page) 247{ 248 int i; 249 250 kernel_fpu_begin(); 251 252 __asm__ __volatile__ ( 253 " pxor %%mm0, %%mm0\n" : : 254 ); 255 256 for (i = 0; i < 4096/128; i++) { 257 __asm__ __volatile__ ( 258 " movq %%mm0, (%0)\n" 259 " movq %%mm0, 8(%0)\n" 260 " movq %%mm0, 16(%0)\n" 261 " movq %%mm0, 24(%0)\n" 262 " movq %%mm0, 32(%0)\n" 263 " movq %%mm0, 40(%0)\n" 264 " movq %%mm0, 48(%0)\n" 265 " movq %%mm0, 56(%0)\n" 266 " movq %%mm0, 64(%0)\n" 267 " movq %%mm0, 72(%0)\n" 268 " movq %%mm0, 80(%0)\n" 269 " movq %%mm0, 88(%0)\n" 270 " movq %%mm0, 96(%0)\n" 271 " movq %%mm0, 104(%0)\n" 272 " movq %%mm0, 112(%0)\n" 273 " movq %%mm0, 120(%0)\n" 274 : : "r" (page) : "memory"); 275 page += 128; 276 } 277 278 kernel_fpu_end(); 279} 280 281static void fast_copy_page(void *to, void *from) 282{ 283 int i; 284 285 kernel_fpu_begin(); 286 287 __asm__ __volatile__ ( 288 "1: prefetch (%0)\n" 289 " prefetch 64(%0)\n" 290 " prefetch 128(%0)\n" 291 " prefetch 192(%0)\n" 292 " prefetch 256(%0)\n" 293 "2: \n" 294 ".section .fixup, \"ax\"\n" 295 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 296 " jmp 2b\n" 297 ".previous\n" 298 _ASM_EXTABLE(1b, 3b) : : "r" (from)); 299 300 for (i = 0; i < 4096/64; i++) { 301 __asm__ __volatile__ ( 302 "1: prefetch 320(%0)\n" 303 "2: movq (%0), %%mm0\n" 304 " movq 8(%0), %%mm1\n" 305 " movq 16(%0), %%mm2\n" 306 " movq 24(%0), %%mm3\n" 307 " movq %%mm0, (%1)\n" 308 " movq %%mm1, 8(%1)\n" 309 " movq %%mm2, 16(%1)\n" 310 " movq %%mm3, 24(%1)\n" 311 " movq 32(%0), %%mm0\n" 312 " movq 40(%0), %%mm1\n" 313 " movq 48(%0), %%mm2\n" 314 " movq 56(%0), %%mm3\n" 315 " movq %%mm0, 32(%1)\n" 316 " movq %%mm1, 40(%1)\n" 317 " movq %%mm2, 48(%1)\n" 318 " movq %%mm3, 56(%1)\n" 319 ".section .fixup, \"ax\"\n" 320 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 321 " jmp 2b\n" 322 ".previous\n" 323 _ASM_EXTABLE(1b, 3b) 324 : : "r" (from), "r" (to) : "memory"); 325 326 from += 64; 327 to += 64; 328 } 329 kernel_fpu_end(); 330} 331 332#endif /* !CONFIG_MK7 */ 333 334/* 335 * Favour MMX for page clear and copy: 336 */ 337static void slow_zero_page(void *page) 338{ 339 int d0, d1; 340 341 __asm__ __volatile__( 342 "cld\n\t" 343 "rep ; stosl" 344 345 : "=&c" (d0), "=&D" (d1) 346 :"a" (0), "1" (page), "0" (1024) 347 :"memory"); 348} 349 350void mmx_clear_page(void *page) 351{ 352 if (unlikely(in_interrupt())) 353 slow_zero_page(page); 354 else 355 fast_clear_page(page); 356} 357EXPORT_SYMBOL(mmx_clear_page); 358 359static void slow_copy_page(void *to, void *from) 360{ 361 int d0, d1, d2; 362 363 __asm__ __volatile__( 364 "cld\n\t" 365 "rep ; movsl" 366 : "=&c" (d0), "=&D" (d1), "=&S" (d2) 367 : "0" (1024), "1" ((long) to), "2" ((long) from) 368 : "memory"); 369} 370 371void mmx_copy_page(void *to, void *from) 372{ 373 if (unlikely(in_interrupt())) 374 slow_copy_page(to, from); 375 else 376 fast_copy_page(to, from); 377} 378EXPORT_SYMBOL(mmx_copy_page);