Merge branch 'x86-mem-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'x86-mem-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
x86, mem: Optimize memmove for small size and unaligned cases
x86, mem: Optimize memcpy by avoiding memory false dependece
x86, mem: Don't implement forward memmove() as memcpy()

+469 -85
+181 -16
arch/x86/lib/memcpy_32.c
··· 22 22 23 23 void *memmove(void *dest, const void *src, size_t n) 24 24 { 25 - int d0, d1, d2; 25 + int d0,d1,d2,d3,d4,d5; 26 + char *ret = dest; 26 27 27 - if (dest < src) { 28 - memcpy(dest, src, n); 29 - } else { 30 - __asm__ __volatile__( 31 - "std\n\t" 32 - "rep\n\t" 33 - "movsb\n\t" 34 - "cld" 35 - : "=&c" (d0), "=&S" (d1), "=&D" (d2) 36 - :"0" (n), 37 - "1" (n-1+src), 38 - "2" (n-1+dest) 39 - :"memory"); 40 - } 41 - return dest; 28 + __asm__ __volatile__( 29 + /* Handle more 16bytes in loop */ 30 + "cmp $0x10, %0\n\t" 31 + "jb 1f\n\t" 32 + 33 + /* Decide forward/backward copy mode */ 34 + "cmp %2, %1\n\t" 35 + "jb 2f\n\t" 36 + 37 + /* 38 + * movs instruction have many startup latency 39 + * so we handle small size by general register. 40 + */ 41 + "cmp $680, %0\n\t" 42 + "jb 3f\n\t" 43 + /* 44 + * movs instruction is only good for aligned case. 45 + */ 46 + "mov %1, %3\n\t" 47 + "xor %2, %3\n\t" 48 + "and $0xff, %3\n\t" 49 + "jz 4f\n\t" 50 + "3:\n\t" 51 + "sub $0x10, %0\n\t" 52 + 53 + /* 54 + * We gobble 16byts forward in each loop. 55 + */ 56 + "3:\n\t" 57 + "sub $0x10, %0\n\t" 58 + "mov 0*4(%1), %3\n\t" 59 + "mov 1*4(%1), %4\n\t" 60 + "mov %3, 0*4(%2)\n\t" 61 + "mov %4, 1*4(%2)\n\t" 62 + "mov 2*4(%1), %3\n\t" 63 + "mov 3*4(%1), %4\n\t" 64 + "mov %3, 2*4(%2)\n\t" 65 + "mov %4, 3*4(%2)\n\t" 66 + "lea 0x10(%1), %1\n\t" 67 + "lea 0x10(%2), %2\n\t" 68 + "jae 3b\n\t" 69 + "add $0x10, %0\n\t" 70 + "jmp 1f\n\t" 71 + 72 + /* 73 + * Handle data forward by movs. 74 + */ 75 + ".p2align 4\n\t" 76 + "4:\n\t" 77 + "mov -4(%1, %0), %3\n\t" 78 + "lea -4(%2, %0), %4\n\t" 79 + "shr $2, %0\n\t" 80 + "rep movsl\n\t" 81 + "mov %3, (%4)\n\t" 82 + "jmp 11f\n\t" 83 + /* 84 + * Handle data backward by movs. 85 + */ 86 + ".p2align 4\n\t" 87 + "6:\n\t" 88 + "mov (%1), %3\n\t" 89 + "mov %2, %4\n\t" 90 + "lea -4(%1, %0), %1\n\t" 91 + "lea -4(%2, %0), %2\n\t" 92 + "shr $2, %0\n\t" 93 + "std\n\t" 94 + "rep movsl\n\t" 95 + "mov %3,(%4)\n\t" 96 + "cld\n\t" 97 + "jmp 11f\n\t" 98 + 99 + /* 100 + * Start to prepare for backward copy. 101 + */ 102 + ".p2align 4\n\t" 103 + "2:\n\t" 104 + "cmp $680, %0\n\t" 105 + "jb 5f\n\t" 106 + "mov %1, %3\n\t" 107 + "xor %2, %3\n\t" 108 + "and $0xff, %3\n\t" 109 + "jz 6b\n\t" 110 + 111 + /* 112 + * Calculate copy position to tail. 113 + */ 114 + "5:\n\t" 115 + "add %0, %1\n\t" 116 + "add %0, %2\n\t" 117 + "sub $0x10, %0\n\t" 118 + 119 + /* 120 + * We gobble 16byts backward in each loop. 121 + */ 122 + "7:\n\t" 123 + "sub $0x10, %0\n\t" 124 + 125 + "mov -1*4(%1), %3\n\t" 126 + "mov -2*4(%1), %4\n\t" 127 + "mov %3, -1*4(%2)\n\t" 128 + "mov %4, -2*4(%2)\n\t" 129 + "mov -3*4(%1), %3\n\t" 130 + "mov -4*4(%1), %4\n\t" 131 + "mov %3, -3*4(%2)\n\t" 132 + "mov %4, -4*4(%2)\n\t" 133 + "lea -0x10(%1), %1\n\t" 134 + "lea -0x10(%2), %2\n\t" 135 + "jae 7b\n\t" 136 + /* 137 + * Calculate copy position to head. 138 + */ 139 + "add $0x10, %0\n\t" 140 + "sub %0, %1\n\t" 141 + "sub %0, %2\n\t" 142 + 143 + /* 144 + * Move data from 8 bytes to 15 bytes. 145 + */ 146 + ".p2align 4\n\t" 147 + "1:\n\t" 148 + "cmp $8, %0\n\t" 149 + "jb 8f\n\t" 150 + "mov 0*4(%1), %3\n\t" 151 + "mov 1*4(%1), %4\n\t" 152 + "mov -2*4(%1, %0), %5\n\t" 153 + "mov -1*4(%1, %0), %1\n\t" 154 + 155 + "mov %3, 0*4(%2)\n\t" 156 + "mov %4, 1*4(%2)\n\t" 157 + "mov %5, -2*4(%2, %0)\n\t" 158 + "mov %1, -1*4(%2, %0)\n\t" 159 + "jmp 11f\n\t" 160 + 161 + /* 162 + * Move data from 4 bytes to 7 bytes. 163 + */ 164 + ".p2align 4\n\t" 165 + "8:\n\t" 166 + "cmp $4, %0\n\t" 167 + "jb 9f\n\t" 168 + "mov 0*4(%1), %3\n\t" 169 + "mov -1*4(%1, %0), %4\n\t" 170 + "mov %3, 0*4(%2)\n\t" 171 + "mov %4, -1*4(%2, %0)\n\t" 172 + "jmp 11f\n\t" 173 + 174 + /* 175 + * Move data from 2 bytes to 3 bytes. 176 + */ 177 + ".p2align 4\n\t" 178 + "9:\n\t" 179 + "cmp $2, %0\n\t" 180 + "jb 10f\n\t" 181 + "movw 0*2(%1), %%dx\n\t" 182 + "movw -1*2(%1, %0), %%bx\n\t" 183 + "movw %%dx, 0*2(%2)\n\t" 184 + "movw %%bx, -1*2(%2, %0)\n\t" 185 + "jmp 11f\n\t" 186 + 187 + /* 188 + * Move data for 1 byte. 189 + */ 190 + ".p2align 4\n\t" 191 + "10:\n\t" 192 + "cmp $1, %0\n\t" 193 + "jb 11f\n\t" 194 + "movb (%1), %%cl\n\t" 195 + "movb %%cl, (%2)\n\t" 196 + ".p2align 4\n\t" 197 + "11:" 198 + : "=&c" (d0), "=&S" (d1), "=&D" (d2), 199 + "=r" (d3),"=r" (d4), "=r"(d5) 200 + :"0" (n), 201 + "1" (src), 202 + "2" (dest) 203 + :"memory"); 204 + 205 + return ret; 206 + 42 207 } 43 208 EXPORT_SYMBOL(memmove);
+108 -60
arch/x86/lib/memcpy_64.S
··· 40 40 ENTRY(__memcpy) 41 41 ENTRY(memcpy) 42 42 CFI_STARTPROC 43 - 44 - /* 45 - * Put the number of full 64-byte blocks into %ecx. 46 - * Tail portion is handled at the end: 47 - */ 48 43 movq %rdi, %rax 49 - movl %edx, %ecx 50 - shrl $6, %ecx 51 - jz .Lhandle_tail 52 44 45 + /* 46 + * Use 32bit CMP here to avoid long NOP padding. 47 + */ 48 + cmp $0x20, %edx 49 + jb .Lhandle_tail 50 + 51 + /* 52 + * We check whether memory false dependece could occur, 53 + * then jump to corresponding copy mode. 54 + */ 55 + cmp %dil, %sil 56 + jl .Lcopy_backward 57 + subl $0x20, %edx 58 + .Lcopy_forward_loop: 59 + subq $0x20, %rdx 60 + 61 + /* 62 + * Move in blocks of 4x8 bytes: 63 + */ 64 + movq 0*8(%rsi), %r8 65 + movq 1*8(%rsi), %r9 66 + movq 2*8(%rsi), %r10 67 + movq 3*8(%rsi), %r11 68 + leaq 4*8(%rsi), %rsi 69 + 70 + movq %r8, 0*8(%rdi) 71 + movq %r9, 1*8(%rdi) 72 + movq %r10, 2*8(%rdi) 73 + movq %r11, 3*8(%rdi) 74 + leaq 4*8(%rdi), %rdi 75 + jae .Lcopy_forward_loop 76 + addq $0x20, %rdx 77 + jmp .Lhandle_tail 78 + 79 + .Lcopy_backward: 80 + /* 81 + * Calculate copy position to tail. 82 + */ 83 + addq %rdx, %rsi 84 + addq %rdx, %rdi 85 + subq $0x20, %rdx 86 + /* 87 + * At most 3 ALU operations in one cycle, 88 + * so append NOPS in the same 16bytes trunk. 89 + */ 53 90 .p2align 4 54 - .Lloop_64: 55 - /* 56 - * We decrement the loop index here - and the zero-flag is 57 - * checked at the end of the loop (instructions inbetween do 58 - * not change the zero flag): 59 - */ 60 - decl %ecx 91 + .Lcopy_backward_loop: 92 + subq $0x20, %rdx 93 + movq -1*8(%rsi), %r8 94 + movq -2*8(%rsi), %r9 95 + movq -3*8(%rsi), %r10 96 + movq -4*8(%rsi), %r11 97 + leaq -4*8(%rsi), %rsi 98 + movq %r8, -1*8(%rdi) 99 + movq %r9, -2*8(%rdi) 100 + movq %r10, -3*8(%rdi) 101 + movq %r11, -4*8(%rdi) 102 + leaq -4*8(%rdi), %rdi 103 + jae .Lcopy_backward_loop 61 104 62 105 /* 63 - * Move in blocks of 4x16 bytes: 106 + * Calculate copy position to head. 64 107 */ 65 - movq 0*8(%rsi), %r11 66 - movq 1*8(%rsi), %r8 67 - movq %r11, 0*8(%rdi) 68 - movq %r8, 1*8(%rdi) 69 - 70 - movq 2*8(%rsi), %r9 71 - movq 3*8(%rsi), %r10 72 - movq %r9, 2*8(%rdi) 73 - movq %r10, 3*8(%rdi) 74 - 75 - movq 4*8(%rsi), %r11 76 - movq 5*8(%rsi), %r8 77 - movq %r11, 4*8(%rdi) 78 - movq %r8, 5*8(%rdi) 79 - 80 - movq 6*8(%rsi), %r9 81 - movq 7*8(%rsi), %r10 82 - movq %r9, 6*8(%rdi) 83 - movq %r10, 7*8(%rdi) 84 - 85 - leaq 64(%rsi), %rsi 86 - leaq 64(%rdi), %rdi 87 - 88 - jnz .Lloop_64 89 - 108 + addq $0x20, %rdx 109 + subq %rdx, %rsi 110 + subq %rdx, %rdi 90 111 .Lhandle_tail: 91 - movl %edx, %ecx 92 - andl $63, %ecx 93 - shrl $3, %ecx 94 - jz .Lhandle_7 112 + cmpq $16, %rdx 113 + jb .Lless_16bytes 95 114 115 + /* 116 + * Move data from 16 bytes to 31 bytes. 117 + */ 118 + movq 0*8(%rsi), %r8 119 + movq 1*8(%rsi), %r9 120 + movq -2*8(%rsi, %rdx), %r10 121 + movq -1*8(%rsi, %rdx), %r11 122 + movq %r8, 0*8(%rdi) 123 + movq %r9, 1*8(%rdi) 124 + movq %r10, -2*8(%rdi, %rdx) 125 + movq %r11, -1*8(%rdi, %rdx) 126 + retq 96 127 .p2align 4 97 - .Lloop_8: 98 - decl %ecx 99 - movq (%rsi), %r8 100 - movq %r8, (%rdi) 101 - leaq 8(%rdi), %rdi 102 - leaq 8(%rsi), %rsi 103 - jnz .Lloop_8 104 - 105 - .Lhandle_7: 106 - movl %edx, %ecx 107 - andl $7, %ecx 108 - jz .Lend 109 - 128 + .Lless_16bytes: 129 + cmpq $8, %rdx 130 + jb .Lless_8bytes 131 + /* 132 + * Move data from 8 bytes to 15 bytes. 133 + */ 134 + movq 0*8(%rsi), %r8 135 + movq -1*8(%rsi, %rdx), %r9 136 + movq %r8, 0*8(%rdi) 137 + movq %r9, -1*8(%rdi, %rdx) 138 + retq 110 139 .p2align 4 140 + .Lless_8bytes: 141 + cmpq $4, %rdx 142 + jb .Lless_3bytes 143 + 144 + /* 145 + * Move data from 4 bytes to 7 bytes. 146 + */ 147 + movl (%rsi), %ecx 148 + movl -4(%rsi, %rdx), %r8d 149 + movl %ecx, (%rdi) 150 + movl %r8d, -4(%rdi, %rdx) 151 + retq 152 + .p2align 4 153 + .Lless_3bytes: 154 + cmpl $0, %edx 155 + je .Lend 156 + /* 157 + * Move data from 1 bytes to 3 bytes. 158 + */ 111 159 .Lloop_1: 112 160 movb (%rsi), %r8b 113 161 movb %r8b, (%rdi) 114 162 incq %rdi 115 163 incq %rsi 116 - decl %ecx 164 + decl %edx 117 165 jnz .Lloop_1 118 166 119 167 .Lend: 120 - ret 168 + retq 121 169 CFI_ENDPROC 122 170 ENDPROC(memcpy) 123 171 ENDPROC(__memcpy)
+180 -9
arch/x86/lib/memmove_64.c
··· 8 8 #undef memmove 9 9 void *memmove(void *dest, const void *src, size_t count) 10 10 { 11 - if (dest < src) { 12 - return memcpy(dest, src, count); 13 - } else { 14 - char *p = dest + count; 15 - const char *s = src + count; 16 - while (count--) 17 - *--p = *--s; 18 - } 19 - return dest; 11 + unsigned long d0,d1,d2,d3,d4,d5,d6,d7; 12 + char *ret; 13 + 14 + __asm__ __volatile__( 15 + /* Handle more 32bytes in loop */ 16 + "mov %2, %3\n\t" 17 + "cmp $0x20, %0\n\t" 18 + "jb 1f\n\t" 19 + 20 + /* Decide forward/backward copy mode */ 21 + "cmp %2, %1\n\t" 22 + "jb 2f\n\t" 23 + 24 + /* 25 + * movsq instruction have many startup latency 26 + * so we handle small size by general register. 27 + */ 28 + "cmp $680, %0\n\t" 29 + "jb 3f\n\t" 30 + /* 31 + * movsq instruction is only good for aligned case. 32 + */ 33 + "cmpb %%dil, %%sil\n\t" 34 + "je 4f\n\t" 35 + "3:\n\t" 36 + "sub $0x20, %0\n\t" 37 + /* 38 + * We gobble 32byts forward in each loop. 39 + */ 40 + "5:\n\t" 41 + "sub $0x20, %0\n\t" 42 + "movq 0*8(%1), %4\n\t" 43 + "movq 1*8(%1), %5\n\t" 44 + "movq 2*8(%1), %6\n\t" 45 + "movq 3*8(%1), %7\n\t" 46 + "leaq 4*8(%1), %1\n\t" 47 + 48 + "movq %4, 0*8(%2)\n\t" 49 + "movq %5, 1*8(%2)\n\t" 50 + "movq %6, 2*8(%2)\n\t" 51 + "movq %7, 3*8(%2)\n\t" 52 + "leaq 4*8(%2), %2\n\t" 53 + "jae 5b\n\t" 54 + "addq $0x20, %0\n\t" 55 + "jmp 1f\n\t" 56 + /* 57 + * Handle data forward by movsq. 58 + */ 59 + ".p2align 4\n\t" 60 + "4:\n\t" 61 + "movq %0, %8\n\t" 62 + "movq -8(%1, %0), %4\n\t" 63 + "lea -8(%2, %0), %5\n\t" 64 + "shrq $3, %8\n\t" 65 + "rep movsq\n\t" 66 + "movq %4, (%5)\n\t" 67 + "jmp 13f\n\t" 68 + /* 69 + * Handle data backward by movsq. 70 + */ 71 + ".p2align 4\n\t" 72 + "7:\n\t" 73 + "movq %0, %8\n\t" 74 + "movq (%1), %4\n\t" 75 + "movq %2, %5\n\t" 76 + "leaq -8(%1, %0), %1\n\t" 77 + "leaq -8(%2, %0), %2\n\t" 78 + "shrq $3, %8\n\t" 79 + "std\n\t" 80 + "rep movsq\n\t" 81 + "cld\n\t" 82 + "movq %4, (%5)\n\t" 83 + "jmp 13f\n\t" 84 + 85 + /* 86 + * Start to prepare for backward copy. 87 + */ 88 + ".p2align 4\n\t" 89 + "2:\n\t" 90 + "cmp $680, %0\n\t" 91 + "jb 6f \n\t" 92 + "cmp %%dil, %%sil\n\t" 93 + "je 7b \n\t" 94 + "6:\n\t" 95 + /* 96 + * Calculate copy position to tail. 97 + */ 98 + "addq %0, %1\n\t" 99 + "addq %0, %2\n\t" 100 + "subq $0x20, %0\n\t" 101 + /* 102 + * We gobble 32byts backward in each loop. 103 + */ 104 + "8:\n\t" 105 + "subq $0x20, %0\n\t" 106 + "movq -1*8(%1), %4\n\t" 107 + "movq -2*8(%1), %5\n\t" 108 + "movq -3*8(%1), %6\n\t" 109 + "movq -4*8(%1), %7\n\t" 110 + "leaq -4*8(%1), %1\n\t" 111 + 112 + "movq %4, -1*8(%2)\n\t" 113 + "movq %5, -2*8(%2)\n\t" 114 + "movq %6, -3*8(%2)\n\t" 115 + "movq %7, -4*8(%2)\n\t" 116 + "leaq -4*8(%2), %2\n\t" 117 + "jae 8b\n\t" 118 + /* 119 + * Calculate copy position to head. 120 + */ 121 + "addq $0x20, %0\n\t" 122 + "subq %0, %1\n\t" 123 + "subq %0, %2\n\t" 124 + "1:\n\t" 125 + "cmpq $16, %0\n\t" 126 + "jb 9f\n\t" 127 + /* 128 + * Move data from 16 bytes to 31 bytes. 129 + */ 130 + "movq 0*8(%1), %4\n\t" 131 + "movq 1*8(%1), %5\n\t" 132 + "movq -2*8(%1, %0), %6\n\t" 133 + "movq -1*8(%1, %0), %7\n\t" 134 + "movq %4, 0*8(%2)\n\t" 135 + "movq %5, 1*8(%2)\n\t" 136 + "movq %6, -2*8(%2, %0)\n\t" 137 + "movq %7, -1*8(%2, %0)\n\t" 138 + "jmp 13f\n\t" 139 + ".p2align 4\n\t" 140 + "9:\n\t" 141 + "cmpq $8, %0\n\t" 142 + "jb 10f\n\t" 143 + /* 144 + * Move data from 8 bytes to 15 bytes. 145 + */ 146 + "movq 0*8(%1), %4\n\t" 147 + "movq -1*8(%1, %0), %5\n\t" 148 + "movq %4, 0*8(%2)\n\t" 149 + "movq %5, -1*8(%2, %0)\n\t" 150 + "jmp 13f\n\t" 151 + "10:\n\t" 152 + "cmpq $4, %0\n\t" 153 + "jb 11f\n\t" 154 + /* 155 + * Move data from 4 bytes to 7 bytes. 156 + */ 157 + "movl (%1), %4d\n\t" 158 + "movl -4(%1, %0), %5d\n\t" 159 + "movl %4d, (%2)\n\t" 160 + "movl %5d, -4(%2, %0)\n\t" 161 + "jmp 13f\n\t" 162 + "11:\n\t" 163 + "cmp $2, %0\n\t" 164 + "jb 12f\n\t" 165 + /* 166 + * Move data from 2 bytes to 3 bytes. 167 + */ 168 + "movw (%1), %4w\n\t" 169 + "movw -2(%1, %0), %5w\n\t" 170 + "movw %4w, (%2)\n\t" 171 + "movw %5w, -2(%2, %0)\n\t" 172 + "jmp 13f\n\t" 173 + "12:\n\t" 174 + "cmp $1, %0\n\t" 175 + "jb 13f\n\t" 176 + /* 177 + * Move data for 1 byte. 178 + */ 179 + "movb (%1), %4b\n\t" 180 + "movb %4b, (%2)\n\t" 181 + "13:\n\t" 182 + : "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) , 183 + "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7) 184 + :"0" (count), 185 + "1" (src), 186 + "2" (dest) 187 + :"memory"); 188 + 189 + return ret; 190 + 20 191 } 21 192 EXPORT_SYMBOL(memmove);