Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86: improve on the non-rep 'copy_user' function

The old 'copy_user_generic_unrolled' function was oddly implemented for
largely historical reasons: it had been largely based on the uncached
copy case, which has some other concerns.

For example, the __copy_user_nocache() function uses 'movnti' for the
destination stores, and those want the destination to be aligned. In
contrast, the regular copy function doesn't really care, and trying to
align things only complicates matters.

Also, like the clear_user function, the copy function had some odd
handling of the repeat counts, complicating the exception handling for
no really good reason. So as with clear_user, just write it to keep all
the byte counts in the %rcx register, exactly like the 'rep movs'
functionality that this replaces.

Unlike a real 'rep movs', we do allow for this to trash a few temporary
registers to not have to unnecessarily save/restore registers on the
stack.

And like the clearing case, rename this to what it now clearly is:
'rep_movs_alternative', and make it one coherent function, so that it
shows up as such in profiles (instead of the odd split between
"copy_user_generic_unrolled" and "copy_user_short_string", the latter of
which was not about strings at all, and which was shared with the
uncached case).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

+147 -170
+4 -4
arch/x86/include/asm/uaccess_64.h
··· 18 18 19 19 /* Handles exceptions in both to and from, but doesn't do access_ok */ 20 20 __must_check unsigned long 21 - copy_user_generic_unrolled(void *to, const void *from, unsigned len); 21 + rep_movs_alternative(void *to, const void *from, unsigned len); 22 22 23 23 static __always_inline __must_check unsigned long 24 24 copy_user_generic(void *to, const void *from, unsigned long len) ··· 26 26 stac(); 27 27 /* 28 28 * If CPU has FSRM feature, use 'rep movs'. 29 - * Otherwise, use copy_user_generic_unrolled. 29 + * Otherwise, use rep_movs_alternative. 30 30 */ 31 31 asm volatile( 32 32 "1:\n\t" 33 33 ALTERNATIVE("rep movsb", 34 - "call copy_user_generic_unrolled", ALT_NOT(X86_FEATURE_FSRM)) 34 + "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM)) 35 35 "2:\n" 36 36 _ASM_EXTABLE_UA(1b, 2b) 37 37 :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT 38 - : : "memory", "rax", "rdx", "r8", "r9", "r10", "r11"); 38 + : : "memory", "rax", "r8", "r9", "r10", "r11"); 39 39 clac(); 40 40 return len; 41 41 }
+142 -165
arch/x86/lib/copy_user_64.S
··· 17 17 #include <asm/export.h> 18 18 #include <asm/trapnr.h> 19 19 20 + /* 21 + * rep_movs_alternative - memory copy with exception handling. 22 + * This version is for CPUs that don't have FSRM (Fast Short Rep Movs) 23 + * 24 + * Input: 25 + * rdi destination 26 + * rsi source 27 + * rcx count 28 + * 29 + * Output: 30 + * rcx uncopied bytes or 0 if successful. 31 + * 32 + * NOTE! The calling convention is very intentionally the same as 33 + * for 'rep movs', so that we can rewrite the function call with 34 + * just a plain 'rep movs' on machines that have FSRM. But to make 35 + * it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely. 36 + */ 37 + SYM_FUNC_START(rep_movs_alternative) 38 + cmpq $64,%rcx 39 + jae .Lunrolled 40 + 41 + cmp $8,%ecx 42 + jae .Lword 43 + 44 + testl %ecx,%ecx 45 + je .Lexit 46 + 47 + .Lcopy_user_tail: 48 + 0: movb (%rsi),%al 49 + 1: movb %al,(%rdi) 50 + inc %rdi 51 + inc %rsi 52 + dec %rcx 53 + jne .Lcopy_user_tail 54 + .Lexit: 55 + RET 56 + 57 + _ASM_EXTABLE_UA( 0b, .Lexit) 58 + _ASM_EXTABLE_UA( 1b, .Lexit) 59 + 60 + .p2align 4 61 + .Lword: 62 + 2: movq (%rsi),%rax 63 + 3: movq %rax,(%rdi) 64 + addq $8,%rsi 65 + addq $8,%rdi 66 + sub $8,%ecx 67 + je .Lexit 68 + cmp $8,%ecx 69 + jae .Lword 70 + jmp .Lcopy_user_tail 71 + 72 + _ASM_EXTABLE_UA( 2b, .Lcopy_user_tail) 73 + _ASM_EXTABLE_UA( 3b, .Lcopy_user_tail) 74 + 75 + .p2align 4 76 + .Lunrolled: 77 + 10: movq (%rsi),%r8 78 + 11: movq 8(%rsi),%r9 79 + 12: movq 16(%rsi),%r10 80 + 13: movq 24(%rsi),%r11 81 + 14: movq %r8,(%rdi) 82 + 15: movq %r9,8(%rdi) 83 + 16: movq %r10,16(%rdi) 84 + 17: movq %r11,24(%rdi) 85 + 20: movq 32(%rsi),%r8 86 + 21: movq 40(%rsi),%r9 87 + 22: movq 48(%rsi),%r10 88 + 23: movq 56(%rsi),%r11 89 + 24: movq %r8,32(%rdi) 90 + 25: movq %r9,40(%rdi) 91 + 26: movq %r10,48(%rdi) 92 + 27: movq %r11,56(%rdi) 93 + addq $64,%rsi 94 + addq $64,%rdi 95 + subq $64,%rcx 96 + cmpq $64,%rcx 97 + jae .Lunrolled 98 + cmpl $8,%ecx 99 + jae .Lword 100 + testl %ecx,%ecx 101 + jne .Lcopy_user_tail 102 + RET 103 + 104 + _ASM_EXTABLE_UA(10b, .Lcopy_user_tail) 105 + _ASM_EXTABLE_UA(11b, .Lcopy_user_tail) 106 + _ASM_EXTABLE_UA(12b, .Lcopy_user_tail) 107 + _ASM_EXTABLE_UA(13b, .Lcopy_user_tail) 108 + _ASM_EXTABLE_UA(14b, .Lcopy_user_tail) 109 + _ASM_EXTABLE_UA(15b, .Lcopy_user_tail) 110 + _ASM_EXTABLE_UA(16b, .Lcopy_user_tail) 111 + _ASM_EXTABLE_UA(17b, .Lcopy_user_tail) 112 + _ASM_EXTABLE_UA(20b, .Lcopy_user_tail) 113 + _ASM_EXTABLE_UA(21b, .Lcopy_user_tail) 114 + _ASM_EXTABLE_UA(22b, .Lcopy_user_tail) 115 + _ASM_EXTABLE_UA(23b, .Lcopy_user_tail) 116 + _ASM_EXTABLE_UA(24b, .Lcopy_user_tail) 117 + _ASM_EXTABLE_UA(25b, .Lcopy_user_tail) 118 + _ASM_EXTABLE_UA(26b, .Lcopy_user_tail) 119 + _ASM_EXTABLE_UA(27b, .Lcopy_user_tail) 120 + SYM_FUNC_END(rep_movs_alternative) 121 + EXPORT_SYMBOL(rep_movs_alternative) 122 + 123 + /* 124 + * The uncached copy needs to align the destination for 125 + * movnti and friends. 126 + */ 20 127 .macro ALIGN_DESTINATION 21 128 /* check for bad alignment of destination */ 22 129 movl %edi,%ecx ··· 144 37 _ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align) 145 38 .endm 146 39 147 - /* 148 - * copy_user_generic_unrolled - memory copy with exception handling. 149 - * This version is for CPUs like P4 that don't have efficient micro 150 - * code for rep movsq 151 - * 152 - * Input: 153 - * rdi destination 154 - * rsi source 155 - * rcx count 156 - * 157 - * Output: 158 - * rcx uncopied bytes or 0 if successful. 159 - * 160 - * NOTE! The calling convention is very intentionally the same as 161 - * for 'rep movs', so that we can rewrite the function call with 162 - * just a plain 'rep movs' on machines that have FSRM. 163 - * 164 - * HOWEVER! This function ends up having a lot of the code common 165 - * with __copy_user_nocache(), which is a normal C function, and 166 - * has a similar calling convention, but gets the 'count' in %rdx, 167 - * and returns the result in %rax. 168 - * 169 - * To share as much code as possible, we end up returning the 170 - * result in *both* %rcx/%rax, and we also move the initial count 171 - * into %rdx. 172 - * 173 - * We can clobber rdx/rsi/rdi and r8-r11 174 - */ 175 - SYM_FUNC_START(copy_user_generic_unrolled) 176 - movl %ecx,%edx 177 - cmpl $8,%ecx 178 - jb .Lcopy_user_short_string_bytes 179 - ALIGN_DESTINATION 180 - movl %edx,%ecx 181 - andl $63,%edx 182 - shrl $6,%ecx 183 - jz copy_user_short_string 184 - 1: movq (%rsi),%r8 185 - 2: movq 1*8(%rsi),%r9 186 - 3: movq 2*8(%rsi),%r10 187 - 4: movq 3*8(%rsi),%r11 188 - 5: movq %r8,(%rdi) 189 - 6: movq %r9,1*8(%rdi) 190 - 7: movq %r10,2*8(%rdi) 191 - 8: movq %r11,3*8(%rdi) 192 - 9: movq 4*8(%rsi),%r8 193 - 10: movq 5*8(%rsi),%r9 194 - 11: movq 6*8(%rsi),%r10 195 - 12: movq 7*8(%rsi),%r11 196 - 13: movq %r8,4*8(%rdi) 197 - 14: movq %r9,5*8(%rdi) 198 - 15: movq %r10,6*8(%rdi) 199 - 16: movq %r11,7*8(%rdi) 200 - leaq 64(%rsi),%rsi 201 - leaq 64(%rdi),%rdi 202 - decl %ecx 203 - jnz 1b 204 - jmp copy_user_short_string 205 - 206 - 30: shll $6,%ecx 207 - addl %ecx,%edx 208 - jmp .Lcopy_user_handle_tail 209 - 210 - _ASM_EXTABLE_CPY(1b, 30b) 211 - _ASM_EXTABLE_CPY(2b, 30b) 212 - _ASM_EXTABLE_CPY(3b, 30b) 213 - _ASM_EXTABLE_CPY(4b, 30b) 214 - _ASM_EXTABLE_CPY(5b, 30b) 215 - _ASM_EXTABLE_CPY(6b, 30b) 216 - _ASM_EXTABLE_CPY(7b, 30b) 217 - _ASM_EXTABLE_CPY(8b, 30b) 218 - _ASM_EXTABLE_CPY(9b, 30b) 219 - _ASM_EXTABLE_CPY(10b, 30b) 220 - _ASM_EXTABLE_CPY(11b, 30b) 221 - _ASM_EXTABLE_CPY(12b, 30b) 222 - _ASM_EXTABLE_CPY(13b, 30b) 223 - _ASM_EXTABLE_CPY(14b, 30b) 224 - _ASM_EXTABLE_CPY(15b, 30b) 225 - _ASM_EXTABLE_CPY(16b, 30b) 226 - SYM_FUNC_END(copy_user_generic_unrolled) 227 - EXPORT_SYMBOL(copy_user_generic_unrolled) 228 - 229 - /* 230 - * Try to copy last bytes and clear the rest if needed. 231 - * Since protection fault in copy_from/to_user is not a normal situation, 232 - * it is not necessary to optimize tail handling. 233 - * Don't try to copy the tail if machine check happened 234 - * 235 - * Input: 236 - * eax trap number written by ex_handler_copy() 237 - * rdi destination 238 - * rsi source 239 - * rdx count 240 - * 241 - * Output: 242 - * eax uncopied bytes or 0 if successful. 243 - */ 244 - SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail) 245 - cmp $X86_TRAP_MC,%eax 246 - je 3f 247 - 248 - movl %edx,%ecx 249 - 1: rep movsb 250 - 2: mov %ecx,%eax 251 - RET 252 - 253 - 3: 254 - movl %edx,%eax 255 - movl %edx,%ecx 256 - RET 257 - 258 - _ASM_EXTABLE_CPY(1b, 2b) 259 - 260 - .Lcopy_user_handle_align: 261 - addl %ecx,%edx /* ecx is zerorest also */ 262 - jmp .Lcopy_user_handle_tail 263 - 264 - SYM_CODE_END(.Lcopy_user_handle_tail) 265 - 266 - /* 267 - * Finish memcpy of less than 64 bytes. #AC should already be set. 268 - * 269 - * Input: 270 - * rdi destination 271 - * rsi source 272 - * rdx count (< 64) 273 - * 274 - * Output: 275 - * eax uncopied bytes or 0 if successful. 276 - */ 277 - SYM_CODE_START_LOCAL(copy_user_short_string) 278 - movl %edx,%ecx 279 - andl $7,%edx 280 - shrl $3,%ecx 281 - jz .Lcopy_user_short_string_bytes 282 - 18: movq (%rsi),%r8 283 - 19: movq %r8,(%rdi) 284 - leaq 8(%rsi),%rsi 285 - leaq 8(%rdi),%rdi 286 - decl %ecx 287 - jnz 18b 288 - .Lcopy_user_short_string_bytes: 289 - andl %edx,%edx 290 - jz 23f 291 - movl %edx,%ecx 292 - 21: movb (%rsi),%al 293 - 22: movb %al,(%rdi) 294 - incq %rsi 295 - incq %rdi 296 - decl %ecx 297 - jnz 21b 298 - 23: xor %eax,%eax 299 - xor %ecx,%ecx 300 - RET 301 - 302 - 40: leal (%rdx,%rcx,8),%edx 303 - jmp 60f 304 - 50: movl %ecx,%edx /* ecx is zerorest also */ 305 - 60: jmp .Lcopy_user_handle_tail 306 - 307 - _ASM_EXTABLE_CPY(18b, 40b) 308 - _ASM_EXTABLE_CPY(19b, 40b) 309 - _ASM_EXTABLE_CPY(21b, 50b) 310 - _ASM_EXTABLE_CPY(22b, 50b) 311 - SYM_CODE_END(copy_user_short_string) 312 40 313 41 /* 314 42 * copy_user_nocache - Uncached memory copy with exception handling ··· 288 346 _ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy) 289 347 _ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy) 290 348 _ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy) 349 + 350 + /* 351 + * Try to copy last bytes and clear the rest if needed. 352 + * Since protection fault in copy_from/to_user is not a normal situation, 353 + * it is not necessary to optimize tail handling. 354 + * Don't try to copy the tail if machine check happened 355 + * 356 + * Input: 357 + * eax trap number written by ex_handler_copy() 358 + * rdi destination 359 + * rsi source 360 + * rdx count 361 + * 362 + * Output: 363 + * eax uncopied bytes or 0 if successful. 364 + */ 365 + .Lcopy_user_handle_tail: 366 + cmp $X86_TRAP_MC,%eax 367 + je 3f 368 + 369 + movl %edx,%ecx 370 + 1: rep movsb 371 + 2: mov %ecx,%eax 372 + RET 373 + 374 + 3: 375 + movl %edx,%eax 376 + RET 377 + 378 + _ASM_EXTABLE_CPY(1b, 2b) 379 + 380 + .Lcopy_user_handle_align: 381 + addl %ecx,%edx /* ecx is zerorest also */ 382 + jmp .Lcopy_user_handle_tail 383 + 291 384 SYM_FUNC_END(__copy_user_nocache) 292 385 EXPORT_SYMBOL(__copy_user_nocache)
+1 -1
tools/objtool/check.c
··· 1285 1285 "copy_mc_enhanced_fast_string", 1286 1286 "ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */ 1287 1287 "rep_stos_alternative", 1288 - "copy_user_generic_unrolled", 1288 + "rep_movs_alternative", 1289 1289 "__copy_user_nocache", 1290 1290 NULL 1291 1291 };