amd64: switch csum_partial_copy_generic() to new calling conventions

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

... and fold handling of misaligned case into it.

Implementation note: we stash the "will we need to rol8 the sum in the end"
flag into the MSB of %rcx (the lower 32 bits are used for length); the rest
is pretty straightforward.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Al Viro 5 years ago daf52375 fdf8bee9

+94 -123

3 changed files

expand all

arch

x86

include

asm

checksum_64.h

lib

csum-copy_64.S

csum-wrappers_64.c

+1 -4

arch/x86/include/asm/checksum_64.h

··· 130 130 extern __wsum csum_partial(const void *buff, int len, __wsum sum); 131 131 132 132 /* Do not call this directly. Use the wrappers below */ 133 - extern __visible __wsum csum_partial_copy_generic(const void *src, const void *dst, 134 - int len, __wsum sum, 135 - int *src_err_ptr, int *dst_err_ptr); 136 - 133 + extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len); 137 134 138 135 extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, int len); 139 136 extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len);

+84 -56

arch/x86/lib/csum-copy_64.S

··· 18 18 * rdi source 19 19 * rsi destination 20 20 * edx len (32bit) 21 - * ecx sum (32bit) 22 - * r8 src_err_ptr (int) 23 - * r9 dst_err_ptr (int) 24 21 * 25 22 * Output 26 23 * eax 64bit sum. undefined in case of exception. ··· 28 31 29 32 .macro source 30 33 10: 31 - _ASM_EXTABLE_UA(10b, .Lbad_source) 34 + _ASM_EXTABLE_UA(10b, .Lfault) 32 35 .endm 33 36 34 37 .macro dest 35 38 20: 36 - _ASM_EXTABLE_UA(20b, .Lbad_dest) 39 + _ASM_EXTABLE_UA(20b, .Lfault) 37 40 .endm 38 - 39 - /* 40 - * No _ASM_EXTABLE_UA; this is used for intentional prefetch on a 41 - * potentially unmapped kernel address. 42 - */ 43 - .macro ignore L=.Lignore 44 - 30: 45 - _ASM_EXTABLE(30b, \L) 46 - .endm 47 - 48 41 49 42 SYM_FUNC_START(csum_partial_copy_generic) 50 - cmpl $3*64, %edx 51 - jle .Lignore 43 + subq $5*8, %rsp 44 + movq %rbx, 0*8(%rsp) 45 + movq %r12, 1*8(%rsp) 46 + movq %r14, 2*8(%rsp) 47 + movq %r13, 3*8(%rsp) 48 + movq %r15, 4*8(%rsp) 52 49 53 - .Lignore: 54 - subq $7*8, %rsp 55 - movq %rbx, 2*8(%rsp) 56 - movq %r12, 3*8(%rsp) 57 - movq %r14, 4*8(%rsp) 58 - movq %r13, 5*8(%rsp) 59 - movq %r15, 6*8(%rsp) 60 - 61 - movq %r8, (%rsp) 62 - movq %r9, 1*8(%rsp) 63 - 64 - movl %ecx, %eax 65 - movl %edx, %ecx 66 - 50 + movl $-1, %eax 67 51 xorl %r9d, %r9d 68 - movq %rcx, %r12 52 + movl %edx, %ecx 53 + cmpl $8, %ecx 54 + jb .Lshort 55 + 56 + testb $7, %sil 57 + jne .Lunaligned 58 + .Laligned: 59 + movl %ecx, %r12d 69 60 70 61 shrq $6, %r12 71 62 jz .Lhandle_tail /* < 64 */ ··· 84 99 source 85 100 movq 56(%rdi), %r13 86 101 87 - ignore 2f 102 + 30: 103 + /* 104 + * No _ASM_EXTABLE_UA; this is used for intentional prefetch on a 105 + * potentially unmapped kernel address. 106 + */ 107 + _ASM_EXTABLE(30b, 2f) 88 108 prefetcht0 5*64(%rdi) 89 109 2: 90 110 adcq %rbx, %rax ··· 121 131 dest 122 132 movq %r13, 56(%rsi) 123 133 124 - 3: 125 - 126 134 leaq 64(%rdi), %rdi 127 135 leaq 64(%rsi), %rsi 128 136 ··· 130 142 131 143 /* do last up to 56 bytes */ 132 144 .Lhandle_tail: 133 - /* ecx: count */ 134 - movl %ecx, %r10d 145 + /* ecx: count, rcx.63: the end result needs to be rol8 */ 146 + movq %rcx, %r10 135 147 andl $63, %ecx 136 148 shrl $3, %ecx 137 149 jz .Lfold ··· 160 172 .Lhandle_7: 161 173 movl %r10d, %ecx 162 174 andl $7, %ecx 175 + .L1: /* .Lshort rejoins the common path here */ 163 176 shrl $1, %ecx 164 177 jz .Lhandle_1 165 178 movl $2, %edx ··· 192 203 adcl %r9d, %eax /* carry */ 193 204 194 205 .Lende: 195 - movq 2*8(%rsp), %rbx 196 - movq 3*8(%rsp), %r12 197 - movq 4*8(%rsp), %r14 198 - movq 5*8(%rsp), %r13 199 - movq 6*8(%rsp), %r15 200 - addq $7*8, %rsp 206 + testq %r10, %r10 207 + js .Lwas_odd 208 + .Lout: 209 + movq 0*8(%rsp), %rbx 210 + movq 1*8(%rsp), %r12 211 + movq 2*8(%rsp), %r14 212 + movq 3*8(%rsp), %r13 213 + movq 4*8(%rsp), %r15 214 + addq $5*8, %rsp 201 215 ret 216 + .Lshort: 217 + movl %ecx, %r10d 218 + jmp .L1 219 + .Lunaligned: 220 + xorl %ebx, %ebx 221 + testb $1, %sil 222 + jne .Lodd 223 + 1: testb $2, %sil 224 + je 2f 225 + source 226 + movw (%rdi), %bx 227 + dest 228 + movw %bx, (%rsi) 229 + leaq 2(%rdi), %rdi 230 + subq $2, %rcx 231 + leaq 2(%rsi), %rsi 232 + addq %rbx, %rax 233 + 2: testb $4, %sil 234 + je .Laligned 235 + source 236 + movl (%rdi), %ebx 237 + dest 238 + movl %ebx, (%rsi) 239 + leaq 4(%rdi), %rdi 240 + subq $4, %rcx 241 + leaq 4(%rsi), %rsi 242 + addq %rbx, %rax 243 + jmp .Laligned 202 244 203 - /* Exception handlers. Very simple, zeroing is done in the wrappers */ 204 - .Lbad_source: 205 - movq (%rsp), %rax 206 - testq %rax, %rax 207 - jz .Lende 208 - movl $-EFAULT, (%rax) 209 - jmp .Lende 245 + .Lodd: 246 + source 247 + movb (%rdi), %bl 248 + dest 249 + movb %bl, (%rsi) 250 + leaq 1(%rdi), %rdi 251 + leaq 1(%rsi), %rsi 252 + /* decrement, set MSB */ 253 + leaq -1(%rcx, %rcx), %rcx 254 + rorq $1, %rcx 255 + shll $8, %ebx 256 + addq %rbx, %rax 257 + jmp 1b 210 258 211 - .Lbad_dest: 212 - movq 8(%rsp), %rax 213 - testq %rax, %rax 214 - jz .Lende 215 - movl $-EFAULT, (%rax) 216 - jmp .Lende 259 + .Lwas_odd: 260 + roll $8, %eax 261 + jmp .Lout 262 + 263 + /* Exception: just return 0 */ 264 + .Lfault: 265 + xorl %eax, %eax 266 + jmp .Lout 217 267 SYM_FUNC_END(csum_partial_copy_generic)

+9 -63

arch/x86/lib/csum-wrappers_64.c

··· 21 21 * src and dst are best aligned to 64bits. 22 22 */ 23 23 __wsum 24 - csum_and_copy_from_user(const void __user *src, void *dst, 25 - int len) 24 + csum_and_copy_from_user(const void __user *src, void *dst, int len) 26 25 { 27 - int err = 0; 28 - __wsum isum = ~0U; 26 + __wsum sum; 29 27 30 28 might_sleep(); 31 - 32 29 if (!user_access_begin(src, len)) 33 30 return 0; 34 - 35 - /* 36 - * Why 6, not 7? To handle odd addresses aligned we 37 - * would need to do considerable complications to fix the 38 - * checksum which is defined as an 16bit accumulator. The 39 - * fix alignment code is primarily for performance 40 - * compatibility with 32bit and that will handle odd 41 - * addresses slowly too. 42 - */ 43 - if (unlikely((unsigned long)src & 6)) { 44 - while (((unsigned long)src & 6) && len >= 2) { 45 - __u16 val16; 46 - 47 - unsafe_get_user(val16, (const __u16 __user *)src, out); 48 - 49 - *(__u16 *)dst = val16; 50 - isum = (__force __wsum)add32_with_carry( 51 - (__force unsigned)isum, val16); 52 - src += 2; 53 - dst += 2; 54 - len -= 2; 55 - } 56 - } 57 - isum = csum_partial_copy_generic((__force const void *)src, 58 - dst, len, isum, &err, NULL); 31 + sum = csum_partial_copy_generic((__force const void *)src, dst, len); 59 32 user_access_end(); 60 - if (unlikely(err)) 61 - isum = 0; 62 - return isum; 63 - 64 - out: 65 - user_access_end(); 66 - return 0; 33 + return sum; 67 34 } 68 35 EXPORT_SYMBOL(csum_and_copy_from_user); 69 36 ··· 46 79 * src and dst are best aligned to 64bits. 47 80 */ 48 81 __wsum 49 - csum_and_copy_to_user(const void *src, void __user *dst, 50 - int len) 82 + csum_and_copy_to_user(const void *src, void __user *dst, int len) 51 83 { 52 - __wsum ret, isum = ~0U; 53 - int err = 0; 84 + __wsum sum; 54 85 55 86 might_sleep(); 56 - 57 87 if (!user_access_begin(dst, len)) 58 88 return 0; 59 - 60 - if (unlikely((unsigned long)dst & 6)) { 61 - while (((unsigned long)dst & 6) && len >= 2) { 62 - __u16 val16 = *(__u16 *)src; 63 - 64 - isum = (__force __wsum)add32_with_carry( 65 - (__force unsigned)isum, val16); 66 - unsafe_put_user(val16, (__u16 __user *)dst, out); 67 - src += 2; 68 - dst += 2; 69 - len -= 2; 70 - } 71 - } 72 - 73 - ret = csum_partial_copy_generic(src, (void __force *)dst, 74 - len, isum, NULL, &err); 89 + sum = csum_partial_copy_generic(src, (void __force *)dst, len); 75 90 user_access_end(); 76 - return err ? 0 : ret; 77 - out: 78 - user_access_end(); 79 - return 0; 91 + return sum; 80 92 } 81 93 EXPORT_SYMBOL(csum_and_copy_to_user); 82 94 ··· 71 125 __wsum 72 126 csum_partial_copy_nocheck(const void *src, void *dst, int len) 73 127 { 74 - return csum_partial_copy_generic(src, dst, len, 0, NULL, NULL); 128 + return csum_partial_copy_generic(src, dst, len); 75 129 } 76 130 EXPORT_SYMBOL(csum_partial_copy_nocheck); 77 131