Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/entry/64: Always run ptregs-using syscalls on the slow path

64-bit syscalls currently have an optimization in which they are
called with partial pt_regs. A small handful require full
pt_regs.

In the 32-bit and compat cases, I cleaned this up by forcing
full pt_regs for all syscalls. The performance hit doesn't
really matter as the affected system calls are fundamentally
heavy and this is the 32-bit compat case.

I want to clean up the 64-bit case as well, but I don't want to
hurt fast path performance. To do that, I want to force the
syscalls that use pt_regs onto the slow path. This will enable
us to make slow path syscalls be real ABI-compliant C functions.

Use the new syscall entry qualification machinery for this.
'stub_clone' is now 'stub_clone/ptregs'.

The next patch will eliminate the stubs, and we'll just have
'sys_clone/ptregs'.

As of this patch, two-phase entry tracing is no longer used. It
has served its purpose (namely a huge speedup on some workloads
prior to more general opportunistic SYSRET support), and once
the dust settles I'll send patches to back it out.

The implementation is heavily based on a patch from Brian Gerst:

http://lkml.kernel.org/g/1449666173-15366-1-git-send-email-brgerst@gmail.com

Originally-From: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/b9beda88460bcefec6e7d792bd44eca9b760b0c4.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Andy Lutomirski and committed by
Ingo Molnar
302f5b26 cfcbadb4

+55 -24
+42 -14
arch/x86/entry/entry_64.S
··· 182 182 #endif 183 183 ja 1f /* return -ENOSYS (already in pt_regs->ax) */ 184 184 movq %r10, %rcx 185 + 186 + /* 187 + * This call instruction is handled specially in stub_ptregs_64. 188 + * It might end up jumping to the slow path. If it jumps, RAX is 189 + * clobbered. 190 + */ 185 191 call *sys_call_table(, %rax, 8) 192 + .Lentry_SYSCALL_64_after_fastpath_call: 193 + 186 194 movq %rax, RAX(%rsp) 187 195 1: 188 196 /* ··· 243 235 244 236 /* Do syscall entry tracing */ 245 237 tracesys: 246 - movq %rsp, %rdi 247 - movl $AUDIT_ARCH_X86_64, %esi 248 - call syscall_trace_enter_phase1 249 - test %rax, %rax 250 - jnz tracesys_phase2 /* if needed, run the slow path */ 251 - RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ 252 - movq ORIG_RAX(%rsp), %rax 253 - jmp entry_SYSCALL_64_fastpath /* and return to the fast path */ 254 - 255 - tracesys_phase2: 256 238 SAVE_EXTRA_REGS 257 239 movq %rsp, %rdi 258 - movl $AUDIT_ARCH_X86_64, %esi 259 - movq %rax, %rdx 260 - call syscall_trace_enter_phase2 240 + call syscall_trace_enter 261 241 262 242 /* 263 243 * Reload registers from stack in case ptrace changed them. 264 - * We don't reload %rax because syscall_trace_entry_phase2() returned 244 + * We don't reload %rax because syscall_trace_enter() returned 265 245 * the value it wants us to use in the table lookup. 266 246 */ 267 247 RESTORE_C_REGS_EXCEPT_RAX ··· 351 355 jmp restore_c_regs_and_iret 352 356 END(entry_SYSCALL_64) 353 357 358 + ENTRY(stub_ptregs_64) 359 + /* 360 + * Syscalls marked as needing ptregs land here. 361 + * If we are on the fast path, we need to save the extra regs. 362 + * If we are on the slow path, the extra regs are already saved. 363 + * 364 + * RAX stores a pointer to the C function implementing the syscall. 365 + */ 366 + cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp) 367 + jne 1f 368 + 369 + /* Called from fast path -- pop return address and jump to slow path */ 370 + popq %rax 371 + jmp tracesys /* called from fast path */ 372 + 373 + 1: 374 + /* Called from C */ 375 + jmp *%rax /* called from C */ 376 + END(stub_ptregs_64) 377 + 378 + .macro ptregs_stub func 379 + ENTRY(ptregs_\func) 380 + leaq \func(%rip), %rax 381 + jmp stub_ptregs_64 382 + END(ptregs_\func) 383 + .endm 384 + 385 + /* Instantiate ptregs_stub for each ptregs-using syscall */ 386 + #define __SYSCALL_64_QUAL_(sym) 387 + #define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym 388 + #define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym) 389 + #include <asm/syscalls_64.h> 354 390 355 391 .macro FORK_LIKE func 356 392 ENTRY(stub_\func)
+5 -2
arch/x86/entry/syscall_64.c
··· 6 6 #include <asm/asm-offsets.h> 7 7 #include <asm/syscall.h> 8 8 9 - #define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; 9 + #define __SYSCALL_64_QUAL_(sym) sym 10 + #define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym 11 + 12 + #define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); 10 13 #include <asm/syscalls_64.h> 11 14 #undef __SYSCALL_64 12 15 13 - #define __SYSCALL_64(nr, sym, qual) [nr] = sym, 16 + #define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym), 14 17 15 18 extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); 16 19
+8 -8
arch/x86/entry/syscalls/syscall_64.tbl
··· 21 21 12 common brk sys_brk 22 22 13 64 rt_sigaction sys_rt_sigaction 23 23 14 common rt_sigprocmask sys_rt_sigprocmask 24 - 15 64 rt_sigreturn stub_rt_sigreturn 24 + 15 64 rt_sigreturn stub_rt_sigreturn/ptregs 25 25 16 64 ioctl sys_ioctl 26 26 17 common pread64 sys_pread64 27 27 18 common pwrite64 sys_pwrite64 ··· 62 62 53 common socketpair sys_socketpair 63 63 54 64 setsockopt sys_setsockopt 64 64 55 64 getsockopt sys_getsockopt 65 - 56 common clone stub_clone 66 - 57 common fork stub_fork 67 - 58 common vfork stub_vfork 68 - 59 64 execve stub_execve 65 + 56 common clone stub_clone/ptregs 66 + 57 common fork stub_fork/ptregs 67 + 58 common vfork stub_vfork/ptregs 68 + 59 64 execve stub_execve/ptregs 69 69 60 common exit sys_exit 70 70 61 common wait4 sys_wait4 71 71 62 common kill sys_kill ··· 328 328 319 common memfd_create sys_memfd_create 329 329 320 common kexec_file_load sys_kexec_file_load 330 330 321 common bpf sys_bpf 331 - 322 64 execveat stub_execveat 331 + 322 64 execveat stub_execveat/ptregs 332 332 323 common userfaultfd sys_userfaultfd 333 333 324 common membarrier sys_membarrier 334 334 325 common mlock2 sys_mlock2 ··· 346 346 517 x32 recvfrom compat_sys_recvfrom 347 347 518 x32 sendmsg compat_sys_sendmsg 348 348 519 x32 recvmsg compat_sys_recvmsg 349 - 520 x32 execve stub_x32_execve 349 + 520 x32 execve stub_x32_execve/ptregs 350 350 521 x32 ptrace compat_sys_ptrace 351 351 522 x32 rt_sigpending compat_sys_rt_sigpending 352 352 523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait ··· 371 371 542 x32 getsockopt compat_sys_getsockopt 372 372 543 x32 io_setup compat_sys_io_setup 373 373 544 x32 io_submit compat_sys_io_submit 374 - 545 x32 execveat stub_x32_execveat 374 + 545 x32 execveat stub_x32_execveat/ptregs