Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/entry/64: Migrate the 64-bit syscall slow path to C

This is more complicated than the 32-bit and compat cases
because it preserves an asm fast path for the case where the
callee-saved regs aren't needed in pt_regs and no entry or exit
work needs to be done.

This appears to slow down fastpath syscalls by no more than one
cycle on my Skylake laptop.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/ce2335a4d42dc164b24132ee5e8c7716061f947b.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Andy Lutomirski and committed by
Ingo Molnar
1e423bff 24d978b7

+67 -80
+26
arch/x86/entry/common.c
··· 344 344 prepare_exit_to_usermode(regs); 345 345 } 346 346 347 + #ifdef CONFIG_X86_64 348 + __visible void do_syscall_64(struct pt_regs *regs) 349 + { 350 + struct thread_info *ti = pt_regs_to_thread_info(regs); 351 + unsigned long nr = regs->orig_ax; 352 + 353 + local_irq_enable(); 354 + 355 + if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) 356 + nr = syscall_trace_enter(regs); 357 + 358 + /* 359 + * NB: Native and x32 syscalls are dispatched from the same 360 + * table. The only functional difference is the x32 bit in 361 + * regs->orig_ax, which changes the behavior of some syscalls. 362 + */ 363 + if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) { 364 + regs->ax = sys_call_table[nr & __SYSCALL_MASK]( 365 + regs->di, regs->si, regs->dx, 366 + regs->r10, regs->r8, regs->r9); 367 + } 368 + 369 + syscall_return_slowpath(regs); 370 + } 371 + #endif 372 + 347 373 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 348 374 /* 349 375 * Does a 32-bit syscall. Called with IRQs on and does all entry and
+41 -80
arch/x86/entry/entry_64.S
··· 145 145 movq %rsp, PER_CPU_VAR(rsp_scratch) 146 146 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 147 147 148 + TRACE_IRQS_OFF 149 + 148 150 /* Construct struct pt_regs on stack */ 149 151 pushq $__USER_DS /* pt_regs->ss */ 150 152 pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ 151 - /* 152 - * Re-enable interrupts. 153 - * We use 'rsp_scratch' as a scratch space, hence irq-off block above 154 - * must execute atomically in the face of possible interrupt-driven 155 - * task preemption. We must enable interrupts only after we're done 156 - * with using rsp_scratch: 157 - */ 158 - ENABLE_INTERRUPTS(CLBR_NONE) 159 153 pushq %r11 /* pt_regs->flags */ 160 154 pushq $__USER_CS /* pt_regs->cs */ 161 155 pushq %rcx /* pt_regs->ip */ ··· 165 171 pushq %r11 /* pt_regs->r11 */ 166 172 sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ 167 173 168 - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 169 - jnz tracesys 174 + /* 175 + * If we need to do entry work or if we guess we'll need to do 176 + * exit work, go straight to the slow path. 177 + */ 178 + testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 179 + jnz entry_SYSCALL64_slow_path 180 + 170 181 entry_SYSCALL_64_fastpath: 182 + /* 183 + * Easy case: enable interrupts and issue the syscall. If the syscall 184 + * needs pt_regs, we'll call a stub that disables interrupts again 185 + * and jumps to the slow path. 186 + */ 187 + TRACE_IRQS_ON 188 + ENABLE_INTERRUPTS(CLBR_NONE) 171 189 #if __SYSCALL_MASK == ~0 172 190 cmpq $__NR_syscall_max, %rax 173 191 #else ··· 199 193 200 194 movq %rax, RAX(%rsp) 201 195 1: 202 - /* 203 - * Syscall return path ending with SYSRET (fast path). 204 - * Has incompletely filled pt_regs. 205 - */ 206 - LOCKDEP_SYS_EXIT 196 + 207 197 /* 208 - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, 209 - * it is too small to ever cause noticeable irq latency. 198 + * If we get here, then we know that pt_regs is clean for SYSRET64. 199 + * If we see that no exit work is required (which we are required 200 + * to check with IRQs off), then we can go straight to SYSRET64. 210 201 */ 211 202 DISABLE_INTERRUPTS(CLBR_NONE) 212 - 213 - /* 214 - * We must check ti flags with interrupts (or at least preemption) 215 - * off because we must *never* return to userspace without 216 - * processing exit work that is enqueued if we're preempted here. 217 - * In particular, returning to userspace with any of the one-shot 218 - * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is 219 - * very bad. 220 - */ 203 + TRACE_IRQS_OFF 221 204 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 222 - jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ 205 + jnz 1f 223 206 224 - RESTORE_C_REGS_EXCEPT_RCX_R11 225 - movq RIP(%rsp), %rcx 226 - movq EFLAGS(%rsp), %r11 207 + LOCKDEP_SYS_EXIT 208 + TRACE_IRQS_ON /* user mode is traced as IRQs on */ 209 + RESTORE_C_REGS 227 210 movq RSP(%rsp), %rsp 228 - /* 229 - * 64-bit SYSRET restores rip from rcx, 230 - * rflags from r11 (but RF and VM bits are forced to 0), 231 - * cs and ss are loaded from MSRs. 232 - * Restoration of rflags re-enables interrupts. 233 - * 234 - * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss 235 - * descriptor is not reinitialized. This means that we should 236 - * avoid SYSRET with SS == NULL, which could happen if we schedule, 237 - * exit the kernel, and re-enter using an interrupt vector. (All 238 - * interrupt entries on x86_64 set SS to NULL.) We prevent that 239 - * from happening by reloading SS in __switch_to. (Actually 240 - * detecting the failure in 64-bit userspace is tricky but can be 241 - * done.) 242 - */ 243 211 USERGS_SYSRET64 244 212 245 - GLOBAL(int_ret_from_sys_call_irqs_off) 213 + 1: 214 + /* 215 + * The fast path looked good when we started, but something changed 216 + * along the way and we need to switch to the slow path. Calling 217 + * raise(3) will trigger this, for example. IRQs are off. 218 + */ 246 219 TRACE_IRQS_ON 247 220 ENABLE_INTERRUPTS(CLBR_NONE) 248 - jmp int_ret_from_sys_call 249 - 250 - /* Do syscall entry tracing */ 251 - tracesys: 252 - SAVE_EXTRA_REGS 253 - movq %rsp, %rdi 254 - call syscall_trace_enter 255 - 256 - /* 257 - * Reload registers from stack in case ptrace changed them. 258 - * We don't reload %rax because syscall_trace_enter() returned 259 - * the value it wants us to use in the table lookup. 260 - */ 261 - RESTORE_C_REGS_EXCEPT_RAX 262 - #if __SYSCALL_MASK == ~0 263 - cmpq $__NR_syscall_max, %rax 264 - #else 265 - andl $__SYSCALL_MASK, %eax 266 - cmpl $__NR_syscall_max, %eax 267 - #endif 268 - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ 269 - movq %r10, %rcx /* fixup for C */ 270 - call *sys_call_table(, %rax, 8) 271 - movq %rax, RAX(%rsp) 272 - RESTORE_EXTRA_REGS 273 - 1: 274 - /* Use IRET because user could have changed pt_regs->foo */ 275 - 276 - /* 277 - * Syscall return path ending with IRET. 278 - * Has correct iret frame. 279 - */ 280 - GLOBAL(int_ret_from_sys_call) 281 221 SAVE_EXTRA_REGS 282 222 movq %rsp, %rdi 283 223 call syscall_return_slowpath /* returns with IRQs disabled */ 224 + jmp return_from_SYSCALL_64 225 + 226 + entry_SYSCALL64_slow_path: 227 + /* IRQs are off. */ 228 + SAVE_EXTRA_REGS 229 + movq %rsp, %rdi 230 + call do_syscall_64 /* returns with IRQs disabled */ 231 + 232 + return_from_SYSCALL_64: 284 233 RESTORE_EXTRA_REGS 285 234 TRACE_IRQS_IRETQ /* we're about to change IF */ 286 235 ··· 325 364 326 365 /* Called from fast path -- pop return address and jump to slow path */ 327 366 popq %rax 328 - jmp tracesys /* called from fast path */ 367 + jmp entry_SYSCALL64_slow_path /* called from fast path */ 329 368 330 369 1: 331 370 /* Called from C */