Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/entry/32: Simplify and fix up the SYSENTER stack #DB/NMI fixup

Right after SYSENTER, we can get a #DB or NMI. On x86_32, there's no IST,
so the exception handler is invoked on the temporary SYSENTER stack.

Because the SYSENTER stack is very small, we have a fixup to switch
off the stack quickly when this happens. The old fixup had several issues:

1. It checked the interrupt frame's CS and EIP. This wasn't
obviously correct on Xen or if vm86 mode was in use [1].

2. In the NMI handler, it did some frightening digging into the
stack frame. I'm not convinced this digging was correct.

3. The fixup didn't switch stacks and then switch back. Instead, it
synthesized a brand new stack frame that would redirect the IRET
back to the SYSENTER code. That frame was highly questionable.
For one thing, if NMI nested inside #DB, we would effectively
abort the #DB prologue, which was probably safe but was
frightening. For another, the code used PUSHFL to write the
FLAGS portion of the frame, which was simply bogus -- by the time
PUSHFL was called, at least TF, NT, VM, and all of the arithmetic
flags were clobbered.

Simplify this considerably. Instead of looking at the saved frame
to see where we came from, check the hardware ESP register against
the SYSENTER stack directly. Malicious user code cannot spoof the
kernel ESP register, and by moving the check after SAVE_ALL, we can
use normal PER_CPU accesses to find all the relevant addresses.

With this patch applied, the improved syscall_nt_32 test finally
passes on 32-bit kernels.

[1] It isn't obviously correct, but it is nonetheless safe from vm86
shenanigans as far as I can tell. A user can't point EIP at
entry_SYSENTER_32 while in vm86 mode because entry_SYSENTER_32,
like all kernel addresses, is greater than 0xffff and would thus
violate the CS segment limit.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/b2cdbc037031c07ecf2c40a96069318aec0e7971.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Andy Lutomirski and committed by
Ingo Molnar
7536656f 6dcc9414

+56 -63
+51 -63
arch/x86/entry/entry_32.S
··· 976 976 jmp ret_from_exception 977 977 END(page_fault) 978 978 979 - /* 980 - * Debug traps and NMI can happen at the one SYSENTER instruction 981 - * that sets up the real kernel stack. Check here, since we can't 982 - * allow the wrong stack to be used. 983 - * 984 - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have 985 - * already pushed 3 words if it hits on the sysenter instruction: 986 - * eflags, cs and eip. 987 - * 988 - * We just load the right stack, and push the three (known) values 989 - * by hand onto the new stack - while updating the return eip past 990 - * the instruction that would have done it for sysenter. 991 - */ 992 - .macro FIX_STACK offset ok label 993 - cmpw $__KERNEL_CS, 4(%esp) 994 - jne \ok 995 - \label: 996 - movl TSS_sysenter_sp0 + \offset(%esp), %esp 997 - pushfl 998 - pushl $__KERNEL_CS 999 - pushl $sysenter_past_esp 1000 - .endm 1001 - 1002 979 ENTRY(debug) 980 + /* 981 + * #DB can happen at the first instruction of 982 + * entry_SYSENTER_32 or in Xen's SYSENTER prologue. If this 983 + * happens, then we will be running on a very small stack. We 984 + * need to detect this condition and switch to the thread 985 + * stack before calling any C code at all. 986 + * 987 + * If you edit this code, keep in mind that NMIs can happen in here. 988 + */ 1003 989 ASM_CLAC 1004 - cmpl $entry_SYSENTER_32, (%esp) 1005 - jne debug_stack_correct 1006 - FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn 1007 - debug_stack_correct: 1008 990 pushl $-1 # mark this as an int 1009 991 SAVE_ALL 1010 - TRACE_IRQS_OFF 1011 992 xorl %edx, %edx # error code 0 1012 993 movl %esp, %eax # pt_regs pointer 994 + 995 + /* Are we currently on the SYSENTER stack? */ 996 + PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) 997 + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ 998 + cmpl $SIZEOF_SYSENTER_stack, %ecx 999 + jb .Ldebug_from_sysenter_stack 1000 + 1001 + TRACE_IRQS_OFF 1013 1002 call do_debug 1003 + jmp ret_from_exception 1004 + 1005 + .Ldebug_from_sysenter_stack: 1006 + /* We're on the SYSENTER stack. Switch off. */ 1007 + movl %esp, %ebp 1008 + movl PER_CPU_VAR(cpu_current_top_of_stack), %esp 1009 + TRACE_IRQS_OFF 1010 + call do_debug 1011 + movl %ebp, %esp 1014 1012 jmp ret_from_exception 1015 1013 END(debug) 1016 1014 1017 1015 /* 1018 - * NMI is doubly nasty. It can happen _while_ we're handling 1019 - * a debug fault, and the debug fault hasn't yet been able to 1020 - * clear up the stack. So we first check whether we got an 1021 - * NMI on the sysenter entry path, but after that we need to 1022 - * check whether we got an NMI on the debug path where the debug 1023 - * fault happened on the sysenter path. 1016 + * NMI is doubly nasty. It can happen on the first instruction of 1017 + * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning 1018 + * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32 1019 + * switched stacks. We handle both conditions by simply checking whether we 1020 + * interrupted kernel code running on the SYSENTER stack. 1024 1021 */ 1025 1022 ENTRY(nmi) 1026 1023 ASM_CLAC ··· 1028 1031 popl %eax 1029 1032 je nmi_espfix_stack 1030 1033 #endif 1031 - cmpl $entry_SYSENTER_32, (%esp) 1032 - je nmi_stack_fixup 1033 - pushl %eax 1034 - movl %esp, %eax 1035 - /* 1036 - * Do not access memory above the end of our stack page, 1037 - * it might not exist. 1038 - */ 1039 - andl $(THREAD_SIZE-1), %eax 1040 - cmpl $(THREAD_SIZE-20), %eax 1041 - popl %eax 1042 - jae nmi_stack_correct 1043 - cmpl $entry_SYSENTER_32, 12(%esp) 1044 - je nmi_debug_stack_check 1045 - nmi_stack_correct: 1046 - pushl %eax 1034 + 1035 + pushl %eax # pt_regs->orig_ax 1047 1036 SAVE_ALL 1048 1037 xorl %edx, %edx # zero error code 1049 1038 movl %esp, %eax # pt_regs pointer 1039 + 1040 + /* Are we currently on the SYSENTER stack? */ 1041 + PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) 1042 + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ 1043 + cmpl $SIZEOF_SYSENTER_stack, %ecx 1044 + jb .Lnmi_from_sysenter_stack 1045 + 1046 + /* Not on SYSENTER stack. */ 1050 1047 call do_nmi 1051 1048 jmp restore_all_notrace 1052 1049 1053 - nmi_stack_fixup: 1054 - FIX_STACK 12, nmi_stack_correct, 1 1055 - jmp nmi_stack_correct 1056 - 1057 - nmi_debug_stack_check: 1058 - cmpw $__KERNEL_CS, 16(%esp) 1059 - jne nmi_stack_correct 1060 - cmpl $debug, (%esp) 1061 - jb nmi_stack_correct 1062 - cmpl $debug_esp_fix_insn, (%esp) 1063 - ja nmi_stack_correct 1064 - FIX_STACK 24, nmi_stack_correct, 1 1065 - jmp nmi_stack_correct 1050 + .Lnmi_from_sysenter_stack: 1051 + /* 1052 + * We're on the SYSENTER stack. Switch off. No one (not even debug) 1053 + * is using the thread stack right now, so it's safe for us to use it. 1054 + */ 1055 + movl %esp, %ebp 1056 + movl PER_CPU_VAR(cpu_current_top_of_stack), %esp 1057 + call do_nmi 1058 + movl %ebp, %esp 1059 + jmp restore_all_notrace 1066 1060 1067 1061 #ifdef CONFIG_X86_ESPFIX32 1068 1062 nmi_espfix_stack:
+5
arch/x86/kernel/asm-offsets_32.c
··· 52 52 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - 53 53 offsetofend(struct tss_struct, SYSENTER_stack)); 54 54 55 + /* Offset from cpu_tss to SYSENTER_stack */ 56 + OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); 57 + /* Size of SYSENTER_stack */ 58 + DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); 59 + 55 60 #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 56 61 BLANK(); 57 62 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);