Merge tag 'x86-urgent-2020-09-06' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:

- more generic entry code ABI fallout

- debug register handling bugfixes

- fix vmalloc mappings on 32-bit kernels

- kprobes instrumentation output fix on 32-bit kernels

- fix over-eager WARN_ON_ONCE() on !SMAP hardware

- NUMA debugging fix

- fix Clang related crash on !RETPOLINE kernels

* tag 'x86-urgent-2020-09-06' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/entry: Unbreak 32bit fast syscall
x86/debug: Allow a single level of #DB recursion
x86/entry: Fix AC assertion
tracing/kprobes, x86/ptrace: Fix regs argument order for i386
x86, fakenuma: Fix invalid starting node ID
x86/mm/32: Bring back vmalloc faulting on x86_32
x86/cmdline: Disable jump tables for cmdline.c

+213 -63
+20 -9
arch/x86/entry/common.c
··· 60 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 61 static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs) 62 { 63 - unsigned int nr = (unsigned int)regs->orig_ax; 64 - 65 if (IS_ENABLED(CONFIG_IA32_EMULATION)) 66 current_thread_info()->status |= TS_COMPAT; 67 - /* 68 - * Subtlety here: if ptrace pokes something larger than 2^32-1 into 69 - * orig_ax, the unsigned int return value truncates it. This may 70 - * or may not be necessary, but it matches the old asm behavior. 71 - */ 72 - return (unsigned int)syscall_enter_from_user_mode(regs, nr); 73 } 74 75 /* ··· 85 { 86 unsigned int nr = syscall_32_enter(regs); 87 88 do_syscall_32_irqs_on(regs, nr); 89 syscall_exit_to_user_mode(regs); 90 } 91 92 static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) 93 { 94 - unsigned int nr = syscall_32_enter(regs); 95 int res; 96 97 instrumentation_begin(); 98 /* Fetch EBP from where the vDSO stashed it. */ ··· 129 syscall_exit_to_user_mode(regs); 130 return false; 131 } 132 133 /* Now this is just like a normal syscall. */ 134 do_syscall_32_irqs_on(regs, nr);
··· 60 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 61 static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs) 62 { 63 if (IS_ENABLED(CONFIG_IA32_EMULATION)) 64 current_thread_info()->status |= TS_COMPAT; 65 + 66 + return (unsigned int)regs->orig_ax; 67 } 68 69 /* ··· 91 { 92 unsigned int nr = syscall_32_enter(regs); 93 94 + /* 95 + * Subtlety here: if ptrace pokes something larger than 2^32-1 into 96 + * orig_ax, the unsigned int return value truncates it. This may 97 + * or may not be necessary, but it matches the old asm behavior. 98 + */ 99 + nr = (unsigned int)syscall_enter_from_user_mode(regs, nr); 100 + 101 do_syscall_32_irqs_on(regs, nr); 102 syscall_exit_to_user_mode(regs); 103 } 104 105 static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) 106 { 107 + unsigned int nr = syscall_32_enter(regs); 108 int res; 109 + 110 + /* 111 + * This cannot use syscall_enter_from_user_mode() as it has to 112 + * fetch EBP before invoking any of the syscall entry work 113 + * functions. 114 + */ 115 + syscall_enter_from_user_mode_prepare(regs); 116 117 instrumentation_begin(); 118 /* Fetch EBP from where the vDSO stashed it. */ ··· 121 syscall_exit_to_user_mode(regs); 122 return false; 123 } 124 + 125 + /* The case truncates any ptrace induced syscall nr > 2^32 -1 */ 126 + nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr); 127 128 /* Now this is just like a normal syscall. */ 129 do_syscall_32_irqs_on(regs, nr);
+10 -2
arch/x86/include/asm/entry-common.h
··· 18 * state, not the interrupt state as imagined by Xen. 19 */ 20 unsigned long flags = native_save_fl(); 21 - WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF | 22 - X86_EFLAGS_NT)); 23 24 /* We think we came from user mode. Make sure pt_regs agrees. */ 25 WARN_ON_ONCE(!user_mode(regs));
··· 18 * state, not the interrupt state as imagined by Xen. 19 */ 20 unsigned long flags = native_save_fl(); 21 + unsigned long mask = X86_EFLAGS_DF | X86_EFLAGS_NT; 22 + 23 + /* 24 + * For !SMAP hardware we patch out CLAC on entry. 25 + */ 26 + if (boot_cpu_has(X86_FEATURE_SMAP) || 27 + (IS_ENABLED(CONFIG_64_BIT) && boot_cpu_has(X86_FEATURE_XENPV))) 28 + mask |= X86_EFLAGS_AC; 29 + 30 + WARN_ON_ONCE(flags & mask); 31 32 /* We think we came from user mode. Make sure pt_regs agrees. */ 33 WARN_ON_ONCE(!user_mode(regs));
+1 -1
arch/x86/include/asm/ptrace.h
··· 327 static const unsigned int argument_offs[] = { 328 #ifdef __i386__ 329 offsetof(struct pt_regs, ax), 330 - offsetof(struct pt_regs, cx), 331 offsetof(struct pt_regs, dx), 332 #define NR_REG_ARGUMENTS 3 333 #else 334 offsetof(struct pt_regs, di),
··· 327 static const unsigned int argument_offs[] = { 328 #ifdef __i386__ 329 offsetof(struct pt_regs, ax), 330 offsetof(struct pt_regs, dx), 331 + offsetof(struct pt_regs, cx), 332 #define NR_REG_ARGUMENTS 3 333 #else 334 offsetof(struct pt_regs, di),
+31 -34
arch/x86/kernel/traps.c
··· 729 #endif 730 } 731 732 - static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7) 733 { 734 - /* 735 - * Disable breakpoints during exception handling; recursive exceptions 736 - * are exceedingly 'fun'. 737 - * 738 - * Since this function is NOKPROBE, and that also applies to 739 - * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a 740 - * HW_BREAKPOINT_W on our stack) 741 - * 742 - * Entry text is excluded for HW_BP_X and cpu_entry_area, which 743 - * includes the entry stack is excluded for everything. 744 - */ 745 - *dr7 = local_db_save(); 746 747 /* 748 * The Intel SDM says: ··· 744 * 745 * Keep it simple: clear DR6 immediately. 746 */ 747 - get_debugreg(*dr6, 6); 748 set_debugreg(0, 6); 749 /* Filter out all the reserved bits which are preset to 1 */ 750 - *dr6 &= ~DR6_RESERVED; 751 - } 752 753 - static __always_inline void debug_exit(unsigned long dr7) 754 - { 755 - local_db_restore(dr7); 756 } 757 758 /* ··· 849 static __always_inline void exc_debug_kernel(struct pt_regs *regs, 850 unsigned long dr6) 851 { 852 bool irq_state = idtentry_enter_nmi(regs); 853 instrumentation_begin(); 854 ··· 881 882 instrumentation_end(); 883 idtentry_exit_nmi(regs, irq_state); 884 } 885 886 static __always_inline void exc_debug_user(struct pt_regs *regs, ··· 893 * #DB, we will malfunction. 894 */ 895 WARN_ON_ONCE(!user_mode(regs)); 896 897 irqentry_enter_from_user_mode(regs); 898 instrumentation_begin(); ··· 916 /* IST stack entry */ 917 DEFINE_IDTENTRY_DEBUG(exc_debug) 918 { 919 - unsigned long dr6, dr7; 920 - 921 - debug_enter(&dr6, &dr7); 922 - exc_debug_kernel(regs, dr6); 923 - debug_exit(dr7); 924 } 925 926 /* User entry, runs on regular task stack */ 927 DEFINE_IDTENTRY_DEBUG_USER(exc_debug) 928 { 929 - unsigned long dr6, dr7; 930 - 931 - debug_enter(&dr6, &dr7); 932 - exc_debug_user(regs, dr6); 933 - debug_exit(dr7); 934 } 935 #else 936 /* 32 bit does not have separate entry points. */ 937 DEFINE_IDTENTRY_RAW(exc_debug) 938 { 939 - unsigned long dr6, dr7; 940 - 941 - debug_enter(&dr6, &dr7); 942 943 if (user_mode(regs)) 944 exc_debug_user(regs, dr6); 945 else 946 exc_debug_kernel(regs, dr6); 947 - 948 - debug_exit(dr7); 949 } 950 #endif 951
··· 729 #endif 730 } 731 732 + static __always_inline unsigned long debug_read_clear_dr6(void) 733 { 734 + unsigned long dr6; 735 736 /* 737 * The Intel SDM says: ··· 755 * 756 * Keep it simple: clear DR6 immediately. 757 */ 758 + get_debugreg(dr6, 6); 759 set_debugreg(0, 6); 760 /* Filter out all the reserved bits which are preset to 1 */ 761 + dr6 &= ~DR6_RESERVED; 762 763 + return dr6; 764 } 765 766 /* ··· 863 static __always_inline void exc_debug_kernel(struct pt_regs *regs, 864 unsigned long dr6) 865 { 866 + /* 867 + * Disable breakpoints during exception handling; recursive exceptions 868 + * are exceedingly 'fun'. 869 + * 870 + * Since this function is NOKPROBE, and that also applies to 871 + * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a 872 + * HW_BREAKPOINT_W on our stack) 873 + * 874 + * Entry text is excluded for HW_BP_X and cpu_entry_area, which 875 + * includes the entry stack is excluded for everything. 876 + */ 877 + unsigned long dr7 = local_db_save(); 878 bool irq_state = idtentry_enter_nmi(regs); 879 instrumentation_begin(); 880 ··· 883 884 instrumentation_end(); 885 idtentry_exit_nmi(regs, irq_state); 886 + 887 + local_db_restore(dr7); 888 } 889 890 static __always_inline void exc_debug_user(struct pt_regs *regs, ··· 893 * #DB, we will malfunction. 894 */ 895 WARN_ON_ONCE(!user_mode(regs)); 896 + 897 + /* 898 + * NB: We can't easily clear DR7 here because 899 + * idtentry_exit_to_usermode() can invoke ptrace, schedule, access 900 + * user memory, etc. This means that a recursive #DB is possible. If 901 + * this happens, that #DB will hit exc_debug_kernel() and clear DR7. 902 + * Since we're not on the IST stack right now, everything will be 903 + * fine. 904 + */ 905 906 irqentry_enter_from_user_mode(regs); 907 instrumentation_begin(); ··· 907 /* IST stack entry */ 908 DEFINE_IDTENTRY_DEBUG(exc_debug) 909 { 910 + exc_debug_kernel(regs, debug_read_clear_dr6()); 911 } 912 913 /* User entry, runs on regular task stack */ 914 DEFINE_IDTENTRY_DEBUG_USER(exc_debug) 915 { 916 + exc_debug_user(regs, debug_read_clear_dr6()); 917 } 918 #else 919 /* 32 bit does not have separate entry points. */ 920 DEFINE_IDTENTRY_RAW(exc_debug) 921 { 922 + unsigned long dr6 = debug_read_clear_dr6(); 923 924 if (user_mode(regs)) 925 exc_debug_user(regs, dr6); 926 else 927 exc_debug_kernel(regs, dr6); 928 } 929 #endif 930
+1 -1
arch/x86/lib/Makefile
··· 24 CFLAGS_REMOVE_cmdline.o = -pg 25 endif 26 27 - CFLAGS_cmdline.o := -fno-stack-protector 28 endif 29 30 inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
··· 24 CFLAGS_REMOVE_cmdline.o = -pg 25 endif 26 27 + CFLAGS_cmdline.o := -fno-stack-protector -fno-jump-tables 28 endif 29 30 inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
+78
arch/x86/mm/fault.c
··· 190 return pmd_k; 191 } 192 193 void arch_sync_kernel_mappings(unsigned long start, unsigned long end) 194 { 195 unsigned long addr; ··· 1156 * space, so do not expect them here. 1157 */ 1158 WARN_ON_ONCE(hw_error_code & X86_PF_PK); 1159 1160 /* Was the fault spurious, caused by lazy TLB invalidation? */ 1161 if (spurious_kernel_fault(hw_error_code, address))
··· 190 return pmd_k; 191 } 192 193 + /* 194 + * Handle a fault on the vmalloc or module mapping area 195 + * 196 + * This is needed because there is a race condition between the time 197 + * when the vmalloc mapping code updates the PMD to the point in time 198 + * where it synchronizes this update with the other page-tables in the 199 + * system. 200 + * 201 + * In this race window another thread/CPU can map an area on the same 202 + * PMD, finds it already present and does not synchronize it with the 203 + * rest of the system yet. As a result v[mz]alloc might return areas 204 + * which are not mapped in every page-table in the system, causing an 205 + * unhandled page-fault when they are accessed. 206 + */ 207 + static noinline int vmalloc_fault(unsigned long address) 208 + { 209 + unsigned long pgd_paddr; 210 + pmd_t *pmd_k; 211 + pte_t *pte_k; 212 + 213 + /* Make sure we are in vmalloc area: */ 214 + if (!(address >= VMALLOC_START && address < VMALLOC_END)) 215 + return -1; 216 + 217 + /* 218 + * Synchronize this task's top level page-table 219 + * with the 'reference' page table. 220 + * 221 + * Do _not_ use "current" here. We might be inside 222 + * an interrupt in the middle of a task switch.. 223 + */ 224 + pgd_paddr = read_cr3_pa(); 225 + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 226 + if (!pmd_k) 227 + return -1; 228 + 229 + if (pmd_large(*pmd_k)) 230 + return 0; 231 + 232 + pte_k = pte_offset_kernel(pmd_k, address); 233 + if (!pte_present(*pte_k)) 234 + return -1; 235 + 236 + return 0; 237 + } 238 + NOKPROBE_SYMBOL(vmalloc_fault); 239 + 240 void arch_sync_kernel_mappings(unsigned long start, unsigned long end) 241 { 242 unsigned long addr; ··· 1109 * space, so do not expect them here. 1110 */ 1111 WARN_ON_ONCE(hw_error_code & X86_PF_PK); 1112 + 1113 + #ifdef CONFIG_X86_32 1114 + /* 1115 + * We can fault-in kernel-space virtual memory on-demand. The 1116 + * 'reference' page table is init_mm.pgd. 1117 + * 1118 + * NOTE! We MUST NOT take any locks for this case. We may 1119 + * be in an interrupt or a critical region, and should 1120 + * only copy the information from the master page table, 1121 + * nothing more. 1122 + * 1123 + * Before doing this on-demand faulting, ensure that the 1124 + * fault is not any of the following: 1125 + * 1. A fault on a PTE with a reserved bit set. 1126 + * 2. A fault caused by a user-mode access. (Do not demand- 1127 + * fault kernel memory due to user-mode accesses). 1128 + * 3. A fault caused by a page-level protection violation. 1129 + * (A demand fault would be on a non-present page which 1130 + * would have X86_PF_PROT==0). 1131 + * 1132 + * This is only needed to close a race condition on x86-32 in 1133 + * the vmalloc mapping/unmapping code. See the comment above 1134 + * vmalloc_fault() for details. On x86-64 the race does not 1135 + * exist as the vmalloc mappings don't need to be synchronized 1136 + * there. 1137 + */ 1138 + if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { 1139 + if (vmalloc_fault(address) >= 0) 1140 + return; 1141 + } 1142 + #endif 1143 1144 /* Was the fault spurious, caused by lazy TLB invalidation? */ 1145 if (spurious_kernel_fault(hw_error_code, address))
+1 -1
arch/x86/mm/numa_emulation.c
··· 321 u64 addr, u64 max_addr, u64 size) 322 { 323 return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, 324 - 0, NULL, NUMA_NO_NODE); 325 } 326 327 static int __init setup_emu2phys_nid(int *dfl_phys_nid)
··· 321 u64 addr, u64 max_addr, u64 size) 322 { 323 return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, 324 + 0, NULL, 0); 325 } 326 327 static int __init setup_emu2phys_nid(int *dfl_phys_nid)
+42 -9
include/linux/entry-common.h
··· 110 #endif 111 112 /** 113 - * syscall_enter_from_user_mode - Check and handle work before invoking 114 - * a syscall 115 * @regs: Pointer to currents pt_regs 116 * @syscall: The syscall number 117 * 118 * Invoked from architecture specific syscall entry code with interrupts 119 - * disabled. The calling code has to be non-instrumentable. When the 120 - * function returns all state is correct and the subsequent functions can be 121 - * instrumented. 122 * 123 * Returns: The original or a modified syscall number 124 * ··· 142 * syscall_set_return_value() first. If neither of those are called and -1 143 * is returned, then the syscall will fail with ENOSYS. 144 * 145 - * The following functionality is handled here: 146 * 147 - * 1) Establish state (lockdep, RCU (context tracking), tracing) 148 - * 2) TIF flag dependent invocations of arch_syscall_enter_tracehook(), 149 * __secure_computing(), trace_sys_enter() 150 - * 3) Invocation of audit_syscall_entry() 151 */ 152 long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall); 153
··· 110 #endif 111 112 /** 113 + * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts 114 + * @regs: Pointer to currents pt_regs 115 + * 116 + * Invoked from architecture specific syscall entry code with interrupts 117 + * disabled. The calling code has to be non-instrumentable. When the 118 + * function returns all state is correct, interrupts are enabled and the 119 + * subsequent functions can be instrumented. 120 + * 121 + * This handles lockdep, RCU (context tracking) and tracing state. 122 + * 123 + * This is invoked when there is extra architecture specific functionality 124 + * to be done between establishing state and handling user mode entry work. 125 + */ 126 + void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); 127 + 128 + /** 129 + * syscall_enter_from_user_mode_work - Check and handle work before invoking 130 + * a syscall 131 * @regs: Pointer to currents pt_regs 132 * @syscall: The syscall number 133 * 134 * Invoked from architecture specific syscall entry code with interrupts 135 + * enabled after invoking syscall_enter_from_user_mode_prepare() and extra 136 + * architecture specific work. 137 * 138 * Returns: The original or a modified syscall number 139 * ··· 127 * syscall_set_return_value() first. If neither of those are called and -1 128 * is returned, then the syscall will fail with ENOSYS. 129 * 130 + * It handles the following work items: 131 * 132 + * 1) TIF flag dependent invocations of arch_syscall_enter_tracehook(), 133 * __secure_computing(), trace_sys_enter() 134 + * 2) Invocation of audit_syscall_entry() 135 + */ 136 + long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall); 137 + 138 + /** 139 + * syscall_enter_from_user_mode - Establish state and check and handle work 140 + * before invoking a syscall 141 + * @regs: Pointer to currents pt_regs 142 + * @syscall: The syscall number 143 + * 144 + * Invoked from architecture specific syscall entry code with interrupts 145 + * disabled. The calling code has to be non-instrumentable. When the 146 + * function returns all state is correct, interrupts are enabled and the 147 + * subsequent functions can be instrumented. 148 + * 149 + * This is combination of syscall_enter_from_user_mode_prepare() and 150 + * syscall_enter_from_user_mode_work(). 151 + * 152 + * Returns: The original or a modified syscall number. See 153 + * syscall_enter_from_user_mode_work() for further explanation. 154 */ 155 long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall); 156
+29 -6
kernel/entry/common.c
··· 69 return ret ? : syscall_get_nr(current, regs); 70 } 71 72 - noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) 73 { 74 unsigned long ti_work; 75 76 - enter_from_user_mode(regs); 77 - instrumentation_begin(); 78 - 79 - local_irq_enable(); 80 ti_work = READ_ONCE(current_thread_info()->flags); 81 if (ti_work & SYSCALL_ENTER_WORK) 82 syscall = syscall_trace_enter(regs, syscall, ti_work); 83 - instrumentation_end(); 84 85 return syscall; 86 } 87 88 /**
··· 69 return ret ? : syscall_get_nr(current, regs); 70 } 71 72 + static __always_inline long 73 + __syscall_enter_from_user_work(struct pt_regs *regs, long syscall) 74 { 75 unsigned long ti_work; 76 77 ti_work = READ_ONCE(current_thread_info()->flags); 78 if (ti_work & SYSCALL_ENTER_WORK) 79 syscall = syscall_trace_enter(regs, syscall, ti_work); 80 81 return syscall; 82 + } 83 + 84 + long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) 85 + { 86 + return __syscall_enter_from_user_work(regs, syscall); 87 + } 88 + 89 + noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) 90 + { 91 + long ret; 92 + 93 + enter_from_user_mode(regs); 94 + 95 + instrumentation_begin(); 96 + local_irq_enable(); 97 + ret = __syscall_enter_from_user_work(regs, syscall); 98 + instrumentation_end(); 99 + 100 + return ret; 101 + } 102 + 103 + noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) 104 + { 105 + enter_from_user_mode(regs); 106 + instrumentation_begin(); 107 + local_irq_enable(); 108 + instrumentation_end(); 109 } 110 111 /**