Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+2 -1

arch/um/include/shared/kern_util.h

··· 22 22 extern unsigned long alloc_stack(int order, int atomic); 23 23 extern void free_stack(unsigned long stack, int order); 24 24 25 - extern int do_signal(void); 25 + struct pt_regs; 26 + extern void do_signal(struct pt_regs *regs); 26 27 extern void interrupt_end(void); 27 28 extern void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs); 28 29

+4 -2

arch/um/kernel/process.c

··· 90 90 91 91 void interrupt_end(void) 92 92 { 93 + struct pt_regs *regs = &current->thread.regs; 94 + 93 95 if (need_resched()) 94 96 schedule(); 95 97 if (test_thread_flag(TIF_SIGPENDING)) 96 - do_signal(); 98 + do_signal(regs); 97 99 if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) 98 - tracehook_notify_resume(&current->thread.regs); 100 + tracehook_notify_resume(regs); 99 101 } 100 102 101 103 void exit_thread(void)

+1 -7

arch/um/kernel/signal.c

··· 64 64 signal_setup_done(err, ksig, singlestep); 65 65 } 66 66 67 - static int kern_do_signal(struct pt_regs *regs) 67 + void do_signal(struct pt_regs *regs) 68 68 { 69 69 struct ksignal ksig; 70 70 int handled_sig = 0; ··· 110 110 */ 111 111 if (!handled_sig) 112 112 restore_saved_sigmask(); 113 - return handled_sig; 114 - } 115 - 116 - int do_signal(void) 117 - { 118 - return kern_do_signal(&current->thread.regs); 119 113 }

+1 -1

arch/um/kernel/tlb.c

··· 291 291 /* We are under mmap_sem, release it such that current can terminate */ 292 292 up_write(&current->mm->mmap_sem); 293 293 force_sig(SIGKILL, current); 294 - do_signal(); 294 + do_signal(&current->thread.regs); 295 295 } 296 296 } 297 297

+1 -1

arch/um/kernel/trap.c

··· 173 173 void fatal_sigsegv(void) 174 174 { 175 175 force_sigsegv(SIGSEGV, current); 176 - do_signal(); 176 + do_signal(&current->thread.regs); 177 177 /* 178 178 * This is to tell gcc that we're not returning - do_signal 179 179 * can, in general, return, but in this case, it's not, since

+49 -11

arch/x86/Kconfig

··· 133 133 select HAVE_PERF_USER_STACK_DUMP 134 134 select HAVE_REGS_AND_STACK_ACCESS_API 135 135 select HAVE_SYSCALL_TRACEPOINTS 136 - select HAVE_UID16 if X86_32 136 + select HAVE_UID16 if X86_32 || IA32_EMULATION 137 137 select HAVE_UNSTABLE_SCHED_CLOCK 138 138 select HAVE_USER_RETURN_NOTIFIER 139 139 select IRQ_FORCED_THREADING ··· 1003 1003 def_bool y 1004 1004 depends on X86_MCE_INTEL 1005 1005 1006 - config VM86 1007 - bool "Enable VM86 support" if EXPERT 1008 - default y 1006 + config X86_LEGACY_VM86 1007 + bool "Legacy VM86 support (obsolete)" 1008 + default n 1009 1009 depends on X86_32 1010 1010 ---help--- 1011 - This option is required by programs like DOSEMU to run 1012 - 16-bit real mode legacy code on x86 processors. It also may 1013 - be needed by software like XFree86 to initialize some video 1014 - cards via BIOS. Disabling this option saves about 6K. 1011 + This option allows user programs to put the CPU into V8086 1012 + mode, which is an 80286-era approximation of 16-bit real mode. 1013 + 1014 + Some very old versions of X and/or vbetool require this option 1015 + for user mode setting. Similarly, DOSEMU will use it if 1016 + available to accelerate real mode DOS programs. However, any 1017 + recent version of DOSEMU, X, or vbetool should be fully 1018 + functional even without kernel VM86 support, as they will all 1019 + fall back to (pretty well performing) software emulation. 1020 + 1021 + Anything that works on a 64-bit kernel is unlikely to need 1022 + this option, as 64-bit kernels don't, and can't, support V8086 1023 + mode. This option is also unrelated to 16-bit protected mode 1024 + and is not needed to run most 16-bit programs under Wine. 1025 + 1026 + Enabling this option adds considerable attack surface to the 1027 + kernel and slows down system calls and exception handling. 1028 + 1029 + Unless you use very old userspace or need the last drop of 1030 + performance in your real mode DOS games and can't use KVM, 1031 + say N here. 1032 + 1033 + config VM86 1034 + bool 1035 + default X86_LEGACY_VM86 1015 1036 1016 1037 config X86_16BIT 1017 1038 bool "Enable support for 16-bit segments" if EXPERT 1018 1039 default y 1040 + depends on MODIFY_LDT_SYSCALL 1019 1041 ---help--- 1020 1042 This option is required by programs like Wine to run 16-bit 1021 1043 protected mode legacy code on x86 processors. Disabling ··· 1532 1510 1533 1511 config MATH_EMULATION 1534 1512 bool 1513 + depends on MODIFY_LDT_SYSCALL 1535 1514 prompt "Math emulation" if X86_32 1536 1515 ---help--- 1537 1516 Linux can emulate a math coprocessor (used for floating point ··· 2077 2054 This is used to work around broken boot loaders. This should 2078 2055 be set to 'N' under normal conditions. 2079 2056 2057 + config MODIFY_LDT_SYSCALL 2058 + bool "Enable the LDT (local descriptor table)" if EXPERT 2059 + default y 2060 + ---help--- 2061 + Linux can allow user programs to install a per-process x86 2062 + Local Descriptor Table (LDT) using the modify_ldt(2) system 2063 + call. This is required to run 16-bit or segmented code such as 2064 + DOSEMU or some Wine programs. It is also used by some very old 2065 + threading libraries. 2066 + 2067 + Enabling this feature adds a small amount of overhead to 2068 + context switches and increases the low-level kernel attack 2069 + surface. Disabling it removes the modify_ldt(2) system call. 2070 + 2071 + Saying 'N' here may make sense for embedded or server kernels. 2072 + 2080 2073 source "kernel/livepatch/Kconfig" 2081 2074 2082 2075 endmenu ··· 2562 2523 depends on X86_64 2563 2524 select BINFMT_ELF 2564 2525 select COMPAT_BINFMT_ELF 2565 - select HAVE_UID16 2526 + select ARCH_WANT_OLD_COMPAT_IPC 2566 2527 ---help--- 2567 2528 Include code to run legacy 32-bit programs under a 2568 2529 64-bit kernel. You should likely turn this on, unless you're ··· 2576 2537 2577 2538 config X86_X32 2578 2539 bool "x32 ABI for 64-bit mode" 2579 - depends on X86_64 && IA32_EMULATION 2540 + depends on X86_64 2580 2541 ---help--- 2581 2542 Include code to run binaries for the x32 native 32-bit ABI 2582 2543 for 64-bit processors. An x32 process gets access to the ··· 2590 2551 config COMPAT 2591 2552 def_bool y 2592 2553 depends on IA32_EMULATION || X86_X32 2593 - select ARCH_WANT_OLD_COMPAT_IPC 2594 2554 2595 2555 if COMPAT 2596 2556 config COMPAT_FOR_U64_ALIGNMENT

+10 -3

arch/x86/Makefile

··· 39 39 LDFLAGS_vmlinux := --emit-relocs 40 40 endif 41 41 42 + # 43 + # Prevent GCC from generating any FP code by mistake. 44 + # 45 + # This must happen before we try the -mpreferred-stack-boundary, see: 46 + # 47 + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 48 + # 49 + KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow 50 + KBUILD_CFLAGS += $(call cc-option,-mno-avx,) 51 + 42 52 ifeq ($(CONFIG_X86_32),y) 43 53 BITS := 32 44 54 UTS_MACHINE := i386 ··· 177 167 KBUILD_CFLAGS += -Wno-sign-compare 178 168 # 179 169 KBUILD_CFLAGS += -fno-asynchronous-unwind-tables 180 - # prevent gcc from generating any FP code by mistake 181 - KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow 182 - KBUILD_CFLAGS += $(call cc-option,-mno-avx,) 183 170 184 171 KBUILD_CFLAGS += $(mflags-y) 185 172 KBUILD_AFLAGS += $(mflags-y)

+1 -1

arch/x86/boot/compressed/aslr.c

··· 82 82 83 83 if (has_cpuflag(X86_FEATURE_TSC)) { 84 84 debug_putstr(" RDTSC"); 85 - rdtscll(raw); 85 + raw = rdtsc(); 86 86 87 87 random ^= raw; 88 88 use_i8254 = false;

+1

arch/x86/entry/Makefile

··· 2 2 # Makefile for the x86 low level entry code 3 3 # 4 4 obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o 5 + obj-y += common.o 5 6 6 7 obj-y += vdso/ 7 8 obj-y += vsyscall/

-9

arch/x86/entry/calling.h

··· 135 135 movq %rbp, 4*8+\offset(%rsp) 136 136 movq %rbx, 5*8+\offset(%rsp) 137 137 .endm 138 - .macro SAVE_EXTRA_REGS_RBP offset=0 139 - movq %rbp, 4*8+\offset(%rsp) 140 - .endm 141 138 142 139 .macro RESTORE_EXTRA_REGS offset=0 143 140 movq 0*8+\offset(%rsp), %r15 ··· 189 192 .endm 190 193 .macro RESTORE_C_REGS_EXCEPT_RCX_R11 191 194 RESTORE_C_REGS_HELPER 1,0,0,1,1 192 - .endm 193 - .macro RESTORE_RSI_RDI 194 - RESTORE_C_REGS_HELPER 0,0,0,0,0 195 - .endm 196 - .macro RESTORE_RSI_RDI_RDX 197 - RESTORE_C_REGS_HELPER 0,0,0,0,1 198 195 .endm 199 196 200 197 .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0

+318

arch/x86/entry/common.c

··· 1 + /* 2 + * common.c - C code for kernel entry and exit 3 + * Copyright (c) 2015 Andrew Lutomirski 4 + * GPL v2 5 + * 6 + * Based on asm and ptrace code by many authors. The code here originated 7 + * in ptrace.c and signal.c. 8 + */ 9 + 10 + #include <linux/kernel.h> 11 + #include <linux/sched.h> 12 + #include <linux/mm.h> 13 + #include <linux/smp.h> 14 + #include <linux/errno.h> 15 + #include <linux/ptrace.h> 16 + #include <linux/tracehook.h> 17 + #include <linux/audit.h> 18 + #include <linux/seccomp.h> 19 + #include <linux/signal.h> 20 + #include <linux/export.h> 21 + #include <linux/context_tracking.h> 22 + #include <linux/user-return-notifier.h> 23 + #include <linux/uprobes.h> 24 + 25 + #include <asm/desc.h> 26 + #include <asm/traps.h> 27 + 28 + #define CREATE_TRACE_POINTS 29 + #include <trace/events/syscalls.h> 30 + 31 + #ifdef CONFIG_CONTEXT_TRACKING 32 + /* Called on entry from user mode with IRQs off. */ 33 + __visible void enter_from_user_mode(void) 34 + { 35 + CT_WARN_ON(ct_state() != CONTEXT_USER); 36 + user_exit(); 37 + } 38 + #endif 39 + 40 + static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) 41 + { 42 + #ifdef CONFIG_X86_64 43 + if (arch == AUDIT_ARCH_X86_64) { 44 + audit_syscall_entry(regs->orig_ax, regs->di, 45 + regs->si, regs->dx, regs->r10); 46 + } else 47 + #endif 48 + { 49 + audit_syscall_entry(regs->orig_ax, regs->bx, 50 + regs->cx, regs->dx, regs->si); 51 + } 52 + } 53 + 54 + /* 55 + * We can return 0 to resume the syscall or anything else to go to phase 56 + * 2. If we resume the syscall, we need to put something appropriate in 57 + * regs->orig_ax. 58 + * 59 + * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax 60 + * are fully functional. 61 + * 62 + * For phase 2's benefit, our return value is: 63 + * 0: resume the syscall 64 + * 1: go to phase 2; no seccomp phase 2 needed 65 + * anything else: go to phase 2; pass return value to seccomp 66 + */ 67 + unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) 68 + { 69 + unsigned long ret = 0; 70 + u32 work; 71 + 72 + BUG_ON(regs != task_pt_regs(current)); 73 + 74 + work = ACCESS_ONCE(current_thread_info()->flags) & 75 + _TIF_WORK_SYSCALL_ENTRY; 76 + 77 + #ifdef CONFIG_CONTEXT_TRACKING 78 + /* 79 + * If TIF_NOHZ is set, we are required to call user_exit() before 80 + * doing anything that could touch RCU. 81 + */ 82 + if (work & _TIF_NOHZ) { 83 + enter_from_user_mode(); 84 + work &= ~_TIF_NOHZ; 85 + } 86 + #endif 87 + 88 + #ifdef CONFIG_SECCOMP 89 + /* 90 + * Do seccomp first -- it should minimize exposure of other 91 + * code, and keeping seccomp fast is probably more valuable 92 + * than the rest of this. 93 + */ 94 + if (work & _TIF_SECCOMP) { 95 + struct seccomp_data sd; 96 + 97 + sd.arch = arch; 98 + sd.nr = regs->orig_ax; 99 + sd.instruction_pointer = regs->ip; 100 + #ifdef CONFIG_X86_64 101 + if (arch == AUDIT_ARCH_X86_64) { 102 + sd.args[0] = regs->di; 103 + sd.args[1] = regs->si; 104 + sd.args[2] = regs->dx; 105 + sd.args[3] = regs->r10; 106 + sd.args[4] = regs->r8; 107 + sd.args[5] = regs->r9; 108 + } else 109 + #endif 110 + { 111 + sd.args[0] = regs->bx; 112 + sd.args[1] = regs->cx; 113 + sd.args[2] = regs->dx; 114 + sd.args[3] = regs->si; 115 + sd.args[4] = regs->di; 116 + sd.args[5] = regs->bp; 117 + } 118 + 119 + BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); 120 + BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); 121 + 122 + ret = seccomp_phase1(&sd); 123 + if (ret == SECCOMP_PHASE1_SKIP) { 124 + regs->orig_ax = -1; 125 + ret = 0; 126 + } else if (ret != SECCOMP_PHASE1_OK) { 127 + return ret; /* Go directly to phase 2 */ 128 + } 129 + 130 + work &= ~_TIF_SECCOMP; 131 + } 132 + #endif 133 + 134 + /* Do our best to finish without phase 2. */ 135 + if (work == 0) 136 + return ret; /* seccomp and/or nohz only (ret == 0 here) */ 137 + 138 + #ifdef CONFIG_AUDITSYSCALL 139 + if (work == _TIF_SYSCALL_AUDIT) { 140 + /* 141 + * If there is no more work to be done except auditing, 142 + * then audit in phase 1. Phase 2 always audits, so, if 143 + * we audit here, then we can't go on to phase 2. 144 + */ 145 + do_audit_syscall_entry(regs, arch); 146 + return 0; 147 + } 148 + #endif 149 + 150 + return 1; /* Something is enabled that we can't handle in phase 1 */ 151 + } 152 + 153 + /* Returns the syscall nr to run (which should match regs->orig_ax). */ 154 + long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, 155 + unsigned long phase1_result) 156 + { 157 + long ret = 0; 158 + u32 work = ACCESS_ONCE(current_thread_info()->flags) & 159 + _TIF_WORK_SYSCALL_ENTRY; 160 + 161 + BUG_ON(regs != task_pt_regs(current)); 162 + 163 + /* 164 + * If we stepped into a sysenter/syscall insn, it trapped in 165 + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. 166 + * If user-mode had set TF itself, then it's still clear from 167 + * do_debug() and we need to set it again to restore the user 168 + * state. If we entered on the slow path, TF was already set. 169 + */ 170 + if (work & _TIF_SINGLESTEP) 171 + regs->flags |= X86_EFLAGS_TF; 172 + 173 + #ifdef CONFIG_SECCOMP 174 + /* 175 + * Call seccomp_phase2 before running the other hooks so that 176 + * they can see any changes made by a seccomp tracer. 177 + */ 178 + if (phase1_result > 1 && seccomp_phase2(phase1_result)) { 179 + /* seccomp failures shouldn't expose any additional code. */ 180 + return -1; 181 + } 182 + #endif 183 + 184 + if (unlikely(work & _TIF_SYSCALL_EMU)) 185 + ret = -1L; 186 + 187 + if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && 188 + tracehook_report_syscall_entry(regs)) 189 + ret = -1L; 190 + 191 + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 192 + trace_sys_enter(regs, regs->orig_ax); 193 + 194 + do_audit_syscall_entry(regs, arch); 195 + 196 + return ret ?: regs->orig_ax; 197 + } 198 + 199 + long syscall_trace_enter(struct pt_regs *regs) 200 + { 201 + u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; 202 + unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); 203 + 204 + if (phase1_result == 0) 205 + return regs->orig_ax; 206 + else 207 + return syscall_trace_enter_phase2(regs, arch, phase1_result); 208 + } 209 + 210 + static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) 211 + { 212 + unsigned long top_of_stack = 213 + (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING; 214 + return (struct thread_info *)(top_of_stack - THREAD_SIZE); 215 + } 216 + 217 + /* Called with IRQs disabled. */ 218 + __visible void prepare_exit_to_usermode(struct pt_regs *regs) 219 + { 220 + if (WARN_ON(!irqs_disabled())) 221 + local_irq_disable(); 222 + 223 + /* 224 + * In order to return to user mode, we need to have IRQs off with 225 + * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY, 226 + * _TIF_UPROBE, or _TIF_NEED_RESCHED set. Several of these flags 227 + * can be set at any time on preemptable kernels if we have IRQs on, 228 + * so we need to loop. Disabling preemption wouldn't help: doing the 229 + * work to clear some of the flags can sleep. 230 + */ 231 + while (true) { 232 + u32 cached_flags = 233 + READ_ONCE(pt_regs_to_thread_info(regs)->flags); 234 + 235 + if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | 236 + _TIF_UPROBE | _TIF_NEED_RESCHED | 237 + _TIF_USER_RETURN_NOTIFY))) 238 + break; 239 + 240 + /* We have work to do. */ 241 + local_irq_enable(); 242 + 243 + if (cached_flags & _TIF_NEED_RESCHED) 244 + schedule(); 245 + 246 + if (cached_flags & _TIF_UPROBE) 247 + uprobe_notify_resume(regs); 248 + 249 + /* deal with pending signal delivery */ 250 + if (cached_flags & _TIF_SIGPENDING) 251 + do_signal(regs); 252 + 253 + if (cached_flags & _TIF_NOTIFY_RESUME) { 254 + clear_thread_flag(TIF_NOTIFY_RESUME); 255 + tracehook_notify_resume(regs); 256 + } 257 + 258 + if (cached_flags & _TIF_USER_RETURN_NOTIFY) 259 + fire_user_return_notifiers(); 260 + 261 + /* Disable IRQs and retry */ 262 + local_irq_disable(); 263 + } 264 + 265 + user_enter(); 266 + } 267 + 268 + /* 269 + * Called with IRQs on and fully valid regs. Returns with IRQs off in a 270 + * state such that we can immediately switch to user mode. 271 + */ 272 + __visible void syscall_return_slowpath(struct pt_regs *regs) 273 + { 274 + struct thread_info *ti = pt_regs_to_thread_info(regs); 275 + u32 cached_flags = READ_ONCE(ti->flags); 276 + bool step; 277 + 278 + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 279 + 280 + if (WARN(irqs_disabled(), "syscall %ld left IRQs disabled", 281 + regs->orig_ax)) 282 + local_irq_enable(); 283 + 284 + /* 285 + * First do one-time work. If these work items are enabled, we 286 + * want to run them exactly once per syscall exit with IRQs on. 287 + */ 288 + if (cached_flags & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | 289 + _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)) { 290 + audit_syscall_exit(regs); 291 + 292 + if (cached_flags & _TIF_SYSCALL_TRACEPOINT) 293 + trace_sys_exit(regs, regs->ax); 294 + 295 + /* 296 + * If TIF_SYSCALL_EMU is set, we only get here because of 297 + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). 298 + * We already reported this syscall instruction in 299 + * syscall_trace_enter(). 300 + */ 301 + step = unlikely( 302 + (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) 303 + == _TIF_SINGLESTEP); 304 + if (step || cached_flags & _TIF_SYSCALL_TRACE) 305 + tracehook_report_syscall_exit(regs, step); 306 + } 307 + 308 + #ifdef CONFIG_COMPAT 309 + /* 310 + * Compat syscalls set TS_COMPAT. Make sure we clear it before 311 + * returning to user mode. 312 + */ 313 + ti->status &= ~TS_COMPAT; 314 + #endif 315 + 316 + local_irq_disable(); 317 + prepare_exit_to_usermode(regs); 318 + }

+12 -118

arch/x86/entry/entry_32.S

··· 45 45 #include <asm/asm.h> 46 46 #include <asm/smap.h> 47 47 48 - /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 49 - #include <linux/elf-em.h> 50 - #define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) 51 - #define __AUDIT_ARCH_LE 0x40000000 52 - 53 - #ifndef CONFIG_AUDITSYSCALL 54 - # define sysenter_audit syscall_trace_entry 55 - # define sysexit_audit syscall_exit_work 56 - #endif 57 - 58 48 .section .entry.text, "ax" 59 49 60 50 /* ··· 256 266 257 267 ENTRY(resume_userspace) 258 268 LOCKDEP_SYS_EXIT 259 - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt 260 - # setting need_resched or sigpending 261 - # between sampling and the iret 269 + DISABLE_INTERRUPTS(CLBR_ANY) 262 270 TRACE_IRQS_OFF 263 - movl TI_flags(%ebp), %ecx 264 - andl $_TIF_WORK_MASK, %ecx # is there any work to be done on 265 - # int/exception return? 266 - jne work_pending 271 + movl %esp, %eax 272 + call prepare_exit_to_usermode 267 273 jmp restore_all 268 274 END(ret_from_exception) 269 275 ··· 325 339 GET_THREAD_INFO(%ebp) 326 340 327 341 testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) 328 - jnz sysenter_audit 342 + jnz syscall_trace_entry 329 343 sysenter_do_call: 330 344 cmpl $(NR_syscalls), %eax 331 345 jae sysenter_badsys ··· 337 351 TRACE_IRQS_OFF 338 352 movl TI_flags(%ebp), %ecx 339 353 testl $_TIF_ALLWORK_MASK, %ecx 340 - jnz sysexit_audit 354 + jnz syscall_exit_work_irqs_off 341 355 sysenter_exit: 342 356 /* if something modifies registers it must also disable sysexit */ 343 357 movl PT_EIP(%esp), %edx ··· 347 361 1: mov PT_FS(%esp), %fs 348 362 PTGS_TO_GS 349 363 ENABLE_INTERRUPTS_SYSEXIT 350 - 351 - #ifdef CONFIG_AUDITSYSCALL 352 - sysenter_audit: 353 - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), TI_flags(%ebp) 354 - jnz syscall_trace_entry 355 - /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ 356 - movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ 357 - /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ 358 - pushl PT_ESI(%esp) /* a3: 5th arg */ 359 - pushl PT_EDX+4(%esp) /* a2: 4th arg */ 360 - call __audit_syscall_entry 361 - popl %ecx /* get that remapped edx off the stack */ 362 - popl %ecx /* get that remapped esi off the stack */ 363 - movl PT_EAX(%esp), %eax /* reload syscall number */ 364 - jmp sysenter_do_call 365 - 366 - sysexit_audit: 367 - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx 368 - jnz syscall_exit_work 369 - TRACE_IRQS_ON 370 - ENABLE_INTERRUPTS(CLBR_ANY) 371 - movl %eax, %edx /* second arg, syscall return value */ 372 - cmpl $-MAX_ERRNO, %eax /* is it an error ? */ 373 - setbe %al /* 1 if so, 0 if not */ 374 - movzbl %al, %eax /* zero-extend that */ 375 - call __audit_syscall_exit 376 - DISABLE_INTERRUPTS(CLBR_ANY) 377 - TRACE_IRQS_OFF 378 - movl TI_flags(%ebp), %ecx 379 - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx 380 - jnz syscall_exit_work 381 - movl PT_EAX(%esp), %eax /* reload syscall return value */ 382 - jmp sysenter_exit 383 - #endif 384 364 385 365 .pushsection .fixup, "ax" 386 366 2: movl $0, PT_FS(%esp) ··· 373 421 movl %eax, PT_EAX(%esp) # store the return value 374 422 syscall_exit: 375 423 LOCKDEP_SYS_EXIT 376 - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt 377 - # setting need_resched or sigpending 378 - # between sampling and the iret 379 - TRACE_IRQS_OFF 380 - movl TI_flags(%ebp), %ecx 381 - testl $_TIF_ALLWORK_MASK, %ecx # current->work 382 - jnz syscall_exit_work 424 + jmp syscall_exit_work 383 425 384 426 restore_all: 385 427 TRACE_IRQS_IRET ··· 450 504 #endif 451 505 ENDPROC(entry_INT80_32) 452 506 453 - # perform work that needs to be done immediately before resumption 454 - ALIGN 455 - work_pending: 456 - testb $_TIF_NEED_RESCHED, %cl 457 - jz work_notifysig 458 - work_resched: 459 - call schedule 460 - LOCKDEP_SYS_EXIT 461 - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt 462 - # setting need_resched or sigpending 463 - # between sampling and the iret 464 - TRACE_IRQS_OFF 465 - movl TI_flags(%ebp), %ecx 466 - andl $_TIF_WORK_MASK, %ecx # is there any work to be done other 467 - # than syscall tracing? 468 - jz restore_all 469 - testb $_TIF_NEED_RESCHED, %cl 470 - jnz work_resched 471 - 472 - work_notifysig: # deal with pending signals and 473 - # notify-resume requests 474 - #ifdef CONFIG_VM86 475 - testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) 476 - movl %esp, %eax 477 - jnz work_notifysig_v86 # returning to kernel-space or 478 - # vm86-space 479 - 1: 480 - #else 481 - movl %esp, %eax 482 - #endif 483 - TRACE_IRQS_ON 484 - ENABLE_INTERRUPTS(CLBR_NONE) 485 - movb PT_CS(%esp), %bl 486 - andb $SEGMENT_RPL_MASK, %bl 487 - cmpb $USER_RPL, %bl 488 - jb resume_kernel 489 - xorl %edx, %edx 490 - call do_notify_resume 491 - jmp resume_userspace 492 - 493 - #ifdef CONFIG_VM86 494 - ALIGN 495 - work_notifysig_v86: 496 - pushl %ecx # save ti_flags for do_notify_resume 497 - call save_v86_state # %eax contains pt_regs pointer 498 - popl %ecx 499 - movl %eax, %esp 500 - jmp 1b 501 - #endif 502 - END(work_pending) 503 - 504 507 # perform syscall exit tracing 505 508 ALIGN 506 509 syscall_trace_entry: ··· 464 569 465 570 # perform syscall exit tracing 466 571 ALIGN 467 - syscall_exit_work: 468 - testl $_TIF_WORK_SYSCALL_EXIT, %ecx 469 - jz work_pending 572 + syscall_exit_work_irqs_off: 470 573 TRACE_IRQS_ON 471 - ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call 472 - # schedule() instead 574 + ENABLE_INTERRUPTS(CLBR_ANY) 575 + 576 + syscall_exit_work: 473 577 movl %esp, %eax 474 - call syscall_trace_leave 475 - jmp resume_userspace 578 + call syscall_return_slowpath 579 + jmp restore_all 476 580 END(syscall_exit_work) 477 581 478 582 syscall_fault:

+53 -144

arch/x86/entry/entry_64.S

··· 33 33 #include <asm/paravirt.h> 34 34 #include <asm/percpu.h> 35 35 #include <asm/asm.h> 36 - #include <asm/context_tracking.h> 37 36 #include <asm/smap.h> 38 37 #include <asm/pgtable_types.h> 39 38 #include <linux/err.h> ··· 228 229 */ 229 230 USERGS_SYSRET64 230 231 232 + GLOBAL(int_ret_from_sys_call_irqs_off) 233 + TRACE_IRQS_ON 234 + ENABLE_INTERRUPTS(CLBR_NONE) 235 + jmp int_ret_from_sys_call 236 + 231 237 /* Do syscall entry tracing */ 232 238 tracesys: 233 239 movq %rsp, %rdi ··· 276 272 * Has correct iret frame. 277 273 */ 278 274 GLOBAL(int_ret_from_sys_call) 279 - DISABLE_INTERRUPTS(CLBR_NONE) 280 - int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ 281 - TRACE_IRQS_OFF 282 - movl $_TIF_ALLWORK_MASK, %edi 283 - /* edi: mask to check */ 284 - GLOBAL(int_with_check) 285 - LOCKDEP_SYS_EXIT_IRQ 286 - GET_THREAD_INFO(%rcx) 287 - movl TI_flags(%rcx), %edx 288 - andl %edi, %edx 289 - jnz int_careful 290 - andl $~TS_COMPAT, TI_status(%rcx) 291 - jmp syscall_return 292 - 293 - /* 294 - * Either reschedule or signal or syscall exit tracking needed. 295 - * First do a reschedule test. 296 - * edx: work, edi: workmask 297 - */ 298 - int_careful: 299 - bt $TIF_NEED_RESCHED, %edx 300 - jnc int_very_careful 301 - TRACE_IRQS_ON 302 - ENABLE_INTERRUPTS(CLBR_NONE) 303 - pushq %rdi 304 - SCHEDULE_USER 305 - popq %rdi 306 - DISABLE_INTERRUPTS(CLBR_NONE) 307 - TRACE_IRQS_OFF 308 - jmp int_with_check 309 - 310 - /* handle signals and tracing -- both require a full pt_regs */ 311 - int_very_careful: 312 - TRACE_IRQS_ON 313 - ENABLE_INTERRUPTS(CLBR_NONE) 314 275 SAVE_EXTRA_REGS 315 - /* Check for syscall exit trace */ 316 - testl $_TIF_WORK_SYSCALL_EXIT, %edx 317 - jz int_signal 318 - pushq %rdi 319 - leaq 8(%rsp), %rdi /* &ptregs -> arg1 */ 320 - call syscall_trace_leave 321 - popq %rdi 322 - andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi 323 - jmp int_restore_rest 324 - 325 - int_signal: 326 - testl $_TIF_DO_NOTIFY_MASK, %edx 327 - jz 1f 328 - movq %rsp, %rdi /* &ptregs -> arg1 */ 329 - xorl %esi, %esi /* oldset -> arg2 */ 330 - call do_notify_resume 331 - 1: movl $_TIF_WORK_MASK, %edi 332 - int_restore_rest: 276 + movq %rsp, %rdi 277 + call syscall_return_slowpath /* returns with IRQs disabled */ 333 278 RESTORE_EXTRA_REGS 334 - DISABLE_INTERRUPTS(CLBR_NONE) 335 - TRACE_IRQS_OFF 336 - jmp int_with_check 337 - 338 - syscall_return: 339 - /* The IRETQ could re-enable interrupts: */ 340 - DISABLE_INTERRUPTS(CLBR_ANY) 341 - TRACE_IRQS_IRETQ 279 + TRACE_IRQS_IRETQ /* we're about to change IF */ 342 280 343 281 /* 344 282 * Try to use SYSRET instead of IRET if we're returning to ··· 501 555 /* 0(%rsp): ~(interrupt number) */ 502 556 .macro interrupt func 503 557 cld 504 - /* 505 - * Since nothing in interrupt handling code touches r12...r15 members 506 - * of "struct pt_regs", and since interrupts can nest, we can save 507 - * four stack slots and simultaneously provide 508 - * an unwind-friendly stack layout by saving "truncated" pt_regs 509 - * exactly up to rbp slot, without these members. 510 - */ 511 - ALLOC_PT_GPREGS_ON_STACK -RBP 512 - SAVE_C_REGS -RBP 513 - /* this goes to 0(%rsp) for unwinder, not for saving the value: */ 514 - SAVE_EXTRA_REGS_RBP -RBP 558 + ALLOC_PT_GPREGS_ON_STACK 559 + SAVE_C_REGS 560 + SAVE_EXTRA_REGS 515 561 516 - leaq -RBP(%rsp), %rdi /* arg1 for \func (pointer to pt_regs) */ 517 - 518 - testb $3, CS-RBP(%rsp) 562 + testb $3, CS(%rsp) 519 563 jz 1f 564 + 565 + /* 566 + * IRQ from user mode. Switch to kernel gsbase and inform context 567 + * tracking that we're in kernel mode. 568 + */ 520 569 SWAPGS 570 + #ifdef CONFIG_CONTEXT_TRACKING 571 + call enter_from_user_mode 572 + #endif 573 + 521 574 1: 522 575 /* 523 576 * Save previous stack pointer, optionally switch to interrupt stack. ··· 525 580 * a little cheaper to use a separate counter in the PDA (short of 526 581 * moving irq_enter into assembly, which would be too much work) 527 582 */ 528 - movq %rsp, %rsi 583 + movq %rsp, %rdi 529 584 incl PER_CPU_VAR(irq_count) 530 585 cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp 531 - pushq %rsi 586 + pushq %rdi 532 587 /* We entered an interrupt context - irqs are off: */ 533 588 TRACE_IRQS_OFF 534 589 535 - call \func 590 + call \func /* rdi points to pt_regs */ 536 591 .endm 537 592 538 593 /* ··· 551 606 decl PER_CPU_VAR(irq_count) 552 607 553 608 /* Restore saved previous stack */ 554 - popq %rsi 555 - /* return code expects complete pt_regs - adjust rsp accordingly: */ 556 - leaq -RBP(%rsi), %rsp 609 + popq %rsp 557 610 558 611 testb $3, CS(%rsp) 559 612 jz retint_kernel 613 + 560 614 /* Interrupt came from user space */ 561 - retint_user: 562 - GET_THREAD_INFO(%rcx) 563 - 564 - /* %rcx: thread info. Interrupts are off. */ 565 - retint_with_reschedule: 566 - movl $_TIF_WORK_MASK, %edi 567 - retint_check: 568 615 LOCKDEP_SYS_EXIT_IRQ 569 - movl TI_flags(%rcx), %edx 570 - andl %edi, %edx 571 - jnz retint_careful 572 - 573 - retint_swapgs: /* return to user-space */ 574 - /* 575 - * The iretq could re-enable interrupts: 576 - */ 577 - DISABLE_INTERRUPTS(CLBR_ANY) 616 + GLOBAL(retint_user) 617 + mov %rsp,%rdi 618 + call prepare_exit_to_usermode 578 619 TRACE_IRQS_IRETQ 579 - 580 620 SWAPGS 581 - jmp restore_c_regs_and_iret 621 + jmp restore_regs_and_iret 582 622 583 623 /* Returning to kernel space */ 584 624 retint_kernel: ··· 587 657 * At this label, code paths which return to kernel and to user, 588 658 * which come from interrupts/exception and from syscalls, merge. 589 659 */ 660 + restore_regs_and_iret: 661 + RESTORE_EXTRA_REGS 590 662 restore_c_regs_and_iret: 591 663 RESTORE_C_REGS 592 664 REMOVE_PT_GPREGS_FROM_STACK 8 ··· 639 707 popq %rax 640 708 jmp native_irq_return_iret 641 709 #endif 642 - 643 - /* edi: workmask, edx: work */ 644 - retint_careful: 645 - bt $TIF_NEED_RESCHED, %edx 646 - jnc retint_signal 647 - TRACE_IRQS_ON 648 - ENABLE_INTERRUPTS(CLBR_NONE) 649 - pushq %rdi 650 - SCHEDULE_USER 651 - popq %rdi 652 - GET_THREAD_INFO(%rcx) 653 - DISABLE_INTERRUPTS(CLBR_NONE) 654 - TRACE_IRQS_OFF 655 - jmp retint_check 656 - 657 - retint_signal: 658 - testl $_TIF_DO_NOTIFY_MASK, %edx 659 - jz retint_swapgs 660 - TRACE_IRQS_ON 661 - ENABLE_INTERRUPTS(CLBR_NONE) 662 - SAVE_EXTRA_REGS 663 - movq $-1, ORIG_RAX(%rsp) 664 - xorl %esi, %esi /* oldset */ 665 - movq %rsp, %rdi /* &pt_regs */ 666 - call do_notify_resume 667 - RESTORE_EXTRA_REGS 668 - DISABLE_INTERRUPTS(CLBR_NONE) 669 - TRACE_IRQS_OFF 670 - GET_THREAD_INFO(%rcx) 671 - jmp retint_with_reschedule 672 - 673 710 END(common_interrupt) 674 711 675 712 /* ··· 1044 1143 SAVE_EXTRA_REGS 8 1045 1144 xorl %ebx, %ebx 1046 1145 testb $3, CS+8(%rsp) 1047 - jz error_kernelspace 1146 + jz .Lerror_kernelspace 1048 1147 1049 - /* We entered from user mode */ 1148 + .Lerror_entry_from_usermode_swapgs: 1149 + /* 1150 + * We entered from user mode or we're pretending to have entered 1151 + * from user mode due to an IRET fault. 1152 + */ 1050 1153 SWAPGS 1051 1154 1052 - error_entry_done: 1155 + .Lerror_entry_from_usermode_after_swapgs: 1156 + #ifdef CONFIG_CONTEXT_TRACKING 1157 + call enter_from_user_mode 1158 + #endif 1159 + 1160 + .Lerror_entry_done: 1161 + 1053 1162 TRACE_IRQS_OFF 1054 1163 ret 1055 1164 ··· 1069 1158 * truncated RIP for IRET exceptions returning to compat mode. Check 1070 1159 * for these here too. 1071 1160 */ 1072 - error_kernelspace: 1161 + .Lerror_kernelspace: 1073 1162 incl %ebx 1074 1163 leaq native_irq_return_iret(%rip), %rcx 1075 1164 cmpq %rcx, RIP+8(%rsp) 1076 - je error_bad_iret 1165 + je .Lerror_bad_iret 1077 1166 movl %ecx, %eax /* zero extend */ 1078 1167 cmpq %rax, RIP+8(%rsp) 1079 - je bstep_iret 1168 + je .Lbstep_iret 1080 1169 cmpq $gs_change, RIP+8(%rsp) 1081 - jne error_entry_done 1170 + jne .Lerror_entry_done 1082 1171 1083 1172 /* 1084 1173 * hack: gs_change can fail with user gsbase. If this happens, fix up 1085 1174 * gsbase and proceed. We'll fix up the exception and land in 1086 1175 * gs_change's error handler with kernel gsbase. 1087 1176 */ 1088 - SWAPGS 1089 - jmp error_entry_done 1177 + jmp .Lerror_entry_from_usermode_swapgs 1090 1178 1091 - bstep_iret: 1179 + .Lbstep_iret: 1092 1180 /* Fix truncated RIP */ 1093 1181 movq %rcx, RIP+8(%rsp) 1094 1182 /* fall through */ 1095 1183 1096 - error_bad_iret: 1184 + .Lerror_bad_iret: 1097 1185 /* 1098 1186 * We came from an IRET to user mode, so we have user gsbase. 1099 1187 * Switch to kernel gsbase: ··· 1108 1198 call fixup_bad_iret 1109 1199 mov %rax, %rsp 1110 1200 decl %ebx 1111 - jmp error_entry_done 1201 + jmp .Lerror_entry_from_usermode_after_swapgs 1112 1202 END(error_entry) 1113 1203 1114 1204 ··· 1119 1209 */ 1120 1210 ENTRY(error_exit) 1121 1211 movl %ebx, %eax 1122 - RESTORE_EXTRA_REGS 1123 1212 DISABLE_INTERRUPTS(CLBR_NONE) 1124 1213 TRACE_IRQS_OFF 1125 1214 testl %eax, %eax

+52 -9

arch/x86/entry/entry_64_compat.S

··· 22 22 #define __AUDIT_ARCH_LE 0x40000000 23 23 24 24 #ifndef CONFIG_AUDITSYSCALL 25 - # define sysexit_audit ia32_ret_from_sys_call 26 - # define sysretl_audit ia32_ret_from_sys_call 25 + # define sysexit_audit ia32_ret_from_sys_call_irqs_off 26 + # define sysretl_audit ia32_ret_from_sys_call_irqs_off 27 27 #endif 28 28 29 29 .section .entry.text, "ax" ··· 141 141 andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 142 142 movl RIP(%rsp), %ecx /* User %eip */ 143 143 movq RAX(%rsp), %rax 144 - RESTORE_RSI_RDI 144 + movl RSI(%rsp), %esi 145 + movl RDI(%rsp), %edi 145 146 xorl %edx, %edx /* Do not leak kernel information */ 146 147 xorq %r8, %r8 147 148 xorq %r9, %r9 ··· 210 209 .endm 211 210 212 211 .macro auditsys_exit exit 213 - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 214 - jnz ia32_ret_from_sys_call 215 212 TRACE_IRQS_ON 216 213 ENABLE_INTERRUPTS(CLBR_NONE) 214 + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 215 + jnz ia32_ret_from_sys_call 217 216 movl %eax, %esi /* second arg, syscall return value */ 218 217 cmpl $-MAX_ERRNO, %eax /* is it an error ? */ 219 218 jbe 1f ··· 231 230 movq %rax, R10(%rsp) 232 231 movq %rax, R9(%rsp) 233 232 movq %rax, R8(%rsp) 234 - jmp int_with_check 233 + jmp int_ret_from_sys_call_irqs_off 235 234 .endm 236 235 237 236 sysenter_auditsys: ··· 366 365 367 366 sysretl_from_sys_call: 368 367 andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 369 - RESTORE_RSI_RDI_RDX 368 + movl RDX(%rsp), %edx 369 + movl RSI(%rsp), %esi 370 + movl RDI(%rsp), %edi 370 371 movl RIP(%rsp), %ecx 371 372 movl EFLAGS(%rsp), %r11d 372 373 movq RAX(%rsp), %rax ··· 433 430 END(entry_SYSCALL_compat) 434 431 435 432 ia32_badarg: 436 - ASM_CLAC 437 - movq $-EFAULT, RAX(%rsp) 433 + /* 434 + * So far, we've entered kernel mode, set AC, turned on IRQs, and 435 + * saved C regs except r8-r11. We haven't done any of the other 436 + * standard entry work, though. We want to bail, but we shouldn't 437 + * treat this as a syscall entry since we don't even know what the 438 + * args are. Instead, treat this as a non-syscall entry, finish 439 + * the entry work, and immediately exit after setting AX = -EFAULT. 440 + * 441 + * We're really just being polite here. Killing the task outright 442 + * would be a reasonable action, too. Given that the only valid 443 + * way to have gotten here is through the vDSO, and we already know 444 + * that the stack pointer is bad, the task isn't going to survive 445 + * for long no matter what we do. 446 + */ 447 + 448 + ASM_CLAC /* undo STAC */ 449 + movq $-EFAULT, RAX(%rsp) /* return -EFAULT if possible */ 450 + 451 + /* Fill in the rest of pt_regs */ 452 + xorl %eax, %eax 453 + movq %rax, R11(%rsp) 454 + movq %rax, R10(%rsp) 455 + movq %rax, R9(%rsp) 456 + movq %rax, R8(%rsp) 457 + SAVE_EXTRA_REGS 458 + 459 + /* Turn IRQs back off. */ 460 + DISABLE_INTERRUPTS(CLBR_NONE) 461 + TRACE_IRQS_OFF 462 + 463 + /* Now finish entering normal kernel mode. */ 464 + #ifdef CONFIG_CONTEXT_TRACKING 465 + call enter_from_user_mode 466 + #endif 467 + 468 + /* And exit again. */ 469 + jmp retint_user 470 + 471 + ia32_ret_from_sys_call_irqs_off: 472 + TRACE_IRQS_ON 473 + ENABLE_INTERRUPTS(CLBR_NONE) 474 + 438 475 ia32_ret_from_sys_call: 439 476 xorl %eax, %eax /* Do not leak kernel information */ 440 477 movq %rax, R11(%rsp)

+15

arch/x86/entry/syscalls/syscall_32.tbl

··· 365 365 356 i386 memfd_create sys_memfd_create 366 366 357 i386 bpf sys_bpf 367 367 358 i386 execveat sys_execveat stub32_execveat 368 + 359 i386 socket sys_socket 369 + 360 i386 socketpair sys_socketpair 370 + 361 i386 bind sys_bind 371 + 362 i386 connect sys_connect 372 + 363 i386 listen sys_listen 373 + 364 i386 accept4 sys_accept4 374 + 365 i386 getsockopt sys_getsockopt compat_sys_getsockopt 375 + 366 i386 setsockopt sys_setsockopt compat_sys_setsockopt 376 + 367 i386 getsockname sys_getsockname 377 + 368 i386 getpeername sys_getpeername 378 + 369 i386 sendto sys_sendto 379 + 370 i386 sendmsg sys_sendmsg compat_sys_sendmsg 380 + 371 i386 recvfrom sys_recvfrom compat_sys_recvfrom 381 + 372 i386 recvmsg sys_recvmsg compat_sys_recvmsg 382 + 373 i386 shutdown sys_shutdown

+4 -4

arch/x86/entry/vdso/Makefile

··· 8 8 VDSO64-$(CONFIG_X86_64) := y 9 9 VDSOX32-$(CONFIG_X86_X32_ABI) := y 10 10 VDSO32-$(CONFIG_X86_32) := y 11 - VDSO32-$(CONFIG_COMPAT) := y 11 + VDSO32-$(CONFIG_IA32_EMULATION) := y 12 12 13 13 # files to link into the vdso 14 14 vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o ··· 20 20 vdso_img-$(VDSO64-y) += 64 21 21 vdso_img-$(VDSOX32-y) += x32 22 22 vdso_img-$(VDSO32-y) += 32-int80 23 - vdso_img-$(CONFIG_COMPAT) += 32-syscall 23 + vdso_img-$(CONFIG_IA32_EMULATION) += 32-syscall 24 24 vdso_img-$(VDSO32-y) += 32-sysenter 25 25 26 26 obj-$(VDSO32-y) += vdso32-setup.o ··· 126 126 # Build multiple 32-bit vDSO images to choose from at boot time. 127 127 # 128 128 vdso32.so-$(VDSO32-y) += int80 129 - vdso32.so-$(CONFIG_COMPAT) += syscall 129 + vdso32.so-$(CONFIG_IA32_EMULATION) += syscall 130 130 vdso32.so-$(VDSO32-y) += sysenter 131 131 132 132 vdso32-images = $(vdso32.so-y:%=vdso32-%.so) ··· 175 175 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ 176 176 sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' 177 177 178 - VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \ 178 + VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \ 179 179 $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS) 180 180 GCOV_PROFILE := n 181 181

+2 -14

arch/x86/entry/vdso/vclock_gettime.c

··· 175 175 176 176 notrace static cycle_t vread_tsc(void) 177 177 { 178 - cycle_t ret; 179 - u64 last; 180 - 181 - /* 182 - * Empirically, a fence (of type that depends on the CPU) 183 - * before rdtsc is enough to ensure that rdtsc is ordered 184 - * with respect to loads. The various CPU manuals are unclear 185 - * as to whether rdtsc can be reordered with later loads, 186 - * but no one has ever seen it happen. 187 - */ 188 - rdtsc_barrier(); 189 - ret = (cycle_t)__native_read_tsc(); 190 - 191 - last = gtod->cycle_last; 178 + cycle_t ret = (cycle_t)rdtsc_ordered(); 179 + u64 last = gtod->cycle_last; 192 180 193 181 if (likely(ret >= last)) 194 182 return ret;

+5 -2

arch/x86/entry/vdso/vma.c

··· 177 177 return ret; 178 178 } 179 179 180 - #if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) 180 + #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 181 181 static int load_vdso32(void) 182 182 { 183 183 int ret; ··· 219 219 return map_vdso(&vdso_image_x32, true); 220 220 } 221 221 #endif 222 - 222 + #ifdef CONFIG_IA32_EMULATION 223 223 return load_vdso32(); 224 + #else 225 + return 0; 226 + #endif 224 227 } 225 228 #endif 226 229 #else

+1 -1

arch/x86/entry/vsyscall/vsyscall_64.c

··· 290 290 291 291 struct vm_area_struct *get_gate_vma(struct mm_struct *mm) 292 292 { 293 - #ifdef CONFIG_IA32_EMULATION 293 + #ifdef CONFIG_COMPAT 294 294 if (!mm || mm->context.ia32_compat) 295 295 return NULL; 296 296 #endif

-93

arch/x86/ia32/ia32_signal.c

··· 34 34 #include <asm/sys_ia32.h> 35 35 #include <asm/smap.h> 36 36 37 - int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) 38 - { 39 - int err = 0; 40 - bool ia32 = test_thread_flag(TIF_IA32); 41 - 42 - if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) 43 - return -EFAULT; 44 - 45 - put_user_try { 46 - /* If you change siginfo_t structure, please make sure that 47 - this code is fixed accordingly. 48 - It should never copy any pad contained in the structure 49 - to avoid security leaks, but must copy the generic 50 - 3 ints plus the relevant union member. */ 51 - put_user_ex(from->si_signo, &to->si_signo); 52 - put_user_ex(from->si_errno, &to->si_errno); 53 - put_user_ex((short)from->si_code, &to->si_code); 54 - 55 - if (from->si_code < 0) { 56 - put_user_ex(from->si_pid, &to->si_pid); 57 - put_user_ex(from->si_uid, &to->si_uid); 58 - put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr); 59 - } else { 60 - /* 61 - * First 32bits of unions are always present: 62 - * si_pid === si_band === si_tid === si_addr(LS half) 63 - */ 64 - put_user_ex(from->_sifields._pad[0], 65 - &to->_sifields._pad[0]); 66 - switch (from->si_code >> 16) { 67 - case __SI_FAULT >> 16: 68 - break; 69 - case __SI_SYS >> 16: 70 - put_user_ex(from->si_syscall, &to->si_syscall); 71 - put_user_ex(from->si_arch, &to->si_arch); 72 - break; 73 - case __SI_CHLD >> 16: 74 - if (ia32) { 75 - put_user_ex(from->si_utime, &to->si_utime); 76 - put_user_ex(from->si_stime, &to->si_stime); 77 - } else { 78 - put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime); 79 - put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime); 80 - } 81 - put_user_ex(from->si_status, &to->si_status); 82 - /* FALL THROUGH */ 83 - default: 84 - case __SI_KILL >> 16: 85 - put_user_ex(from->si_uid, &to->si_uid); 86 - break; 87 - case __SI_POLL >> 16: 88 - put_user_ex(from->si_fd, &to->si_fd); 89 - break; 90 - case __SI_TIMER >> 16: 91 - put_user_ex(from->si_overrun, &to->si_overrun); 92 - put_user_ex(ptr_to_compat(from->si_ptr), 93 - &to->si_ptr); 94 - break; 95 - /* This is not generated by the kernel as of now. */ 96 - case __SI_RT >> 16: 97 - case __SI_MESGQ >> 16: 98 - put_user_ex(from->si_uid, &to->si_uid); 99 - put_user_ex(from->si_int, &to->si_int); 100 - break; 101 - } 102 - } 103 - } put_user_catch(err); 104 - 105 - return err; 106 - } 107 - 108 - int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) 109 - { 110 - int err = 0; 111 - u32 ptr32; 112 - 113 - if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t))) 114 - return -EFAULT; 115 - 116 - get_user_try { 117 - get_user_ex(to->si_signo, &from->si_signo); 118 - get_user_ex(to->si_errno, &from->si_errno); 119 - get_user_ex(to->si_code, &from->si_code); 120 - 121 - get_user_ex(to->si_pid, &from->si_pid); 122 - get_user_ex(to->si_uid, &from->si_uid); 123 - get_user_ex(ptr32, &from->si_ptr); 124 - to->si_ptr = compat_ptr(ptr32); 125 - } get_user_catch(err); 126 - 127 - return err; 128 - } 129 - 130 37 /* 131 38 * Do a signal return; undo the signal stack. 132 39 */

-11

arch/x86/include/asm/barrier.h

··· 91 91 #define smp_mb__before_atomic() barrier() 92 92 #define smp_mb__after_atomic() barrier() 93 93 94 - /* 95 - * Stop RDTSC speculation. This is needed when you need to use RDTSC 96 - * (or get_cycles or vread that possibly accesses the TSC) in a defined 97 - * code region. 98 - */ 99 - static __always_inline void rdtsc_barrier(void) 100 - { 101 - alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, 102 - "lfence", X86_FEATURE_LFENCE_RDTSC); 103 - } 104 - 105 94 #endif /* _ASM_X86_BARRIER_H */

-10

arch/x86/include/asm/context_tracking.h

··· 1 - #ifndef _ASM_X86_CONTEXT_TRACKING_H 2 - #define _ASM_X86_CONTEXT_TRACKING_H 3 - 4 - #ifdef CONFIG_CONTEXT_TRACKING 5 - # define SCHEDULE_USER call schedule_user 6 - #else 7 - # define SCHEDULE_USER call schedule 8 - #endif 9 - 10 - #endif

+1

arch/x86/include/asm/cpufeature.h

··· 176 176 #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ 177 177 #define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ 178 178 #define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ 179 + #define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ 179 180 180 181 /* 181 182 * Auxiliary flags: Linux defined - For features scattered in various

+1

arch/x86/include/asm/delay.h

··· 4 4 #include <asm-generic/delay.h> 5 5 6 6 void use_tsc_delay(void); 7 + void use_mwaitx_delay(void); 7 8 8 9 #endif /* _ASM_X86_DELAY_H */

+6 -11

arch/x86/include/asm/elf.h

··· 78 78 #ifdef CONFIG_X86_64 79 79 extern unsigned int vdso64_enabled; 80 80 #endif 81 - #if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) 81 + #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 82 82 extern unsigned int vdso32_enabled; 83 83 #endif 84 84 ··· 187 187 #define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ 188 188 elf_common_init(&current->thread, regs, __USER_DS) 189 189 190 - void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp); 191 - #define compat_start_thread start_thread_ia32 190 + void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp); 191 + #define compat_start_thread compat_start_thread 192 192 193 193 void set_personality_ia32(bool); 194 194 #define COMPAT_SET_PERSONALITY(ex) \ ··· 344 344 */ 345 345 static inline int mmap_is_ia32(void) 346 346 { 347 - #ifdef CONFIG_X86_32 348 - return 1; 349 - #endif 350 - #ifdef CONFIG_IA32_EMULATION 351 - if (test_thread_flag(TIF_ADDR32)) 352 - return 1; 353 - #endif 354 - return 0; 347 + return config_enabled(CONFIG_X86_32) || 348 + (config_enabled(CONFIG_COMPAT) && 349 + test_thread_flag(TIF_ADDR32)); 355 350 } 356 351 357 352 /* Do not change the values. See get_align_mask() */

-9

arch/x86/include/asm/ia32.h

··· 22 22 compat_sigset_t uc_sigmask; /* mask last for extensibility */ 23 23 }; 24 24 25 - struct ucontext_x32 { 26 - unsigned int uc_flags; 27 - unsigned int uc_link; 28 - compat_stack_t uc_stack; 29 - unsigned int uc__pad0; /* needed for alignment */ 30 - struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */ 31 - compat_sigset_t uc_sigmask; /* mask last for extensibility */ 32 - }; 33 - 34 25 /* This matches struct stat64 in glibc2.2, hence the absolutely 35 26 * insane amounts of padding around dev_t's. 36 27 */

-10

arch/x86/include/asm/irq_vectors.h

··· 117 117 118 118 #define FPU_IRQ 13 119 119 120 - #define FIRST_VM86_IRQ 3 121 - #define LAST_VM86_IRQ 15 122 - 123 - #ifndef __ASSEMBLY__ 124 - static inline int invalid_vm86_irq(int irq) 125 - { 126 - return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; 127 - } 128 - #endif 129 - 130 120 /* 131 121 * Size the maximum number of interrupts. 132 122 *

+1 -5

arch/x86/include/asm/math_emu.h

··· 2 2 #define _ASM_X86_MATH_EMU_H 3 3 4 4 #include <asm/ptrace.h> 5 - #include <asm/vm86.h> 6 5 7 6 /* This structure matches the layout of the data saved to the stack 8 7 following a device-not-present interrupt, part of it saved ··· 9 10 */ 10 11 struct math_emu_info { 11 12 long ___orig_eip; 12 - union { 13 - struct pt_regs *regs; 14 - struct kernel_vm86_regs *vm86; 15 - }; 13 + struct pt_regs *regs; 16 14 }; 17 15 #endif /* _ASM_X86_MATH_EMU_H */

+2

arch/x86/include/asm/mmu.h

··· 9 9 * we put the segment information here. 10 10 */ 11 11 typedef struct { 12 + #ifdef CONFIG_MODIFY_LDT_SYSCALL 12 13 struct ldt_struct *ldt; 14 + #endif 13 15 14 16 #ifdef CONFIG_X86_64 15 17 /* True if mm supports a task running in 32 bit compatibility mode. */

+21 -7

arch/x86/include/asm/mmu_context.h

··· 33 33 static inline void load_mm_cr4(struct mm_struct *mm) {} 34 34 #endif 35 35 36 + #ifdef CONFIG_MODIFY_LDT_SYSCALL 36 37 /* 37 38 * ldt_structs can be allocated, used, and freed, but they are never 38 39 * modified while live. ··· 49 48 int size; 50 49 }; 51 50 51 + /* 52 + * Used for LDT copy/destruction. 53 + */ 54 + int init_new_context(struct task_struct *tsk, struct mm_struct *mm); 55 + void destroy_context(struct mm_struct *mm); 56 + #else /* CONFIG_MODIFY_LDT_SYSCALL */ 57 + static inline int init_new_context(struct task_struct *tsk, 58 + struct mm_struct *mm) 59 + { 60 + return 0; 61 + } 62 + static inline void destroy_context(struct mm_struct *mm) {} 63 + #endif 64 + 52 65 static inline void load_mm_ldt(struct mm_struct *mm) 53 66 { 67 + #ifdef CONFIG_MODIFY_LDT_SYSCALL 54 68 struct ldt_struct *ldt; 55 69 56 70 /* lockless_dereference synchronizes with smp_store_release */ ··· 89 73 set_ldt(ldt->entries, ldt->size); 90 74 else 91 75 clear_LDT(); 76 + #else 77 + clear_LDT(); 78 + #endif 92 79 93 80 DEBUG_LOCKS_WARN_ON(preemptible()); 94 81 } 95 - 96 - /* 97 - * Used for LDT copy/destruction. 98 - */ 99 - int init_new_context(struct task_struct *tsk, struct mm_struct *mm); 100 - void destroy_context(struct mm_struct *mm); 101 - 102 82 103 83 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 104 84 { ··· 126 114 /* Load per-mm CR4 state */ 127 115 load_mm_cr4(next); 128 116 117 + #ifdef CONFIG_MODIFY_LDT_SYSCALL 129 118 /* 130 119 * Load the LDT, if the LDT is different. 131 120 * ··· 141 128 */ 142 129 if (unlikely(prev->context.ldt != next->context.ldt)) 143 130 load_mm_ldt(next); 131 + #endif 144 132 } 145 133 #ifdef CONFIG_SMP 146 134 else {

+46 -24

arch/x86/include/asm/msr.h

··· 47 47 * it means rax *or* rdx. 48 48 */ 49 49 #ifdef CONFIG_X86_64 50 - #define DECLARE_ARGS(val, low, high) unsigned low, high 51 - #define EAX_EDX_VAL(val, low, high) ((low) | ((u64)(high) << 32)) 52 - #define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high) 50 + /* Using 64-bit values saves one instruction clearing the high half of low */ 51 + #define DECLARE_ARGS(val, low, high) unsigned long low, high 52 + #define EAX_EDX_VAL(val, low, high) ((low) | (high) << 32) 53 53 #define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high) 54 54 #else 55 55 #define DECLARE_ARGS(val, low, high) unsigned long long val 56 56 #define EAX_EDX_VAL(val, low, high) (val) 57 - #define EAX_EDX_ARGS(val, low, high) "A" (val) 58 57 #define EAX_EDX_RET(val, low, high) "=A" (val) 59 58 #endif 60 59 ··· 105 106 return err; 106 107 } 107 108 108 - extern unsigned long long native_read_tsc(void); 109 - 110 109 extern int rdmsr_safe_regs(u32 regs[8]); 111 110 extern int wrmsr_safe_regs(u32 regs[8]); 112 111 113 - static __always_inline unsigned long long __native_read_tsc(void) 112 + /** 113 + * rdtsc() - returns the current TSC without ordering constraints 114 + * 115 + * rdtsc() returns the result of RDTSC as a 64-bit integer. The 116 + * only ordering constraint it supplies is the ordering implied by 117 + * "asm volatile": it will put the RDTSC in the place you expect. The 118 + * CPU can and will speculatively execute that RDTSC, though, so the 119 + * results can be non-monotonic if compared on different CPUs. 120 + */ 121 + static __always_inline unsigned long long rdtsc(void) 114 122 { 115 123 DECLARE_ARGS(val, low, high); 116 124 ··· 125 119 126 120 return EAX_EDX_VAL(val, low, high); 127 121 } 122 + 123 + /** 124 + * rdtsc_ordered() - read the current TSC in program order 125 + * 126 + * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer. 127 + * It is ordered like a load to a global in-memory counter. It should 128 + * be impossible to observe non-monotonic rdtsc_unordered() behavior 129 + * across multiple CPUs as long as the TSC is synced. 130 + */ 131 + static __always_inline unsigned long long rdtsc_ordered(void) 132 + { 133 + /* 134 + * The RDTSC instruction is not ordered relative to memory 135 + * access. The Intel SDM and the AMD APM are both vague on this 136 + * point, but empirically an RDTSC instruction can be 137 + * speculatively executed before prior loads. An RDTSC 138 + * immediately after an appropriate barrier appears to be 139 + * ordered as a normal load, that is, it provides the same 140 + * ordering guarantees as reading from a global memory location 141 + * that some other imaginary CPU is updating continuously with a 142 + * time stamp. 143 + */ 144 + alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, 145 + "lfence", X86_FEATURE_LFENCE_RDTSC); 146 + return rdtsc(); 147 + } 148 + 149 + /* Deprecated, keep it for a cycle for easier merging: */ 150 + #define rdtscll(now) do { (now) = rdtsc_ordered(); } while (0) 128 151 129 152 static inline unsigned long long native_read_pmc(int counter) 130 153 { ··· 188 153 #define rdmsrl(msr, val) \ 189 154 ((val) = native_read_msr((msr))) 190 155 191 - #define wrmsrl(msr, val) \ 192 - native_write_msr((msr), (u32)((u64)(val)), (u32)((u64)(val) >> 32)) 156 + static inline void wrmsrl(unsigned msr, u64 val) 157 + { 158 + native_write_msr(msr, (u32)val, (u32)(val >> 32)); 159 + } 193 160 194 161 /* wrmsr with exception handling */ 195 162 static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high) ··· 217 180 return err; 218 181 } 219 182 220 - #define rdtscl(low) \ 221 - ((low) = (u32)__native_read_tsc()) 222 - 223 - #define rdtscll(val) \ 224 - ((val) = __native_read_tsc()) 225 - 226 183 #define rdpmc(counter, low, high) \ 227 184 do { \ 228 185 u64 _l = native_read_pmc((counter)); \ ··· 225 194 } while (0) 226 195 227 196 #define rdpmcl(counter, val) ((val) = native_read_pmc(counter)) 228 - 229 - #define rdtscp(low, high, aux) \ 230 - do { \ 231 - unsigned long long _val = native_read_tscp(&(aux)); \ 232 - (low) = (u32)_val; \ 233 - (high) = (u32)(_val >> 32); \ 234 - } while (0) 235 - 236 - #define rdtscpll(val, aux) (val) = native_read_tscp(&(aux)) 237 197 238 198 #endif /* !CONFIG_PARAVIRT */ 239 199

+45

arch/x86/include/asm/mwait.h

··· 14 14 #define CPUID5_ECX_INTERRUPT_BREAK 0x2 15 15 16 16 #define MWAIT_ECX_INTERRUPT_BREAK 0x1 17 + #define MWAITX_ECX_TIMER_ENABLE BIT(1) 18 + #define MWAITX_MAX_LOOPS ((u32)-1) 19 + #define MWAITX_DISABLE_CSTATES 0xf 17 20 18 21 static inline void __monitor(const void *eax, unsigned long ecx, 19 22 unsigned long edx) ··· 26 23 :: "a" (eax), "c" (ecx), "d"(edx)); 27 24 } 28 25 26 + static inline void __monitorx(const void *eax, unsigned long ecx, 27 + unsigned long edx) 28 + { 29 + /* "monitorx %eax, %ecx, %edx;" */ 30 + asm volatile(".byte 0x0f, 0x01, 0xfa;" 31 + :: "a" (eax), "c" (ecx), "d"(edx)); 32 + } 33 + 29 34 static inline void __mwait(unsigned long eax, unsigned long ecx) 30 35 { 31 36 /* "mwait %eax, %ecx;" */ 32 37 asm volatile(".byte 0x0f, 0x01, 0xc9;" 33 38 :: "a" (eax), "c" (ecx)); 39 + } 40 + 41 + /* 42 + * MWAITX allows for a timer expiration to get the core out a wait state in 43 + * addition to the default MWAIT exit condition of a store appearing at a 44 + * monitored virtual address. 45 + * 46 + * Registers: 47 + * 48 + * MWAITX ECX[1]: enable timer if set 49 + * MWAITX EBX[31:0]: max wait time expressed in SW P0 clocks. The software P0 50 + * frequency is the same as the TSC frequency. 51 + * 52 + * Below is a comparison between MWAIT and MWAITX on AMD processors: 53 + * 54 + * MWAIT MWAITX 55 + * opcode 0f 01 c9 | 0f 01 fb 56 + * ECX[0] value of RFLAGS.IF seen by instruction 57 + * ECX[1] unused/#GP if set | enable timer if set 58 + * ECX[31:2] unused/#GP if set 59 + * EAX unused (reserve for hint) 60 + * EBX[31:0] unused | max wait time (P0 clocks) 61 + * 62 + * MONITOR MONITORX 63 + * opcode 0f 01 c8 | 0f 01 fa 64 + * EAX (logical) address to monitor 65 + * ECX #GP if not zero 66 + */ 67 + static inline void __mwaitx(unsigned long eax, unsigned long ebx, 68 + unsigned long ecx) 69 + { 70 + /* "mwaitx %eax, %ebx, %ecx;" */ 71 + asm volatile(".byte 0x0f, 0x01, 0xfb;" 72 + :: "a" (eax), "b" (ebx), "c" (ecx)); 34 73 } 35 74 36 75 static inline void __sti_mwait(unsigned long eax, unsigned long ecx)

+5 -35

arch/x86/include/asm/paravirt.h

··· 153 153 val = paravirt_read_msr(msr, &_err); \ 154 154 } while (0) 155 155 156 - #define wrmsrl(msr, val) wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32) 156 + static inline void wrmsrl(unsigned msr, u64 val) 157 + { 158 + wrmsr(msr, (u32)val, (u32)(val>>32)); 159 + } 160 + 157 161 #define wrmsr_safe(msr, a, b) paravirt_write_msr(msr, a, b) 158 162 159 163 /* rdmsr with exception handling */ ··· 177 173 *p = paravirt_read_msr(msr, &err); 178 174 return err; 179 175 } 180 - 181 - static inline u64 paravirt_read_tsc(void) 182 - { 183 - return PVOP_CALL0(u64, pv_cpu_ops.read_tsc); 184 - } 185 - 186 - #define rdtscl(low) \ 187 - do { \ 188 - u64 _l = paravirt_read_tsc(); \ 189 - low = (int)_l; \ 190 - } while (0) 191 - 192 - #define rdtscll(val) (val = paravirt_read_tsc()) 193 176 194 177 static inline unsigned long long paravirt_sched_clock(void) 195 178 { ··· 205 214 } while (0) 206 215 207 216 #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) 208 - 209 - static inline unsigned long long paravirt_rdtscp(unsigned int *aux) 210 - { 211 - return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux); 212 - } 213 - 214 - #define rdtscp(low, high, aux) \ 215 - do { \ 216 - int __aux; \ 217 - unsigned long __val = paravirt_rdtscp(&__aux); \ 218 - (low) = (u32)__val; \ 219 - (high) = (u32)(__val >> 32); \ 220 - (aux) = __aux; \ 221 - } while (0) 222 - 223 - #define rdtscpll(val, aux) \ 224 - do { \ 225 - unsigned long __aux; \ 226 - val = paravirt_rdtscp(&__aux); \ 227 - (aux) = __aux; \ 228 - } while (0) 229 217 230 218 static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) 231 219 {

-2

arch/x86/include/asm/paravirt_types.h

··· 156 156 u64 (*read_msr)(unsigned int msr, int *err); 157 157 int (*write_msr)(unsigned int msr, unsigned low, unsigned high); 158 158 159 - u64 (*read_tsc)(void); 160 159 u64 (*read_pmc)(int counter); 161 - unsigned long long (*read_tscp)(unsigned int *aux); 162 160 163 161 #ifdef CONFIG_X86_32 164 162 /*

+3 -10

arch/x86/include/asm/processor.h

··· 6 6 /* Forward declaration, a strange C thing */ 7 7 struct task_struct; 8 8 struct mm_struct; 9 + struct vm86; 9 10 10 - #include <asm/vm86.h> 11 11 #include <asm/math_emu.h> 12 12 #include <asm/segment.h> 13 13 #include <asm/types.h> ··· 400 400 unsigned long cr2; 401 401 unsigned long trap_nr; 402 402 unsigned long error_code; 403 - #ifdef CONFIG_X86_32 403 + #ifdef CONFIG_VM86 404 404 /* Virtual 86 mode info */ 405 - struct vm86_struct __user *vm86_info; 406 - unsigned long screen_bitmap; 407 - unsigned long v86flags; 408 - unsigned long v86mask; 409 - unsigned long saved_sp0; 410 - unsigned int saved_fs; 411 - unsigned int saved_gs; 405 + struct vm86 *vm86; 412 406 #endif 413 407 /* IO permissions: */ 414 408 unsigned long *io_bitmap_ptr; ··· 714 720 715 721 #define INIT_THREAD { \ 716 722 .sp0 = TOP_OF_INIT_STACK, \ 717 - .vm86_info = NULL, \ 718 723 .sysenter_cs = __KERNEL_CS, \ 719 724 .io_bitmap_ptr = NULL, \ 720 725 }

-1

arch/x86/include/asm/ptrace.h

··· 88 88 unsigned long phase1_result); 89 89 90 90 extern long syscall_trace_enter(struct pt_regs *); 91 - extern void syscall_trace_leave(struct pt_regs *); 92 91 93 92 static inline unsigned long regs_return_value(struct pt_regs *regs) 94 93 {

+2 -8

arch/x86/include/asm/pvclock.h

··· 62 62 static __always_inline 63 63 u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src) 64 64 { 65 - u64 delta = __native_read_tsc() - src->tsc_timestamp; 65 + u64 delta = rdtsc_ordered() - src->tsc_timestamp; 66 66 return pvclock_scale_delta(delta, src->tsc_to_system_mul, 67 67 src->tsc_shift); 68 68 } ··· 76 76 u8 ret_flags; 77 77 78 78 version = src->version; 79 - /* Note: emulated platforms which do not advertise SSE2 support 80 - * result in kvmclock not using the necessary RDTSC barriers. 81 - * Without barriers, it is possible that RDTSC instruction reads from 82 - * the time stamp counter outside rdtsc_barrier protected section 83 - * below, resulting in violation of monotonicity. 84 - */ 85 - rdtsc_barrier(); 79 + 86 80 offset = pvclock_get_nsec_offset(src); 87 81 ret = src->system_time + offset; 88 82 ret_flags = src->flags;

+10

arch/x86/include/asm/sigframe.h

··· 4 4 #include <asm/sigcontext.h> 5 5 #include <asm/siginfo.h> 6 6 #include <asm/ucontext.h> 7 + #include <linux/compat.h> 7 8 8 9 #ifdef CONFIG_X86_32 9 10 #define sigframe_ia32 sigframe ··· 69 68 }; 70 69 71 70 #ifdef CONFIG_X86_X32_ABI 71 + 72 + struct ucontext_x32 { 73 + unsigned int uc_flags; 74 + unsigned int uc_link; 75 + compat_stack_t uc_stack; 76 + unsigned int uc__pad0; /* needed for alignment */ 77 + struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */ 78 + compat_sigset_t uc_sigmask; /* mask last for extensibility */ 79 + }; 72 80 73 81 struct rt_sigframe_x32 { 74 82 u64 pretcode;

+1 -1

arch/x86/include/asm/signal.h

··· 30 30 #endif /* __ASSEMBLY__ */ 31 31 #include <uapi/asm/signal.h> 32 32 #ifndef __ASSEMBLY__ 33 - extern void do_notify_resume(struct pt_regs *, void *, __u32); 33 + extern void do_signal(struct pt_regs *regs); 34 34 35 35 #define __ARCH_HAS_SA_RESTORER 36 36

+1 -1

arch/x86/include/asm/stackprotector.h

··· 72 72 * on during the bootup the random pool has true entropy too. 73 73 */ 74 74 get_random_bytes(&canary, sizeof(canary)); 75 - tsc = __native_read_tsc(); 75 + tsc = rdtsc(); 76 76 canary += tsc + (tsc << 32UL); 77 77 78 78 current->stack_canary = canary;

+1

arch/x86/include/asm/syscalls.h

··· 37 37 asmlinkage unsigned long sys_sigreturn(void); 38 38 39 39 /* kernel/vm86_32.c */ 40 + struct vm86_struct; 40 41 asmlinkage long sys_vm86old(struct vm86_struct __user *); 41 42 asmlinkage long sys_vm86(unsigned long, unsigned long); 42 43

+7 -20

arch/x86/include/asm/thread_info.h

··· 27 27 * Without this offset, that can result in a page fault. (We are 28 28 * careful that, in this case, the value we read doesn't matter.) 29 29 * 30 - * In vm86 mode, the hardware frame is much longer still, but we neither 31 - * access the extra members from NMI context, nor do we write such a 32 - * frame at sp0 at all. 30 + * In vm86 mode, the hardware frame is much longer still, so add 16 31 + * bytes to make room for the real-mode segments. 33 32 * 34 33 * x86_64 has a fixed-length stack frame. 35 34 */ 36 35 #ifdef CONFIG_X86_32 37 - # define TOP_OF_KERNEL_STACK_PADDING 8 36 + # ifdef CONFIG_VM86 37 + # define TOP_OF_KERNEL_STACK_PADDING 16 38 + # else 39 + # define TOP_OF_KERNEL_STACK_PADDING 8 40 + # endif 38 41 #else 39 42 # define TOP_OF_KERNEL_STACK_PADDING 0 40 43 #endif ··· 143 140 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \ 144 141 _TIF_NOHZ) 145 142 146 - /* work to do in syscall_trace_leave() */ 147 - #define _TIF_WORK_SYSCALL_EXIT \ 148 - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ 149 - _TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ) 150 - 151 - /* work to do on interrupt/exception return */ 152 - #define _TIF_WORK_MASK \ 153 - (0x0000FFFF & \ 154 - ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT| \ 155 - _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) 156 - 157 143 /* work to do on any return to user space */ 158 144 #define _TIF_ALLWORK_MASK \ 159 145 ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \ 160 146 _TIF_NOHZ) 161 - 162 - /* Only used for 64 bit */ 163 - #define _TIF_DO_NOTIFY_MASK \ 164 - (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \ 165 - _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE) 166 147 167 148 /* flags to check in __switch_to() */ 168 149 #define _TIF_WORK_CTXSW \

+2 -2

arch/x86/include/asm/traps.h

··· 112 112 asmlinkage void smp_deferred_error_interrupt(void); 113 113 #endif 114 114 115 - extern enum ctx_state ist_enter(struct pt_regs *regs); 116 - extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state); 115 + extern void ist_enter(struct pt_regs *regs); 116 + extern void ist_exit(struct pt_regs *regs); 117 117 extern void ist_begin_non_atomic(struct pt_regs *regs); 118 118 extern void ist_end_non_atomic(void); 119 119

+1 -17

arch/x86/include/asm/tsc.h

··· 21 21 22 22 static inline cycles_t get_cycles(void) 23 23 { 24 - unsigned long long ret = 0; 25 - 26 24 #ifndef CONFIG_X86_TSC 27 25 if (!cpu_has_tsc) 28 26 return 0; 29 27 #endif 30 - rdtscll(ret); 31 28 32 - return ret; 33 - } 34 - 35 - static __always_inline cycles_t vget_cycles(void) 36 - { 37 - /* 38 - * We only do VDSOs on TSC capable CPUs, so this shouldn't 39 - * access boot_cpu_data (which is not VDSO-safe): 40 - */ 41 - #ifndef CONFIG_X86_TSC 42 - if (!cpu_has_tsc) 43 - return 0; 44 - #endif 45 - return (cycles_t)__native_read_tsc(); 29 + return rdtsc(); 46 30 } 47 31 48 32 extern void tsc_init(void);

+33 -24

arch/x86/include/asm/vm86.h

··· 1 1 #ifndef _ASM_X86_VM86_H 2 2 #define _ASM_X86_VM86_H 3 3 4 - 5 4 #include <asm/ptrace.h> 6 5 #include <uapi/asm/vm86.h> 7 6 ··· 27 28 unsigned short gs, __gsh; 28 29 }; 29 30 30 - struct kernel_vm86_struct { 31 - struct kernel_vm86_regs regs; 32 - /* 33 - * the below part remains on the kernel stack while we are in VM86 mode. 34 - * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we 35 - * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above 36 - * 'struct kernel_vm86_regs' with the then actual values. 37 - * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct' 38 - * in kernelspace, hence we need not reget the data from userspace. 39 - */ 40 - #define VM86_TSS_ESP0 flags 31 + struct vm86 { 32 + struct vm86plus_struct __user *user_vm86; 33 + struct pt_regs regs32; 34 + unsigned long veflags; 35 + unsigned long veflags_mask; 36 + unsigned long saved_sp0; 37 + 41 38 unsigned long flags; 42 39 unsigned long screen_bitmap; 43 40 unsigned long cpu_type; 44 41 struct revectored_struct int_revectored; 45 42 struct revectored_struct int21_revectored; 46 43 struct vm86plus_info_struct vm86plus; 47 - struct pt_regs *regs32; /* here we save the pointer to the old regs */ 48 - /* 49 - * The below is not part of the structure, but the stack layout continues 50 - * this way. In front of 'return-eip' may be some data, depending on 51 - * compilation, so we don't rely on this and save the pointer to 'oldregs' 52 - * in 'regs32' above. 53 - * However, with GCC-2.7.2 and the current CFLAGS you see exactly this: 54 - 55 - long return-eip; from call to vm86() 56 - struct pt_regs oldregs; user space registers as saved by syscall 57 - */ 58 44 }; 59 45 60 46 #ifdef CONFIG_VM86 61 47 62 48 void handle_vm86_fault(struct kernel_vm86_regs *, long); 63 49 int handle_vm86_trap(struct kernel_vm86_regs *, long, int); 64 - struct pt_regs *save_v86_state(struct kernel_vm86_regs *); 50 + void save_v86_state(struct kernel_vm86_regs *, int); 65 51 66 52 struct task_struct; 53 + 54 + #define free_vm86(t) do { \ 55 + struct thread_struct *__t = (t); \ 56 + if (__t->vm86 != NULL) { \ 57 + kfree(__t->vm86); \ 58 + __t->vm86 = NULL; \ 59 + } \ 60 + } while (0) 61 + 62 + /* 63 + * Support for VM86 programs to request interrupts for 64 + * real mode hardware drivers: 65 + */ 66 + #define FIRST_VM86_IRQ 3 67 + #define LAST_VM86_IRQ 15 68 + 69 + static inline int invalid_vm86_irq(int irq) 70 + { 71 + return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; 72 + } 73 + 67 74 void release_vm86_irqs(struct task_struct *); 68 75 69 76 #else ··· 81 76 { 82 77 return 0; 83 78 } 79 + 80 + static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { } 81 + 82 + #define free_vm86(t) do { } while(0) 84 83 85 84 #endif /* CONFIG_VM86 */ 86 85

-2

arch/x86/include/uapi/asm/processor-flags.h

··· 37 37 #define X86_EFLAGS_VM _BITUL(X86_EFLAGS_VM_BIT) 38 38 #define X86_EFLAGS_AC_BIT 18 /* Alignment Check/Access Control */ 39 39 #define X86_EFLAGS_AC _BITUL(X86_EFLAGS_AC_BIT) 40 - #define X86_EFLAGS_AC_BIT 18 /* Alignment Check/Access Control */ 41 - #define X86_EFLAGS_AC _BITUL(X86_EFLAGS_AC_BIT) 42 40 #define X86_EFLAGS_VIF_BIT 19 /* Virtual Interrupt Flag */ 43 41 #define X86_EFLAGS_VIF _BITUL(X86_EFLAGS_VIF_BIT) 44 42 #define X86_EFLAGS_VIP_BIT 20 /* Virtual Interrupt Pending */

+3 -1

arch/x86/kernel/Makefile

··· 23 23 CFLAGS_irq.o := -I$(src)/../include/asm/trace 24 24 25 25 obj-y := process_$(BITS).o signal.o 26 + obj-$(CONFIG_COMPAT) += signal_compat.o 26 27 obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 27 - obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o 28 + obj-y += time.o ioport.o dumpstack.o nmi.o 29 + obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o 28 30 obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o 29 31 obj-$(CONFIG_IRQ_WORK) += irq_work.o 30 32 obj-y += probe_roms.o

+4 -4

arch/x86/kernel/apb_timer.c

··· 263 263 264 264 /* Verify whether apbt counter works */ 265 265 t1 = dw_apb_clocksource_read(clocksource_apbt); 266 - rdtscll(start); 266 + start = rdtsc(); 267 267 268 268 /* 269 269 * We don't know the TSC frequency yet, but waiting for ··· 273 273 */ 274 274 do { 275 275 rep_nop(); 276 - rdtscll(now); 276 + now = rdtsc(); 277 277 } while ((now - start) < 200000UL); 278 278 279 279 /* APBT is the only always on clocksource, it has to work! */ ··· 390 390 old = dw_apb_clocksource_read(clocksource_apbt); 391 391 old += loop; 392 392 393 - t1 = __native_read_tsc(); 393 + t1 = rdtsc(); 394 394 395 395 do { 396 396 new = dw_apb_clocksource_read(clocksource_apbt); 397 397 } while (new < old); 398 398 399 - t2 = __native_read_tsc(); 399 + t2 = rdtsc(); 400 400 401 401 shift = 5; 402 402 if (unlikely(loop >> shift == 0)) {

+4 -4

arch/x86/kernel/apic/apic.c

··· 457 457 { 458 458 u64 tsc; 459 459 460 - rdtscll(tsc); 460 + tsc = rdtsc(); 461 461 wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); 462 462 return 0; 463 463 } ··· 592 592 unsigned long pm = acpi_pm_read_early(); 593 593 594 594 if (cpu_has_tsc) 595 - rdtscll(tsc); 595 + tsc = rdtsc(); 596 596 597 597 switch (lapic_cal_loops++) { 598 598 case 0: ··· 1209 1209 long long max_loops = cpu_khz ? cpu_khz : 1000000; 1210 1210 1211 1211 if (cpu_has_tsc) 1212 - rdtscll(tsc); 1212 + tsc = rdtsc(); 1213 1213 1214 1214 if (disable_apic) { 1215 1215 disable_ioapic_support(); ··· 1293 1293 } 1294 1294 if (queued) { 1295 1295 if (cpu_has_tsc && cpu_khz) { 1296 - rdtscll(ntsc); 1296 + ntsc = rdtsc(); 1297 1297 max_loops = (cpu_khz << 10) - (ntsc - tsc); 1298 1298 } else 1299 1299 max_loops--;

+7 -3

arch/x86/kernel/cpu/amd.c

··· 11 11 #include <asm/cpu.h> 12 12 #include <asm/smp.h> 13 13 #include <asm/pci-direct.h> 14 + #include <asm/delay.h> 14 15 15 16 #ifdef CONFIG_X86_64 16 17 # include <asm/mmconfig.h> ··· 115 114 const int K6_BUG_LOOP = 1000000; 116 115 int n; 117 116 void (*f_vide)(void); 118 - unsigned long d, d2; 117 + u64 d, d2; 119 118 120 119 printk(KERN_INFO "AMD K6 stepping B detected - "); 121 120 ··· 126 125 127 126 n = K6_BUG_LOOP; 128 127 f_vide = vide; 129 - rdtscl(d); 128 + d = rdtsc(); 130 129 while (n--) 131 130 f_vide(); 132 - rdtscl(d2); 131 + d2 = rdtsc(); 133 132 d = d2-d; 134 133 135 134 if (d > 20*K6_BUG_LOOP) ··· 507 506 /* A random value per boot for bit slice [12:upper_bit) */ 508 507 va_align.bits = get_random_int() & va_align.mask; 509 508 } 509 + 510 + if (cpu_has(c, X86_FEATURE_MWAITX)) 511 + use_mwaitx_delay(); 510 512 } 511 513 512 514 static void early_init_amd(struct cpuinfo_x86 *c)

+3 -3

arch/x86/kernel/cpu/common.c

··· 1185 1185 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. 1186 1186 */ 1187 1187 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); 1188 - wrmsrl(MSR_LSTAR, entry_SYSCALL_64); 1188 + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); 1189 1189 1190 1190 #ifdef CONFIG_IA32_EMULATION 1191 - wrmsrl(MSR_CSTAR, entry_SYSCALL_compat); 1191 + wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); 1192 1192 /* 1193 1193 * This only works on Intel CPUs. 1194 1194 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. ··· 1199 1199 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); 1200 1200 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); 1201 1201 #else 1202 - wrmsrl(MSR_CSTAR, ignore_sysret); 1202 + wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); 1203 1203 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); 1204 1204 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); 1205 1205 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);

+4 -5

arch/x86/kernel/cpu/mcheck/mce.c

··· 127 127 { 128 128 memset(m, 0, sizeof(struct mce)); 129 129 m->cpu = m->extcpu = smp_processor_id(); 130 - rdtscll(m->tsc); 130 + m->tsc = rdtsc(); 131 131 /* We hope get_seconds stays lockless */ 132 132 m->time = get_seconds(); 133 133 m->cpuvendor = boot_cpu_data.x86_vendor; ··· 974 974 { 975 975 struct mca_config *cfg = &mca_cfg; 976 976 struct mce m, *final; 977 - enum ctx_state prev_state; 978 977 int i; 979 978 int worst = 0; 980 979 int severity; ··· 999 1000 int flags = MF_ACTION_REQUIRED; 1000 1001 int lmce = 0; 1001 1002 1002 - prev_state = ist_enter(regs); 1003 + ist_enter(regs); 1003 1004 1004 1005 this_cpu_inc(mce_exception_count); 1005 1006 ··· 1165 1166 local_irq_disable(); 1166 1167 ist_end_non_atomic(); 1167 1168 done: 1168 - ist_exit(regs, prev_state); 1169 + ist_exit(regs); 1169 1170 } 1170 1171 EXPORT_SYMBOL_GPL(do_machine_check); 1171 1172 ··· 1753 1754 { 1754 1755 unsigned long *cpu_tsc = (unsigned long *)data; 1755 1756 1756 - rdtscll(cpu_tsc[smp_processor_id()]); 1757 + cpu_tsc[smp_processor_id()] = rdtsc(); 1757 1758 } 1758 1759 1759 1760 static int mce_apei_read_done;

+2 -3

arch/x86/kernel/cpu/mcheck/p5.c

··· 19 19 /* Machine check handler for Pentium class Intel CPUs: */ 20 20 static void pentium_machine_check(struct pt_regs *regs, long error_code) 21 21 { 22 - enum ctx_state prev_state; 23 22 u32 loaddr, hi, lotype; 24 23 25 - prev_state = ist_enter(regs); 24 + ist_enter(regs); 26 25 27 26 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); 28 27 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); ··· 38 39 39 40 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 40 41 41 - ist_exit(regs, prev_state); 42 + ist_exit(regs); 42 43 } 43 44 44 45 /* Set up machine check reporting for processors with Intel style MCE: */

+2 -2

arch/x86/kernel/cpu/mcheck/winchip.c

··· 15 15 /* Machine check handler for WinChip C6: */ 16 16 static void winchip_machine_check(struct pt_regs *regs, long error_code) 17 17 { 18 - enum ctx_state prev_state = ist_enter(regs); 18 + ist_enter(regs); 19 19 20 20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); 21 21 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 22 22 23 - ist_exit(regs, prev_state); 23 + ist_exit(regs); 24 24 } 25 25 26 26 /* Set up machine check reporting on the Winchip C6 series */

+5 -1

arch/x86/kernel/cpu/perf_event.c

··· 2179 2179 int idx = segment >> 3; 2180 2180 2181 2181 if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { 2182 + #ifdef CONFIG_MODIFY_LDT_SYSCALL 2182 2183 struct ldt_struct *ldt; 2183 2184 2184 2185 if (idx > LDT_ENTRIES) ··· 2191 2190 return 0; 2192 2191 2193 2192 desc = &ldt->entries[idx]; 2193 + #else 2194 + return 0; 2195 + #endif 2194 2196 } else { 2195 2197 if (idx > GDT_ENTRIES) 2196 2198 return 0; ··· 2204 2200 return get_desc_base(desc); 2205 2201 } 2206 2202 2207 - #ifdef CONFIG_COMPAT 2203 + #ifdef CONFIG_IA32_EMULATION 2208 2204 2209 2205 #include <asm/compat.h> 2210 2206

+1 -1

arch/x86/kernel/espfix_64.c

··· 110 110 */ 111 111 if (!arch_get_random_long(&rand)) { 112 112 /* The constant is an arbitrary large prime */ 113 - rdtscll(rand); 113 + rand = rdtsc(); 114 114 rand *= 0xc345c6b72fd16123UL; 115 115 } 116 116

+2 -2

arch/x86/kernel/hpet.c

··· 735 735 736 736 /* Verify whether hpet counter works */ 737 737 t1 = hpet_readl(HPET_COUNTER); 738 - rdtscll(start); 738 + start = rdtsc(); 739 739 740 740 /* 741 741 * We don't know the TSC frequency yet, but waiting for ··· 745 745 */ 746 746 do { 747 747 rep_nop(); 748 - rdtscll(now); 748 + now = rdtsc(); 749 749 } while ((now - start) < 200000UL); 750 750 751 751 if (t1 == hpet_readl(HPET_COUNTER)) {

+15

arch/x86/kernel/irq.c

··· 216 216 unsigned vector = ~regs->orig_ax; 217 217 unsigned irq; 218 218 219 + /* 220 + * NB: Unlike exception entries, IRQ entries do not reliably 221 + * handle context tracking in the low-level entry code. This is 222 + * because syscall entries execute briefly with IRQs on before 223 + * updating context tracking state, so we can take an IRQ from 224 + * kernel mode with CONTEXT_USER. The low-level entry code only 225 + * updates the context if we came from user mode, so we won't 226 + * switch to CONTEXT_KERNEL. We'll fix that once the syscall 227 + * code is cleaned up enough that we can cleanly defer enabling 228 + * IRQs. 229 + */ 230 + 219 231 entering_irq(); 232 + 233 + /* entering_irq() tells RCU that we're not quiescent. Check it. */ 234 + RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); 220 235 221 236 irq = __this_cpu_read(vector_irq[vector]); 222 237

+5 -5

arch/x86/kernel/nmi.c

··· 110 110 a->handler, whole_msecs, decimal_msecs); 111 111 } 112 112 113 - static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) 113 + static int nmi_handle(unsigned int type, struct pt_regs *regs) 114 114 { 115 115 struct nmi_desc *desc = nmi_to_desc(type); 116 116 struct nmiaction *a; ··· 213 213 pci_serr_error(unsigned char reason, struct pt_regs *regs) 214 214 { 215 215 /* check to see if anyone registered against these types of errors */ 216 - if (nmi_handle(NMI_SERR, regs, false)) 216 + if (nmi_handle(NMI_SERR, regs)) 217 217 return; 218 218 219 219 pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", ··· 247 247 unsigned long i; 248 248 249 249 /* check to see if anyone registered against these types of errors */ 250 - if (nmi_handle(NMI_IO_CHECK, regs, false)) 250 + if (nmi_handle(NMI_IO_CHECK, regs)) 251 251 return; 252 252 253 253 pr_emerg( ··· 284 284 * as only the first one is ever run (unless it can actually determine 285 285 * if it caused the NMI) 286 286 */ 287 - handled = nmi_handle(NMI_UNKNOWN, regs, false); 287 + handled = nmi_handle(NMI_UNKNOWN, regs); 288 288 if (handled) { 289 289 __this_cpu_add(nmi_stats.unknown, handled); 290 290 return; ··· 332 332 333 333 __this_cpu_write(last_nmi_rip, regs->ip); 334 334 335 - handled = nmi_handle(NMI_LOCAL, regs, b2b); 335 + handled = nmi_handle(NMI_LOCAL, regs); 336 336 __this_cpu_add(nmi_stats.normal, handled); 337 337 if (handled) { 338 338 /*

-2

arch/x86/kernel/paravirt.c

··· 351 351 .wbinvd = native_wbinvd, 352 352 .read_msr = native_read_msr_safe, 353 353 .write_msr = native_write_msr_safe, 354 - .read_tsc = native_read_tsc, 355 354 .read_pmc = native_read_pmc, 356 - .read_tscp = native_read_tscp, 357 355 .load_tr_desc = native_load_tr_desc, 358 356 .set_ldt = native_set_ldt, 359 357 .load_gdt = native_load_gdt,

-2

arch/x86/kernel/paravirt_patch_32.c

··· 10 10 DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); 11 11 DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); 12 12 DEF_NATIVE(pv_cpu_ops, clts, "clts"); 13 - DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); 14 13 15 14 #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) 16 15 DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)"); ··· 51 52 PATCH_SITE(pv_mmu_ops, read_cr3); 52 53 PATCH_SITE(pv_mmu_ops, write_cr3); 53 54 PATCH_SITE(pv_cpu_ops, clts); 54 - PATCH_SITE(pv_cpu_ops, read_tsc); 55 55 #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) 56 56 case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): 57 57 if (pv_is_native_spin_unlock()) {

+3

arch/x86/kernel/process.c

··· 30 30 #include <asm/nmi.h> 31 31 #include <asm/tlbflush.h> 32 32 #include <asm/mce.h> 33 + #include <asm/vm86.h> 33 34 34 35 /* 35 36 * per-CPU TSS segments. Threads are completely 'soft' on Linux, ··· 111 110 put_cpu(); 112 111 kfree(bp); 113 112 } 113 + 114 + free_vm86(t); 114 115 115 116 fpu__drop(fpu); 116 117 }

+1

arch/x86/kernel/process_32.c

··· 53 53 #include <asm/syscalls.h> 54 54 #include <asm/debugreg.h> 55 55 #include <asm/switch_to.h> 56 + #include <asm/vm86.h> 56 57 57 58 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 58 59 asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");

+4 -2

arch/x86/kernel/process_64.c

··· 121 121 void release_thread(struct task_struct *dead_task) 122 122 { 123 123 if (dead_task->mm) { 124 + #ifdef CONFIG_MODIFY_LDT_SYSCALL 124 125 if (dead_task->mm->context.ldt) { 125 126 pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", 126 127 dead_task->comm, ··· 129 128 dead_task->mm->context.ldt->size); 130 129 BUG(); 131 130 } 131 + #endif 132 132 } 133 133 } 134 134 ··· 250 248 __USER_CS, __USER_DS, 0); 251 249 } 252 250 253 - #ifdef CONFIG_IA32_EMULATION 254 - void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) 251 + #ifdef CONFIG_COMPAT 252 + void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp) 255 253 { 256 254 start_thread_common(regs, new_ip, new_sp, 257 255 test_thread_flag(TIF_X32)

+75 -265

arch/x86/kernel/ptrace.c

··· 37 37 #include <asm/proto.h> 38 38 #include <asm/hw_breakpoint.h> 39 39 #include <asm/traps.h> 40 + #include <asm/syscall.h> 40 41 41 42 #include "tls.h" 42 - 43 - #define CREATE_TRACE_POINTS 44 - #include <trace/events/syscalls.h> 45 43 46 44 enum x86_regset { 47 45 REGSET_GENERAL, ··· 1121 1123 return ret; 1122 1124 } 1123 1125 1126 + static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request, 1127 + compat_ulong_t caddr, compat_ulong_t cdata) 1128 + { 1129 + unsigned long addr = caddr; 1130 + unsigned long data = cdata; 1131 + void __user *datap = compat_ptr(data); 1132 + int ret; 1133 + __u32 val; 1134 + 1135 + switch (request) { 1136 + case PTRACE_PEEKUSR: 1137 + ret = getreg32(child, addr, &val); 1138 + if (ret == 0) 1139 + ret = put_user(val, (__u32 __user *)datap); 1140 + break; 1141 + 1142 + case PTRACE_POKEUSR: 1143 + ret = putreg32(child, addr, data); 1144 + break; 1145 + 1146 + case PTRACE_GETREGS: /* Get all gp regs from the child. */ 1147 + return copy_regset_to_user(child, &user_x86_32_view, 1148 + REGSET_GENERAL, 1149 + 0, sizeof(struct user_regs_struct32), 1150 + datap); 1151 + 1152 + case PTRACE_SETREGS: /* Set all gp regs in the child. */ 1153 + return copy_regset_from_user(child, &user_x86_32_view, 1154 + REGSET_GENERAL, 0, 1155 + sizeof(struct user_regs_struct32), 1156 + datap); 1157 + 1158 + case PTRACE_GETFPREGS: /* Get the child FPU state. */ 1159 + return copy_regset_to_user(child, &user_x86_32_view, 1160 + REGSET_FP, 0, 1161 + sizeof(struct user_i387_ia32_struct), 1162 + datap); 1163 + 1164 + case PTRACE_SETFPREGS: /* Set the child FPU state. */ 1165 + return copy_regset_from_user( 1166 + child, &user_x86_32_view, REGSET_FP, 1167 + 0, sizeof(struct user_i387_ia32_struct), datap); 1168 + 1169 + case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ 1170 + return copy_regset_to_user(child, &user_x86_32_view, 1171 + REGSET_XFP, 0, 1172 + sizeof(struct user32_fxsr_struct), 1173 + datap); 1174 + 1175 + case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ 1176 + return copy_regset_from_user(child, &user_x86_32_view, 1177 + REGSET_XFP, 0, 1178 + sizeof(struct user32_fxsr_struct), 1179 + datap); 1180 + 1181 + case PTRACE_GET_THREAD_AREA: 1182 + case PTRACE_SET_THREAD_AREA: 1183 + return arch_ptrace(child, request, addr, data); 1184 + 1185 + default: 1186 + return compat_ptrace_request(child, request, addr, data); 1187 + } 1188 + 1189 + return ret; 1190 + } 1191 + #endif /* CONFIG_IA32_EMULATION */ 1192 + 1124 1193 #ifdef CONFIG_X86_X32_ABI 1125 1194 static long x32_arch_ptrace(struct task_struct *child, 1126 1195 compat_long_t request, compat_ulong_t caddr, ··· 1276 1211 } 1277 1212 #endif 1278 1213 1214 + #ifdef CONFIG_COMPAT 1279 1215 long compat_arch_ptrace(struct task_struct *child, compat_long_t request, 1280 1216 compat_ulong_t caddr, compat_ulong_t cdata) 1281 1217 { 1282 - unsigned long addr = caddr; 1283 - unsigned long data = cdata; 1284 - void __user *datap = compat_ptr(data); 1285 - int ret; 1286 - __u32 val; 1287 - 1288 1218 #ifdef CONFIG_X86_X32_ABI 1289 1219 if (!is_ia32_task()) 1290 1220 return x32_arch_ptrace(child, request, caddr, cdata); 1291 1221 #endif 1292 - 1293 - switch (request) { 1294 - case PTRACE_PEEKUSR: 1295 - ret = getreg32(child, addr, &val); 1296 - if (ret == 0) 1297 - ret = put_user(val, (__u32 __user *)datap); 1298 - break; 1299 - 1300 - case PTRACE_POKEUSR: 1301 - ret = putreg32(child, addr, data); 1302 - break; 1303 - 1304 - case PTRACE_GETREGS: /* Get all gp regs from the child. */ 1305 - return copy_regset_to_user(child, &user_x86_32_view, 1306 - REGSET_GENERAL, 1307 - 0, sizeof(struct user_regs_struct32), 1308 - datap); 1309 - 1310 - case PTRACE_SETREGS: /* Set all gp regs in the child. */ 1311 - return copy_regset_from_user(child, &user_x86_32_view, 1312 - REGSET_GENERAL, 0, 1313 - sizeof(struct user_regs_struct32), 1314 - datap); 1315 - 1316 - case PTRACE_GETFPREGS: /* Get the child FPU state. */ 1317 - return copy_regset_to_user(child, &user_x86_32_view, 1318 - REGSET_FP, 0, 1319 - sizeof(struct user_i387_ia32_struct), 1320 - datap); 1321 - 1322 - case PTRACE_SETFPREGS: /* Set the child FPU state. */ 1323 - return copy_regset_from_user( 1324 - child, &user_x86_32_view, REGSET_FP, 1325 - 0, sizeof(struct user_i387_ia32_struct), datap); 1326 - 1327 - case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ 1328 - return copy_regset_to_user(child, &user_x86_32_view, 1329 - REGSET_XFP, 0, 1330 - sizeof(struct user32_fxsr_struct), 1331 - datap); 1332 - 1333 - case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ 1334 - return copy_regset_from_user(child, &user_x86_32_view, 1335 - REGSET_XFP, 0, 1336 - sizeof(struct user32_fxsr_struct), 1337 - datap); 1338 - 1339 - case PTRACE_GET_THREAD_AREA: 1340 - case PTRACE_SET_THREAD_AREA: 1341 - return arch_ptrace(child, request, addr, data); 1342 - 1343 - default: 1344 - return compat_ptrace_request(child, request, addr, data); 1345 - } 1346 - 1347 - return ret; 1222 + #ifdef CONFIG_IA32_EMULATION 1223 + return ia32_arch_ptrace(child, request, caddr, cdata); 1224 + #else 1225 + return 0; 1226 + #endif 1348 1227 } 1349 - 1350 - #endif /* CONFIG_IA32_EMULATION */ 1228 + #endif /* CONFIG_COMPAT */ 1351 1229 1352 1230 #ifdef CONFIG_X86_64 1353 1231 ··· 1441 1433 fill_sigtrap_info(tsk, regs, error_code, si_code, &info); 1442 1434 /* Send us the fake SIGTRAP */ 1443 1435 force_sig_info(SIGTRAP, &info, tsk); 1444 - } 1445 - 1446 - static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) 1447 - { 1448 - #ifdef CONFIG_X86_64 1449 - if (arch == AUDIT_ARCH_X86_64) { 1450 - audit_syscall_entry(regs->orig_ax, regs->di, 1451 - regs->si, regs->dx, regs->r10); 1452 - } else 1453 - #endif 1454 - { 1455 - audit_syscall_entry(regs->orig_ax, regs->bx, 1456 - regs->cx, regs->dx, regs->si); 1457 - } 1458 - } 1459 - 1460 - /* 1461 - * We can return 0 to resume the syscall or anything else to go to phase 1462 - * 2. If we resume the syscall, we need to put something appropriate in 1463 - * regs->orig_ax. 1464 - * 1465 - * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax 1466 - * are fully functional. 1467 - * 1468 - * For phase 2's benefit, our return value is: 1469 - * 0: resume the syscall 1470 - * 1: go to phase 2; no seccomp phase 2 needed 1471 - * anything else: go to phase 2; pass return value to seccomp 1472 - */ 1473 - unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) 1474 - { 1475 - unsigned long ret = 0; 1476 - u32 work; 1477 - 1478 - BUG_ON(regs != task_pt_regs(current)); 1479 - 1480 - work = ACCESS_ONCE(current_thread_info()->flags) & 1481 - _TIF_WORK_SYSCALL_ENTRY; 1482 - 1483 - /* 1484 - * If TIF_NOHZ is set, we are required to call user_exit() before 1485 - * doing anything that could touch RCU. 1486 - */ 1487 - if (work & _TIF_NOHZ) { 1488 - user_exit(); 1489 - work &= ~_TIF_NOHZ; 1490 - } 1491 - 1492 - #ifdef CONFIG_SECCOMP 1493 - /* 1494 - * Do seccomp first -- it should minimize exposure of other 1495 - * code, and keeping seccomp fast is probably more valuable 1496 - * than the rest of this. 1497 - */ 1498 - if (work & _TIF_SECCOMP) { 1499 - struct seccomp_data sd; 1500 - 1501 - sd.arch = arch; 1502 - sd.nr = regs->orig_ax; 1503 - sd.instruction_pointer = regs->ip; 1504 - #ifdef CONFIG_X86_64 1505 - if (arch == AUDIT_ARCH_X86_64) { 1506 - sd.args[0] = regs->di; 1507 - sd.args[1] = regs->si; 1508 - sd.args[2] = regs->dx; 1509 - sd.args[3] = regs->r10; 1510 - sd.args[4] = regs->r8; 1511 - sd.args[5] = regs->r9; 1512 - } else 1513 - #endif 1514 - { 1515 - sd.args[0] = regs->bx; 1516 - sd.args[1] = regs->cx; 1517 - sd.args[2] = regs->dx; 1518 - sd.args[3] = regs->si; 1519 - sd.args[4] = regs->di; 1520 - sd.args[5] = regs->bp; 1521 - } 1522 - 1523 - BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); 1524 - BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); 1525 - 1526 - ret = seccomp_phase1(&sd); 1527 - if (ret == SECCOMP_PHASE1_SKIP) { 1528 - regs->orig_ax = -1; 1529 - ret = 0; 1530 - } else if (ret != SECCOMP_PHASE1_OK) { 1531 - return ret; /* Go directly to phase 2 */ 1532 - } 1533 - 1534 - work &= ~_TIF_SECCOMP; 1535 - } 1536 - #endif 1537 - 1538 - /* Do our best to finish without phase 2. */ 1539 - if (work == 0) 1540 - return ret; /* seccomp and/or nohz only (ret == 0 here) */ 1541 - 1542 - #ifdef CONFIG_AUDITSYSCALL 1543 - if (work == _TIF_SYSCALL_AUDIT) { 1544 - /* 1545 - * If there is no more work to be done except auditing, 1546 - * then audit in phase 1. Phase 2 always audits, so, if 1547 - * we audit here, then we can't go on to phase 2. 1548 - */ 1549 - do_audit_syscall_entry(regs, arch); 1550 - return 0; 1551 - } 1552 - #endif 1553 - 1554 - return 1; /* Something is enabled that we can't handle in phase 1 */ 1555 - } 1556 - 1557 - /* Returns the syscall nr to run (which should match regs->orig_ax). */ 1558 - long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, 1559 - unsigned long phase1_result) 1560 - { 1561 - long ret = 0; 1562 - u32 work = ACCESS_ONCE(current_thread_info()->flags) & 1563 - _TIF_WORK_SYSCALL_ENTRY; 1564 - 1565 - BUG_ON(regs != task_pt_regs(current)); 1566 - 1567 - /* 1568 - * If we stepped into a sysenter/syscall insn, it trapped in 1569 - * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. 1570 - * If user-mode had set TF itself, then it's still clear from 1571 - * do_debug() and we need to set it again to restore the user 1572 - * state. If we entered on the slow path, TF was already set. 1573 - */ 1574 - if (work & _TIF_SINGLESTEP) 1575 - regs->flags |= X86_EFLAGS_TF; 1576 - 1577 - #ifdef CONFIG_SECCOMP 1578 - /* 1579 - * Call seccomp_phase2 before running the other hooks so that 1580 - * they can see any changes made by a seccomp tracer. 1581 - */ 1582 - if (phase1_result > 1 && seccomp_phase2(phase1_result)) { 1583 - /* seccomp failures shouldn't expose any additional code. */ 1584 - return -1; 1585 - } 1586 - #endif 1587 - 1588 - if (unlikely(work & _TIF_SYSCALL_EMU)) 1589 - ret = -1L; 1590 - 1591 - if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && 1592 - tracehook_report_syscall_entry(regs)) 1593 - ret = -1L; 1594 - 1595 - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1596 - trace_sys_enter(regs, regs->orig_ax); 1597 - 1598 - do_audit_syscall_entry(regs, arch); 1599 - 1600 - return ret ?: regs->orig_ax; 1601 - } 1602 - 1603 - long syscall_trace_enter(struct pt_regs *regs) 1604 - { 1605 - u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; 1606 - unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); 1607 - 1608 - if (phase1_result == 0) 1609 - return regs->orig_ax; 1610 - else 1611 - return syscall_trace_enter_phase2(regs, arch, phase1_result); 1612 - } 1613 - 1614 - void syscall_trace_leave(struct pt_regs *regs) 1615 - { 1616 - bool step; 1617 - 1618 - /* 1619 - * We may come here right after calling schedule_user() 1620 - * or do_notify_resume(), in which case we can be in RCU 1621 - * user mode. 1622 - */ 1623 - user_exit(); 1624 - 1625 - audit_syscall_exit(regs); 1626 - 1627 - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1628 - trace_sys_exit(regs, regs->ax); 1629 - 1630 - /* 1631 - * If TIF_SYSCALL_EMU is set, we only get here because of 1632 - * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). 1633 - * We already reported this syscall instruction in 1634 - * syscall_trace_enter(). 1635 - */ 1636 - step = unlikely(test_thread_flag(TIF_SINGLESTEP)) && 1637 - !test_thread_flag(TIF_SYSCALL_EMU); 1638 - if (step || test_thread_flag(TIF_SYSCALL_TRACE)) 1639 - tracehook_report_syscall_exit(regs, step); 1640 - 1641 - user_enter(); 1642 1436 }

+5 -28

arch/x86/kernel/signal.c

··· 31 31 #include <asm/vdso.h> 32 32 #include <asm/mce.h> 33 33 #include <asm/sighandling.h> 34 + #include <asm/vm86.h> 34 35 35 36 #ifdef CONFIG_X86_64 36 37 #include <asm/proto.h> 37 38 #include <asm/ia32_unistd.h> 38 - #include <asm/sys_ia32.h> 39 39 #endif /* CONFIG_X86_64 */ 40 40 41 41 #include <asm/syscall.h> ··· 632 632 bool stepping, failed; 633 633 struct fpu *fpu = &current->thread.fpu; 634 634 635 + if (v8086_mode(regs)) 636 + save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL); 637 + 635 638 /* Are we from a system call? */ 636 639 if (syscall_get_nr(current, regs) >= 0) { 637 640 /* If so, check system call restarting.. */ ··· 700 697 * want to handle. Thus you cannot kill init even with a SIGKILL even by 701 698 * mistake. 702 699 */ 703 - static void do_signal(struct pt_regs *regs) 700 + void do_signal(struct pt_regs *regs) 704 701 { 705 702 struct ksignal ksig; 706 703 ··· 733 730 * back. 734 731 */ 735 732 restore_saved_sigmask(); 736 - } 737 - 738 - /* 739 - * notification of userspace execution resumption 740 - * - triggered by the TIF_WORK_MASK flags 741 - */ 742 - __visible void 743 - do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 744 - { 745 - user_exit(); 746 - 747 - if (thread_info_flags & _TIF_UPROBE) 748 - uprobe_notify_resume(regs); 749 - 750 - /* deal with pending signal delivery */ 751 - if (thread_info_flags & _TIF_SIGPENDING) 752 - do_signal(regs); 753 - 754 - if (thread_info_flags & _TIF_NOTIFY_RESUME) { 755 - clear_thread_flag(TIF_NOTIFY_RESUME); 756 - tracehook_notify_resume(regs); 757 - } 758 - if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) 759 - fire_user_return_notifiers(); 760 - 761 - user_enter(); 762 733 } 763 734 764 735 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)

+95

arch/x86/kernel/signal_compat.c

··· 1 + #include <linux/compat.h> 2 + #include <linux/uaccess.h> 3 + 4 + int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) 5 + { 6 + int err = 0; 7 + bool ia32 = test_thread_flag(TIF_IA32); 8 + 9 + if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) 10 + return -EFAULT; 11 + 12 + put_user_try { 13 + /* If you change siginfo_t structure, please make sure that 14 + this code is fixed accordingly. 15 + It should never copy any pad contained in the structure 16 + to avoid security leaks, but must copy the generic 17 + 3 ints plus the relevant union member. */ 18 + put_user_ex(from->si_signo, &to->si_signo); 19 + put_user_ex(from->si_errno, &to->si_errno); 20 + put_user_ex((short)from->si_code, &to->si_code); 21 + 22 + if (from->si_code < 0) { 23 + put_user_ex(from->si_pid, &to->si_pid); 24 + put_user_ex(from->si_uid, &to->si_uid); 25 + put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr); 26 + } else { 27 + /* 28 + * First 32bits of unions are always present: 29 + * si_pid === si_band === si_tid === si_addr(LS half) 30 + */ 31 + put_user_ex(from->_sifields._pad[0], 32 + &to->_sifields._pad[0]); 33 + switch (from->si_code >> 16) { 34 + case __SI_FAULT >> 16: 35 + break; 36 + case __SI_SYS >> 16: 37 + put_user_ex(from->si_syscall, &to->si_syscall); 38 + put_user_ex(from->si_arch, &to->si_arch); 39 + break; 40 + case __SI_CHLD >> 16: 41 + if (ia32) { 42 + put_user_ex(from->si_utime, &to->si_utime); 43 + put_user_ex(from->si_stime, &to->si_stime); 44 + } else { 45 + put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime); 46 + put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime); 47 + } 48 + put_user_ex(from->si_status, &to->si_status); 49 + /* FALL THROUGH */ 50 + default: 51 + case __SI_KILL >> 16: 52 + put_user_ex(from->si_uid, &to->si_uid); 53 + break; 54 + case __SI_POLL >> 16: 55 + put_user_ex(from->si_fd, &to->si_fd); 56 + break; 57 + case __SI_TIMER >> 16: 58 + put_user_ex(from->si_overrun, &to->si_overrun); 59 + put_user_ex(ptr_to_compat(from->si_ptr), 60 + &to->si_ptr); 61 + break; 62 + /* This is not generated by the kernel as of now. */ 63 + case __SI_RT >> 16: 64 + case __SI_MESGQ >> 16: 65 + put_user_ex(from->si_uid, &to->si_uid); 66 + put_user_ex(from->si_int, &to->si_int); 67 + break; 68 + } 69 + } 70 + } put_user_catch(err); 71 + 72 + return err; 73 + } 74 + 75 + int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) 76 + { 77 + int err = 0; 78 + u32 ptr32; 79 + 80 + if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t))) 81 + return -EFAULT; 82 + 83 + get_user_try { 84 + get_user_ex(to->si_signo, &from->si_signo); 85 + get_user_ex(to->si_errno, &from->si_errno); 86 + get_user_ex(to->si_code, &from->si_code); 87 + 88 + get_user_ex(to->si_pid, &from->si_pid); 89 + get_user_ex(to->si_uid, &from->si_uid); 90 + get_user_ex(ptr32, &from->si_ptr); 91 + to->si_ptr = compat_ptr(ptr32); 92 + } get_user_catch(err); 93 + 94 + return err; 95 + }

+2

arch/x86/kernel/step.c

··· 18 18 return addr; 19 19 } 20 20 21 + #ifdef CONFIG_MODIFY_LDT_SYSCALL 21 22 /* 22 23 * We'll assume that the code segments in the GDT 23 24 * are all zero-based. That is largely true: the ··· 46 45 } 47 46 mutex_unlock(&child->mm->context.lock); 48 47 } 48 + #endif 49 49 50 50 return addr; 51 51 }

+1 -6

arch/x86/kernel/trace_clock.c

··· 12 12 */ 13 13 u64 notrace trace_clock_x86_tsc(void) 14 14 { 15 - u64 ret; 16 - 17 - rdtsc_barrier(); 18 - rdtscll(ret); 19 - 20 - return ret; 15 + return rdtsc_ordered(); 21 16 }

+29 -59

arch/x86/kernel/traps.c

··· 62 62 #include <asm/fpu/xstate.h> 63 63 #include <asm/trace/mpx.h> 64 64 #include <asm/mpx.h> 65 + #include <asm/vm86.h> 65 66 66 67 #ifdef CONFIG_X86_64 67 68 #include <asm/x86_init.h> ··· 109 108 preempt_count_dec(); 110 109 } 111 110 112 - enum ctx_state ist_enter(struct pt_regs *regs) 111 + void ist_enter(struct pt_regs *regs) 113 112 { 114 - enum ctx_state prev_state; 115 - 116 113 if (user_mode(regs)) { 117 - /* Other than that, we're just an exception. */ 118 - prev_state = exception_enter(); 114 + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); 119 115 } else { 120 116 /* 121 117 * We might have interrupted pretty much anything. In ··· 121 123 * but we need to notify RCU. 122 124 */ 123 125 rcu_nmi_enter(); 124 - prev_state = CONTEXT_KERNEL; /* the value is irrelevant. */ 125 126 } 126 127 127 128 /* 128 - * We are atomic because we're on the IST stack (or we're on x86_32, 129 - * in which case we still shouldn't schedule). 130 - * 131 - * This must be after exception_enter(), because exception_enter() 132 - * won't do anything if in_interrupt() returns true. 129 + * We are atomic because we're on the IST stack; or we're on 130 + * x86_32, in which case we still shouldn't schedule; or we're 131 + * on x86_64 and entered from user mode, in which case we're 132 + * still atomic unless ist_begin_non_atomic is called. 133 133 */ 134 134 preempt_count_add(HARDIRQ_OFFSET); 135 135 136 136 /* This code is a bit fragile. Test it. */ 137 137 RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work"); 138 - 139 - return prev_state; 140 138 } 141 139 142 - void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) 140 + void ist_exit(struct pt_regs *regs) 143 141 { 144 - /* Must be before exception_exit. */ 145 142 preempt_count_sub(HARDIRQ_OFFSET); 146 143 147 - if (user_mode(regs)) 148 - return exception_exit(prev_state); 149 - else 144 + if (!user_mode(regs)) 150 145 rcu_nmi_exit(); 151 146 } 152 147 ··· 153 162 * a double fault, it can be safe to schedule. ist_begin_non_atomic() 154 163 * begins a non-atomic section within an ist_enter()/ist_exit() region. 155 164 * Callers are responsible for enabling interrupts themselves inside 156 - * the non-atomic section, and callers must call is_end_non_atomic() 165 + * the non-atomic section, and callers must call ist_end_non_atomic() 157 166 * before ist_exit(). 158 167 */ 159 168 void ist_begin_non_atomic(struct pt_regs *regs) ··· 280 289 static void do_error_trap(struct pt_regs *regs, long error_code, char *str, 281 290 unsigned long trapnr, int signr) 282 291 { 283 - enum ctx_state prev_state = exception_enter(); 284 292 siginfo_t info; 293 + 294 + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); 285 295 286 296 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != 287 297 NOTIFY_STOP) { ··· 290 298 do_trap(trapnr, signr, str, regs, error_code, 291 299 fill_trap_info(regs, signr, trapnr, &info)); 292 300 } 293 - 294 - exception_exit(prev_state); 295 301 } 296 302 297 303 #define DO_ERROR(trapnr, signr, str, name) \ ··· 341 351 } 342 352 #endif 343 353 344 - ist_enter(regs); /* Discard prev_state because we won't return. */ 354 + ist_enter(regs); 345 355 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); 346 356 347 357 tsk->thread.error_code = error_code; ··· 361 371 362 372 dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) 363 373 { 364 - enum ctx_state prev_state; 365 374 const struct bndcsr *bndcsr; 366 375 siginfo_t *info; 367 376 368 - prev_state = exception_enter(); 377 + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); 369 378 if (notify_die(DIE_TRAP, "bounds", regs, error_code, 370 379 X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) 371 - goto exit; 380 + return; 372 381 conditional_sti(regs); 373 382 374 383 if (!user_mode(regs)) ··· 424 435 die("bounds", regs, error_code); 425 436 } 426 437 427 - exit: 428 - exception_exit(prev_state); 429 438 return; 439 + 430 440 exit_trap: 431 441 /* 432 442 * This path out is for all the cases where we could not ··· 435 447 * time.. 436 448 */ 437 449 do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL); 438 - exception_exit(prev_state); 439 450 } 440 451 441 452 dotraplinkage void 442 453 do_general_protection(struct pt_regs *regs, long error_code) 443 454 { 444 455 struct task_struct *tsk; 445 - enum ctx_state prev_state; 446 456 447 - prev_state = exception_enter(); 457 + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); 448 458 conditional_sti(regs); 449 459 450 460 if (v8086_mode(regs)) { 451 461 local_irq_enable(); 452 462 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); 453 - goto exit; 463 + return; 454 464 } 455 465 456 466 tsk = current; 457 467 if (!user_mode(regs)) { 458 468 if (fixup_exception(regs)) 459 - goto exit; 469 + return; 460 470 461 471 tsk->thread.error_code = error_code; 462 472 tsk->thread.trap_nr = X86_TRAP_GP; 463 473 if (notify_die(DIE_GPF, "general protection fault", regs, error_code, 464 474 X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP) 465 475 die("general protection fault", regs, error_code); 466 - goto exit; 476 + return; 467 477 } 468 478 469 479 tsk->thread.error_code = error_code; ··· 477 491 } 478 492 479 493 force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); 480 - exit: 481 - exception_exit(prev_state); 482 494 } 483 495 NOKPROBE_SYMBOL(do_general_protection); 484 496 485 497 /* May run on IST stack. */ 486 498 dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) 487 499 { 488 - enum ctx_state prev_state; 489 - 490 500 #ifdef CONFIG_DYNAMIC_FTRACE 491 501 /* 492 502 * ftrace must be first, everything else may cause a recursive crash. ··· 495 513 if (poke_int3_handler(regs)) 496 514 return; 497 515 498 - prev_state = ist_enter(regs); 516 + ist_enter(regs); 517 + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); 499 518 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP 500 519 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, 501 520 SIGTRAP) == NOTIFY_STOP) ··· 522 539 preempt_conditional_cli(regs); 523 540 debug_stack_usage_dec(); 524 541 exit: 525 - ist_exit(regs, prev_state); 542 + ist_exit(regs); 526 543 } 527 544 NOKPROBE_SYMBOL(do_int3); 528 545 ··· 598 615 dotraplinkage void do_debug(struct pt_regs *regs, long error_code) 599 616 { 600 617 struct task_struct *tsk = current; 601 - enum ctx_state prev_state; 602 618 int user_icebp = 0; 603 619 unsigned long dr6; 604 620 int si_code; 605 621 606 - prev_state = ist_enter(regs); 622 + ist_enter(regs); 607 623 608 624 get_debugreg(dr6, 6); 609 625 ··· 677 695 debug_stack_usage_dec(); 678 696 679 697 exit: 680 - ist_exit(regs, prev_state); 698 + ist_exit(regs); 681 699 } 682 700 NOKPROBE_SYMBOL(do_debug); 683 701 ··· 729 747 730 748 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) 731 749 { 732 - enum ctx_state prev_state; 733 - 734 - prev_state = exception_enter(); 750 + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); 735 751 math_error(regs, error_code, X86_TRAP_MF); 736 - exception_exit(prev_state); 737 752 } 738 753 739 754 dotraplinkage void 740 755 do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 741 756 { 742 - enum ctx_state prev_state; 743 - 744 - prev_state = exception_enter(); 757 + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); 745 758 math_error(regs, error_code, X86_TRAP_XF); 746 - exception_exit(prev_state); 747 759 } 748 760 749 761 dotraplinkage void ··· 749 773 dotraplinkage void 750 774 do_device_not_available(struct pt_regs *regs, long error_code) 751 775 { 752 - enum ctx_state prev_state; 753 - 754 - prev_state = exception_enter(); 776 + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); 755 777 BUG_ON(use_eager_fpu()); 756 778 757 779 #ifdef CONFIG_MATH_EMULATION ··· 760 786 761 787 info.regs = regs; 762 788 math_emulate(&info); 763 - exception_exit(prev_state); 764 789 return; 765 790 } 766 791 #endif ··· 767 794 #ifdef CONFIG_X86_32 768 795 conditional_sti(regs); 769 796 #endif 770 - exception_exit(prev_state); 771 797 } 772 798 NOKPROBE_SYMBOL(do_device_not_available); 773 799 ··· 774 802 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) 775 803 { 776 804 siginfo_t info; 777 - enum ctx_state prev_state; 778 805 779 - prev_state = exception_enter(); 806 + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); 780 807 local_irq_enable(); 781 808 782 809 info.si_signo = SIGILL; ··· 787 816 do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, 788 817 &info); 789 818 } 790 - exception_exit(prev_state); 791 819 } 792 820 #endif 793 821

+3 -9

arch/x86/kernel/tsc.c

··· 248 248 249 249 data = cyc2ns_write_begin(cpu); 250 250 251 - rdtscll(tsc_now); 251 + tsc_now = rdtsc(); 252 252 ns_now = cycles_2_ns(tsc_now); 253 253 254 254 /* ··· 290 290 } 291 291 292 292 /* read the Time Stamp Counter: */ 293 - rdtscll(tsc_now); 293 + tsc_now = rdtsc(); 294 294 295 295 /* return the value in ns */ 296 296 return cycles_2_ns(tsc_now); ··· 315 315 unsigned long long 316 316 sched_clock(void) __attribute__((alias("native_sched_clock"))); 317 317 #endif 318 - 319 - unsigned long long native_read_tsc(void) 320 - { 321 - return __native_read_tsc(); 322 - } 323 - EXPORT_SYMBOL(native_read_tsc); 324 318 325 319 int check_tsc_unstable(void) 326 320 { ··· 978 984 */ 979 985 static cycle_t read_tsc(struct clocksource *cs) 980 986 { 981 - return (cycle_t)get_cycles(); 987 + return (cycle_t)rdtsc_ordered(); 982 988 } 983 989 984 990 /*

+6 -8

arch/x86/kernel/tsc_sync.c

··· 39 39 static int nr_warps; 40 40 41 41 /* 42 - * TSC-warp measurement loop running on both CPUs: 42 + * TSC-warp measurement loop running on both CPUs. This is not called 43 + * if there is no TSC. 43 44 */ 44 45 static void check_tsc_warp(unsigned int timeout) 45 46 { 46 47 cycles_t start, now, prev, end; 47 48 int i; 48 49 49 - rdtsc_barrier(); 50 - start = get_cycles(); 51 - rdtsc_barrier(); 50 + start = rdtsc_ordered(); 52 51 /* 53 52 * The measurement runs for 'timeout' msecs: 54 53 */ ··· 62 63 */ 63 64 arch_spin_lock(&sync_lock); 64 65 prev = last_tsc; 65 - rdtsc_barrier(); 66 - now = get_cycles(); 67 - rdtsc_barrier(); 66 + now = rdtsc_ordered(); 68 67 last_tsc = now; 69 68 arch_spin_unlock(&sync_lock); 70 69 ··· 123 126 124 127 /* 125 128 * No need to check if we already know that the TSC is not 126 - * synchronized: 129 + * synchronized or if we have no TSC. 127 130 */ 128 131 if (unsynchronized_tsc()) 129 132 return; ··· 187 190 { 188 191 int cpus = 2; 189 192 193 + /* Also aborts if there is no TSC. */ 190 194 if (unsynchronized_tsc() || tsc_clocksource_reliable) 191 195 return; 192 196

+183 -190

arch/x86/kernel/vm86_32.c

··· 44 44 #include <linux/ptrace.h> 45 45 #include <linux/audit.h> 46 46 #include <linux/stddef.h> 47 + #include <linux/slab.h> 47 48 48 49 #include <asm/uaccess.h> 49 50 #include <asm/io.h> 50 51 #include <asm/tlbflush.h> 51 52 #include <asm/irq.h> 53 + #include <asm/traps.h> 54 + #include <asm/vm86.h> 52 55 53 56 /* 54 57 * Known problems: ··· 69 66 */ 70 67 71 68 72 - #define KVM86 ((struct kernel_vm86_struct *)regs) 73 - #define VMPI KVM86->vm86plus 74 - 75 - 76 69 /* 77 70 * 8- and 16-bit register defines.. 78 71 */ ··· 80 81 /* 81 82 * virtual flags (16 and 32-bit versions) 82 83 */ 83 - #define VFLAGS (*(unsigned short *)&(current->thread.v86flags)) 84 - #define VEFLAGS (current->thread.v86flags) 84 + #define VFLAGS (*(unsigned short *)&(current->thread.vm86->veflags)) 85 + #define VEFLAGS (current->thread.vm86->veflags) 85 86 86 87 #define set_flags(X, new, mask) \ 87 88 ((X) = ((X) & ~(mask)) | ((new) & (mask))) ··· 89 90 #define SAFE_MASK (0xDD5) 90 91 #define RETURN_MASK (0xDFF) 91 92 92 - /* convert kernel_vm86_regs to vm86_regs */ 93 - static int copy_vm86_regs_to_user(struct vm86_regs __user *user, 94 - const struct kernel_vm86_regs *regs) 95 - { 96 - int ret = 0; 97 - 98 - /* 99 - * kernel_vm86_regs is missing gs, so copy everything up to 100 - * (but not including) orig_eax, and then rest including orig_eax. 101 - */ 102 - ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax)); 103 - ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax, 104 - sizeof(struct kernel_vm86_regs) - 105 - offsetof(struct kernel_vm86_regs, pt.orig_ax)); 106 - 107 - return ret; 108 - } 109 - 110 - /* convert vm86_regs to kernel_vm86_regs */ 111 - static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs, 112 - const struct vm86_regs __user *user, 113 - unsigned extra) 114 - { 115 - int ret = 0; 116 - 117 - /* copy ax-fs inclusive */ 118 - ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax)); 119 - /* copy orig_ax-__gsh+extra */ 120 - ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax, 121 - sizeof(struct kernel_vm86_regs) - 122 - offsetof(struct kernel_vm86_regs, pt.orig_ax) + 123 - extra); 124 - return ret; 125 - } 126 - 127 - struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) 93 + void save_v86_state(struct kernel_vm86_regs *regs, int retval) 128 94 { 129 95 struct tss_struct *tss; 130 - struct pt_regs *ret; 131 - unsigned long tmp; 96 + struct task_struct *tsk = current; 97 + struct vm86plus_struct __user *user; 98 + struct vm86 *vm86 = current->thread.vm86; 99 + long err = 0; 132 100 133 101 /* 134 102 * This gets called from entry.S with interrupts disabled, but ··· 104 138 */ 105 139 local_irq_enable(); 106 140 107 - if (!current->thread.vm86_info) { 108 - pr_alert("no vm86_info: BAD\n"); 141 + if (!vm86 || !vm86->user_vm86) { 142 + pr_alert("no user_vm86: BAD\n"); 109 143 do_exit(SIGSEGV); 110 144 } 111 - set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask); 112 - tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs); 113 - tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap); 114 - if (tmp) { 115 - pr_alert("could not access userspace vm86_info\n"); 145 + set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask); 146 + user = vm86->user_vm86; 147 + 148 + if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ? 149 + sizeof(struct vm86plus_struct) : 150 + sizeof(struct vm86_struct))) { 151 + pr_alert("could not access userspace vm86 info\n"); 152 + do_exit(SIGSEGV); 153 + } 154 + 155 + put_user_try { 156 + put_user_ex(regs->pt.bx, &user->regs.ebx); 157 + put_user_ex(regs->pt.cx, &user->regs.ecx); 158 + put_user_ex(regs->pt.dx, &user->regs.edx); 159 + put_user_ex(regs->pt.si, &user->regs.esi); 160 + put_user_ex(regs->pt.di, &user->regs.edi); 161 + put_user_ex(regs->pt.bp, &user->regs.ebp); 162 + put_user_ex(regs->pt.ax, &user->regs.eax); 163 + put_user_ex(regs->pt.ip, &user->regs.eip); 164 + put_user_ex(regs->pt.cs, &user->regs.cs); 165 + put_user_ex(regs->pt.flags, &user->regs.eflags); 166 + put_user_ex(regs->pt.sp, &user->regs.esp); 167 + put_user_ex(regs->pt.ss, &user->regs.ss); 168 + put_user_ex(regs->es, &user->regs.es); 169 + put_user_ex(regs->ds, &user->regs.ds); 170 + put_user_ex(regs->fs, &user->regs.fs); 171 + put_user_ex(regs->gs, &user->regs.gs); 172 + 173 + put_user_ex(vm86->screen_bitmap, &user->screen_bitmap); 174 + } put_user_catch(err); 175 + if (err) { 176 + pr_alert("could not access userspace vm86 info\n"); 116 177 do_exit(SIGSEGV); 117 178 } 118 179 119 180 tss = &per_cpu(cpu_tss, get_cpu()); 120 - current->thread.sp0 = current->thread.saved_sp0; 121 - current->thread.sysenter_cs = __KERNEL_CS; 122 - load_sp0(tss, &current->thread); 123 - current->thread.saved_sp0 = 0; 181 + tsk->thread.sp0 = vm86->saved_sp0; 182 + tsk->thread.sysenter_cs = __KERNEL_CS; 183 + load_sp0(tss, &tsk->thread); 184 + vm86->saved_sp0 = 0; 124 185 put_cpu(); 125 186 126 - ret = KVM86->regs32; 187 + memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs)); 127 188 128 - ret->fs = current->thread.saved_fs; 129 - set_user_gs(ret, current->thread.saved_gs); 189 + lazy_load_gs(vm86->regs32.gs); 130 190 131 - return ret; 191 + regs->pt.ax = retval; 132 192 } 133 193 134 194 static void mark_screen_rdonly(struct mm_struct *mm) ··· 192 200 193 201 194 202 static int do_vm86_irq_handling(int subfunction, int irqnumber); 195 - static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); 203 + static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus); 196 204 197 - SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86) 205 + SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, user_vm86) 198 206 { 199 - struct kernel_vm86_struct info; /* declare this _on top_, 200 - * this avoids wasting of stack space. 201 - * This remains on the stack until we 202 - * return to 32 bit user space. 203 - */ 204 - struct task_struct *tsk = current; 205 - int tmp; 206 - 207 - if (tsk->thread.saved_sp0) 208 - return -EPERM; 209 - tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 210 - offsetof(struct kernel_vm86_struct, vm86plus) - 211 - sizeof(info.regs)); 212 - if (tmp) 213 - return -EFAULT; 214 - memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); 215 - info.regs32 = current_pt_regs(); 216 - tsk->thread.vm86_info = v86; 217 - do_sys_vm86(&info, tsk); 218 - return 0; /* we never return here */ 207 + return do_sys_vm86((struct vm86plus_struct __user *) user_vm86, false); 219 208 } 220 209 221 210 222 211 SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) 223 212 { 224 - struct kernel_vm86_struct info; /* declare this _on top_, 225 - * this avoids wasting of stack space. 226 - * This remains on the stack until we 227 - * return to 32 bit user space. 228 - */ 229 - struct task_struct *tsk; 230 - int tmp; 231 - struct vm86plus_struct __user *v86; 232 - 233 - tsk = current; 234 213 switch (cmd) { 235 214 case VM86_REQUEST_IRQ: 236 215 case VM86_FREE_IRQ: ··· 219 256 } 220 257 221 258 /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ 222 - if (tsk->thread.saved_sp0) 223 - return -EPERM; 224 - v86 = (struct vm86plus_struct __user *)arg; 225 - tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 226 - offsetof(struct kernel_vm86_struct, regs32) - 227 - sizeof(info.regs)); 228 - if (tmp) 229 - return -EFAULT; 230 - info.regs32 = current_pt_regs(); 231 - info.vm86plus.is_vm86pus = 1; 232 - tsk->thread.vm86_info = (struct vm86_struct __user *)v86; 233 - do_sys_vm86(&info, tsk); 234 - return 0; /* we never return here */ 259 + return do_sys_vm86((struct vm86plus_struct __user *) arg, true); 235 260 } 236 261 237 262 238 - static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) 263 + static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) 239 264 { 240 265 struct tss_struct *tss; 241 - /* 242 - * make sure the vm86() system call doesn't try to do anything silly 243 - */ 244 - info->regs.pt.ds = 0; 245 - info->regs.pt.es = 0; 246 - info->regs.pt.fs = 0; 247 - #ifndef CONFIG_X86_32_LAZY_GS 248 - info->regs.pt.gs = 0; 249 - #endif 266 + struct task_struct *tsk = current; 267 + struct vm86 *vm86 = tsk->thread.vm86; 268 + struct kernel_vm86_regs vm86regs; 269 + struct pt_regs *regs = current_pt_regs(); 270 + unsigned long err = 0; 271 + 272 + if (!vm86) { 273 + if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL))) 274 + return -ENOMEM; 275 + tsk->thread.vm86 = vm86; 276 + } 277 + if (vm86->saved_sp0) 278 + return -EPERM; 279 + 280 + if (!access_ok(VERIFY_READ, user_vm86, plus ? 281 + sizeof(struct vm86_struct) : 282 + sizeof(struct vm86plus_struct))) 283 + return -EFAULT; 284 + 285 + memset(&vm86regs, 0, sizeof(vm86regs)); 286 + get_user_try { 287 + unsigned short seg; 288 + get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx); 289 + get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx); 290 + get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx); 291 + get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi); 292 + get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi); 293 + get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp); 294 + get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax); 295 + get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip); 296 + get_user_ex(seg, &user_vm86->regs.cs); 297 + vm86regs.pt.cs = seg; 298 + get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags); 299 + get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp); 300 + get_user_ex(seg, &user_vm86->regs.ss); 301 + vm86regs.pt.ss = seg; 302 + get_user_ex(vm86regs.es, &user_vm86->regs.es); 303 + get_user_ex(vm86regs.ds, &user_vm86->regs.ds); 304 + get_user_ex(vm86regs.fs, &user_vm86->regs.fs); 305 + get_user_ex(vm86regs.gs, &user_vm86->regs.gs); 306 + 307 + get_user_ex(vm86->flags, &user_vm86->flags); 308 + get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap); 309 + get_user_ex(vm86->cpu_type, &user_vm86->cpu_type); 310 + } get_user_catch(err); 311 + if (err) 312 + return err; 313 + 314 + if (copy_from_user(&vm86->int_revectored, 315 + &user_vm86->int_revectored, 316 + sizeof(struct revectored_struct))) 317 + return -EFAULT; 318 + if (copy_from_user(&vm86->int21_revectored, 319 + &user_vm86->int21_revectored, 320 + sizeof(struct revectored_struct))) 321 + return -EFAULT; 322 + if (plus) { 323 + if (copy_from_user(&vm86->vm86plus, &user_vm86->vm86plus, 324 + sizeof(struct vm86plus_info_struct))) 325 + return -EFAULT; 326 + vm86->vm86plus.is_vm86pus = 1; 327 + } else 328 + memset(&vm86->vm86plus, 0, 329 + sizeof(struct vm86plus_info_struct)); 330 + 331 + memcpy(&vm86->regs32, regs, sizeof(struct pt_regs)); 332 + vm86->user_vm86 = user_vm86; 250 333 251 334 /* 252 335 * The flags register is also special: we cannot trust that the user 253 336 * has set it up safely, so this makes sure interrupt etc flags are 254 337 * inherited from protected mode. 255 338 */ 256 - VEFLAGS = info->regs.pt.flags; 257 - info->regs.pt.flags &= SAFE_MASK; 258 - info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK; 259 - info->regs.pt.flags |= X86_VM_MASK; 339 + VEFLAGS = vm86regs.pt.flags; 340 + vm86regs.pt.flags &= SAFE_MASK; 341 + vm86regs.pt.flags |= regs->flags & ~SAFE_MASK; 342 + vm86regs.pt.flags |= X86_VM_MASK; 260 343 261 - switch (info->cpu_type) { 344 + vm86regs.pt.orig_ax = regs->orig_ax; 345 + 346 + switch (vm86->cpu_type) { 262 347 case CPU_286: 263 - tsk->thread.v86mask = 0; 348 + vm86->veflags_mask = 0; 264 349 break; 265 350 case CPU_386: 266 - tsk->thread.v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL; 351 + vm86->veflags_mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL; 267 352 break; 268 353 case CPU_486: 269 - tsk->thread.v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; 354 + vm86->veflags_mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; 270 355 break; 271 356 default: 272 - tsk->thread.v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; 357 + vm86->veflags_mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; 273 358 break; 274 359 } 275 360 276 361 /* 277 - * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL) 362 + * Save old state 278 363 */ 279 - info->regs32->ax = VM86_SIGNAL; 280 - tsk->thread.saved_sp0 = tsk->thread.sp0; 281 - tsk->thread.saved_fs = info->regs32->fs; 282 - tsk->thread.saved_gs = get_user_gs(info->regs32); 364 + vm86->saved_sp0 = tsk->thread.sp0; 365 + lazy_save_gs(vm86->regs32.gs); 283 366 284 367 tss = &per_cpu(cpu_tss, get_cpu()); 285 - tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; 368 + /* make room for real-mode segments */ 369 + tsk->thread.sp0 += 16; 286 370 if (cpu_has_sep) 287 371 tsk->thread.sysenter_cs = 0; 288 372 load_sp0(tss, &tsk->thread); 289 373 put_cpu(); 290 374 291 - tsk->thread.screen_bitmap = info->screen_bitmap; 292 - if (info->flags & VM86_SCREEN_BITMAP) 375 + if (vm86->flags & VM86_SCREEN_BITMAP) 293 376 mark_screen_rdonly(tsk->mm); 294 377 295 - /*call __audit_syscall_exit since we do not exit via the normal paths */ 296 - #ifdef CONFIG_AUDITSYSCALL 297 - if (unlikely(current->audit_context)) 298 - __audit_syscall_exit(1, 0); 299 - #endif 300 - 301 - __asm__ __volatile__( 302 - "movl %0,%%esp\n\t" 303 - "movl %1,%%ebp\n\t" 304 - #ifdef CONFIG_X86_32_LAZY_GS 305 - "mov %2, %%gs\n\t" 306 - #endif 307 - "jmp resume_userspace" 308 - : /* no outputs */ 309 - :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); 310 - /* we never return here */ 311 - } 312 - 313 - static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval) 314 - { 315 - struct pt_regs *regs32; 316 - 317 - regs32 = save_v86_state(regs16); 318 - regs32->ax = retval; 319 - __asm__ __volatile__("movl %0,%%esp\n\t" 320 - "movl %1,%%ebp\n\t" 321 - "jmp resume_userspace" 322 - : : "r" (regs32), "r" (current_thread_info())); 378 + memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs)); 379 + force_iret(); 380 + return regs->ax; 323 381 } 324 382 325 383 static inline void set_IF(struct kernel_vm86_regs *regs) 326 384 { 327 385 VEFLAGS |= X86_EFLAGS_VIF; 328 - if (VEFLAGS & X86_EFLAGS_VIP) 329 - return_to_32bit(regs, VM86_STI); 330 386 } 331 387 332 388 static inline void clear_IF(struct kernel_vm86_regs *regs) ··· 377 395 378 396 static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs) 379 397 { 380 - set_flags(VEFLAGS, flags, current->thread.v86mask); 398 + set_flags(VEFLAGS, flags, current->thread.vm86->veflags_mask); 381 399 set_flags(regs->pt.flags, flags, SAFE_MASK); 382 400 if (flags & X86_EFLAGS_IF) 383 401 set_IF(regs); ··· 387 405 388 406 static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs) 389 407 { 390 - set_flags(VFLAGS, flags, current->thread.v86mask); 408 + set_flags(VFLAGS, flags, current->thread.vm86->veflags_mask); 391 409 set_flags(regs->pt.flags, flags, SAFE_MASK); 392 410 if (flags & X86_EFLAGS_IF) 393 411 set_IF(regs); ··· 402 420 if (VEFLAGS & X86_EFLAGS_VIF) 403 421 flags |= X86_EFLAGS_IF; 404 422 flags |= X86_EFLAGS_IOPL; 405 - return flags | (VEFLAGS & current->thread.v86mask); 423 + return flags | (VEFLAGS & current->thread.vm86->veflags_mask); 406 424 } 407 425 408 426 static inline int is_revectored(int nr, struct revectored_struct *bitmap) ··· 500 518 { 501 519 unsigned long __user *intr_ptr; 502 520 unsigned long segoffs; 521 + struct vm86 *vm86 = current->thread.vm86; 503 522 504 523 if (regs->pt.cs == BIOSSEG) 505 524 goto cannot_handle; 506 - if (is_revectored(i, &KVM86->int_revectored)) 525 + if (is_revectored(i, &vm86->int_revectored)) 507 526 goto cannot_handle; 508 - if (i == 0x21 && is_revectored(AH(regs), &KVM86->int21_revectored)) 527 + if (i == 0x21 && is_revectored(AH(regs), &vm86->int21_revectored)) 509 528 goto cannot_handle; 510 529 intr_ptr = (unsigned long __user *) (i << 2); 511 530 if (get_user(segoffs, intr_ptr)) ··· 525 542 return; 526 543 527 544 cannot_handle: 528 - return_to_32bit(regs, VM86_INTx + (i << 8)); 545 + save_v86_state(regs, VM86_INTx + (i << 8)); 529 546 } 530 547 531 548 int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) 532 549 { 533 - if (VMPI.is_vm86pus) { 550 + struct vm86 *vm86 = current->thread.vm86; 551 + 552 + if (vm86->vm86plus.is_vm86pus) { 534 553 if ((trapno == 3) || (trapno == 1)) { 535 - KVM86->regs32->ax = VM86_TRAP + (trapno << 8); 536 - /* setting this flag forces the code in entry_32.S to 537 - the path where we call save_v86_state() and change 538 - the stack pointer to KVM86->regs32 */ 539 - set_thread_flag(TIF_NOTIFY_RESUME); 554 + save_v86_state(regs, VM86_TRAP + (trapno << 8)); 540 555 return 0; 541 556 } 542 557 do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); ··· 555 574 unsigned char __user *ssp; 556 575 unsigned short ip, sp, orig_flags; 557 576 int data32, pref_done; 577 + struct vm86plus_info_struct *vmpi = &current->thread.vm86->vm86plus; 558 578 559 579 #define CHECK_IF_IN_TRAP \ 560 - if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \ 580 + if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \ 561 581 newflags |= X86_EFLAGS_TF 562 - #define VM86_FAULT_RETURN do { \ 563 - if (VMPI.force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \ 564 - return_to_32bit(regs, VM86_PICRETURN); \ 565 - if (orig_flags & X86_EFLAGS_TF) \ 566 - handle_vm86_trap(regs, 0, 1); \ 567 - return; } while (0) 568 582 569 583 orig_flags = *(unsigned short *)&regs->pt.flags; 570 584 ··· 598 622 SP(regs) -= 2; 599 623 } 600 624 IP(regs) = ip; 601 - VM86_FAULT_RETURN; 625 + goto vm86_fault_return; 602 626 603 627 /* popf */ 604 628 case 0x9d: ··· 618 642 else 619 643 set_vflags_short(newflags, regs); 620 644 621 - VM86_FAULT_RETURN; 645 + goto check_vip; 622 646 } 623 647 624 648 /* int xx */ 625 649 case 0xcd: { 626 650 int intno = popb(csp, ip, simulate_sigsegv); 627 651 IP(regs) = ip; 628 - if (VMPI.vm86dbg_active) { 629 - if ((1 << (intno & 7)) & VMPI.vm86dbg_intxxtab[intno >> 3]) 630 - return_to_32bit(regs, VM86_INTx + (intno << 8)); 652 + if (vmpi->vm86dbg_active) { 653 + if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) { 654 + save_v86_state(regs, VM86_INTx + (intno << 8)); 655 + return; 656 + } 631 657 } 632 658 do_int(regs, intno, ssp, sp); 633 659 return; ··· 660 682 } else { 661 683 set_vflags_short(newflags, regs); 662 684 } 663 - VM86_FAULT_RETURN; 685 + goto check_vip; 664 686 } 665 687 666 688 /* cli */ 667 689 case 0xfa: 668 690 IP(regs) = ip; 669 691 clear_IF(regs); 670 - VM86_FAULT_RETURN; 692 + goto vm86_fault_return; 671 693 672 694 /* sti */ 673 695 /* ··· 679 701 case 0xfb: 680 702 IP(regs) = ip; 681 703 set_IF(regs); 682 - VM86_FAULT_RETURN; 704 + goto check_vip; 683 705 684 706 default: 685 - return_to_32bit(regs, VM86_UNKNOWN); 707 + save_v86_state(regs, VM86_UNKNOWN); 686 708 } 687 709 710 + return; 711 + 712 + check_vip: 713 + if (VEFLAGS & X86_EFLAGS_VIP) { 714 + save_v86_state(regs, VM86_STI); 715 + return; 716 + } 717 + 718 + vm86_fault_return: 719 + if (vmpi->force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) { 720 + save_v86_state(regs, VM86_PICRETURN); 721 + return; 722 + } 723 + if (orig_flags & X86_EFLAGS_TF) 724 + handle_vm86_trap(regs, 0, X86_TRAP_DB); 688 725 return; 689 726 690 727 simulate_sigsegv: ··· 713 720 * should be a mixture of the two, but how do we 714 721 * get the information? [KD] 715 722 */ 716 - return_to_32bit(regs, VM86_UNKNOWN); 723 + save_v86_state(regs, VM86_UNKNOWN); 717 724 } 718 725 719 726 /* ---------------- vm86 special IRQ passing stuff ----------------- */

+2 -2

arch/x86/kvm/lapic.c

··· 1172 1172 1173 1173 tsc_deadline = apic->lapic_timer.expired_tscdeadline; 1174 1174 apic->lapic_timer.expired_tscdeadline = 0; 1175 - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); 1175 + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc()); 1176 1176 trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); 1177 1177 1178 1178 /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ ··· 1240 1240 local_irq_save(flags); 1241 1241 1242 1242 now = apic->lapic_timer.timer.base->get_time(); 1243 - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); 1243 + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc()); 1244 1244 if (likely(tscdeadline > guest_tsc)) { 1245 1245 ns = (tscdeadline - guest_tsc) * 1000000ULL; 1246 1246 do_div(ns, this_tsc_khz);

+2 -2

arch/x86/kvm/svm.c

··· 1139 1139 { 1140 1140 u64 tsc; 1141 1141 1142 - tsc = svm_scale_tsc(vcpu, native_read_tsc()); 1142 + tsc = svm_scale_tsc(vcpu, rdtsc()); 1143 1143 1144 1144 return target_tsc - tsc; 1145 1145 } ··· 3174 3174 switch (msr_info->index) { 3175 3175 case MSR_IA32_TSC: { 3176 3176 msr_info->data = svm->vmcb->control.tsc_offset + 3177 - svm_scale_tsc(vcpu, native_read_tsc()); 3177 + svm_scale_tsc(vcpu, rdtsc()); 3178 3178 3179 3179 break; 3180 3180 }

+2 -2

arch/x86/kvm/vmx.c

··· 2236 2236 { 2237 2237 u64 host_tsc, tsc_offset; 2238 2238 2239 - rdtscll(host_tsc); 2239 + host_tsc = rdtsc(); 2240 2240 tsc_offset = vmcs_read64(TSC_OFFSET); 2241 2241 return host_tsc + tsc_offset; 2242 2242 } ··· 2317 2317 2318 2318 static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) 2319 2319 { 2320 - return target_tsc - native_read_tsc(); 2320 + return target_tsc - rdtsc(); 2321 2321 } 2322 2322 2323 2323 static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)

+7 -19

arch/x86/kvm/x86.c

··· 1441 1441 1442 1442 static cycle_t read_tsc(void) 1443 1443 { 1444 - cycle_t ret; 1445 - u64 last; 1446 - 1447 - /* 1448 - * Empirically, a fence (of type that depends on the CPU) 1449 - * before rdtsc is enough to ensure that rdtsc is ordered 1450 - * with respect to loads. The various CPU manuals are unclear 1451 - * as to whether rdtsc can be reordered with later loads, 1452 - * but no one has ever seen it happen. 1453 - */ 1454 - rdtsc_barrier(); 1455 - ret = (cycle_t)vget_cycles(); 1456 - 1457 - last = pvclock_gtod_data.clock.cycle_last; 1444 + cycle_t ret = (cycle_t)rdtsc_ordered(); 1445 + u64 last = pvclock_gtod_data.clock.cycle_last; 1458 1446 1459 1447 if (likely(ret >= last)) 1460 1448 return ret; ··· 1631 1643 return 1; 1632 1644 } 1633 1645 if (!use_master_clock) { 1634 - host_tsc = native_read_tsc(); 1646 + host_tsc = rdtsc(); 1635 1647 kernel_ns = get_kernel_ns(); 1636 1648 } 1637 1649 ··· 2608 2620 2609 2621 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { 2610 2622 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : 2611 - native_read_tsc() - vcpu->arch.last_host_tsc; 2623 + rdtsc() - vcpu->arch.last_host_tsc; 2612 2624 if (tsc_delta < 0) 2613 2625 mark_tsc_unstable("KVM discovered backwards TSC"); 2614 2626 if (check_tsc_unstable()) { ··· 2636 2648 { 2637 2649 kvm_x86_ops->vcpu_put(vcpu); 2638 2650 kvm_put_guest_fpu(vcpu); 2639 - vcpu->arch.last_host_tsc = native_read_tsc(); 2651 + vcpu->arch.last_host_tsc = rdtsc(); 2640 2652 } 2641 2653 2642 2654 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, ··· 6375 6387 hw_breakpoint_restore(); 6376 6388 6377 6389 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, 6378 - native_read_tsc()); 6390 + rdtsc()); 6379 6391 6380 6392 vcpu->mode = OUTSIDE_GUEST_MODE; 6381 6393 smp_wmb(); ··· 7184 7196 if (ret != 0) 7185 7197 return ret; 7186 7198 7187 - local_tsc = native_read_tsc(); 7199 + local_tsc = rdtsc(); 7188 7200 stable = !check_tsc_unstable(); 7189 7201 list_for_each_entry(kvm, &vm_list, vm_list) { 7190 7202 kvm_for_each_vcpu(i, vcpu, kvm) {

+51 -9

arch/x86/lib/delay.c

··· 20 20 #include <asm/processor.h> 21 21 #include <asm/delay.h> 22 22 #include <asm/timer.h> 23 + #include <asm/mwait.h> 23 24 24 25 #ifdef CONFIG_SMP 25 26 # include <asm/smp.h> ··· 50 49 /* TSC based delay: */ 51 50 static void delay_tsc(unsigned long __loops) 52 51 { 53 - u32 bclock, now, loops = __loops; 52 + u64 bclock, now, loops = __loops; 54 53 int cpu; 55 54 56 55 preempt_disable(); 57 56 cpu = smp_processor_id(); 58 - rdtsc_barrier(); 59 - rdtscl(bclock); 57 + bclock = rdtsc_ordered(); 60 58 for (;;) { 61 - rdtsc_barrier(); 62 - rdtscl(now); 59 + now = rdtsc_ordered(); 63 60 if ((now - bclock) >= loops) 64 61 break; 65 62 ··· 78 79 if (unlikely(cpu != smp_processor_id())) { 79 80 loops -= (now - bclock); 80 81 cpu = smp_processor_id(); 81 - rdtsc_barrier(); 82 - rdtscl(bclock); 82 + bclock = rdtsc_ordered(); 83 83 } 84 84 } 85 85 preempt_enable(); 86 + } 87 + 88 + /* 89 + * On some AMD platforms, MWAITX has a configurable 32-bit timer, that 90 + * counts with TSC frequency. The input value is the loop of the 91 + * counter, it will exit when the timer expires. 92 + */ 93 + static void delay_mwaitx(unsigned long __loops) 94 + { 95 + u64 start, end, delay, loops = __loops; 96 + 97 + start = rdtsc_ordered(); 98 + 99 + for (;;) { 100 + delay = min_t(u64, MWAITX_MAX_LOOPS, loops); 101 + 102 + /* 103 + * Use cpu_tss as a cacheline-aligned, seldomly 104 + * accessed per-cpu variable as the monitor target. 105 + */ 106 + __monitorx(this_cpu_ptr(&cpu_tss), 0, 0); 107 + 108 + /* 109 + * AMD, like Intel, supports the EAX hint and EAX=0xf 110 + * means, do not enter any deep C-state and we use it 111 + * here in delay() to minimize wakeup latency. 112 + */ 113 + __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE); 114 + 115 + end = rdtsc_ordered(); 116 + 117 + if (loops <= end - start) 118 + break; 119 + 120 + loops -= end - start; 121 + 122 + start = end; 123 + } 86 124 } 87 125 88 126 /* ··· 130 94 131 95 void use_tsc_delay(void) 132 96 { 133 - delay_fn = delay_tsc; 97 + if (delay_fn == delay_loop) 98 + delay_fn = delay_tsc; 99 + } 100 + 101 + void use_mwaitx_delay(void) 102 + { 103 + delay_fn = delay_mwaitx; 134 104 } 135 105 136 106 int read_current_timer(unsigned long *timer_val) 137 107 { 138 108 if (delay_fn == delay_tsc) { 139 - rdtscll(*timer_val); 109 + *timer_val = rdtsc(); 140 110 return 0; 141 111 } 142 112 return -1;

+1

arch/x86/math-emu/get_address.c

··· 20 20 #include <linux/stddef.h> 21 21 22 22 #include <asm/uaccess.h> 23 + #include <asm/vm86.h> 23 24 24 25 #include "fpu_system.h" 25 26 #include "exception.h"

+5 -2

arch/x86/mm/fault.c

··· 20 20 #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ 21 21 #include <asm/fixmap.h> /* VSYSCALL_ADDR */ 22 22 #include <asm/vsyscall.h> /* emulate_vsyscall */ 23 + #include <asm/vm86.h> /* struct vm86 */ 23 24 24 25 #define CREATE_TRACE_POINTS 25 26 #include <asm/trace/exceptions.h> ··· 302 301 check_v8086_mode(struct pt_regs *regs, unsigned long address, 303 302 struct task_struct *tsk) 304 303 { 304 + #ifdef CONFIG_VM86 305 305 unsigned long bit; 306 306 307 - if (!v8086_mode(regs)) 307 + if (!v8086_mode(regs) || !tsk->thread.vm86) 308 308 return; 309 309 310 310 bit = (address - 0xA0000) >> PAGE_SHIFT; 311 311 if (bit < 32) 312 - tsk->thread.screen_bitmap |= 1 << bit; 312 + tsk->thread.vm86->screen_bitmap |= 1 << bit; 313 + #endif 313 314 } 314 315 315 316 static bool low_pfn(unsigned long pfn)

-13

arch/x86/um/asm/barrier.h

··· 45 45 #define read_barrier_depends() do { } while (0) 46 46 #define smp_read_barrier_depends() do { } while (0) 47 47 48 - /* 49 - * Stop RDTSC speculation. This is needed when you need to use RDTSC 50 - * (or get_cycles or vread that possibly accesses the TSC) in a defined 51 - * code region. 52 - * 53 - * (Could use an alternative three way for this if there was one.) 54 - */ 55 - static inline void rdtsc_barrier(void) 56 - { 57 - alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, 58 - "lfence", X86_FEATURE_LFENCE_RDTSC); 59 - } 60 - 61 48 #endif

-3

arch/x86/xen/enlighten.c

··· 1215 1215 .read_msr = xen_read_msr_safe, 1216 1216 .write_msr = xen_write_msr_safe, 1217 1217 1218 - .read_tsc = native_read_tsc, 1219 1218 .read_pmc = native_read_pmc, 1220 - 1221 - .read_tscp = native_read_tscp, 1222 1219 1223 1220 .iret = xen_iret, 1224 1221 #ifdef CONFIG_X86_64

+1 -1

drivers/cpufreq/intel_pstate.c

··· 766 766 local_irq_save(flags); 767 767 rdmsrl(MSR_IA32_APERF, aperf); 768 768 rdmsrl(MSR_IA32_MPERF, mperf); 769 - tsc = native_read_tsc(); 769 + tsc = rdtsc(); 770 770 local_irq_restore(flags); 771 771 772 772 cpu->last_sample_time = cpu->sample.time;

+2 -2

drivers/input/gameport/gameport.c

··· 149 149 150 150 for(i = 0; i < 50; i++) { 151 151 local_irq_save(flags); 152 - rdtscl(t1); 152 + t1 = rdtsc(); 153 153 for (t = 0; t < 50; t++) gameport_read(gameport); 154 - rdtscl(t2); 154 + t2 = rdtsc(); 155 155 local_irq_restore(flags); 156 156 udelay(i * 10); 157 157 if (t2 - t1 < tx) tx = t2 - t1;

+2 -2

drivers/input/joystick/analog.c

··· 143 143 144 144 #include <linux/i8253.h> 145 145 146 - #define GET_TIME(x) do { if (cpu_has_tsc) rdtscl(x); else x = get_time_pit(); } while (0) 146 + #define GET_TIME(x) do { if (cpu_has_tsc) x = (unsigned int)rdtsc(); else x = get_time_pit(); } while (0) 147 147 #define DELTA(x,y) (cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0))) 148 148 #define TIME_NAME (cpu_has_tsc?"TSC":"PIT") 149 149 static unsigned int get_time_pit(void) ··· 160 160 return count; 161 161 } 162 162 #elif defined(__x86_64__) 163 - #define GET_TIME(x) rdtscl(x) 163 + #define GET_TIME(x) do { x = (unsigned int)rdtsc(); } while (0) 164 164 #define DELTA(x,y) ((y)-(x)) 165 165 #define TIME_NAME "TSC" 166 166 #elif defined(__alpha__) || defined(CONFIG_MN10300) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_TILE)

+1 -1

drivers/net/hamradio/baycom_epp.c

··· 638 638 #define GETTICK(x) \ 639 639 ({ \ 640 640 if (cpu_has_tsc) \ 641 - rdtscl(x); \ 641 + x = (unsigned int)rdtsc(); \ 642 642 }) 643 643 #else /* __i386__ */ 644 644 #define GETTICK(x)

+3

drivers/scsi/dpt_i2o.c

··· 1924 1924 #endif 1925 1925 1926 1926 #if defined __i386__ 1927 + 1928 + #include <uapi/asm/vm86.h> 1929 + 1927 1930 static void adpt_i386_info(sysInfo_S* si) 1928 1931 { 1929 1932 // This is all the info we need for now

+4 -59

drivers/staging/media/lirc/lirc_serial.c

··· 327 327 * time 328 328 */ 329 329 330 - /* So send_pulse can quickly convert microseconds to clocks */ 331 - static unsigned long conv_us_to_clocks; 332 - 333 330 static int init_timing_params(unsigned int new_duty_cycle, 334 331 unsigned int new_freq) 335 332 { ··· 341 344 /* How many clocks in a microsecond?, avoiding long long divide */ 342 345 work = loops_per_sec; 343 346 work *= 4295; /* 4295 = 2^32 / 1e6 */ 344 - conv_us_to_clocks = work >> 32; 345 347 346 348 /* 347 349 * Carrier period in clocks, approach good up to 32GHz clock, ··· 353 357 pulse_width = period * duty_cycle / 100; 354 358 space_width = period - pulse_width; 355 359 dprintk("in init_timing_params, freq=%d, duty_cycle=%d, " 356 - "clk/jiffy=%ld, pulse=%ld, space=%ld, " 357 - "conv_us_to_clocks=%ld\n", 360 + "clk/jiffy=%ld, pulse=%ld, space=%ld\n", 358 361 freq, duty_cycle, __this_cpu_read(cpu_info.loops_per_jiffy), 359 - pulse_width, space_width, conv_us_to_clocks); 362 + pulse_width, space_width); 360 363 return 0; 361 364 } 362 365 #else /* ! USE_RDTSC */ ··· 426 431 return ret; 427 432 } 428 433 429 - #ifdef USE_RDTSC 430 - /* Version that uses Pentium rdtsc instruction to measure clocks */ 431 - 432 - /* 433 - * This version does sub-microsecond timing using rdtsc instruction, 434 - * and does away with the fudged LIRC_SERIAL_TRANSMITTER_LATENCY 435 - * Implicitly i586 architecture... - Steve 436 - */ 437 - 438 - static long send_pulse_homebrew_softcarrier(unsigned long length) 439 - { 440 - int flag; 441 - unsigned long target, start, now; 442 - 443 - /* Get going quick as we can */ 444 - rdtscl(start); 445 - on(); 446 - /* Convert length from microseconds to clocks */ 447 - length *= conv_us_to_clocks; 448 - /* And loop till time is up - flipping at right intervals */ 449 - now = start; 450 - target = pulse_width; 451 - flag = 1; 452 - /* 453 - * FIXME: This looks like a hard busy wait, without even an occasional, 454 - * polite, cpu_relax() call. There's got to be a better way? 455 - * 456 - * The i2c code has the result of a lot of bit-banging work, I wonder if 457 - * there's something there which could be helpful here. 458 - */ 459 - while ((now - start) < length) { 460 - /* Delay till flip time */ 461 - do { 462 - rdtscl(now); 463 - } while ((now - start) < target); 464 - 465 - /* flip */ 466 - if (flag) { 467 - rdtscl(now); 468 - off(); 469 - target += space_width; 470 - } else { 471 - rdtscl(now); on(); 472 - target += pulse_width; 473 - } 474 - flag = !flag; 475 - } 476 - rdtscl(now); 477 - return ((now - start) - length) / conv_us_to_clocks; 478 - } 479 - #else /* ! USE_RDTSC */ 480 434 /* Version using udelay() */ 481 435 482 436 /* 483 437 * here we use fixed point arithmetic, with 8 484 438 * fractional bits. that gets us within 0.1% or so of the right average 485 439 * frequency, albeit with some jitter in pulse length - Steve 440 + * 441 + * This should use ndelay instead. 486 442 */ 487 443 488 444 /* To match 8 fractional bits used for pulse/space length */ ··· 466 520 } 467 521 return (actual-length) >> 8; 468 522 } 469 - #endif /* USE_RDTSC */ 470 523 471 524 static long send_pulse_homebrew(unsigned long length) 472 525 {

+2 -2

drivers/thermal/intel_powerclamp.c

··· 340 340 341 341 /* check result for the last window */ 342 342 msr_now = pkg_state_counter(); 343 - rdtscll(tsc_now); 343 + tsc_now = rdtsc(); 344 344 345 345 /* calculate pkg cstate vs tsc ratio */ 346 346 if (!msr_last || !tsc_last) ··· 482 482 u64 val64; 483 483 484 484 msr_now = pkg_state_counter(); 485 - rdtscll(tsc_now); 485 + tsc_now = rdtsc(); 486 486 jiffies_now = jiffies; 487 487 488 488 /* calculate pkg cstate vs tsc ratio */

+15

include/linux/context_tracking.h

··· 49 49 } 50 50 } 51 51 52 + 53 + /** 54 + * ct_state() - return the current context tracking state if known 55 + * 56 + * Returns the current cpu's context tracking state if context tracking 57 + * is enabled. If context tracking is disabled, returns 58 + * CONTEXT_DISABLED. This should be used primarily for debugging. 59 + */ 60 + static inline enum ctx_state ct_state(void) 61 + { 62 + return context_tracking_is_enabled() ? 63 + this_cpu_read(context_tracking.state) : CONTEXT_DISABLED; 64 + } 52 65 #else 53 66 static inline void user_enter(void) { } 54 67 static inline void user_exit(void) { } 55 68 static inline enum ctx_state exception_enter(void) { return 0; } 56 69 static inline void exception_exit(enum ctx_state prev_ctx) { } 70 + static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; } 57 71 #endif /* !CONFIG_CONTEXT_TRACKING */ 58 72 73 + #define CT_WARN_ON(cond) WARN_ON(context_tracking_is_enabled() && (cond)) 59 74 60 75 #ifdef CONFIG_CONTEXT_TRACKING_FORCE 61 76 extern void context_tracking_init(void);

+1

include/linux/context_tracking_state.h

··· 14 14 bool active; 15 15 int recursion; 16 16 enum ctx_state { 17 + CONTEXT_DISABLED = -1, /* returned by ct_state() if unknown */ 17 18 CONTEXT_KERNEL = 0, 18 19 CONTEXT_USER, 19 20 CONTEXT_GUEST,

+15 -15

include/linux/spinlock.h

··· 286 286 * Map the spin_lock functions to the raw variants for PREEMPT_RT=n 287 287 */ 288 288 289 - static inline raw_spinlock_t *spinlock_check(spinlock_t *lock) 289 + static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock) 290 290 { 291 291 return &lock->rlock; 292 292 } ··· 297 297 raw_spin_lock_init(&(_lock)->rlock); \ 298 298 } while (0) 299 299 300 - static inline void spin_lock(spinlock_t *lock) 300 + static __always_inline void spin_lock(spinlock_t *lock) 301 301 { 302 302 raw_spin_lock(&lock->rlock); 303 303 } 304 304 305 - static inline void spin_lock_bh(spinlock_t *lock) 305 + static __always_inline void spin_lock_bh(spinlock_t *lock) 306 306 { 307 307 raw_spin_lock_bh(&lock->rlock); 308 308 } 309 309 310 - static inline int spin_trylock(spinlock_t *lock) 310 + static __always_inline int spin_trylock(spinlock_t *lock) 311 311 { 312 312 return raw_spin_trylock(&lock->rlock); 313 313 } ··· 327 327 raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \ 328 328 } while (0) 329 329 330 - static inline void spin_lock_irq(spinlock_t *lock) 330 + static __always_inline void spin_lock_irq(spinlock_t *lock) 331 331 { 332 332 raw_spin_lock_irq(&lock->rlock); 333 333 } ··· 342 342 raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \ 343 343 } while (0) 344 344 345 - static inline void spin_unlock(spinlock_t *lock) 345 + static __always_inline void spin_unlock(spinlock_t *lock) 346 346 { 347 347 raw_spin_unlock(&lock->rlock); 348 348 } 349 349 350 - static inline void spin_unlock_bh(spinlock_t *lock) 350 + static __always_inline void spin_unlock_bh(spinlock_t *lock) 351 351 { 352 352 raw_spin_unlock_bh(&lock->rlock); 353 353 } 354 354 355 - static inline void spin_unlock_irq(spinlock_t *lock) 355 + static __always_inline void spin_unlock_irq(spinlock_t *lock) 356 356 { 357 357 raw_spin_unlock_irq(&lock->rlock); 358 358 } 359 359 360 - static inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 360 + static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 361 361 { 362 362 raw_spin_unlock_irqrestore(&lock->rlock, flags); 363 363 } 364 364 365 - static inline int spin_trylock_bh(spinlock_t *lock) 365 + static __always_inline int spin_trylock_bh(spinlock_t *lock) 366 366 { 367 367 return raw_spin_trylock_bh(&lock->rlock); 368 368 } 369 369 370 - static inline int spin_trylock_irq(spinlock_t *lock) 370 + static __always_inline int spin_trylock_irq(spinlock_t *lock) 371 371 { 372 372 return raw_spin_trylock_irq(&lock->rlock); 373 373 } ··· 377 377 raw_spin_trylock_irqsave(spinlock_check(lock), flags); \ 378 378 }) 379 379 380 - static inline void spin_unlock_wait(spinlock_t *lock) 380 + static __always_inline void spin_unlock_wait(spinlock_t *lock) 381 381 { 382 382 raw_spin_unlock_wait(&lock->rlock); 383 383 } 384 384 385 - static inline int spin_is_locked(spinlock_t *lock) 385 + static __always_inline int spin_is_locked(spinlock_t *lock) 386 386 { 387 387 return raw_spin_is_locked(&lock->rlock); 388 388 } 389 389 390 - static inline int spin_is_contended(spinlock_t *lock) 390 + static __always_inline int spin_is_contended(spinlock_t *lock) 391 391 { 392 392 return raw_spin_is_contended(&lock->rlock); 393 393 } 394 394 395 - static inline int spin_can_lock(spinlock_t *lock) 395 + static __always_inline int spin_can_lock(spinlock_t *lock) 396 396 { 397 397 return raw_spin_can_lock(&lock->rlock); 398 398 }

+2

kernel/notifier.c

··· 544 544 .signr = sig, 545 545 546 546 }; 547 + RCU_LOCKDEP_WARN(!rcu_is_watching(), 548 + "notify_die called but RCU thinks we're quiescent"); 547 549 return atomic_notifier_call_chain(&die_chain, val, &args); 548 550 } 549 551 NOKPROBE_SYMBOL(notify_die);

+1

kernel/sys_ni.c

··· 140 140 cond_syscall(sys_ssetmask); 141 141 cond_syscall(sys_vm86old); 142 142 cond_syscall(sys_vm86); 143 + cond_syscall(sys_modify_ldt); 143 144 cond_syscall(sys_ipc); 144 145 cond_syscall(compat_sys_ipc); 145 146 cond_syscall(compat_sys_sysctl);

+2 -2

tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c

··· 81 81 82 82 printk(KERN_DEBUG "start--> \n"); 83 83 then = read_pmtmr(); 84 - rdtscll(then_tsc); 84 + then_tsc = rdtsc(); 85 85 for (i=0;i<20;i++) { 86 86 mdelay(100); 87 87 now = read_pmtmr(); 88 - rdtscll(now_tsc); 88 + now_tsc = rdtsc(); 89 89 diff = (now - then) & 0xFFFFFF; 90 90 diff_tsc = now_tsc - then_tsc; 91 91 printk(KERN_DEBUG "t1: %08u t2: %08u diff_pmtmr: %08u diff_tsc: %016llu\n", then, now, diff, diff_tsc);

+2 -2

tools/testing/selftests/x86/Makefile

··· 4 4 5 5 .PHONY: all all_32 all_64 warn_32bit_failure clean 6 6 7 - TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs 8 - TARGETS_C_32BIT_ONLY := entry_from_vm86 7 + TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt 8 + TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn 9 9 10 10 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) 11 11 BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)

+128 -9

tools/testing/selftests/x86/entry_from_vm86.c

··· 28 28 static unsigned long load_addr = 0x10000; 29 29 static int nerrs = 0; 30 30 31 + static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), 32 + int flags) 33 + { 34 + struct sigaction sa; 35 + memset(&sa, 0, sizeof(sa)); 36 + sa.sa_sigaction = handler; 37 + sa.sa_flags = SA_SIGINFO | flags; 38 + sigemptyset(&sa.sa_mask); 39 + if (sigaction(sig, &sa, 0)) 40 + err(1, "sigaction"); 41 + } 42 + 43 + static void clearhandler(int sig) 44 + { 45 + struct sigaction sa; 46 + memset(&sa, 0, sizeof(sa)); 47 + sa.sa_handler = SIG_DFL; 48 + sigemptyset(&sa.sa_mask); 49 + if (sigaction(sig, &sa, 0)) 50 + err(1, "sigaction"); 51 + } 52 + 53 + static sig_atomic_t got_signal; 54 + 55 + static void sighandler(int sig, siginfo_t *info, void *ctx_void) 56 + { 57 + ucontext_t *ctx = (ucontext_t*)ctx_void; 58 + 59 + if (ctx->uc_mcontext.gregs[REG_EFL] & X86_EFLAGS_VM || 60 + (ctx->uc_mcontext.gregs[REG_CS] & 3) != 3) { 61 + printf("[FAIL]\tSignal frame should not reflect vm86 mode\n"); 62 + nerrs++; 63 + } 64 + 65 + const char *signame; 66 + if (sig == SIGSEGV) 67 + signame = "SIGSEGV"; 68 + else if (sig == SIGILL) 69 + signame = "SIGILL"; 70 + else 71 + signame = "unexpected signal"; 72 + 73 + printf("[INFO]\t%s: FLAGS = 0x%lx, CS = 0x%hx\n", signame, 74 + (unsigned long)ctx->uc_mcontext.gregs[REG_EFL], 75 + (unsigned short)ctx->uc_mcontext.gregs[REG_CS]); 76 + 77 + got_signal = 1; 78 + } 79 + 31 80 asm ( 32 81 ".pushsection .rodata\n\t" 33 82 ".type vmcode_bound, @object\n\t" ··· 87 38 "int3\n\t" 88 39 "vmcode_sysenter:\n\t" 89 40 "sysenter\n\t" 41 + "vmcode_syscall:\n\t" 42 + "syscall\n\t" 43 + "vmcode_sti:\n\t" 44 + "sti\n\t" 45 + "vmcode_int3:\n\t" 46 + "int3\n\t" 47 + "vmcode_int80:\n\t" 48 + "int $0x80\n\t" 90 49 ".size vmcode, . - vmcode\n\t" 91 50 "end_vmcode:\n\t" 92 51 ".code32\n\t" ··· 102 45 ); 103 46 104 47 extern unsigned char vmcode[], end_vmcode[]; 105 - extern unsigned char vmcode_bound[], vmcode_sysenter[]; 48 + extern unsigned char vmcode_bound[], vmcode_sysenter[], vmcode_syscall[], 49 + vmcode_sti[], vmcode_int3[], vmcode_int80[]; 106 50 107 - static void do_test(struct vm86plus_struct *v86, unsigned long eip, 51 + /* Returns false if the test was skipped. */ 52 + static bool do_test(struct vm86plus_struct *v86, unsigned long eip, 53 + unsigned int rettype, unsigned int retarg, 108 54 const char *text) 109 55 { 110 56 long ret; ··· 118 58 119 59 if (ret == -1 && errno == ENOSYS) { 120 60 printf("[SKIP]\tvm86 not supported\n"); 121 - return; 61 + return false; 122 62 } 123 63 124 64 if (VM86_TYPE(ret) == VM86_INTx) { ··· 133 73 else 134 74 sprintf(trapname, "%d", trapno); 135 75 136 - printf("[OK]\tExited vm86 mode due to #%s\n", trapname); 76 + printf("[INFO]\tExited vm86 mode due to #%s\n", trapname); 137 77 } else if (VM86_TYPE(ret) == VM86_UNKNOWN) { 138 - printf("[OK]\tExited vm86 mode due to unhandled GP fault\n"); 78 + printf("[INFO]\tExited vm86 mode due to unhandled GP fault\n"); 79 + } else if (VM86_TYPE(ret) == VM86_TRAP) { 80 + printf("[INFO]\tExited vm86 mode due to a trap (arg=%ld)\n", 81 + VM86_ARG(ret)); 82 + } else if (VM86_TYPE(ret) == VM86_SIGNAL) { 83 + printf("[INFO]\tExited vm86 mode due to a signal\n"); 84 + } else if (VM86_TYPE(ret) == VM86_STI) { 85 + printf("[INFO]\tExited vm86 mode due to STI\n"); 139 86 } else { 140 - printf("[OK]\tExited vm86 mode due to type %ld, arg %ld\n", 87 + printf("[INFO]\tExited vm86 mode due to type %ld, arg %ld\n", 141 88 VM86_TYPE(ret), VM86_ARG(ret)); 142 89 } 90 + 91 + if (rettype == -1 || 92 + (VM86_TYPE(ret) == rettype && VM86_ARG(ret) == retarg)) { 93 + printf("[OK]\tReturned correctly\n"); 94 + } else { 95 + printf("[FAIL]\tIncorrect return reason\n"); 96 + nerrs++; 97 + } 98 + 99 + return true; 143 100 } 144 101 145 102 int main(void) ··· 182 105 assert((v86.regs.cs & 3) == 0); /* Looks like RPL = 0 */ 183 106 184 107 /* #BR -- should deliver SIG??? */ 185 - do_test(&v86, vmcode_bound - vmcode, "#BR"); 108 + do_test(&v86, vmcode_bound - vmcode, VM86_INTx, 5, "#BR"); 186 109 187 - /* SYSENTER -- should cause #GP or #UD depending on CPU */ 188 - do_test(&v86, vmcode_sysenter - vmcode, "SYSENTER"); 110 + /* 111 + * SYSENTER -- should cause #GP or #UD depending on CPU. 112 + * Expected return type -1 means that we shouldn't validate 113 + * the vm86 return value. This will avoid problems on non-SEP 114 + * CPUs. 115 + */ 116 + sethandler(SIGILL, sighandler, 0); 117 + do_test(&v86, vmcode_sysenter - vmcode, -1, 0, "SYSENTER"); 118 + clearhandler(SIGILL); 119 + 120 + /* 121 + * SYSCALL would be a disaster in VM86 mode. Fortunately, 122 + * there is no kernel that both enables SYSCALL and sets 123 + * EFER.SCE, so it's #UD on all systems. But vm86 is 124 + * buggy (or has a "feature"), so the SIGILL will actually 125 + * be delivered. 126 + */ 127 + sethandler(SIGILL, sighandler, 0); 128 + do_test(&v86, vmcode_syscall - vmcode, VM86_SIGNAL, 0, "SYSCALL"); 129 + clearhandler(SIGILL); 130 + 131 + /* STI with VIP set */ 132 + v86.regs.eflags |= X86_EFLAGS_VIP; 133 + v86.regs.eflags &= ~X86_EFLAGS_IF; 134 + do_test(&v86, vmcode_sti - vmcode, VM86_STI, 0, "STI with VIP set"); 135 + 136 + /* INT3 -- should cause #BP */ 137 + do_test(&v86, vmcode_int3 - vmcode, VM86_TRAP, 3, "INT3"); 138 + 139 + /* INT80 -- should exit with "INTx 0x80" */ 140 + v86.regs.eax = (unsigned int)-1; 141 + do_test(&v86, vmcode_int80 - vmcode, VM86_INTx, 0x80, "int80"); 142 + 143 + /* Execute a null pointer */ 144 + v86.regs.cs = 0; 145 + v86.regs.ss = 0; 146 + sethandler(SIGSEGV, sighandler, 0); 147 + got_signal = 0; 148 + if (do_test(&v86, 0, VM86_SIGNAL, 0, "Execute null pointer") && 149 + !got_signal) { 150 + printf("[FAIL]\tDid not receive SIGSEGV\n"); 151 + nerrs++; 152 + } 153 + clearhandler(SIGSEGV); 189 154 190 155 return (nerrs == 0 ? 0 : 1); 191 156 }

+576

tools/testing/selftests/x86/ldt_gdt.c

··· 1 + /* 2 + * ldt_gdt.c - Test cases for LDT and GDT access 3 + * Copyright (c) 2015 Andrew Lutomirski 4 + */ 5 + 6 + #define _GNU_SOURCE 7 + #include <err.h> 8 + #include <stdio.h> 9 + #include <stdint.h> 10 + #include <signal.h> 11 + #include <setjmp.h> 12 + #include <stdlib.h> 13 + #include <string.h> 14 + #include <errno.h> 15 + #include <unistd.h> 16 + #include <sys/syscall.h> 17 + #include <asm/ldt.h> 18 + #include <sys/types.h> 19 + #include <sys/wait.h> 20 + #include <stdbool.h> 21 + #include <pthread.h> 22 + #include <sched.h> 23 + #include <linux/futex.h> 24 + 25 + #define AR_ACCESSED (1<<8) 26 + 27 + #define AR_TYPE_RODATA (0 * (1<<9)) 28 + #define AR_TYPE_RWDATA (1 * (1<<9)) 29 + #define AR_TYPE_RODATA_EXPDOWN (2 * (1<<9)) 30 + #define AR_TYPE_RWDATA_EXPDOWN (3 * (1<<9)) 31 + #define AR_TYPE_XOCODE (4 * (1<<9)) 32 + #define AR_TYPE_XRCODE (5 * (1<<9)) 33 + #define AR_TYPE_XOCODE_CONF (6 * (1<<9)) 34 + #define AR_TYPE_XRCODE_CONF (7 * (1<<9)) 35 + 36 + #define AR_DPL3 (3 * (1<<13)) 37 + 38 + #define AR_S (1 << 12) 39 + #define AR_P (1 << 15) 40 + #define AR_AVL (1 << 20) 41 + #define AR_L (1 << 21) 42 + #define AR_DB (1 << 22) 43 + #define AR_G (1 << 23) 44 + 45 + static int nerrs; 46 + 47 + static void check_invalid_segment(uint16_t index, int ldt) 48 + { 49 + uint32_t has_limit = 0, has_ar = 0, limit, ar; 50 + uint32_t selector = (index << 3) | (ldt << 2) | 3; 51 + 52 + asm ("lsl %[selector], %[limit]\n\t" 53 + "jnz 1f\n\t" 54 + "movl $1, %[has_limit]\n\t" 55 + "1:" 56 + : [limit] "=r" (limit), [has_limit] "+rm" (has_limit) 57 + : [selector] "r" (selector)); 58 + asm ("larl %[selector], %[ar]\n\t" 59 + "jnz 1f\n\t" 60 + "movl $1, %[has_ar]\n\t" 61 + "1:" 62 + : [ar] "=r" (ar), [has_ar] "+rm" (has_ar) 63 + : [selector] "r" (selector)); 64 + 65 + if (has_limit || has_ar) { 66 + printf("[FAIL]\t%s entry %hu is valid but should be invalid\n", 67 + (ldt ? "LDT" : "GDT"), index); 68 + nerrs++; 69 + } else { 70 + printf("[OK]\t%s entry %hu is invalid\n", 71 + (ldt ? "LDT" : "GDT"), index); 72 + } 73 + } 74 + 75 + static void check_valid_segment(uint16_t index, int ldt, 76 + uint32_t expected_ar, uint32_t expected_limit, 77 + bool verbose) 78 + { 79 + uint32_t has_limit = 0, has_ar = 0, limit, ar; 80 + uint32_t selector = (index << 3) | (ldt << 2) | 3; 81 + 82 + asm ("lsl %[selector], %[limit]\n\t" 83 + "jnz 1f\n\t" 84 + "movl $1, %[has_limit]\n\t" 85 + "1:" 86 + : [limit] "=r" (limit), [has_limit] "+rm" (has_limit) 87 + : [selector] "r" (selector)); 88 + asm ("larl %[selector], %[ar]\n\t" 89 + "jnz 1f\n\t" 90 + "movl $1, %[has_ar]\n\t" 91 + "1:" 92 + : [ar] "=r" (ar), [has_ar] "+rm" (has_ar) 93 + : [selector] "r" (selector)); 94 + 95 + if (!has_limit || !has_ar) { 96 + printf("[FAIL]\t%s entry %hu is invalid but should be valid\n", 97 + (ldt ? "LDT" : "GDT"), index); 98 + nerrs++; 99 + return; 100 + } 101 + 102 + if (ar != expected_ar) { 103 + printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n", 104 + (ldt ? "LDT" : "GDT"), index, ar, expected_ar); 105 + nerrs++; 106 + } else if (limit != expected_limit) { 107 + printf("[FAIL]\t%s entry %hu has limit 0x%08X but expected 0x%08X\n", 108 + (ldt ? "LDT" : "GDT"), index, limit, expected_limit); 109 + nerrs++; 110 + } else if (verbose) { 111 + printf("[OK]\t%s entry %hu has AR 0x%08X and limit 0x%08X\n", 112 + (ldt ? "LDT" : "GDT"), index, ar, limit); 113 + } 114 + } 115 + 116 + static bool install_valid_mode(const struct user_desc *desc, uint32_t ar, 117 + bool oldmode) 118 + { 119 + int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11, 120 + desc, sizeof(*desc)); 121 + if (ret < -1) 122 + errno = -ret; 123 + if (ret == 0) { 124 + uint32_t limit = desc->limit; 125 + if (desc->limit_in_pages) 126 + limit = (limit << 12) + 4095; 127 + check_valid_segment(desc->entry_number, 1, ar, limit, true); 128 + return true; 129 + } else if (errno == ENOSYS) { 130 + printf("[OK]\tmodify_ldt returned -ENOSYS\n"); 131 + return false; 132 + } else { 133 + if (desc->seg_32bit) { 134 + printf("[FAIL]\tUnexpected modify_ldt failure %d\n", 135 + errno); 136 + nerrs++; 137 + return false; 138 + } else { 139 + printf("[OK]\tmodify_ldt rejected 16 bit segment\n"); 140 + return false; 141 + } 142 + } 143 + } 144 + 145 + static bool install_valid(const struct user_desc *desc, uint32_t ar) 146 + { 147 + return install_valid_mode(desc, ar, false); 148 + } 149 + 150 + static void install_invalid(const struct user_desc *desc, bool oldmode) 151 + { 152 + int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11, 153 + desc, sizeof(*desc)); 154 + if (ret < -1) 155 + errno = -ret; 156 + if (ret == 0) { 157 + check_invalid_segment(desc->entry_number, 1); 158 + } else if (errno == ENOSYS) { 159 + printf("[OK]\tmodify_ldt returned -ENOSYS\n"); 160 + } else { 161 + if (desc->seg_32bit) { 162 + printf("[FAIL]\tUnexpected modify_ldt failure %d\n", 163 + errno); 164 + nerrs++; 165 + } else { 166 + printf("[OK]\tmodify_ldt rejected 16 bit segment\n"); 167 + } 168 + } 169 + } 170 + 171 + static int safe_modify_ldt(int func, struct user_desc *ptr, 172 + unsigned long bytecount) 173 + { 174 + int ret = syscall(SYS_modify_ldt, 0x11, ptr, bytecount); 175 + if (ret < -1) 176 + errno = -ret; 177 + return ret; 178 + } 179 + 180 + static void fail_install(struct user_desc *desc) 181 + { 182 + if (safe_modify_ldt(0x11, desc, sizeof(*desc)) == 0) { 183 + printf("[FAIL]\tmodify_ldt accepted a bad descriptor\n"); 184 + nerrs++; 185 + } else if (errno == ENOSYS) { 186 + printf("[OK]\tmodify_ldt returned -ENOSYS\n"); 187 + } else { 188 + printf("[OK]\tmodify_ldt failure %d\n", errno); 189 + } 190 + } 191 + 192 + static void do_simple_tests(void) 193 + { 194 + struct user_desc desc = { 195 + .entry_number = 0, 196 + .base_addr = 0, 197 + .limit = 10, 198 + .seg_32bit = 1, 199 + .contents = 2, /* Code, not conforming */ 200 + .read_exec_only = 0, 201 + .limit_in_pages = 0, 202 + .seg_not_present = 0, 203 + .useable = 0 204 + }; 205 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB); 206 + 207 + desc.limit_in_pages = 1; 208 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | 209 + AR_S | AR_P | AR_DB | AR_G); 210 + 211 + check_invalid_segment(1, 1); 212 + 213 + desc.entry_number = 2; 214 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | 215 + AR_S | AR_P | AR_DB | AR_G); 216 + 217 + check_invalid_segment(1, 1); 218 + 219 + desc.base_addr = 0xf0000000; 220 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | 221 + AR_S | AR_P | AR_DB | AR_G); 222 + 223 + desc.useable = 1; 224 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | 225 + AR_S | AR_P | AR_DB | AR_G | AR_AVL); 226 + 227 + desc.seg_not_present = 1; 228 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | 229 + AR_S | AR_DB | AR_G | AR_AVL); 230 + 231 + desc.seg_32bit = 0; 232 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | 233 + AR_S | AR_G | AR_AVL); 234 + 235 + desc.seg_32bit = 1; 236 + desc.contents = 0; 237 + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | 238 + AR_S | AR_DB | AR_G | AR_AVL); 239 + 240 + desc.read_exec_only = 1; 241 + install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | 242 + AR_S | AR_DB | AR_G | AR_AVL); 243 + 244 + desc.contents = 1; 245 + install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA_EXPDOWN | 246 + AR_S | AR_DB | AR_G | AR_AVL); 247 + 248 + desc.read_exec_only = 0; 249 + desc.limit_in_pages = 0; 250 + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA_EXPDOWN | 251 + AR_S | AR_DB | AR_AVL); 252 + 253 + desc.contents = 3; 254 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE_CONF | 255 + AR_S | AR_DB | AR_AVL); 256 + 257 + desc.read_exec_only = 1; 258 + install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE_CONF | 259 + AR_S | AR_DB | AR_AVL); 260 + 261 + desc.read_exec_only = 0; 262 + desc.contents = 2; 263 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | 264 + AR_S | AR_DB | AR_AVL); 265 + 266 + desc.read_exec_only = 1; 267 + 268 + #ifdef __x86_64__ 269 + desc.lm = 1; 270 + install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE | 271 + AR_S | AR_DB | AR_AVL); 272 + desc.lm = 0; 273 + #endif 274 + 275 + bool entry1_okay = install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE | 276 + AR_S | AR_DB | AR_AVL); 277 + 278 + if (entry1_okay) { 279 + printf("[RUN]\tTest fork\n"); 280 + pid_t child = fork(); 281 + if (child == 0) { 282 + nerrs = 0; 283 + check_valid_segment(desc.entry_number, 1, 284 + AR_DPL3 | AR_TYPE_XOCODE | 285 + AR_S | AR_DB | AR_AVL, desc.limit, 286 + true); 287 + check_invalid_segment(1, 1); 288 + exit(nerrs ? 1 : 0); 289 + } else { 290 + int status; 291 + if (waitpid(child, &status, 0) != child || 292 + !WIFEXITED(status)) { 293 + printf("[FAIL]\tChild died\n"); 294 + nerrs++; 295 + } else if (WEXITSTATUS(status) != 0) { 296 + printf("[FAIL]\tChild failed\n"); 297 + nerrs++; 298 + } else { 299 + printf("[OK]\tChild succeeded\n"); 300 + } 301 + } 302 + 303 + printf("[RUN]\tTest size\n"); 304 + int i; 305 + for (i = 0; i < 8192; i++) { 306 + desc.entry_number = i; 307 + desc.limit = i; 308 + if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) { 309 + printf("[FAIL]\tFailed to install entry %d\n", i); 310 + nerrs++; 311 + break; 312 + } 313 + } 314 + for (int j = 0; j < i; j++) { 315 + check_valid_segment(j, 1, AR_DPL3 | AR_TYPE_XOCODE | 316 + AR_S | AR_DB | AR_AVL, j, false); 317 + } 318 + printf("[DONE]\tSize test\n"); 319 + } else { 320 + printf("[SKIP]\tSkipping fork and size tests because we have no LDT\n"); 321 + } 322 + 323 + /* Test entry_number too high. */ 324 + desc.entry_number = 8192; 325 + fail_install(&desc); 326 + 327 + /* Test deletion and actions mistakeable for deletion. */ 328 + memset(&desc, 0, sizeof(desc)); 329 + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P); 330 + 331 + desc.seg_not_present = 1; 332 + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S); 333 + 334 + desc.seg_not_present = 0; 335 + desc.read_exec_only = 1; 336 + install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S | AR_P); 337 + 338 + desc.read_exec_only = 0; 339 + desc.seg_not_present = 1; 340 + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S); 341 + 342 + desc.read_exec_only = 1; 343 + desc.limit = 1; 344 + install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S); 345 + 346 + desc.limit = 0; 347 + desc.base_addr = 1; 348 + install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S); 349 + 350 + desc.base_addr = 0; 351 + install_invalid(&desc, false); 352 + 353 + desc.seg_not_present = 0; 354 + desc.read_exec_only = 0; 355 + desc.seg_32bit = 1; 356 + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB); 357 + install_invalid(&desc, true); 358 + } 359 + 360 + /* 361 + * 0: thread is idle 362 + * 1: thread armed 363 + * 2: thread should clear LDT entry 0 364 + * 3: thread should exit 365 + */ 366 + static volatile unsigned int ftx; 367 + 368 + static void *threadproc(void *ctx) 369 + { 370 + cpu_set_t cpuset; 371 + CPU_ZERO(&cpuset); 372 + CPU_SET(1, &cpuset); 373 + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) 374 + err(1, "sched_setaffinity to CPU 1"); /* should never fail */ 375 + 376 + while (1) { 377 + syscall(SYS_futex, &ftx, FUTEX_WAIT, 0, NULL, NULL, 0); 378 + while (ftx != 2) { 379 + if (ftx >= 3) 380 + return NULL; 381 + } 382 + 383 + /* clear LDT entry 0 */ 384 + const struct user_desc desc = {}; 385 + if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) != 0) 386 + err(1, "modify_ldt"); 387 + 388 + /* If ftx == 2, set it to zero. If ftx == 100, quit. */ 389 + unsigned int x = -2; 390 + asm volatile ("lock xaddl %[x], %[ftx]" : 391 + [x] "+r" (x), [ftx] "+m" (ftx)); 392 + if (x != 2) 393 + return NULL; 394 + } 395 + } 396 + 397 + static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), 398 + int flags) 399 + { 400 + struct sigaction sa; 401 + memset(&sa, 0, sizeof(sa)); 402 + sa.sa_sigaction = handler; 403 + sa.sa_flags = SA_SIGINFO | flags; 404 + sigemptyset(&sa.sa_mask); 405 + if (sigaction(sig, &sa, 0)) 406 + err(1, "sigaction"); 407 + 408 + } 409 + 410 + static jmp_buf jmpbuf; 411 + 412 + static void sigsegv(int sig, siginfo_t *info, void *ctx_void) 413 + { 414 + siglongjmp(jmpbuf, 1); 415 + } 416 + 417 + static void do_multicpu_tests(void) 418 + { 419 + cpu_set_t cpuset; 420 + pthread_t thread; 421 + int failures = 0, iters = 5, i; 422 + unsigned short orig_ss; 423 + 424 + CPU_ZERO(&cpuset); 425 + CPU_SET(1, &cpuset); 426 + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) { 427 + printf("[SKIP]\tCannot set affinity to CPU 1\n"); 428 + return; 429 + } 430 + 431 + CPU_ZERO(&cpuset); 432 + CPU_SET(0, &cpuset); 433 + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) { 434 + printf("[SKIP]\tCannot set affinity to CPU 0\n"); 435 + return; 436 + } 437 + 438 + sethandler(SIGSEGV, sigsegv, 0); 439 + #ifdef __i386__ 440 + /* True 32-bit kernels send SIGILL instead of SIGSEGV on IRET faults. */ 441 + sethandler(SIGILL, sigsegv, 0); 442 + #endif 443 + 444 + printf("[RUN]\tCross-CPU LDT invalidation\n"); 445 + 446 + if (pthread_create(&thread, 0, threadproc, 0) != 0) 447 + err(1, "pthread_create"); 448 + 449 + asm volatile ("mov %%ss, %0" : "=rm" (orig_ss)); 450 + 451 + for (i = 0; i < 5; i++) { 452 + if (sigsetjmp(jmpbuf, 1) != 0) 453 + continue; 454 + 455 + /* Make sure the thread is ready after the last test. */ 456 + while (ftx != 0) 457 + ; 458 + 459 + struct user_desc desc = { 460 + .entry_number = 0, 461 + .base_addr = 0, 462 + .limit = 0xfffff, 463 + .seg_32bit = 1, 464 + .contents = 0, /* Data */ 465 + .read_exec_only = 0, 466 + .limit_in_pages = 1, 467 + .seg_not_present = 0, 468 + .useable = 0 469 + }; 470 + 471 + if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) { 472 + if (errno != ENOSYS) 473 + err(1, "modify_ldt"); 474 + printf("[SKIP]\tmodify_ldt unavailable\n"); 475 + break; 476 + } 477 + 478 + /* Arm the thread. */ 479 + ftx = 1; 480 + syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); 481 + 482 + asm volatile ("mov %0, %%ss" : : "r" (0x7)); 483 + 484 + /* Go! */ 485 + ftx = 2; 486 + 487 + while (ftx != 0) 488 + ; 489 + 490 + /* 491 + * On success, modify_ldt will segfault us synchronously, 492 + * and we'll escape via siglongjmp. 493 + */ 494 + 495 + failures++; 496 + asm volatile ("mov %0, %%ss" : : "rm" (orig_ss)); 497 + }; 498 + 499 + ftx = 100; /* Kill the thread. */ 500 + syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); 501 + 502 + if (pthread_join(thread, NULL) != 0) 503 + err(1, "pthread_join"); 504 + 505 + if (failures) { 506 + printf("[FAIL]\t%d of %d iterations failed\n", failures, iters); 507 + nerrs++; 508 + } else { 509 + printf("[OK]\tAll %d iterations succeeded\n", iters); 510 + } 511 + } 512 + 513 + static int finish_exec_test(void) 514 + { 515 + /* 516 + * In a sensible world, this would be check_invalid_segment(0, 1); 517 + * For better or for worse, though, the LDT is inherited across exec. 518 + * We can probably change this safely, but for now we test it. 519 + */ 520 + check_valid_segment(0, 1, 521 + AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB, 522 + 42, true); 523 + 524 + return nerrs ? 1 : 0; 525 + } 526 + 527 + static void do_exec_test(void) 528 + { 529 + printf("[RUN]\tTest exec\n"); 530 + 531 + struct user_desc desc = { 532 + .entry_number = 0, 533 + .base_addr = 0, 534 + .limit = 42, 535 + .seg_32bit = 1, 536 + .contents = 2, /* Code, not conforming */ 537 + .read_exec_only = 0, 538 + .limit_in_pages = 0, 539 + .seg_not_present = 0, 540 + .useable = 0 541 + }; 542 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB); 543 + 544 + pid_t child = fork(); 545 + if (child == 0) { 546 + execl("/proc/self/exe", "ldt_gdt_test_exec", NULL); 547 + printf("[FAIL]\tCould not exec self\n"); 548 + exit(1); /* exec failed */ 549 + } else { 550 + int status; 551 + if (waitpid(child, &status, 0) != child || 552 + !WIFEXITED(status)) { 553 + printf("[FAIL]\tChild died\n"); 554 + nerrs++; 555 + } else if (WEXITSTATUS(status) != 0) { 556 + printf("[FAIL]\tChild failed\n"); 557 + nerrs++; 558 + } else { 559 + printf("[OK]\tChild succeeded\n"); 560 + } 561 + } 562 + } 563 + 564 + int main(int argc, char **argv) 565 + { 566 + if (argc == 1 && !strcmp(argv[0], "ldt_gdt_test_exec")) 567 + return finish_exec_test(); 568 + 569 + do_simple_tests(); 570 + 571 + do_multicpu_tests(); 572 + 573 + do_exec_test(); 574 + 575 + return nerrs ? 1 : 0; 576 + }

+130

tools/testing/selftests/x86/syscall_arg_fault.c

··· 1 + /* 2 + * syscall_arg_fault.c - tests faults 32-bit fast syscall stack args 3 + * Copyright (c) 2015 Andrew Lutomirski 4 + * 5 + * This program is free software; you can redistribute it and/or modify 6 + * it under the terms and conditions of the GNU General Public License, 7 + * version 2, as published by the Free Software Foundation. 8 + * 9 + * This program is distributed in the hope it will be useful, but 10 + * WITHOUT ANY WARRANTY; without even the implied warranty of 11 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 + * General Public License for more details. 13 + */ 14 + 15 + #define _GNU_SOURCE 16 + 17 + #include <stdlib.h> 18 + #include <stdio.h> 19 + #include <string.h> 20 + #include <sys/signal.h> 21 + #include <sys/ucontext.h> 22 + #include <err.h> 23 + #include <setjmp.h> 24 + #include <errno.h> 25 + 26 + /* Our sigaltstack scratch space. */ 27 + static unsigned char altstack_data[SIGSTKSZ]; 28 + 29 + static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), 30 + int flags) 31 + { 32 + struct sigaction sa; 33 + memset(&sa, 0, sizeof(sa)); 34 + sa.sa_sigaction = handler; 35 + sa.sa_flags = SA_SIGINFO | flags; 36 + sigemptyset(&sa.sa_mask); 37 + if (sigaction(sig, &sa, 0)) 38 + err(1, "sigaction"); 39 + } 40 + 41 + static volatile sig_atomic_t sig_traps; 42 + static sigjmp_buf jmpbuf; 43 + 44 + static volatile sig_atomic_t n_errs; 45 + 46 + static void sigsegv(int sig, siginfo_t *info, void *ctx_void) 47 + { 48 + ucontext_t *ctx = (ucontext_t*)ctx_void; 49 + 50 + if (ctx->uc_mcontext.gregs[REG_EAX] != -EFAULT) { 51 + printf("[FAIL]\tAX had the wrong value: 0x%x\n", 52 + ctx->uc_mcontext.gregs[REG_EAX]); 53 + n_errs++; 54 + } else { 55 + printf("[OK]\tSeems okay\n"); 56 + } 57 + 58 + siglongjmp(jmpbuf, 1); 59 + } 60 + 61 + static void sigill(int sig, siginfo_t *info, void *ctx_void) 62 + { 63 + printf("[SKIP]\tIllegal instruction\n"); 64 + siglongjmp(jmpbuf, 1); 65 + } 66 + 67 + int main() 68 + { 69 + stack_t stack = { 70 + .ss_sp = altstack_data, 71 + .ss_size = SIGSTKSZ, 72 + }; 73 + if (sigaltstack(&stack, NULL) != 0) 74 + err(1, "sigaltstack"); 75 + 76 + sethandler(SIGSEGV, sigsegv, SA_ONSTACK); 77 + sethandler(SIGILL, sigill, SA_ONSTACK); 78 + 79 + /* 80 + * Exercise another nasty special case. The 32-bit SYSCALL 81 + * and SYSENTER instructions (even in compat mode) each 82 + * clobber one register. A Linux system call has a syscall 83 + * number and six arguments, and the user stack pointer 84 + * needs to live in some register on return. That means 85 + * that we need eight registers, but SYSCALL and SYSENTER 86 + * only preserve seven registers. As a result, one argument 87 + * ends up on the stack. The stack is user memory, which 88 + * means that the kernel can fail to read it. 89 + * 90 + * The 32-bit fast system calls don't have a defined ABI: 91 + * we're supposed to invoke them through the vDSO. So we'll 92 + * fudge it: we set all regs to invalid pointer values and 93 + * invoke the entry instruction. The return will fail no 94 + * matter what, and we completely lose our program state, 95 + * but we can fix it up with a signal handler. 96 + */ 97 + 98 + printf("[RUN]\tSYSENTER with invalid state\n"); 99 + if (sigsetjmp(jmpbuf, 1) == 0) { 100 + asm volatile ( 101 + "movl $-1, %%eax\n\t" 102 + "movl $-1, %%ebx\n\t" 103 + "movl $-1, %%ecx\n\t" 104 + "movl $-1, %%edx\n\t" 105 + "movl $-1, %%esi\n\t" 106 + "movl $-1, %%edi\n\t" 107 + "movl $-1, %%ebp\n\t" 108 + "movl $-1, %%esp\n\t" 109 + "sysenter" 110 + : : : "memory", "flags"); 111 + } 112 + 113 + printf("[RUN]\tSYSCALL with invalid state\n"); 114 + if (sigsetjmp(jmpbuf, 1) == 0) { 115 + asm volatile ( 116 + "movl $-1, %%eax\n\t" 117 + "movl $-1, %%ebx\n\t" 118 + "movl $-1, %%ecx\n\t" 119 + "movl $-1, %%edx\n\t" 120 + "movl $-1, %%esi\n\t" 121 + "movl $-1, %%edi\n\t" 122 + "movl $-1, %%ebp\n\t" 123 + "movl $-1, %%esp\n\t" 124 + "syscall\n\t" 125 + "pushl $0" /* make sure we segfault cleanly */ 126 + : : : "memory", "flags"); 127 + } 128 + 129 + return 0; 130 + }

+54

tools/testing/selftests/x86/syscall_nt.c

··· 1 + /* 2 + * syscall_nt.c - checks syscalls with NT set 3 + * Copyright (c) 2014-2015 Andrew Lutomirski 4 + * 5 + * This program is free software; you can redistribute it and/or modify 6 + * it under the terms and conditions of the GNU General Public License, 7 + * version 2, as published by the Free Software Foundation. 8 + * 9 + * This program is distributed in the hope it will be useful, but 10 + * WITHOUT ANY WARRANTY; without even the implied warranty of 11 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 + * General Public License for more details. 13 + * 14 + * Some obscure user-space code requires the ability to make system calls 15 + * with FLAGS.NT set. Make sure it works. 16 + */ 17 + 18 + #include <stdio.h> 19 + #include <unistd.h> 20 + #include <sys/syscall.h> 21 + #include <asm/processor-flags.h> 22 + 23 + #ifdef __x86_64__ 24 + # define WIDTH "q" 25 + #else 26 + # define WIDTH "l" 27 + #endif 28 + 29 + static unsigned long get_eflags(void) 30 + { 31 + unsigned long eflags; 32 + asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags)); 33 + return eflags; 34 + } 35 + 36 + static void set_eflags(unsigned long eflags) 37 + { 38 + asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH 39 + : : "rm" (eflags) : "flags"); 40 + } 41 + 42 + int main() 43 + { 44 + printf("[RUN]\tSet NT and issue a syscall\n"); 45 + set_eflags(get_eflags() | X86_EFLAGS_NT); 46 + syscall(SYS_getpid); 47 + if (get_eflags() & X86_EFLAGS_NT) { 48 + printf("[OK]\tThe syscall worked and NT is still set\n"); 49 + return 0; 50 + } else { 51 + printf("[FAIL]\tThe syscall worked but NT was cleared\n"); 52 + return 1; 53 + } 54 + }