Merge branch 'x86/asm' into locking/core

+2 -1

arch/um/include/shared/kern_util.h

··· 22 22 extern unsigned long alloc_stack(int order, int atomic); 23 23 extern void free_stack(unsigned long stack, int order); 24 24 25 - extern int do_signal(void); 25 + struct pt_regs; 26 + extern void do_signal(struct pt_regs *regs); 26 27 extern void interrupt_end(void); 27 28 extern void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs); 28 29

+4 -2

arch/um/kernel/process.c

··· 90 90 91 91 void interrupt_end(void) 92 92 { 93 + struct pt_regs *regs = &current->thread.regs; 94 + 93 95 if (need_resched()) 94 96 schedule(); 95 97 if (test_thread_flag(TIF_SIGPENDING)) 96 - do_signal(); 98 + do_signal(regs); 97 99 if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) 98 - tracehook_notify_resume(&current->thread.regs); 100 + tracehook_notify_resume(regs); 99 101 } 100 102 101 103 void exit_thread(void)

+1 -7

arch/um/kernel/signal.c

··· 64 64 signal_setup_done(err, ksig, singlestep); 65 65 } 66 66 67 - static int kern_do_signal(struct pt_regs *regs) 67 + void do_signal(struct pt_regs *regs) 68 68 { 69 69 struct ksignal ksig; 70 70 int handled_sig = 0; ··· 110 110 */ 111 111 if (!handled_sig) 112 112 restore_saved_sigmask(); 113 - return handled_sig; 114 - } 115 - 116 - int do_signal(void) 117 - { 118 - return kern_do_signal(&current->thread.regs); 119 113 }

+1 -1

arch/um/kernel/tlb.c

··· 291 291 /* We are under mmap_sem, release it such that current can terminate */ 292 292 up_write(&current->mm->mmap_sem); 293 293 force_sig(SIGKILL, current); 294 - do_signal(); 294 + do_signal(&current->thread.regs); 295 295 } 296 296 } 297 297

+1 -1

arch/um/kernel/trap.c

··· 173 173 void fatal_sigsegv(void) 174 174 { 175 175 force_sigsegv(SIGSEGV, current); 176 - do_signal(); 176 + do_signal(&current->thread.regs); 177 177 /* 178 178 * This is to tell gcc that we're not returning - do_signal 179 179 * can, in general, return, but in this case, it's not, since

+49 -11

arch/x86/Kconfig

··· 133 133 select HAVE_PERF_USER_STACK_DUMP 134 134 select HAVE_REGS_AND_STACK_ACCESS_API 135 135 select HAVE_SYSCALL_TRACEPOINTS 136 - select HAVE_UID16 if X86_32 136 + select HAVE_UID16 if X86_32 || IA32_EMULATION 137 137 select HAVE_UNSTABLE_SCHED_CLOCK 138 138 select HAVE_USER_RETURN_NOTIFIER 139 139 select IRQ_FORCED_THREADING ··· 1002 1002 def_bool y 1003 1003 depends on X86_MCE_INTEL 1004 1004 1005 - config VM86 1006 - bool "Enable VM86 support" if EXPERT 1007 - default y 1005 + config X86_LEGACY_VM86 1006 + bool "Legacy VM86 support (obsolete)" 1007 + default n 1008 1008 depends on X86_32 1009 1009 ---help--- 1010 - This option is required by programs like DOSEMU to run 1011 - 16-bit real mode legacy code on x86 processors. It also may 1012 - be needed by software like XFree86 to initialize some video 1013 - cards via BIOS. Disabling this option saves about 6K. 1010 + This option allows user programs to put the CPU into V8086 1011 + mode, which is an 80286-era approximation of 16-bit real mode. 1012 + 1013 + Some very old versions of X and/or vbetool require this option 1014 + for user mode setting. Similarly, DOSEMU will use it if 1015 + available to accelerate real mode DOS programs. However, any 1016 + recent version of DOSEMU, X, or vbetool should be fully 1017 + functional even without kernel VM86 support, as they will all 1018 + fall back to (pretty well performing) software emulation. 1019 + 1020 + Anything that works on a 64-bit kernel is unlikely to need 1021 + this option, as 64-bit kernels don't, and can't, support V8086 1022 + mode. This option is also unrelated to 16-bit protected mode 1023 + and is not needed to run most 16-bit programs under Wine. 1024 + 1025 + Enabling this option adds considerable attack surface to the 1026 + kernel and slows down system calls and exception handling. 1027 + 1028 + Unless you use very old userspace or need the last drop of 1029 + performance in your real mode DOS games and can't use KVM, 1030 + say N here. 1031 + 1032 + config VM86 1033 + bool 1034 + default X86_LEGACY_VM86 1014 1035 1015 1036 config X86_16BIT 1016 1037 bool "Enable support for 16-bit segments" if EXPERT 1017 1038 default y 1039 + depends on MODIFY_LDT_SYSCALL 1018 1040 ---help--- 1019 1041 This option is required by programs like Wine to run 16-bit 1020 1042 protected mode legacy code on x86 processors. Disabling ··· 1531 1509 1532 1510 config MATH_EMULATION 1533 1511 bool 1512 + depends on MODIFY_LDT_SYSCALL 1534 1513 prompt "Math emulation" if X86_32 1535 1514 ---help--- 1536 1515 Linux can emulate a math coprocessor (used for floating point ··· 2076 2053 This is used to work around broken boot loaders. This should 2077 2054 be set to 'N' under normal conditions. 2078 2055 2056 + config MODIFY_LDT_SYSCALL 2057 + bool "Enable the LDT (local descriptor table)" if EXPERT 2058 + default y 2059 + ---help--- 2060 + Linux can allow user programs to install a per-process x86 2061 + Local Descriptor Table (LDT) using the modify_ldt(2) system 2062 + call. This is required to run 16-bit or segmented code such as 2063 + DOSEMU or some Wine programs. It is also used by some very old 2064 + threading libraries. 2065 + 2066 + Enabling this feature adds a small amount of overhead to 2067 + context switches and increases the low-level kernel attack 2068 + surface. Disabling it removes the modify_ldt(2) system call. 2069 + 2070 + Saying 'N' here may make sense for embedded or server kernels. 2071 + 2079 2072 source "kernel/livepatch/Kconfig" 2080 2073 2081 2074 endmenu ··· 2561 2522 depends on X86_64 2562 2523 select BINFMT_ELF 2563 2524 select COMPAT_BINFMT_ELF 2564 - select HAVE_UID16 2525 + select ARCH_WANT_OLD_COMPAT_IPC 2565 2526 ---help--- 2566 2527 Include code to run legacy 32-bit programs under a 2567 2528 64-bit kernel. You should likely turn this on, unless you're ··· 2575 2536 2576 2537 config X86_X32 2577 2538 bool "x32 ABI for 64-bit mode" 2578 - depends on X86_64 && IA32_EMULATION 2539 + depends on X86_64 2579 2540 ---help--- 2580 2541 Include code to run binaries for the x32 native 32-bit ABI 2581 2542 for 64-bit processors. An x32 process gets access to the ··· 2589 2550 config COMPAT 2590 2551 def_bool y 2591 2552 depends on IA32_EMULATION || X86_X32 2592 - select ARCH_WANT_OLD_COMPAT_IPC 2593 2553 2594 2554 if COMPAT 2595 2555 config COMPAT_FOR_U64_ALIGNMENT

+10 -3

arch/x86/Makefile

··· 39 39 LDFLAGS_vmlinux := --emit-relocs 40 40 endif 41 41 42 + # 43 + # Prevent GCC from generating any FP code by mistake. 44 + # 45 + # This must happen before we try the -mpreferred-stack-boundary, see: 46 + # 47 + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 48 + # 49 + KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow 50 + KBUILD_CFLAGS += $(call cc-option,-mno-avx,) 51 + 42 52 ifeq ($(CONFIG_X86_32),y) 43 53 BITS := 32 44 54 UTS_MACHINE := i386 ··· 177 167 KBUILD_CFLAGS += -Wno-sign-compare 178 168 # 179 169 KBUILD_CFLAGS += -fno-asynchronous-unwind-tables 180 - # prevent gcc from generating any FP code by mistake 181 - KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow 182 - KBUILD_CFLAGS += $(call cc-option,-mno-avx,) 183 170 184 171 KBUILD_CFLAGS += $(mflags-y) 185 172 KBUILD_AFLAGS += $(mflags-y)

+1 -1

arch/x86/boot/compressed/aslr.c

··· 82 82 83 83 if (has_cpuflag(X86_FEATURE_TSC)) { 84 84 debug_putstr(" RDTSC"); 85 - rdtscll(raw); 85 + raw = rdtsc(); 86 86 87 87 random ^= raw; 88 88 use_i8254 = false;

+1

arch/x86/entry/Makefile

··· 2 2 # Makefile for the x86 low level entry code 3 3 # 4 4 obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o 5 + obj-y += common.o 5 6 6 7 obj-y += vdso/ 7 8 obj-y += vsyscall/

-9

arch/x86/entry/calling.h

··· 135 135 movq %rbp, 4*8+\offset(%rsp) 136 136 movq %rbx, 5*8+\offset(%rsp) 137 137 .endm 138 - .macro SAVE_EXTRA_REGS_RBP offset=0 139 - movq %rbp, 4*8+\offset(%rsp) 140 - .endm 141 138 142 139 .macro RESTORE_EXTRA_REGS offset=0 143 140 movq 0*8+\offset(%rsp), %r15 ··· 189 192 .endm 190 193 .macro RESTORE_C_REGS_EXCEPT_RCX_R11 191 194 RESTORE_C_REGS_HELPER 1,0,0,1,1 192 - .endm 193 - .macro RESTORE_RSI_RDI 194 - RESTORE_C_REGS_HELPER 0,0,0,0,0 195 - .endm 196 - .macro RESTORE_RSI_RDI_RDX 197 - RESTORE_C_REGS_HELPER 0,0,0,0,1 198 195 .endm 199 196 200 197 .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0

+375

arch/x86/entry/common.c

··· 1 + /* 2 + * common.c - C code for kernel entry and exit 3 + * Copyright (c) 2015 Andrew Lutomirski 4 + * GPL v2 5 + * 6 + * Based on asm and ptrace code by many authors. The code here originated 7 + * in ptrace.c and signal.c. 8 + */ 9 + 10 + #include <linux/kernel.h> 11 + #include <linux/sched.h> 12 + #include <linux/mm.h> 13 + #include <linux/smp.h> 14 + #include <linux/errno.h> 15 + #include <linux/ptrace.h> 16 + #include <linux/tracehook.h> 17 + #include <linux/audit.h> 18 + #include <linux/seccomp.h> 19 + #include <linux/signal.h> 20 + #include <linux/export.h> 21 + #include <linux/context_tracking.h> 22 + #include <linux/user-return-notifier.h> 23 + #include <linux/uprobes.h> 24 + 25 + #include <asm/desc.h> 26 + #include <asm/traps.h> 27 + 28 + #define CREATE_TRACE_POINTS 29 + #include <trace/events/syscalls.h> 30 + 31 + #ifdef CONFIG_CONTEXT_TRACKING 32 + /* Called on entry from user mode with IRQs off. */ 33 + __visible void enter_from_user_mode(void) 34 + { 35 + CT_WARN_ON(ct_state() != CONTEXT_USER); 36 + user_exit(); 37 + } 38 + #endif 39 + 40 + static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) 41 + { 42 + #ifdef CONFIG_X86_64 43 + if (arch == AUDIT_ARCH_X86_64) { 44 + audit_syscall_entry(regs->orig_ax, regs->di, 45 + regs->si, regs->dx, regs->r10); 46 + } else 47 + #endif 48 + { 49 + audit_syscall_entry(regs->orig_ax, regs->bx, 50 + regs->cx, regs->dx, regs->si); 51 + } 52 + } 53 + 54 + /* 55 + * We can return 0 to resume the syscall or anything else to go to phase 56 + * 2. If we resume the syscall, we need to put something appropriate in 57 + * regs->orig_ax. 58 + * 59 + * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax 60 + * are fully functional. 61 + * 62 + * For phase 2's benefit, our return value is: 63 + * 0: resume the syscall 64 + * 1: go to phase 2; no seccomp phase 2 needed 65 + * anything else: go to phase 2; pass return value to seccomp 66 + */ 67 + unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) 68 + { 69 + unsigned long ret = 0; 70 + u32 work; 71 + 72 + BUG_ON(regs != task_pt_regs(current)); 73 + 74 + work = ACCESS_ONCE(current_thread_info()->flags) & 75 + _TIF_WORK_SYSCALL_ENTRY; 76 + 77 + #ifdef CONFIG_CONTEXT_TRACKING 78 + /* 79 + * If TIF_NOHZ is set, we are required to call user_exit() before 80 + * doing anything that could touch RCU. 81 + */ 82 + if (work & _TIF_NOHZ) { 83 + enter_from_user_mode(); 84 + work &= ~_TIF_NOHZ; 85 + } 86 + #endif 87 + 88 + #ifdef CONFIG_SECCOMP 89 + /* 90 + * Do seccomp first -- it should minimize exposure of other 91 + * code, and keeping seccomp fast is probably more valuable 92 + * than the rest of this. 93 + */ 94 + if (work & _TIF_SECCOMP) { 95 + struct seccomp_data sd; 96 + 97 + sd.arch = arch; 98 + sd.nr = regs->orig_ax; 99 + sd.instruction_pointer = regs->ip; 100 + #ifdef CONFIG_X86_64 101 + if (arch == AUDIT_ARCH_X86_64) { 102 + sd.args[0] = regs->di; 103 + sd.args[1] = regs->si; 104 + sd.args[2] = regs->dx; 105 + sd.args[3] = regs->r10; 106 + sd.args[4] = regs->r8; 107 + sd.args[5] = regs->r9; 108 + } else 109 + #endif 110 + { 111 + sd.args[0] = regs->bx; 112 + sd.args[1] = regs->cx; 113 + sd.args[2] = regs->dx; 114 + sd.args[3] = regs->si; 115 + sd.args[4] = regs->di; 116 + sd.args[5] = regs->bp; 117 + } 118 + 119 + BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); 120 + BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); 121 + 122 + ret = seccomp_phase1(&sd); 123 + if (ret == SECCOMP_PHASE1_SKIP) { 124 + regs->orig_ax = -1; 125 + ret = 0; 126 + } else if (ret != SECCOMP_PHASE1_OK) { 127 + return ret; /* Go directly to phase 2 */ 128 + } 129 + 130 + work &= ~_TIF_SECCOMP; 131 + } 132 + #endif 133 + 134 + /* Do our best to finish without phase 2. */ 135 + if (work == 0) 136 + return ret; /* seccomp and/or nohz only (ret == 0 here) */ 137 + 138 + #ifdef CONFIG_AUDITSYSCALL 139 + if (work == _TIF_SYSCALL_AUDIT) { 140 + /* 141 + * If there is no more work to be done except auditing, 142 + * then audit in phase 1. Phase 2 always audits, so, if 143 + * we audit here, then we can't go on to phase 2. 144 + */ 145 + do_audit_syscall_entry(regs, arch); 146 + return 0; 147 + } 148 + #endif 149 + 150 + return 1; /* Something is enabled that we can't handle in phase 1 */ 151 + } 152 + 153 + /* Returns the syscall nr to run (which should match regs->orig_ax). */ 154 + long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, 155 + unsigned long phase1_result) 156 + { 157 + long ret = 0; 158 + u32 work = ACCESS_ONCE(current_thread_info()->flags) & 159 + _TIF_WORK_SYSCALL_ENTRY; 160 + 161 + BUG_ON(regs != task_pt_regs(current)); 162 + 163 + /* 164 + * If we stepped into a sysenter/syscall insn, it trapped in 165 + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. 166 + * If user-mode had set TF itself, then it's still clear from 167 + * do_debug() and we need to set it again to restore the user 168 + * state. If we entered on the slow path, TF was already set. 169 + */ 170 + if (work & _TIF_SINGLESTEP) 171 + regs->flags |= X86_EFLAGS_TF; 172 + 173 + #ifdef CONFIG_SECCOMP 174 + /* 175 + * Call seccomp_phase2 before running the other hooks so that 176 + * they can see any changes made by a seccomp tracer. 177 + */ 178 + if (phase1_result > 1 && seccomp_phase2(phase1_result)) { 179 + /* seccomp failures shouldn't expose any additional code. */ 180 + return -1; 181 + } 182 + #endif 183 + 184 + if (unlikely(work & _TIF_SYSCALL_EMU)) 185 + ret = -1L; 186 + 187 + if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && 188 + tracehook_report_syscall_entry(regs)) 189 + ret = -1L; 190 + 191 + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 192 + trace_sys_enter(regs, regs->orig_ax); 193 + 194 + do_audit_syscall_entry(regs, arch); 195 + 196 + return ret ?: regs->orig_ax; 197 + } 198 + 199 + long syscall_trace_enter(struct pt_regs *regs) 200 + { 201 + u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; 202 + unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); 203 + 204 + if (phase1_result == 0) 205 + return regs->orig_ax; 206 + else 207 + return syscall_trace_enter_phase2(regs, arch, phase1_result); 208 + } 209 + 210 + /* Deprecated. */ 211 + void syscall_trace_leave(struct pt_regs *regs) 212 + { 213 + bool step; 214 + 215 + /* 216 + * We may come here right after calling schedule_user() 217 + * or do_notify_resume(), in which case we can be in RCU 218 + * user mode. 219 + */ 220 + user_exit(); 221 + 222 + audit_syscall_exit(regs); 223 + 224 + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 225 + trace_sys_exit(regs, regs->ax); 226 + 227 + /* 228 + * If TIF_SYSCALL_EMU is set, we only get here because of 229 + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). 230 + * We already reported this syscall instruction in 231 + * syscall_trace_enter(). 232 + */ 233 + step = unlikely(test_thread_flag(TIF_SINGLESTEP)) && 234 + !test_thread_flag(TIF_SYSCALL_EMU); 235 + if (step || test_thread_flag(TIF_SYSCALL_TRACE)) 236 + tracehook_report_syscall_exit(regs, step); 237 + 238 + user_enter(); 239 + } 240 + 241 + static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) 242 + { 243 + unsigned long top_of_stack = 244 + (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING; 245 + return (struct thread_info *)(top_of_stack - THREAD_SIZE); 246 + } 247 + 248 + /* Called with IRQs disabled. */ 249 + __visible void prepare_exit_to_usermode(struct pt_regs *regs) 250 + { 251 + if (WARN_ON(!irqs_disabled())) 252 + local_irq_disable(); 253 + 254 + /* 255 + * In order to return to user mode, we need to have IRQs off with 256 + * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY, 257 + * _TIF_UPROBE, or _TIF_NEED_RESCHED set. Several of these flags 258 + * can be set at any time on preemptable kernels if we have IRQs on, 259 + * so we need to loop. Disabling preemption wouldn't help: doing the 260 + * work to clear some of the flags can sleep. 261 + */ 262 + while (true) { 263 + u32 cached_flags = 264 + READ_ONCE(pt_regs_to_thread_info(regs)->flags); 265 + 266 + if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | 267 + _TIF_UPROBE | _TIF_NEED_RESCHED | 268 + _TIF_USER_RETURN_NOTIFY))) 269 + break; 270 + 271 + /* We have work to do. */ 272 + local_irq_enable(); 273 + 274 + if (cached_flags & _TIF_NEED_RESCHED) 275 + schedule(); 276 + 277 + if (cached_flags & _TIF_UPROBE) 278 + uprobe_notify_resume(regs); 279 + 280 + /* deal with pending signal delivery */ 281 + if (cached_flags & _TIF_SIGPENDING) 282 + do_signal(regs); 283 + 284 + if (cached_flags & _TIF_NOTIFY_RESUME) { 285 + clear_thread_flag(TIF_NOTIFY_RESUME); 286 + tracehook_notify_resume(regs); 287 + } 288 + 289 + if (cached_flags & _TIF_USER_RETURN_NOTIFY) 290 + fire_user_return_notifiers(); 291 + 292 + /* Disable IRQs and retry */ 293 + local_irq_disable(); 294 + } 295 + 296 + user_enter(); 297 + } 298 + 299 + /* 300 + * Called with IRQs on and fully valid regs. Returns with IRQs off in a 301 + * state such that we can immediately switch to user mode. 302 + */ 303 + __visible void syscall_return_slowpath(struct pt_regs *regs) 304 + { 305 + struct thread_info *ti = pt_regs_to_thread_info(regs); 306 + u32 cached_flags = READ_ONCE(ti->flags); 307 + bool step; 308 + 309 + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 310 + 311 + if (WARN(irqs_disabled(), "syscall %ld left IRQs disabled", 312 + regs->orig_ax)) 313 + local_irq_enable(); 314 + 315 + /* 316 + * First do one-time work. If these work items are enabled, we 317 + * want to run them exactly once per syscall exit with IRQs on. 318 + */ 319 + if (cached_flags & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | 320 + _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)) { 321 + audit_syscall_exit(regs); 322 + 323 + if (cached_flags & _TIF_SYSCALL_TRACEPOINT) 324 + trace_sys_exit(regs, regs->ax); 325 + 326 + /* 327 + * If TIF_SYSCALL_EMU is set, we only get here because of 328 + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). 329 + * We already reported this syscall instruction in 330 + * syscall_trace_enter(). 331 + */ 332 + step = unlikely( 333 + (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) 334 + == _TIF_SINGLESTEP); 335 + if (step || cached_flags & _TIF_SYSCALL_TRACE) 336 + tracehook_report_syscall_exit(regs, step); 337 + } 338 + 339 + #ifdef CONFIG_COMPAT 340 + /* 341 + * Compat syscalls set TS_COMPAT. Make sure we clear it before 342 + * returning to user mode. 343 + */ 344 + ti->status &= ~TS_COMPAT; 345 + #endif 346 + 347 + local_irq_disable(); 348 + prepare_exit_to_usermode(regs); 349 + } 350 + 351 + /* 352 + * Deprecated notification of userspace execution resumption 353 + * - triggered by the TIF_WORK_MASK flags 354 + */ 355 + __visible void 356 + do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 357 + { 358 + user_exit(); 359 + 360 + if (thread_info_flags & _TIF_UPROBE) 361 + uprobe_notify_resume(regs); 362 + 363 + /* deal with pending signal delivery */ 364 + if (thread_info_flags & _TIF_SIGPENDING) 365 + do_signal(regs); 366 + 367 + if (thread_info_flags & _TIF_NOTIFY_RESUME) { 368 + clear_thread_flag(TIF_NOTIFY_RESUME); 369 + tracehook_notify_resume(regs); 370 + } 371 + if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) 372 + fire_user_return_notifiers(); 373 + 374 + user_enter(); 375 + }

+1 -23

arch/x86/entry/entry_32.S

··· 525 525 526 526 work_notifysig: # deal with pending signals and 527 527 # notify-resume requests 528 - #ifdef CONFIG_VM86 529 - testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) 530 - movl %esp, %eax 531 - jnz work_notifysig_v86 # returning to kernel-space or 532 - # vm86-space 533 - 1: 534 - #else 535 - movl %esp, %eax 536 - #endif 537 528 TRACE_IRQS_ON 538 529 ENABLE_INTERRUPTS(CLBR_NONE) 539 - movb PT_CS(%esp), %bl 540 - andb $SEGMENT_RPL_MASK, %bl 541 - cmpb $USER_RPL, %bl 542 - jb resume_kernel 530 + movl %esp, %eax 543 531 xorl %edx, %edx 544 532 call do_notify_resume 545 533 jmp resume_userspace 546 - 547 - #ifdef CONFIG_VM86 548 - ALIGN 549 - work_notifysig_v86: 550 - pushl %ecx # save ti_flags for do_notify_resume 551 - call save_v86_state # %eax contains pt_regs pointer 552 - popl %ecx 553 - movl %eax, %esp 554 - jmp 1b 555 - #endif 556 534 END(work_pending) 557 535 558 536 # perform syscall exit tracing

+53 -144

arch/x86/entry/entry_64.S

··· 33 33 #include <asm/paravirt.h> 34 34 #include <asm/percpu.h> 35 35 #include <asm/asm.h> 36 - #include <asm/context_tracking.h> 37 36 #include <asm/smap.h> 38 37 #include <asm/pgtable_types.h> 39 38 #include <linux/err.h> ··· 228 229 */ 229 230 USERGS_SYSRET64 230 231 232 + GLOBAL(int_ret_from_sys_call_irqs_off) 233 + TRACE_IRQS_ON 234 + ENABLE_INTERRUPTS(CLBR_NONE) 235 + jmp int_ret_from_sys_call 236 + 231 237 /* Do syscall entry tracing */ 232 238 tracesys: 233 239 movq %rsp, %rdi ··· 276 272 * Has correct iret frame. 277 273 */ 278 274 GLOBAL(int_ret_from_sys_call) 279 - DISABLE_INTERRUPTS(CLBR_NONE) 280 - int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ 281 - TRACE_IRQS_OFF 282 - movl $_TIF_ALLWORK_MASK, %edi 283 - /* edi: mask to check */ 284 - GLOBAL(int_with_check) 285 - LOCKDEP_SYS_EXIT_IRQ 286 - GET_THREAD_INFO(%rcx) 287 - movl TI_flags(%rcx), %edx 288 - andl %edi, %edx 289 - jnz int_careful 290 - andl $~TS_COMPAT, TI_status(%rcx) 291 - jmp syscall_return 292 - 293 - /* 294 - * Either reschedule or signal or syscall exit tracking needed. 295 - * First do a reschedule test. 296 - * edx: work, edi: workmask 297 - */ 298 - int_careful: 299 - bt $TIF_NEED_RESCHED, %edx 300 - jnc int_very_careful 301 - TRACE_IRQS_ON 302 - ENABLE_INTERRUPTS(CLBR_NONE) 303 - pushq %rdi 304 - SCHEDULE_USER 305 - popq %rdi 306 - DISABLE_INTERRUPTS(CLBR_NONE) 307 - TRACE_IRQS_OFF 308 - jmp int_with_check 309 - 310 - /* handle signals and tracing -- both require a full pt_regs */ 311 - int_very_careful: 312 - TRACE_IRQS_ON 313 - ENABLE_INTERRUPTS(CLBR_NONE) 314 275 SAVE_EXTRA_REGS 315 - /* Check for syscall exit trace */ 316 - testl $_TIF_WORK_SYSCALL_EXIT, %edx 317 - jz int_signal 318 - pushq %rdi 319 - leaq 8(%rsp), %rdi /* &ptregs -> arg1 */ 320 - call syscall_trace_leave 321 - popq %rdi 322 - andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi 323 - jmp int_restore_rest 324 - 325 - int_signal: 326 - testl $_TIF_DO_NOTIFY_MASK, %edx 327 - jz 1f 328 - movq %rsp, %rdi /* &ptregs -> arg1 */ 329 - xorl %esi, %esi /* oldset -> arg2 */ 330 - call do_notify_resume 331 - 1: movl $_TIF_WORK_MASK, %edi 332 - int_restore_rest: 276 + movq %rsp, %rdi 277 + call syscall_return_slowpath /* returns with IRQs disabled */ 333 278 RESTORE_EXTRA_REGS 334 - DISABLE_INTERRUPTS(CLBR_NONE) 335 - TRACE_IRQS_OFF 336 - jmp int_with_check 337 - 338 - syscall_return: 339 - /* The IRETQ could re-enable interrupts: */ 340 - DISABLE_INTERRUPTS(CLBR_ANY) 341 - TRACE_IRQS_IRETQ 279 + TRACE_IRQS_IRETQ /* we're about to change IF */ 342 280 343 281 /* 344 282 * Try to use SYSRET instead of IRET if we're returning to ··· 501 555 /* 0(%rsp): ~(interrupt number) */ 502 556 .macro interrupt func 503 557 cld 504 - /* 505 - * Since nothing in interrupt handling code touches r12...r15 members 506 - * of "struct pt_regs", and since interrupts can nest, we can save 507 - * four stack slots and simultaneously provide 508 - * an unwind-friendly stack layout by saving "truncated" pt_regs 509 - * exactly up to rbp slot, without these members. 510 - */ 511 - ALLOC_PT_GPREGS_ON_STACK -RBP 512 - SAVE_C_REGS -RBP 513 - /* this goes to 0(%rsp) for unwinder, not for saving the value: */ 514 - SAVE_EXTRA_REGS_RBP -RBP 558 + ALLOC_PT_GPREGS_ON_STACK 559 + SAVE_C_REGS 560 + SAVE_EXTRA_REGS 515 561 516 - leaq -RBP(%rsp), %rdi /* arg1 for \func (pointer to pt_regs) */ 517 - 518 - testb $3, CS-RBP(%rsp) 562 + testb $3, CS(%rsp) 519 563 jz 1f 564 + 565 + /* 566 + * IRQ from user mode. Switch to kernel gsbase and inform context 567 + * tracking that we're in kernel mode. 568 + */ 520 569 SWAPGS 570 + #ifdef CONFIG_CONTEXT_TRACKING 571 + call enter_from_user_mode 572 + #endif 573 + 521 574 1: 522 575 /* 523 576 * Save previous stack pointer, optionally switch to interrupt stack. ··· 525 580 * a little cheaper to use a separate counter in the PDA (short of 526 581 * moving irq_enter into assembly, which would be too much work) 527 582 */ 528 - movq %rsp, %rsi 583 + movq %rsp, %rdi 529 584 incl PER_CPU_VAR(irq_count) 530 585 cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp 531 - pushq %rsi 586 + pushq %rdi 532 587 /* We entered an interrupt context - irqs are off: */ 533 588 TRACE_IRQS_OFF 534 589 535 - call \func 590 + call \func /* rdi points to pt_regs */ 536 591 .endm 537 592 538 593 /* ··· 551 606 decl PER_CPU_VAR(irq_count) 552 607 553 608 /* Restore saved previous stack */ 554 - popq %rsi 555 - /* return code expects complete pt_regs - adjust rsp accordingly: */ 556 - leaq -RBP(%rsi), %rsp 609 + popq %rsp 557 610 558 611 testb $3, CS(%rsp) 559 612 jz retint_kernel 613 + 560 614 /* Interrupt came from user space */ 561 - retint_user: 562 - GET_THREAD_INFO(%rcx) 563 - 564 - /* %rcx: thread info. Interrupts are off. */ 565 - retint_with_reschedule: 566 - movl $_TIF_WORK_MASK, %edi 567 - retint_check: 568 615 LOCKDEP_SYS_EXIT_IRQ 569 - movl TI_flags(%rcx), %edx 570 - andl %edi, %edx 571 - jnz retint_careful 572 - 573 - retint_swapgs: /* return to user-space */ 574 - /* 575 - * The iretq could re-enable interrupts: 576 - */ 577 - DISABLE_INTERRUPTS(CLBR_ANY) 616 + GLOBAL(retint_user) 617 + mov %rsp,%rdi 618 + call prepare_exit_to_usermode 578 619 TRACE_IRQS_IRETQ 579 - 580 620 SWAPGS 581 - jmp restore_c_regs_and_iret 621 + jmp restore_regs_and_iret 582 622 583 623 /* Returning to kernel space */ 584 624 retint_kernel: ··· 587 657 * At this label, code paths which return to kernel and to user, 588 658 * which come from interrupts/exception and from syscalls, merge. 589 659 */ 660 + restore_regs_and_iret: 661 + RESTORE_EXTRA_REGS 590 662 restore_c_regs_and_iret: 591 663 RESTORE_C_REGS 592 664 REMOVE_PT_GPREGS_FROM_STACK 8 ··· 639 707 popq %rax 640 708 jmp native_irq_return_iret 641 709 #endif 642 - 643 - /* edi: workmask, edx: work */ 644 - retint_careful: 645 - bt $TIF_NEED_RESCHED, %edx 646 - jnc retint_signal 647 - TRACE_IRQS_ON 648 - ENABLE_INTERRUPTS(CLBR_NONE) 649 - pushq %rdi 650 - SCHEDULE_USER 651 - popq %rdi 652 - GET_THREAD_INFO(%rcx) 653 - DISABLE_INTERRUPTS(CLBR_NONE) 654 - TRACE_IRQS_OFF 655 - jmp retint_check 656 - 657 - retint_signal: 658 - testl $_TIF_DO_NOTIFY_MASK, %edx 659 - jz retint_swapgs 660 - TRACE_IRQS_ON 661 - ENABLE_INTERRUPTS(CLBR_NONE) 662 - SAVE_EXTRA_REGS 663 - movq $-1, ORIG_RAX(%rsp) 664 - xorl %esi, %esi /* oldset */ 665 - movq %rsp, %rdi /* &pt_regs */ 666 - call do_notify_resume 667 - RESTORE_EXTRA_REGS 668 - DISABLE_INTERRUPTS(CLBR_NONE) 669 - TRACE_IRQS_OFF 670 - GET_THREAD_INFO(%rcx) 671 - jmp retint_with_reschedule 672 - 673 710 END(common_interrupt) 674 711 675 712 /* ··· 1044 1143 SAVE_EXTRA_REGS 8 1045 1144 xorl %ebx, %ebx 1046 1145 testb $3, CS+8(%rsp) 1047 - jz error_kernelspace 1146 + jz .Lerror_kernelspace 1048 1147 1049 - /* We entered from user mode */ 1148 + .Lerror_entry_from_usermode_swapgs: 1149 + /* 1150 + * We entered from user mode or we're pretending to have entered 1151 + * from user mode due to an IRET fault. 1152 + */ 1050 1153 SWAPGS 1051 1154 1052 - error_entry_done: 1155 + .Lerror_entry_from_usermode_after_swapgs: 1156 + #ifdef CONFIG_CONTEXT_TRACKING 1157 + call enter_from_user_mode 1158 + #endif 1159 + 1160 + .Lerror_entry_done: 1161 + 1053 1162 TRACE_IRQS_OFF 1054 1163 ret 1055 1164 ··· 1069 1158 * truncated RIP for IRET exceptions returning to compat mode. Check 1070 1159 * for these here too. 1071 1160 */ 1072 - error_kernelspace: 1161 + .Lerror_kernelspace: 1073 1162 incl %ebx 1074 1163 leaq native_irq_return_iret(%rip), %rcx 1075 1164 cmpq %rcx, RIP+8(%rsp) 1076 - je error_bad_iret 1165 + je .Lerror_bad_iret 1077 1166 movl %ecx, %eax /* zero extend */ 1078 1167 cmpq %rax, RIP+8(%rsp) 1079 - je bstep_iret 1168 + je .Lbstep_iret 1080 1169 cmpq $gs_change, RIP+8(%rsp) 1081 - jne error_entry_done 1170 + jne .Lerror_entry_done 1082 1171 1083 1172 /* 1084 1173 * hack: gs_change can fail with user gsbase. If this happens, fix up 1085 1174 * gsbase and proceed. We'll fix up the exception and land in 1086 1175 * gs_change's error handler with kernel gsbase. 1087 1176 */ 1088 - SWAPGS 1089 - jmp error_entry_done 1177 + jmp .Lerror_entry_from_usermode_swapgs 1090 1178 1091 - bstep_iret: 1179 + .Lbstep_iret: 1092 1180 /* Fix truncated RIP */ 1093 1181 movq %rcx, RIP+8(%rsp) 1094 1182 /* fall through */ 1095 1183 1096 - error_bad_iret: 1184 + .Lerror_bad_iret: 1097 1185 /* 1098 1186 * We came from an IRET to user mode, so we have user gsbase. 1099 1187 * Switch to kernel gsbase: ··· 1108 1198 call fixup_bad_iret 1109 1199 mov %rax, %rsp 1110 1200 decl %ebx 1111 - jmp error_entry_done 1201 + jmp .Lerror_entry_from_usermode_after_swapgs 1112 1202 END(error_entry) 1113 1203 1114 1204 ··· 1119 1209 */ 1120 1210 ENTRY(error_exit) 1121 1211 movl %ebx, %eax 1122 - RESTORE_EXTRA_REGS 1123 1212 DISABLE_INTERRUPTS(CLBR_NONE) 1124 1213 TRACE_IRQS_OFF 1125 1214 testl %eax, %eax

+52 -9

arch/x86/entry/entry_64_compat.S

··· 22 22 #define __AUDIT_ARCH_LE 0x40000000 23 23 24 24 #ifndef CONFIG_AUDITSYSCALL 25 - # define sysexit_audit ia32_ret_from_sys_call 26 - # define sysretl_audit ia32_ret_from_sys_call 25 + # define sysexit_audit ia32_ret_from_sys_call_irqs_off 26 + # define sysretl_audit ia32_ret_from_sys_call_irqs_off 27 27 #endif 28 28 29 29 .section .entry.text, "ax" ··· 140 140 */ 141 141 andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 142 142 movl RIP(%rsp), %ecx /* User %eip */ 143 - RESTORE_RSI_RDI 143 + movl RSI(%rsp), %esi 144 + movl RDI(%rsp), %edi 144 145 xorl %edx, %edx /* Do not leak kernel information */ 145 146 xorq %r8, %r8 146 147 xorq %r9, %r9 ··· 209 208 .endm 210 209 211 210 .macro auditsys_exit exit 212 - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 213 - jnz ia32_ret_from_sys_call 214 211 TRACE_IRQS_ON 215 212 ENABLE_INTERRUPTS(CLBR_NONE) 213 + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 214 + jnz ia32_ret_from_sys_call 216 215 movl %eax, %esi /* second arg, syscall return value */ 217 216 cmpl $-MAX_ERRNO, %eax /* is it an error ? */ 218 217 jbe 1f ··· 231 230 movq %rax, R10(%rsp) 232 231 movq %rax, R9(%rsp) 233 232 movq %rax, R8(%rsp) 234 - jmp int_with_check 233 + jmp int_ret_from_sys_call_irqs_off 235 234 .endm 236 235 237 236 sysenter_auditsys: ··· 366 365 367 366 sysretl_from_sys_call: 368 367 andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 369 - RESTORE_RSI_RDI_RDX 368 + movl RDX(%rsp), %edx 369 + movl RSI(%rsp), %esi 370 + movl RDI(%rsp), %edi 370 371 movl RIP(%rsp), %ecx 371 372 movl EFLAGS(%rsp), %r11d 372 373 xorq %r10, %r10 ··· 432 429 END(entry_SYSCALL_compat) 433 430 434 431 ia32_badarg: 435 - ASM_CLAC 436 - movq $-EFAULT, RAX(%rsp) 432 + /* 433 + * So far, we've entered kernel mode, set AC, turned on IRQs, and 434 + * saved C regs except r8-r11. We haven't done any of the other 435 + * standard entry work, though. We want to bail, but we shouldn't 436 + * treat this as a syscall entry since we don't even know what the 437 + * args are. Instead, treat this as a non-syscall entry, finish 438 + * the entry work, and immediately exit after setting AX = -EFAULT. 439 + * 440 + * We're really just being polite here. Killing the task outright 441 + * would be a reasonable action, too. Given that the only valid 442 + * way to have gotten here is through the vDSO, and we already know 443 + * that the stack pointer is bad, the task isn't going to survive 444 + * for long no matter what we do. 445 + */ 446 + 447 + ASM_CLAC /* undo STAC */ 448 + movq $-EFAULT, RAX(%rsp) /* return -EFAULT if possible */ 449 + 450 + /* Fill in the rest of pt_regs */ 451 + xorl %eax, %eax 452 + movq %rax, R11(%rsp) 453 + movq %rax, R10(%rsp) 454 + movq %rax, R9(%rsp) 455 + movq %rax, R8(%rsp) 456 + SAVE_EXTRA_REGS 457 + 458 + /* Turn IRQs back off. */ 459 + DISABLE_INTERRUPTS(CLBR_NONE) 460 + TRACE_IRQS_OFF 461 + 462 + /* Now finish entering normal kernel mode. */ 463 + #ifdef CONFIG_CONTEXT_TRACKING 464 + call enter_from_user_mode 465 + #endif 466 + 467 + /* And exit again. */ 468 + jmp retint_user 469 + 470 + ia32_ret_from_sys_call_irqs_off: 471 + TRACE_IRQS_ON 472 + ENABLE_INTERRUPTS(CLBR_NONE) 473 + 437 474 ia32_ret_from_sys_call: 438 475 xorl %eax, %eax /* Do not leak kernel information */ 439 476 movq %rax, R11(%rsp)

+15

arch/x86/entry/syscalls/syscall_32.tbl

··· 365 365 356 i386 memfd_create sys_memfd_create 366 366 357 i386 bpf sys_bpf 367 367 358 i386 execveat sys_execveat stub32_execveat 368 + 359 i386 socket sys_socket 369 + 360 i386 socketpair sys_socketpair 370 + 361 i386 bind sys_bind 371 + 362 i386 connect sys_connect 372 + 363 i386 listen sys_listen 373 + 364 i386 accept4 sys_accept4 374 + 365 i386 getsockopt sys_getsockopt compat_sys_getsockopt 375 + 366 i386 setsockopt sys_setsockopt compat_sys_setsockopt 376 + 367 i386 getsockname sys_getsockname 377 + 368 i386 getpeername sys_getpeername 378 + 369 i386 sendto sys_sendto 379 + 370 i386 sendmsg sys_sendmsg compat_sys_sendmsg 380 + 371 i386 recvfrom sys_recvfrom compat_sys_recvfrom 381 + 372 i386 recvmsg sys_recvmsg compat_sys_recvmsg 382 + 373 i386 shutdown sys_shutdown

+3 -3

arch/x86/entry/vdso/Makefile

··· 8 8 VDSO64-$(CONFIG_X86_64) := y 9 9 VDSOX32-$(CONFIG_X86_X32_ABI) := y 10 10 VDSO32-$(CONFIG_X86_32) := y 11 - VDSO32-$(CONFIG_COMPAT) := y 11 + VDSO32-$(CONFIG_IA32_EMULATION) := y 12 12 13 13 # files to link into the vdso 14 14 vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o ··· 20 20 vdso_img-$(VDSO64-y) += 64 21 21 vdso_img-$(VDSOX32-y) += x32 22 22 vdso_img-$(VDSO32-y) += 32-int80 23 - vdso_img-$(CONFIG_COMPAT) += 32-syscall 23 + vdso_img-$(CONFIG_IA32_EMULATION) += 32-syscall 24 24 vdso_img-$(VDSO32-y) += 32-sysenter 25 25 26 26 obj-$(VDSO32-y) += vdso32-setup.o ··· 126 126 # Build multiple 32-bit vDSO images to choose from at boot time. 127 127 # 128 128 vdso32.so-$(VDSO32-y) += int80 129 - vdso32.so-$(CONFIG_COMPAT) += syscall 129 + vdso32.so-$(CONFIG_IA32_EMULATION) += syscall 130 130 vdso32.so-$(VDSO32-y) += sysenter 131 131 132 132 vdso32-images = $(vdso32.so-y:%=vdso32-%.so)

+2 -14

arch/x86/entry/vdso/vclock_gettime.c

··· 175 175 176 176 notrace static cycle_t vread_tsc(void) 177 177 { 178 - cycle_t ret; 179 - u64 last; 180 - 181 - /* 182 - * Empirically, a fence (of type that depends on the CPU) 183 - * before rdtsc is enough to ensure that rdtsc is ordered 184 - * with respect to loads. The various CPU manuals are unclear 185 - * as to whether rdtsc can be reordered with later loads, 186 - * but no one has ever seen it happen. 187 - */ 188 - rdtsc_barrier(); 189 - ret = (cycle_t)__native_read_tsc(); 190 - 191 - last = gtod->cycle_last; 178 + cycle_t ret = (cycle_t)rdtsc_ordered(); 179 + u64 last = gtod->cycle_last; 192 180 193 181 if (likely(ret >= last)) 194 182 return ret;

+5 -2

arch/x86/entry/vdso/vma.c

··· 177 177 return ret; 178 178 } 179 179 180 - #if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) 180 + #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 181 181 static int load_vdso32(void) 182 182 { 183 183 int ret; ··· 219 219 return map_vdso(&vdso_image_x32, true); 220 220 } 221 221 #endif 222 - 222 + #ifdef CONFIG_IA32_EMULATION 223 223 return load_vdso32(); 224 + #else 225 + return 0; 226 + #endif 224 227 } 225 228 #endif 226 229 #else

+1 -1

arch/x86/entry/vsyscall/vsyscall_64.c

··· 290 290 291 291 struct vm_area_struct *get_gate_vma(struct mm_struct *mm) 292 292 { 293 - #ifdef CONFIG_IA32_EMULATION 293 + #ifdef CONFIG_COMPAT 294 294 if (!mm || mm->context.ia32_compat) 295 295 return NULL; 296 296 #endif

-93

arch/x86/ia32/ia32_signal.c

··· 34 34 #include <asm/sys_ia32.h> 35 35 #include <asm/smap.h> 36 36 37 - int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) 38 - { 39 - int err = 0; 40 - bool ia32 = test_thread_flag(TIF_IA32); 41 - 42 - if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) 43 - return -EFAULT; 44 - 45 - put_user_try { 46 - /* If you change siginfo_t structure, please make sure that 47 - this code is fixed accordingly. 48 - It should never copy any pad contained in the structure 49 - to avoid security leaks, but must copy the generic 50 - 3 ints plus the relevant union member. */ 51 - put_user_ex(from->si_signo, &to->si_signo); 52 - put_user_ex(from->si_errno, &to->si_errno); 53 - put_user_ex((short)from->si_code, &to->si_code); 54 - 55 - if (from->si_code < 0) { 56 - put_user_ex(from->si_pid, &to->si_pid); 57 - put_user_ex(from->si_uid, &to->si_uid); 58 - put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr); 59 - } else { 60 - /* 61 - * First 32bits of unions are always present: 62 - * si_pid === si_band === si_tid === si_addr(LS half) 63 - */ 64 - put_user_ex(from->_sifields._pad[0], 65 - &to->_sifields._pad[0]); 66 - switch (from->si_code >> 16) { 67 - case __SI_FAULT >> 16: 68 - break; 69 - case __SI_SYS >> 16: 70 - put_user_ex(from->si_syscall, &to->si_syscall); 71 - put_user_ex(from->si_arch, &to->si_arch); 72 - break; 73 - case __SI_CHLD >> 16: 74 - if (ia32) { 75 - put_user_ex(from->si_utime, &to->si_utime); 76 - put_user_ex(from->si_stime, &to->si_stime); 77 - } else { 78 - put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime); 79 - put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime); 80 - } 81 - put_user_ex(from->si_status, &to->si_status); 82 - /* FALL THROUGH */ 83 - default: 84 - case __SI_KILL >> 16: 85 - put_user_ex(from->si_uid, &to->si_uid); 86 - break; 87 - case __SI_POLL >> 16: 88 - put_user_ex(from->si_fd, &to->si_fd); 89 - break; 90 - case __SI_TIMER >> 16: 91 - put_user_ex(from->si_overrun, &to->si_overrun); 92 - put_user_ex(ptr_to_compat(from->si_ptr), 93 - &to->si_ptr); 94 - break; 95 - /* This is not generated by the kernel as of now. */ 96 - case __SI_RT >> 16: 97 - case __SI_MESGQ >> 16: 98 - put_user_ex(from->si_uid, &to->si_uid); 99 - put_user_ex(from->si_int, &to->si_int); 100 - break; 101 - } 102 - } 103 - } put_user_catch(err); 104 - 105 - return err; 106 - } 107 - 108 - int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) 109 - { 110 - int err = 0; 111 - u32 ptr32; 112 - 113 - if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t))) 114 - return -EFAULT; 115 - 116 - get_user_try { 117 - get_user_ex(to->si_signo, &from->si_signo); 118 - get_user_ex(to->si_errno, &from->si_errno); 119 - get_user_ex(to->si_code, &from->si_code); 120 - 121 - get_user_ex(to->si_pid, &from->si_pid); 122 - get_user_ex(to->si_uid, &from->si_uid); 123 - get_user_ex(ptr32, &from->si_ptr); 124 - to->si_ptr = compat_ptr(ptr32); 125 - } get_user_catch(err); 126 - 127 - return err; 128 - } 129 - 130 37 /* 131 38 * Do a signal return; undo the signal stack. 132 39 */

-11

arch/x86/include/asm/barrier.h

··· 91 91 #define smp_mb__before_atomic() barrier() 92 92 #define smp_mb__after_atomic() barrier() 93 93 94 - /* 95 - * Stop RDTSC speculation. This is needed when you need to use RDTSC 96 - * (or get_cycles or vread that possibly accesses the TSC) in a defined 97 - * code region. 98 - */ 99 - static __always_inline void rdtsc_barrier(void) 100 - { 101 - alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, 102 - "lfence", X86_FEATURE_LFENCE_RDTSC); 103 - } 104 - 105 94 #endif /* _ASM_X86_BARRIER_H */

-10

arch/x86/include/asm/context_tracking.h

··· 1 - #ifndef _ASM_X86_CONTEXT_TRACKING_H 2 - #define _ASM_X86_CONTEXT_TRACKING_H 3 - 4 - #ifdef CONFIG_CONTEXT_TRACKING 5 - # define SCHEDULE_USER call schedule_user 6 - #else 7 - # define SCHEDULE_USER call schedule 8 - #endif 9 - 10 - #endif

+6 -11

arch/x86/include/asm/elf.h

··· 78 78 #ifdef CONFIG_X86_64 79 79 extern unsigned int vdso64_enabled; 80 80 #endif 81 - #if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) 81 + #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 82 82 extern unsigned int vdso32_enabled; 83 83 #endif 84 84 ··· 187 187 #define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ 188 188 elf_common_init(&current->thread, regs, __USER_DS) 189 189 190 - void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp); 191 - #define compat_start_thread start_thread_ia32 190 + void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp); 191 + #define compat_start_thread compat_start_thread 192 192 193 193 void set_personality_ia32(bool); 194 194 #define COMPAT_SET_PERSONALITY(ex) \ ··· 344 344 */ 345 345 static inline int mmap_is_ia32(void) 346 346 { 347 - #ifdef CONFIG_X86_32 348 - return 1; 349 - #endif 350 - #ifdef CONFIG_IA32_EMULATION 351 - if (test_thread_flag(TIF_ADDR32)) 352 - return 1; 353 - #endif 354 - return 0; 347 + return config_enabled(CONFIG_X86_32) || 348 + (config_enabled(CONFIG_COMPAT) && 349 + test_thread_flag(TIF_ADDR32)); 355 350 } 356 351 357 352 /* Do not change the values. See get_align_mask() */

-9

arch/x86/include/asm/ia32.h

··· 22 22 compat_sigset_t uc_sigmask; /* mask last for extensibility */ 23 23 }; 24 24 25 - struct ucontext_x32 { 26 - unsigned int uc_flags; 27 - unsigned int uc_link; 28 - compat_stack_t uc_stack; 29 - unsigned int uc__pad0; /* needed for alignment */ 30 - struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */ 31 - compat_sigset_t uc_sigmask; /* mask last for extensibility */ 32 - }; 33 - 34 25 /* This matches struct stat64 in glibc2.2, hence the absolutely 35 26 * insane amounts of padding around dev_t's. 36 27 */

-10

arch/x86/include/asm/irq_vectors.h

··· 117 117 118 118 #define FPU_IRQ 13 119 119 120 - #define FIRST_VM86_IRQ 3 121 - #define LAST_VM86_IRQ 15 122 - 123 - #ifndef __ASSEMBLY__ 124 - static inline int invalid_vm86_irq(int irq) 125 - { 126 - return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; 127 - } 128 - #endif 129 - 130 120 /* 131 121 * Size the maximum number of interrupts. 132 122 *

+1 -5

arch/x86/include/asm/math_emu.h

··· 2 2 #define _ASM_X86_MATH_EMU_H 3 3 4 4 #include <asm/ptrace.h> 5 - #include <asm/vm86.h> 6 5 7 6 /* This structure matches the layout of the data saved to the stack 8 7 following a device-not-present interrupt, part of it saved ··· 9 10 */ 10 11 struct math_emu_info { 11 12 long ___orig_eip; 12 - union { 13 - struct pt_regs *regs; 14 - struct kernel_vm86_regs *vm86; 15 - }; 13 + struct pt_regs *regs; 16 14 }; 17 15 #endif /* _ASM_X86_MATH_EMU_H */

+2

arch/x86/include/asm/mmu.h

··· 9 9 * we put the segment information here. 10 10 */ 11 11 typedef struct { 12 + #ifdef CONFIG_MODIFY_LDT_SYSCALL 12 13 struct ldt_struct *ldt; 14 + #endif 13 15 14 16 #ifdef CONFIG_X86_64 15 17 /* True if mm supports a task running in 32 bit compatibility mode. */

+21 -7

arch/x86/include/asm/mmu_context.h

··· 33 33 static inline void load_mm_cr4(struct mm_struct *mm) {} 34 34 #endif 35 35 36 + #ifdef CONFIG_MODIFY_LDT_SYSCALL 36 37 /* 37 38 * ldt_structs can be allocated, used, and freed, but they are never 38 39 * modified while live. ··· 49 48 int size; 50 49 }; 51 50 51 + /* 52 + * Used for LDT copy/destruction. 53 + */ 54 + int init_new_context(struct task_struct *tsk, struct mm_struct *mm); 55 + void destroy_context(struct mm_struct *mm); 56 + #else /* CONFIG_MODIFY_LDT_SYSCALL */ 57 + static inline int init_new_context(struct task_struct *tsk, 58 + struct mm_struct *mm) 59 + { 60 + return 0; 61 + } 62 + static inline void destroy_context(struct mm_struct *mm) {} 63 + #endif 64 + 52 65 static inline void load_mm_ldt(struct mm_struct *mm) 53 66 { 67 + #ifdef CONFIG_MODIFY_LDT_SYSCALL 54 68 struct ldt_struct *ldt; 55 69 56 70 /* lockless_dereference synchronizes with smp_store_release */ ··· 89 73 set_ldt(ldt->entries, ldt->size); 90 74 else 91 75 clear_LDT(); 76 + #else 77 + clear_LDT(); 78 + #endif 92 79 93 80 DEBUG_LOCKS_WARN_ON(preemptible()); 94 81 } 95 - 96 - /* 97 - * Used for LDT copy/destruction. 98 - */ 99 - int init_new_context(struct task_struct *tsk, struct mm_struct *mm); 100 - void destroy_context(struct mm_struct *mm); 101 - 102 82 103 83 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 104 84 { ··· 126 114 /* Load per-mm CR4 state */ 127 115 load_mm_cr4(next); 128 116 117 + #ifdef CONFIG_MODIFY_LDT_SYSCALL 129 118 /* 130 119 * Load the LDT, if the LDT is different. 131 120 * ··· 141 128 */ 142 129 if (unlikely(prev->context.ldt != next->context.ldt)) 143 130 load_mm_ldt(next); 131 + #endif 144 132 } 145 133 #ifdef CONFIG_SMP 146 134 else {

+39 -22

arch/x86/include/asm/msr.h

··· 47 47 * it means rax *or* rdx. 48 48 */ 49 49 #ifdef CONFIG_X86_64 50 - #define DECLARE_ARGS(val, low, high) unsigned low, high 51 - #define EAX_EDX_VAL(val, low, high) ((low) | ((u64)(high) << 32)) 52 - #define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high) 50 + /* Using 64-bit values saves one instruction clearing the high half of low */ 51 + #define DECLARE_ARGS(val, low, high) unsigned long low, high 52 + #define EAX_EDX_VAL(val, low, high) ((low) | (high) << 32) 53 53 #define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high) 54 54 #else 55 55 #define DECLARE_ARGS(val, low, high) unsigned long long val 56 56 #define EAX_EDX_VAL(val, low, high) (val) 57 - #define EAX_EDX_ARGS(val, low, high) "A" (val) 58 57 #define EAX_EDX_RET(val, low, high) "=A" (val) 59 58 #endif 60 59 ··· 105 106 return err; 106 107 } 107 108 108 - extern unsigned long long native_read_tsc(void); 109 - 110 109 extern int rdmsr_safe_regs(u32 regs[8]); 111 110 extern int wrmsr_safe_regs(u32 regs[8]); 112 111 113 - static __always_inline unsigned long long __native_read_tsc(void) 112 + /** 113 + * rdtsc() - returns the current TSC without ordering constraints 114 + * 115 + * rdtsc() returns the result of RDTSC as a 64-bit integer. The 116 + * only ordering constraint it supplies is the ordering implied by 117 + * "asm volatile": it will put the RDTSC in the place you expect. The 118 + * CPU can and will speculatively execute that RDTSC, though, so the 119 + * results can be non-monotonic if compared on different CPUs. 120 + */ 121 + static __always_inline unsigned long long rdtsc(void) 114 122 { 115 123 DECLARE_ARGS(val, low, high); 116 124 117 125 asm volatile("rdtsc" : EAX_EDX_RET(val, low, high)); 118 126 119 127 return EAX_EDX_VAL(val, low, high); 128 + } 129 + 130 + /** 131 + * rdtsc_ordered() - read the current TSC in program order 132 + * 133 + * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer. 134 + * It is ordered like a load to a global in-memory counter. It should 135 + * be impossible to observe non-monotonic rdtsc_unordered() behavior 136 + * across multiple CPUs as long as the TSC is synced. 137 + */ 138 + static __always_inline unsigned long long rdtsc_ordered(void) 139 + { 140 + /* 141 + * The RDTSC instruction is not ordered relative to memory 142 + * access. The Intel SDM and the AMD APM are both vague on this 143 + * point, but empirically an RDTSC instruction can be 144 + * speculatively executed before prior loads. An RDTSC 145 + * immediately after an appropriate barrier appears to be 146 + * ordered as a normal load, that is, it provides the same 147 + * ordering guarantees as reading from a global memory location 148 + * that some other imaginary CPU is updating continuously with a 149 + * time stamp. 150 + */ 151 + alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, 152 + "lfence", X86_FEATURE_LFENCE_RDTSC); 153 + return rdtsc(); 120 154 } 121 155 122 156 static inline unsigned long long native_read_pmc(int counter) ··· 212 180 return err; 213 181 } 214 182 215 - #define rdtscl(low) \ 216 - ((low) = (u32)__native_read_tsc()) 217 - 218 - #define rdtscll(val) \ 219 - ((val) = __native_read_tsc()) 220 - 221 183 #define rdpmc(counter, low, high) \ 222 184 do { \ 223 185 u64 _l = native_read_pmc((counter)); \ ··· 220 194 } while (0) 221 195 222 196 #define rdpmcl(counter, val) ((val) = native_read_pmc(counter)) 223 - 224 - #define rdtscp(low, high, aux) \ 225 - do { \ 226 - unsigned long long _val = native_read_tscp(&(aux)); \ 227 - (low) = (u32)_val; \ 228 - (high) = (u32)(_val >> 32); \ 229 - } while (0) 230 - 231 - #define rdtscpll(val, aux) (val) = native_read_tscp(&(aux)) 232 197 233 198 #endif /* !CONFIG_PARAVIRT */ 234 199

-34

arch/x86/include/asm/paravirt.h

··· 174 174 return err; 175 175 } 176 176 177 - static inline u64 paravirt_read_tsc(void) 178 - { 179 - return PVOP_CALL0(u64, pv_cpu_ops.read_tsc); 180 - } 181 - 182 - #define rdtscl(low) \ 183 - do { \ 184 - u64 _l = paravirt_read_tsc(); \ 185 - low = (int)_l; \ 186 - } while (0) 187 - 188 - #define rdtscll(val) (val = paravirt_read_tsc()) 189 - 190 177 static inline unsigned long long paravirt_sched_clock(void) 191 178 { 192 179 return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); ··· 201 214 } while (0) 202 215 203 216 #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) 204 - 205 - static inline unsigned long long paravirt_rdtscp(unsigned int *aux) 206 - { 207 - return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux); 208 - } 209 - 210 - #define rdtscp(low, high, aux) \ 211 - do { \ 212 - int __aux; \ 213 - unsigned long __val = paravirt_rdtscp(&__aux); \ 214 - (low) = (u32)__val; \ 215 - (high) = (u32)(__val >> 32); \ 216 - (aux) = __aux; \ 217 - } while (0) 218 - 219 - #define rdtscpll(val, aux) \ 220 - do { \ 221 - unsigned long __aux; \ 222 - val = paravirt_rdtscp(&__aux); \ 223 - (aux) = __aux; \ 224 - } while (0) 225 217 226 218 static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) 227 219 {

-2

arch/x86/include/asm/paravirt_types.h

··· 156 156 u64 (*read_msr)(unsigned int msr, int *err); 157 157 int (*write_msr)(unsigned int msr, unsigned low, unsigned high); 158 158 159 - u64 (*read_tsc)(void); 160 159 u64 (*read_pmc)(int counter); 161 - unsigned long long (*read_tscp)(unsigned int *aux); 162 160 163 161 #ifdef CONFIG_X86_32 164 162 /*

+3 -10

arch/x86/include/asm/processor.h

··· 6 6 /* Forward declaration, a strange C thing */ 7 7 struct task_struct; 8 8 struct mm_struct; 9 + struct vm86; 9 10 10 - #include <asm/vm86.h> 11 11 #include <asm/math_emu.h> 12 12 #include <asm/segment.h> 13 13 #include <asm/types.h> ··· 400 400 unsigned long cr2; 401 401 unsigned long trap_nr; 402 402 unsigned long error_code; 403 - #ifdef CONFIG_X86_32 403 + #ifdef CONFIG_VM86 404 404 /* Virtual 86 mode info */ 405 - struct vm86_struct __user *vm86_info; 406 - unsigned long screen_bitmap; 407 - unsigned long v86flags; 408 - unsigned long v86mask; 409 - unsigned long saved_sp0; 410 - unsigned int saved_fs; 411 - unsigned int saved_gs; 405 + struct vm86 *vm86; 412 406 #endif 413 407 /* IO permissions: */ 414 408 unsigned long *io_bitmap_ptr; ··· 714 720 715 721 #define INIT_THREAD { \ 716 722 .sp0 = TOP_OF_INIT_STACK, \ 717 - .vm86_info = NULL, \ 718 723 .sysenter_cs = __KERNEL_CS, \ 719 724 .io_bitmap_ptr = NULL, \ 720 725 }

+2 -8

arch/x86/include/asm/pvclock.h

··· 62 62 static __always_inline 63 63 u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src) 64 64 { 65 - u64 delta = __native_read_tsc() - src->tsc_timestamp; 65 + u64 delta = rdtsc_ordered() - src->tsc_timestamp; 66 66 return pvclock_scale_delta(delta, src->tsc_to_system_mul, 67 67 src->tsc_shift); 68 68 } ··· 76 76 u8 ret_flags; 77 77 78 78 version = src->version; 79 - /* Note: emulated platforms which do not advertise SSE2 support 80 - * result in kvmclock not using the necessary RDTSC barriers. 81 - * Without barriers, it is possible that RDTSC instruction reads from 82 - * the time stamp counter outside rdtsc_barrier protected section 83 - * below, resulting in violation of monotonicity. 84 - */ 85 - rdtsc_barrier(); 79 + 86 80 offset = pvclock_get_nsec_offset(src); 87 81 ret = src->system_time + offset; 88 82 ret_flags = src->flags;

+10

arch/x86/include/asm/sigframe.h

··· 4 4 #include <asm/sigcontext.h> 5 5 #include <asm/siginfo.h> 6 6 #include <asm/ucontext.h> 7 + #include <linux/compat.h> 7 8 8 9 #ifdef CONFIG_X86_32 9 10 #define sigframe_ia32 sigframe ··· 69 68 }; 70 69 71 70 #ifdef CONFIG_X86_X32_ABI 71 + 72 + struct ucontext_x32 { 73 + unsigned int uc_flags; 74 + unsigned int uc_link; 75 + compat_stack_t uc_stack; 76 + unsigned int uc__pad0; /* needed for alignment */ 77 + struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */ 78 + compat_sigset_t uc_sigmask; /* mask last for extensibility */ 79 + }; 72 80 73 81 struct rt_sigframe_x32 { 74 82 u64 pretcode;

+1

arch/x86/include/asm/signal.h

··· 30 30 #endif /* __ASSEMBLY__ */ 31 31 #include <uapi/asm/signal.h> 32 32 #ifndef __ASSEMBLY__ 33 + extern void do_signal(struct pt_regs *regs); 33 34 extern void do_notify_resume(struct pt_regs *, void *, __u32); 34 35 35 36 #define __ARCH_HAS_SA_RESTORER

+1 -1

arch/x86/include/asm/stackprotector.h

··· 72 72 * on during the bootup the random pool has true entropy too. 73 73 */ 74 74 get_random_bytes(&canary, sizeof(canary)); 75 - tsc = __native_read_tsc(); 75 + tsc = rdtsc(); 76 76 canary += tsc + (tsc << 32UL); 77 77 78 78 current->stack_canary = canary;

+1

arch/x86/include/asm/syscalls.h

··· 37 37 asmlinkage unsigned long sys_sigreturn(void); 38 38 39 39 /* kernel/vm86_32.c */ 40 + struct vm86_struct; 40 41 asmlinkage long sys_vm86old(struct vm86_struct __user *); 41 42 asmlinkage long sys_vm86(unsigned long, unsigned long); 42 43

+7 -4

arch/x86/include/asm/thread_info.h

··· 27 27 * Without this offset, that can result in a page fault. (We are 28 28 * careful that, in this case, the value we read doesn't matter.) 29 29 * 30 - * In vm86 mode, the hardware frame is much longer still, but we neither 31 - * access the extra members from NMI context, nor do we write such a 32 - * frame at sp0 at all. 30 + * In vm86 mode, the hardware frame is much longer still, so add 16 31 + * bytes to make room for the real-mode segments. 33 32 * 34 33 * x86_64 has a fixed-length stack frame. 35 34 */ 36 35 #ifdef CONFIG_X86_32 37 - # define TOP_OF_KERNEL_STACK_PADDING 8 36 + # ifdef CONFIG_VM86 37 + # define TOP_OF_KERNEL_STACK_PADDING 16 38 + # else 39 + # define TOP_OF_KERNEL_STACK_PADDING 8 40 + # endif 38 41 #else 39 42 # define TOP_OF_KERNEL_STACK_PADDING 0 40 43 #endif

+2 -2

arch/x86/include/asm/traps.h

··· 112 112 asmlinkage void smp_deferred_error_interrupt(void); 113 113 #endif 114 114 115 - extern enum ctx_state ist_enter(struct pt_regs *regs); 116 - extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state); 115 + extern void ist_enter(struct pt_regs *regs); 116 + extern void ist_exit(struct pt_regs *regs); 117 117 extern void ist_begin_non_atomic(struct pt_regs *regs); 118 118 extern void ist_end_non_atomic(void); 119 119

+1 -17

arch/x86/include/asm/tsc.h

··· 21 21 22 22 static inline cycles_t get_cycles(void) 23 23 { 24 - unsigned long long ret = 0; 25 - 26 24 #ifndef CONFIG_X86_TSC 27 25 if (!cpu_has_tsc) 28 26 return 0; 29 27 #endif 30 - rdtscll(ret); 31 28 32 - return ret; 33 - } 34 - 35 - static __always_inline cycles_t vget_cycles(void) 36 - { 37 - /* 38 - * We only do VDSOs on TSC capable CPUs, so this shouldn't 39 - * access boot_cpu_data (which is not VDSO-safe): 40 - */ 41 - #ifndef CONFIG_X86_TSC 42 - if (!cpu_has_tsc) 43 - return 0; 44 - #endif 45 - return (cycles_t)__native_read_tsc(); 29 + return rdtsc(); 46 30 } 47 31 48 32 extern void tsc_init(void);

+33 -24

arch/x86/include/asm/vm86.h

··· 1 1 #ifndef _ASM_X86_VM86_H 2 2 #define _ASM_X86_VM86_H 3 3 4 - 5 4 #include <asm/ptrace.h> 6 5 #include <uapi/asm/vm86.h> 7 6 ··· 27 28 unsigned short gs, __gsh; 28 29 }; 29 30 30 - struct kernel_vm86_struct { 31 - struct kernel_vm86_regs regs; 32 - /* 33 - * the below part remains on the kernel stack while we are in VM86 mode. 34 - * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we 35 - * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above 36 - * 'struct kernel_vm86_regs' with the then actual values. 37 - * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct' 38 - * in kernelspace, hence we need not reget the data from userspace. 39 - */ 40 - #define VM86_TSS_ESP0 flags 31 + struct vm86 { 32 + struct vm86plus_struct __user *user_vm86; 33 + struct pt_regs regs32; 34 + unsigned long veflags; 35 + unsigned long veflags_mask; 36 + unsigned long saved_sp0; 37 + 41 38 unsigned long flags; 42 39 unsigned long screen_bitmap; 43 40 unsigned long cpu_type; 44 41 struct revectored_struct int_revectored; 45 42 struct revectored_struct int21_revectored; 46 43 struct vm86plus_info_struct vm86plus; 47 - struct pt_regs *regs32; /* here we save the pointer to the old regs */ 48 - /* 49 - * The below is not part of the structure, but the stack layout continues 50 - * this way. In front of 'return-eip' may be some data, depending on 51 - * compilation, so we don't rely on this and save the pointer to 'oldregs' 52 - * in 'regs32' above. 53 - * However, with GCC-2.7.2 and the current CFLAGS you see exactly this: 54 - 55 - long return-eip; from call to vm86() 56 - struct pt_regs oldregs; user space registers as saved by syscall 57 - */ 58 44 }; 59 45 60 46 #ifdef CONFIG_VM86 61 47 62 48 void handle_vm86_fault(struct kernel_vm86_regs *, long); 63 49 int handle_vm86_trap(struct kernel_vm86_regs *, long, int); 64 - struct pt_regs *save_v86_state(struct kernel_vm86_regs *); 50 + void save_v86_state(struct kernel_vm86_regs *, int); 65 51 66 52 struct task_struct; 53 + 54 + #define free_vm86(t) do { \ 55 + struct thread_struct *__t = (t); \ 56 + if (__t->vm86 != NULL) { \ 57 + kfree(__t->vm86); \ 58 + __t->vm86 = NULL; \ 59 + } \ 60 + } while (0) 61 + 62 + /* 63 + * Support for VM86 programs to request interrupts for 64 + * real mode hardware drivers: 65 + */ 66 + #define FIRST_VM86_IRQ 3 67 + #define LAST_VM86_IRQ 15 68 + 69 + static inline int invalid_vm86_irq(int irq) 70 + { 71 + return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; 72 + } 73 + 67 74 void release_vm86_irqs(struct task_struct *); 68 75 69 76 #else ··· 81 76 { 82 77 return 0; 83 78 } 79 + 80 + static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { } 81 + 82 + #define free_vm86(t) do { } while(0) 84 83 85 84 #endif /* CONFIG_VM86 */ 86 85

+3 -1

arch/x86/kernel/Makefile

··· 23 23 CFLAGS_irq.o := -I$(src)/../include/asm/trace 24 24 25 25 obj-y := process_$(BITS).o signal.o 26 + obj-$(CONFIG_COMPAT) += signal_compat.o 26 27 obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 27 - obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o 28 + obj-y += time.o ioport.o dumpstack.o nmi.o 29 + obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o 28 30 obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o 29 31 obj-$(CONFIG_IRQ_WORK) += irq_work.o 30 32 obj-y += probe_roms.o

+4 -4

arch/x86/kernel/apb_timer.c

··· 263 263 264 264 /* Verify whether apbt counter works */ 265 265 t1 = dw_apb_clocksource_read(clocksource_apbt); 266 - rdtscll(start); 266 + start = rdtsc(); 267 267 268 268 /* 269 269 * We don't know the TSC frequency yet, but waiting for ··· 273 273 */ 274 274 do { 275 275 rep_nop(); 276 - rdtscll(now); 276 + now = rdtsc(); 277 277 } while ((now - start) < 200000UL); 278 278 279 279 /* APBT is the only always on clocksource, it has to work! */ ··· 390 390 old = dw_apb_clocksource_read(clocksource_apbt); 391 391 old += loop; 392 392 393 - t1 = __native_read_tsc(); 393 + t1 = rdtsc(); 394 394 395 395 do { 396 396 new = dw_apb_clocksource_read(clocksource_apbt); 397 397 } while (new < old); 398 398 399 - t2 = __native_read_tsc(); 399 + t2 = rdtsc(); 400 400 401 401 shift = 5; 402 402 if (unlikely(loop >> shift == 0)) {

+4 -4

arch/x86/kernel/apic/apic.c

··· 457 457 { 458 458 u64 tsc; 459 459 460 - rdtscll(tsc); 460 + tsc = rdtsc(); 461 461 wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); 462 462 return 0; 463 463 } ··· 592 592 unsigned long pm = acpi_pm_read_early(); 593 593 594 594 if (cpu_has_tsc) 595 - rdtscll(tsc); 595 + tsc = rdtsc(); 596 596 597 597 switch (lapic_cal_loops++) { 598 598 case 0: ··· 1209 1209 long long max_loops = cpu_khz ? cpu_khz : 1000000; 1210 1210 1211 1211 if (cpu_has_tsc) 1212 - rdtscll(tsc); 1212 + tsc = rdtsc(); 1213 1213 1214 1214 if (disable_apic) { 1215 1215 disable_ioapic_support(); ··· 1293 1293 } 1294 1294 if (queued) { 1295 1295 if (cpu_has_tsc && cpu_khz) { 1296 - rdtscll(ntsc); 1296 + ntsc = rdtsc(); 1297 1297 max_loops = (cpu_khz << 10) - (ntsc - tsc); 1298 1298 } else 1299 1299 max_loops--;

+3 -3

arch/x86/kernel/cpu/amd.c

··· 114 114 const int K6_BUG_LOOP = 1000000; 115 115 int n; 116 116 void (*f_vide)(void); 117 - unsigned long d, d2; 117 + u64 d, d2; 118 118 119 119 printk(KERN_INFO "AMD K6 stepping B detected - "); 120 120 ··· 125 125 126 126 n = K6_BUG_LOOP; 127 127 f_vide = vide; 128 - rdtscl(d); 128 + d = rdtsc(); 129 129 while (n--) 130 130 f_vide(); 131 - rdtscl(d2); 131 + d2 = rdtsc(); 132 132 d = d2-d; 133 133 134 134 if (d > 20*K6_BUG_LOOP)

+4 -5

arch/x86/kernel/cpu/mcheck/mce.c

··· 125 125 { 126 126 memset(m, 0, sizeof(struct mce)); 127 127 m->cpu = m->extcpu = smp_processor_id(); 128 - rdtscll(m->tsc); 128 + m->tsc = rdtsc(); 129 129 /* We hope get_seconds stays lockless */ 130 130 m->time = get_seconds(); 131 131 m->cpuvendor = boot_cpu_data.x86_vendor; ··· 1029 1029 { 1030 1030 struct mca_config *cfg = &mca_cfg; 1031 1031 struct mce m, *final; 1032 - enum ctx_state prev_state; 1033 1032 int i; 1034 1033 int worst = 0; 1035 1034 int severity; ··· 1054 1055 int flags = MF_ACTION_REQUIRED; 1055 1056 int lmce = 0; 1056 1057 1057 - prev_state = ist_enter(regs); 1058 + ist_enter(regs); 1058 1059 1059 1060 this_cpu_inc(mce_exception_count); 1060 1061 ··· 1226 1227 local_irq_disable(); 1227 1228 ist_end_non_atomic(); 1228 1229 done: 1229 - ist_exit(regs, prev_state); 1230 + ist_exit(regs); 1230 1231 } 1231 1232 EXPORT_SYMBOL_GPL(do_machine_check); 1232 1233 ··· 1783 1784 { 1784 1785 unsigned long *cpu_tsc = (unsigned long *)data; 1785 1786 1786 - rdtscll(cpu_tsc[smp_processor_id()]); 1787 + cpu_tsc[smp_processor_id()] = rdtsc(); 1787 1788 } 1788 1789 1789 1790 static int mce_apei_read_done;

+2 -3

arch/x86/kernel/cpu/mcheck/p5.c

··· 19 19 /* Machine check handler for Pentium class Intel CPUs: */ 20 20 static void pentium_machine_check(struct pt_regs *regs, long error_code) 21 21 { 22 - enum ctx_state prev_state; 23 22 u32 loaddr, hi, lotype; 24 23 25 - prev_state = ist_enter(regs); 24 + ist_enter(regs); 26 25 27 26 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); 28 27 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); ··· 38 39 39 40 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 40 41 41 - ist_exit(regs, prev_state); 42 + ist_exit(regs); 42 43 } 43 44 44 45 /* Set up machine check reporting for processors with Intel style MCE: */

+2 -2

arch/x86/kernel/cpu/mcheck/winchip.c

··· 15 15 /* Machine check handler for WinChip C6: */ 16 16 static void winchip_machine_check(struct pt_regs *regs, long error_code) 17 17 { 18 - enum ctx_state prev_state = ist_enter(regs); 18 + ist_enter(regs); 19 19 20 20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); 21 21 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 22 22 23 - ist_exit(regs, prev_state); 23 + ist_exit(regs); 24 24 } 25 25 26 26 /* Set up machine check reporting on the Winchip C6 series */

+5 -1

arch/x86/kernel/cpu/perf_event.c

··· 2179 2179 int idx = segment >> 3; 2180 2180 2181 2181 if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { 2182 + #ifdef CONFIG_MODIFY_LDT_SYSCALL 2182 2183 struct ldt_struct *ldt; 2183 2184 2184 2185 if (idx > LDT_ENTRIES) ··· 2191 2190 return 0; 2192 2191 2193 2192 desc = &ldt->entries[idx]; 2193 + #else 2194 + return 0; 2195 + #endif 2194 2196 } else { 2195 2197 if (idx > GDT_ENTRIES) 2196 2198 return 0; ··· 2204 2200 return get_desc_base(desc); 2205 2201 } 2206 2202 2207 - #ifdef CONFIG_COMPAT 2203 + #ifdef CONFIG_IA32_EMULATION 2208 2204 2209 2205 #include <asm/compat.h> 2210 2206

+1 -1

arch/x86/kernel/espfix_64.c

··· 110 110 */ 111 111 if (!arch_get_random_long(&rand)) { 112 112 /* The constant is an arbitrary large prime */ 113 - rdtscll(rand); 113 + rand = rdtsc(); 114 114 rand *= 0xc345c6b72fd16123UL; 115 115 } 116 116

+2 -2

arch/x86/kernel/hpet.c

··· 735 735 736 736 /* Verify whether hpet counter works */ 737 737 t1 = hpet_readl(HPET_COUNTER); 738 - rdtscll(start); 738 + start = rdtsc(); 739 739 740 740 /* 741 741 * We don't know the TSC frequency yet, but waiting for ··· 745 745 */ 746 746 do { 747 747 rep_nop(); 748 - rdtscll(now); 748 + now = rdtsc(); 749 749 } while ((now - start) < 200000UL); 750 750 751 751 if (t1 == hpet_readl(HPET_COUNTER)) {

+15

arch/x86/kernel/irq.c

··· 216 216 unsigned vector = ~regs->orig_ax; 217 217 unsigned irq; 218 218 219 + /* 220 + * NB: Unlike exception entries, IRQ entries do not reliably 221 + * handle context tracking in the low-level entry code. This is 222 + * because syscall entries execute briefly with IRQs on before 223 + * updating context tracking state, so we can take an IRQ from 224 + * kernel mode with CONTEXT_USER. The low-level entry code only 225 + * updates the context if we came from user mode, so we won't 226 + * switch to CONTEXT_KERNEL. We'll fix that once the syscall 227 + * code is cleaned up enough that we can cleanly defer enabling 228 + * IRQs. 229 + */ 230 + 219 231 entering_irq(); 232 + 233 + /* entering_irq() tells RCU that we're not quiescent. Check it. */ 234 + rcu_lockdep_assert(rcu_is_watching(), "IRQ failed to wake up RCU"); 220 235 221 236 irq = __this_cpu_read(vector_irq[vector]); 222 237

+5 -5

arch/x86/kernel/nmi.c

··· 110 110 a->handler, whole_msecs, decimal_msecs); 111 111 } 112 112 113 - static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) 113 + static int nmi_handle(unsigned int type, struct pt_regs *regs) 114 114 { 115 115 struct nmi_desc *desc = nmi_to_desc(type); 116 116 struct nmiaction *a; ··· 213 213 pci_serr_error(unsigned char reason, struct pt_regs *regs) 214 214 { 215 215 /* check to see if anyone registered against these types of errors */ 216 - if (nmi_handle(NMI_SERR, regs, false)) 216 + if (nmi_handle(NMI_SERR, regs)) 217 217 return; 218 218 219 219 pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", ··· 247 247 unsigned long i; 248 248 249 249 /* check to see if anyone registered against these types of errors */ 250 - if (nmi_handle(NMI_IO_CHECK, regs, false)) 250 + if (nmi_handle(NMI_IO_CHECK, regs)) 251 251 return; 252 252 253 253 pr_emerg( ··· 284 284 * as only the first one is ever run (unless it can actually determine 285 285 * if it caused the NMI) 286 286 */ 287 - handled = nmi_handle(NMI_UNKNOWN, regs, false); 287 + handled = nmi_handle(NMI_UNKNOWN, regs); 288 288 if (handled) { 289 289 __this_cpu_add(nmi_stats.unknown, handled); 290 290 return; ··· 332 332 333 333 __this_cpu_write(last_nmi_rip, regs->ip); 334 334 335 - handled = nmi_handle(NMI_LOCAL, regs, b2b); 335 + handled = nmi_handle(NMI_LOCAL, regs); 336 336 __this_cpu_add(nmi_stats.normal, handled); 337 337 if (handled) { 338 338 /*

-2

arch/x86/kernel/paravirt.c

··· 351 351 .wbinvd = native_wbinvd, 352 352 .read_msr = native_read_msr_safe, 353 353 .write_msr = native_write_msr_safe, 354 - .read_tsc = native_read_tsc, 355 354 .read_pmc = native_read_pmc, 356 - .read_tscp = native_read_tscp, 357 355 .load_tr_desc = native_load_tr_desc, 358 356 .set_ldt = native_set_ldt, 359 357 .load_gdt = native_load_gdt,

-2

arch/x86/kernel/paravirt_patch_32.c

··· 10 10 DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); 11 11 DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); 12 12 DEF_NATIVE(pv_cpu_ops, clts, "clts"); 13 - DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); 14 13 15 14 #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) 16 15 DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)"); ··· 51 52 PATCH_SITE(pv_mmu_ops, read_cr3); 52 53 PATCH_SITE(pv_mmu_ops, write_cr3); 53 54 PATCH_SITE(pv_cpu_ops, clts); 54 - PATCH_SITE(pv_cpu_ops, read_tsc); 55 55 #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) 56 56 case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): 57 57 if (pv_is_native_spin_unlock()) {

+3

arch/x86/kernel/process.c

··· 29 29 #include <asm/debugreg.h> 30 30 #include <asm/nmi.h> 31 31 #include <asm/tlbflush.h> 32 + #include <asm/vm86.h> 32 33 33 34 /* 34 35 * per-CPU TSS segments. Threads are completely 'soft' on Linux, ··· 110 109 put_cpu(); 111 110 kfree(bp); 112 111 } 112 + 113 + free_vm86(t); 113 114 114 115 fpu__drop(fpu); 115 116 }

+1

arch/x86/kernel/process_32.c

··· 53 53 #include <asm/syscalls.h> 54 54 #include <asm/debugreg.h> 55 55 #include <asm/switch_to.h> 56 + #include <asm/vm86.h> 56 57 57 58 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 58 59 asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");

+4 -2

arch/x86/kernel/process_64.c

··· 121 121 void release_thread(struct task_struct *dead_task) 122 122 { 123 123 if (dead_task->mm) { 124 + #ifdef CONFIG_MODIFY_LDT_SYSCALL 124 125 if (dead_task->mm->context.ldt) { 125 126 pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", 126 127 dead_task->comm, ··· 129 128 dead_task->mm->context.ldt->size); 130 129 BUG(); 131 130 } 131 + #endif 132 132 } 133 133 } 134 134 ··· 250 248 __USER_CS, __USER_DS, 0); 251 249 } 252 250 253 - #ifdef CONFIG_IA32_EMULATION 254 - void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) 251 + #ifdef CONFIG_COMPAT 252 + void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp) 255 253 { 256 254 start_thread_common(regs, new_ip, new_sp, 257 255 test_thread_flag(TIF_X32)

+75 -265

arch/x86/kernel/ptrace.c

··· 37 37 #include <asm/proto.h> 38 38 #include <asm/hw_breakpoint.h> 39 39 #include <asm/traps.h> 40 + #include <asm/syscall.h> 40 41 41 42 #include "tls.h" 42 - 43 - #define CREATE_TRACE_POINTS 44 - #include <trace/events/syscalls.h> 45 43 46 44 enum x86_regset { 47 45 REGSET_GENERAL, ··· 1121 1123 return ret; 1122 1124 } 1123 1125 1126 + static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request, 1127 + compat_ulong_t caddr, compat_ulong_t cdata) 1128 + { 1129 + unsigned long addr = caddr; 1130 + unsigned long data = cdata; 1131 + void __user *datap = compat_ptr(data); 1132 + int ret; 1133 + __u32 val; 1134 + 1135 + switch (request) { 1136 + case PTRACE_PEEKUSR: 1137 + ret = getreg32(child, addr, &val); 1138 + if (ret == 0) 1139 + ret = put_user(val, (__u32 __user *)datap); 1140 + break; 1141 + 1142 + case PTRACE_POKEUSR: 1143 + ret = putreg32(child, addr, data); 1144 + break; 1145 + 1146 + case PTRACE_GETREGS: /* Get all gp regs from the child. */ 1147 + return copy_regset_to_user(child, &user_x86_32_view, 1148 + REGSET_GENERAL, 1149 + 0, sizeof(struct user_regs_struct32), 1150 + datap); 1151 + 1152 + case PTRACE_SETREGS: /* Set all gp regs in the child. */ 1153 + return copy_regset_from_user(child, &user_x86_32_view, 1154 + REGSET_GENERAL, 0, 1155 + sizeof(struct user_regs_struct32), 1156 + datap); 1157 + 1158 + case PTRACE_GETFPREGS: /* Get the child FPU state. */ 1159 + return copy_regset_to_user(child, &user_x86_32_view, 1160 + REGSET_FP, 0, 1161 + sizeof(struct user_i387_ia32_struct), 1162 + datap); 1163 + 1164 + case PTRACE_SETFPREGS: /* Set the child FPU state. */ 1165 + return copy_regset_from_user( 1166 + child, &user_x86_32_view, REGSET_FP, 1167 + 0, sizeof(struct user_i387_ia32_struct), datap); 1168 + 1169 + case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ 1170 + return copy_regset_to_user(child, &user_x86_32_view, 1171 + REGSET_XFP, 0, 1172 + sizeof(struct user32_fxsr_struct), 1173 + datap); 1174 + 1175 + case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ 1176 + return copy_regset_from_user(child, &user_x86_32_view, 1177 + REGSET_XFP, 0, 1178 + sizeof(struct user32_fxsr_struct), 1179 + datap); 1180 + 1181 + case PTRACE_GET_THREAD_AREA: 1182 + case PTRACE_SET_THREAD_AREA: 1183 + return arch_ptrace(child, request, addr, data); 1184 + 1185 + default: 1186 + return compat_ptrace_request(child, request, addr, data); 1187 + } 1188 + 1189 + return ret; 1190 + } 1191 + #endif /* CONFIG_IA32_EMULATION */ 1192 + 1124 1193 #ifdef CONFIG_X86_X32_ABI 1125 1194 static long x32_arch_ptrace(struct task_struct *child, 1126 1195 compat_long_t request, compat_ulong_t caddr, ··· 1276 1211 } 1277 1212 #endif 1278 1213 1214 + #ifdef CONFIG_COMPAT 1279 1215 long compat_arch_ptrace(struct task_struct *child, compat_long_t request, 1280 1216 compat_ulong_t caddr, compat_ulong_t cdata) 1281 1217 { 1282 - unsigned long addr = caddr; 1283 - unsigned long data = cdata; 1284 - void __user *datap = compat_ptr(data); 1285 - int ret; 1286 - __u32 val; 1287 - 1288 1218 #ifdef CONFIG_X86_X32_ABI 1289 1219 if (!is_ia32_task()) 1290 1220 return x32_arch_ptrace(child, request, caddr, cdata); 1291 1221 #endif 1292 - 1293 - switch (request) { 1294 - case PTRACE_PEEKUSR: 1295 - ret = getreg32(child, addr, &val); 1296 - if (ret == 0) 1297 - ret = put_user(val, (__u32 __user *)datap); 1298 - break; 1299 - 1300 - case PTRACE_POKEUSR: 1301 - ret = putreg32(child, addr, data); 1302 - break; 1303 - 1304 - case PTRACE_GETREGS: /* Get all gp regs from the child. */ 1305 - return copy_regset_to_user(child, &user_x86_32_view, 1306 - REGSET_GENERAL, 1307 - 0, sizeof(struct user_regs_struct32), 1308 - datap); 1309 - 1310 - case PTRACE_SETREGS: /* Set all gp regs in the child. */ 1311 - return copy_regset_from_user(child, &user_x86_32_view, 1312 - REGSET_GENERAL, 0, 1313 - sizeof(struct user_regs_struct32), 1314 - datap); 1315 - 1316 - case PTRACE_GETFPREGS: /* Get the child FPU state. */ 1317 - return copy_regset_to_user(child, &user_x86_32_view, 1318 - REGSET_FP, 0, 1319 - sizeof(struct user_i387_ia32_struct), 1320 - datap); 1321 - 1322 - case PTRACE_SETFPREGS: /* Set the child FPU state. */ 1323 - return copy_regset_from_user( 1324 - child, &user_x86_32_view, REGSET_FP, 1325 - 0, sizeof(struct user_i387_ia32_struct), datap); 1326 - 1327 - case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ 1328 - return copy_regset_to_user(child, &user_x86_32_view, 1329 - REGSET_XFP, 0, 1330 - sizeof(struct user32_fxsr_struct), 1331 - datap); 1332 - 1333 - case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ 1334 - return copy_regset_from_user(child, &user_x86_32_view, 1335 - REGSET_XFP, 0, 1336 - sizeof(struct user32_fxsr_struct), 1337 - datap); 1338 - 1339 - case PTRACE_GET_THREAD_AREA: 1340 - case PTRACE_SET_THREAD_AREA: 1341 - return arch_ptrace(child, request, addr, data); 1342 - 1343 - default: 1344 - return compat_ptrace_request(child, request, addr, data); 1345 - } 1346 - 1347 - return ret; 1222 + #ifdef CONFIG_IA32_EMULATION 1223 + return ia32_arch_ptrace(child, request, caddr, cdata); 1224 + #else 1225 + return 0; 1226 + #endif 1348 1227 } 1349 - 1350 - #endif /* CONFIG_IA32_EMULATION */ 1228 + #endif /* CONFIG_COMPAT */ 1351 1229 1352 1230 #ifdef CONFIG_X86_64 1353 1231 ··· 1441 1433 fill_sigtrap_info(tsk, regs, error_code, si_code, &info); 1442 1434 /* Send us the fake SIGTRAP */ 1443 1435 force_sig_info(SIGTRAP, &info, tsk); 1444 - } 1445 - 1446 - static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) 1447 - { 1448 - #ifdef CONFIG_X86_64 1449 - if (arch == AUDIT_ARCH_X86_64) { 1450 - audit_syscall_entry(regs->orig_ax, regs->di, 1451 - regs->si, regs->dx, regs->r10); 1452 - } else 1453 - #endif 1454 - { 1455 - audit_syscall_entry(regs->orig_ax, regs->bx, 1456 - regs->cx, regs->dx, regs->si); 1457 - } 1458 - } 1459 - 1460 - /* 1461 - * We can return 0 to resume the syscall or anything else to go to phase 1462 - * 2. If we resume the syscall, we need to put something appropriate in 1463 - * regs->orig_ax. 1464 - * 1465 - * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax 1466 - * are fully functional. 1467 - * 1468 - * For phase 2's benefit, our return value is: 1469 - * 0: resume the syscall 1470 - * 1: go to phase 2; no seccomp phase 2 needed 1471 - * anything else: go to phase 2; pass return value to seccomp 1472 - */ 1473 - unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) 1474 - { 1475 - unsigned long ret = 0; 1476 - u32 work; 1477 - 1478 - BUG_ON(regs != task_pt_regs(current)); 1479 - 1480 - work = ACCESS_ONCE(current_thread_info()->flags) & 1481 - _TIF_WORK_SYSCALL_ENTRY; 1482 - 1483 - /* 1484 - * If TIF_NOHZ is set, we are required to call user_exit() before 1485 - * doing anything that could touch RCU. 1486 - */ 1487 - if (work & _TIF_NOHZ) { 1488 - user_exit(); 1489 - work &= ~_TIF_NOHZ; 1490 - } 1491 - 1492 - #ifdef CONFIG_SECCOMP 1493 - /* 1494 - * Do seccomp first -- it should minimize exposure of other 1495 - * code, and keeping seccomp fast is probably more valuable 1496 - * than the rest of this. 1497 - */ 1498 - if (work & _TIF_SECCOMP) { 1499 - struct seccomp_data sd; 1500 - 1501 - sd.arch = arch; 1502 - sd.nr = regs->orig_ax; 1503 - sd.instruction_pointer = regs->ip; 1504 - #ifdef CONFIG_X86_64 1505 - if (arch == AUDIT_ARCH_X86_64) { 1506 - sd.args[0] = regs->di; 1507 - sd.args[1] = regs->si; 1508 - sd.args[2] = regs->dx; 1509 - sd.args[3] = regs->r10; 1510 - sd.args[4] = regs->r8; 1511 - sd.args[5] = regs->r9; 1512 - } else 1513 - #endif 1514 - { 1515 - sd.args[0] = regs->bx; 1516 - sd.args[1] = regs->cx; 1517 - sd.args[2] = regs->dx; 1518 - sd.args[3] = regs->si; 1519 - sd.args[4] = regs->di; 1520 - sd.args[5] = regs->bp; 1521 - } 1522 - 1523 - BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); 1524 - BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); 1525 - 1526 - ret = seccomp_phase1(&sd); 1527 - if (ret == SECCOMP_PHASE1_SKIP) { 1528 - regs->orig_ax = -1; 1529 - ret = 0; 1530 - } else if (ret != SECCOMP_PHASE1_OK) { 1531 - return ret; /* Go directly to phase 2 */ 1532 - } 1533 - 1534 - work &= ~_TIF_SECCOMP; 1535 - } 1536 - #endif 1537 - 1538 - /* Do our best to finish without phase 2. */ 1539 - if (work == 0) 1540 - return ret; /* seccomp and/or nohz only (ret == 0 here) */ 1541 - 1542 - #ifdef CONFIG_AUDITSYSCALL 1543 - if (work == _TIF_SYSCALL_AUDIT) { 1544 - /* 1545 - * If there is no more work to be done except auditing, 1546 - * then audit in phase 1. Phase 2 always audits, so, if 1547 - * we audit here, then we can't go on to phase 2. 1548 - */ 1549 - do_audit_syscall_entry(regs, arch); 1550 - return 0; 1551 - } 1552 - #endif 1553 - 1554 - return 1; /* Something is enabled that we can't handle in phase 1 */ 1555 - } 1556 - 1557 - /* Returns the syscall nr to run (which should match regs->orig_ax). */ 1558 - long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, 1559 - unsigned long phase1_result) 1560 - { 1561 - long ret = 0; 1562 - u32 work = ACCESS_ONCE(current_thread_info()->flags) & 1563 - _TIF_WORK_SYSCALL_ENTRY; 1564 - 1565 - BUG_ON(regs != task_pt_regs(current)); 1566 - 1567 - /* 1568 - * If we stepped into a sysenter/syscall insn, it trapped in 1569 - * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. 1570 - * If user-mode had set TF itself, then it's still clear from 1571 - * do_debug() and we need to set it again to restore the user 1572 - * state. If we entered on the slow path, TF was already set. 1573 - */ 1574 - if (work & _TIF_SINGLESTEP) 1575 - regs->flags |= X86_EFLAGS_TF; 1576 - 1577 - #ifdef CONFIG_SECCOMP 1578 - /* 1579 - * Call seccomp_phase2 before running the other hooks so that 1580 - * they can see any changes made by a seccomp tracer. 1581 - */ 1582 - if (phase1_result > 1 && seccomp_phase2(phase1_result)) { 1583 - /* seccomp failures shouldn't expose any additional code. */ 1584 - return -1; 1585 - } 1586 - #endif 1587 - 1588 - if (unlikely(work & _TIF_SYSCALL_EMU)) 1589 - ret = -1L; 1590 - 1591 - if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && 1592 - tracehook_report_syscall_entry(regs)) 1593 - ret = -1L; 1594 - 1595 - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1596 - trace_sys_enter(regs, regs->orig_ax); 1597 - 1598 - do_audit_syscall_entry(regs, arch); 1599 - 1600 - return ret ?: regs->orig_ax; 1601 - } 1602 - 1603 - long syscall_trace_enter(struct pt_regs *regs) 1604 - { 1605 - u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; 1606 - unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); 1607 - 1608 - if (phase1_result == 0) 1609 - return regs->orig_ax; 1610 - else 1611 - return syscall_trace_enter_phase2(regs, arch, phase1_result); 1612 - } 1613 - 1614 - void syscall_trace_leave(struct pt_regs *regs) 1615 - { 1616 - bool step; 1617 - 1618 - /* 1619 - * We may come here right after calling schedule_user() 1620 - * or do_notify_resume(), in which case we can be in RCU 1621 - * user mode. 1622 - */ 1623 - user_exit(); 1624 - 1625 - audit_syscall_exit(regs); 1626 - 1627 - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1628 - trace_sys_exit(regs, regs->ax); 1629 - 1630 - /* 1631 - * If TIF_SYSCALL_EMU is set, we only get here because of 1632 - * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). 1633 - * We already reported this syscall instruction in 1634 - * syscall_trace_enter(). 1635 - */ 1636 - step = unlikely(test_thread_flag(TIF_SINGLESTEP)) && 1637 - !test_thread_flag(TIF_SYSCALL_EMU); 1638 - if (step || test_thread_flag(TIF_SYSCALL_TRACE)) 1639 - tracehook_report_syscall_exit(regs, step); 1640 - 1641 - user_enter(); 1642 1436 }

+5 -28

arch/x86/kernel/signal.c

··· 31 31 #include <asm/vdso.h> 32 32 #include <asm/mce.h> 33 33 #include <asm/sighandling.h> 34 + #include <asm/vm86.h> 34 35 35 36 #ifdef CONFIG_X86_64 36 37 #include <asm/proto.h> 37 38 #include <asm/ia32_unistd.h> 38 - #include <asm/sys_ia32.h> 39 39 #endif /* CONFIG_X86_64 */ 40 40 41 41 #include <asm/syscall.h> ··· 636 636 bool stepping, failed; 637 637 struct fpu *fpu = &current->thread.fpu; 638 638 639 + if (v8086_mode(regs)) 640 + save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL); 641 + 639 642 /* Are we from a system call? */ 640 643 if (syscall_get_nr(current, regs) >= 0) { 641 644 /* If so, check system call restarting.. */ ··· 704 701 * want to handle. Thus you cannot kill init even with a SIGKILL even by 705 702 * mistake. 706 703 */ 707 - static void do_signal(struct pt_regs *regs) 704 + void do_signal(struct pt_regs *regs) 708 705 { 709 706 struct ksignal ksig; 710 707 ··· 737 734 * back. 738 735 */ 739 736 restore_saved_sigmask(); 740 - } 741 - 742 - /* 743 - * notification of userspace execution resumption 744 - * - triggered by the TIF_WORK_MASK flags 745 - */ 746 - __visible void 747 - do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 748 - { 749 - user_exit(); 750 - 751 - if (thread_info_flags & _TIF_UPROBE) 752 - uprobe_notify_resume(regs); 753 - 754 - /* deal with pending signal delivery */ 755 - if (thread_info_flags & _TIF_SIGPENDING) 756 - do_signal(regs); 757 - 758 - if (thread_info_flags & _TIF_NOTIFY_RESUME) { 759 - clear_thread_flag(TIF_NOTIFY_RESUME); 760 - tracehook_notify_resume(regs); 761 - } 762 - if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) 763 - fire_user_return_notifiers(); 764 - 765 - user_enter(); 766 737 } 767 738 768 739 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)

+95

arch/x86/kernel/signal_compat.c

··· 1 + #include <linux/compat.h> 2 + #include <linux/uaccess.h> 3 + 4 + int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) 5 + { 6 + int err = 0; 7 + bool ia32 = test_thread_flag(TIF_IA32); 8 + 9 + if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) 10 + return -EFAULT; 11 + 12 + put_user_try { 13 + /* If you change siginfo_t structure, please make sure that 14 + this code is fixed accordingly. 15 + It should never copy any pad contained in the structure 16 + to avoid security leaks, but must copy the generic 17 + 3 ints plus the relevant union member. */ 18 + put_user_ex(from->si_signo, &to->si_signo); 19 + put_user_ex(from->si_errno, &to->si_errno); 20 + put_user_ex((short)from->si_code, &to->si_code); 21 + 22 + if (from->si_code < 0) { 23 + put_user_ex(from->si_pid, &to->si_pid); 24 + put_user_ex(from->si_uid, &to->si_uid); 25 + put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr); 26 + } else { 27 + /* 28 + * First 32bits of unions are always present: 29 + * si_pid === si_band === si_tid === si_addr(LS half) 30 + */ 31 + put_user_ex(from->_sifields._pad[0], 32 + &to->_sifields._pad[0]); 33 + switch (from->si_code >> 16) { 34 + case __SI_FAULT >> 16: 35 + break; 36 + case __SI_SYS >> 16: 37 + put_user_ex(from->si_syscall, &to->si_syscall); 38 + put_user_ex(from->si_arch, &to->si_arch); 39 + break; 40 + case __SI_CHLD >> 16: 41 + if (ia32) { 42 + put_user_ex(from->si_utime, &to->si_utime); 43 + put_user_ex(from->si_stime, &to->si_stime); 44 + } else { 45 + put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime); 46 + put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime); 47 + } 48 + put_user_ex(from->si_status, &to->si_status); 49 + /* FALL THROUGH */ 50 + default: 51 + case __SI_KILL >> 16: 52 + put_user_ex(from->si_uid, &to->si_uid); 53 + break; 54 + case __SI_POLL >> 16: 55 + put_user_ex(from->si_fd, &to->si_fd); 56 + break; 57 + case __SI_TIMER >> 16: 58 + put_user_ex(from->si_overrun, &to->si_overrun); 59 + put_user_ex(ptr_to_compat(from->si_ptr), 60 + &to->si_ptr); 61 + break; 62 + /* This is not generated by the kernel as of now. */ 63 + case __SI_RT >> 16: 64 + case __SI_MESGQ >> 16: 65 + put_user_ex(from->si_uid, &to->si_uid); 66 + put_user_ex(from->si_int, &to->si_int); 67 + break; 68 + } 69 + } 70 + } put_user_catch(err); 71 + 72 + return err; 73 + } 74 + 75 + int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) 76 + { 77 + int err = 0; 78 + u32 ptr32; 79 + 80 + if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t))) 81 + return -EFAULT; 82 + 83 + get_user_try { 84 + get_user_ex(to->si_signo, &from->si_signo); 85 + get_user_ex(to->si_errno, &from->si_errno); 86 + get_user_ex(to->si_code, &from->si_code); 87 + 88 + get_user_ex(to->si_pid, &from->si_pid); 89 + get_user_ex(to->si_uid, &from->si_uid); 90 + get_user_ex(ptr32, &from->si_ptr); 91 + to->si_ptr = compat_ptr(ptr32); 92 + } get_user_catch(err); 93 + 94 + return err; 95 + }

+2

arch/x86/kernel/step.c

··· 18 18 return addr; 19 19 } 20 20 21 + #ifdef CONFIG_MODIFY_LDT_SYSCALL 21 22 /* 22 23 * We'll assume that the code segments in the GDT 23 24 * are all zero-based. That is largely true: the ··· 46 45 } 47 46 mutex_unlock(&child->mm->context.lock); 48 47 } 48 + #endif 49 49 50 50 return addr; 51 51 }

+1 -6

arch/x86/kernel/trace_clock.c

··· 12 12 */ 13 13 u64 notrace trace_clock_x86_tsc(void) 14 14 { 15 - u64 ret; 16 - 17 - rdtsc_barrier(); 18 - rdtscll(ret); 19 - 20 - return ret; 15 + return rdtsc_ordered(); 21 16 }

+29 -59

arch/x86/kernel/traps.c

··· 62 62 #include <asm/fpu/xstate.h> 63 63 #include <asm/trace/mpx.h> 64 64 #include <asm/mpx.h> 65 + #include <asm/vm86.h> 65 66 66 67 #ifdef CONFIG_X86_64 67 68 #include <asm/x86_init.h> ··· 109 108 preempt_count_dec(); 110 109 } 111 110 112 - enum ctx_state ist_enter(struct pt_regs *regs) 111 + void ist_enter(struct pt_regs *regs) 113 112 { 114 - enum ctx_state prev_state; 115 - 116 113 if (user_mode(regs)) { 117 - /* Other than that, we're just an exception. */ 118 - prev_state = exception_enter(); 114 + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 119 115 } else { 120 116 /* 121 117 * We might have interrupted pretty much anything. In ··· 121 123 * but we need to notify RCU. 122 124 */ 123 125 rcu_nmi_enter(); 124 - prev_state = CONTEXT_KERNEL; /* the value is irrelevant. */ 125 126 } 126 127 127 128 /* 128 - * We are atomic because we're on the IST stack (or we're on x86_32, 129 - * in which case we still shouldn't schedule). 130 - * 131 - * This must be after exception_enter(), because exception_enter() 132 - * won't do anything if in_interrupt() returns true. 129 + * We are atomic because we're on the IST stack; or we're on 130 + * x86_32, in which case we still shouldn't schedule; or we're 131 + * on x86_64 and entered from user mode, in which case we're 132 + * still atomic unless ist_begin_non_atomic is called. 133 133 */ 134 134 preempt_count_add(HARDIRQ_OFFSET); 135 135 136 136 /* This code is a bit fragile. Test it. */ 137 137 rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work"); 138 - 139 - return prev_state; 140 138 } 141 139 142 - void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) 140 + void ist_exit(struct pt_regs *regs) 143 141 { 144 - /* Must be before exception_exit. */ 145 142 preempt_count_sub(HARDIRQ_OFFSET); 146 143 147 - if (user_mode(regs)) 148 - return exception_exit(prev_state); 149 - else 144 + if (!user_mode(regs)) 150 145 rcu_nmi_exit(); 151 146 } 152 147 ··· 153 162 * a double fault, it can be safe to schedule. ist_begin_non_atomic() 154 163 * begins a non-atomic section within an ist_enter()/ist_exit() region. 155 164 * Callers are responsible for enabling interrupts themselves inside 156 - * the non-atomic section, and callers must call is_end_non_atomic() 165 + * the non-atomic section, and callers must call ist_end_non_atomic() 157 166 * before ist_exit(). 158 167 */ 159 168 void ist_begin_non_atomic(struct pt_regs *regs) ··· 280 289 static void do_error_trap(struct pt_regs *regs, long error_code, char *str, 281 290 unsigned long trapnr, int signr) 282 291 { 283 - enum ctx_state prev_state = exception_enter(); 284 292 siginfo_t info; 293 + 294 + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 285 295 286 296 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != 287 297 NOTIFY_STOP) { ··· 290 298 do_trap(trapnr, signr, str, regs, error_code, 291 299 fill_trap_info(regs, signr, trapnr, &info)); 292 300 } 293 - 294 - exception_exit(prev_state); 295 301 } 296 302 297 303 #define DO_ERROR(trapnr, signr, str, name) \ ··· 341 351 } 342 352 #endif 343 353 344 - ist_enter(regs); /* Discard prev_state because we won't return. */ 354 + ist_enter(regs); 345 355 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); 346 356 347 357 tsk->thread.error_code = error_code; ··· 361 371 362 372 dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) 363 373 { 364 - enum ctx_state prev_state; 365 374 const struct bndcsr *bndcsr; 366 375 siginfo_t *info; 367 376 368 - prev_state = exception_enter(); 377 + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 369 378 if (notify_die(DIE_TRAP, "bounds", regs, error_code, 370 379 X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) 371 - goto exit; 380 + return; 372 381 conditional_sti(regs); 373 382 374 383 if (!user_mode(regs)) ··· 424 435 die("bounds", regs, error_code); 425 436 } 426 437 427 - exit: 428 - exception_exit(prev_state); 429 438 return; 439 + 430 440 exit_trap: 431 441 /* 432 442 * This path out is for all the cases where we could not ··· 435 447 * time.. 436 448 */ 437 449 do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL); 438 - exception_exit(prev_state); 439 450 } 440 451 441 452 dotraplinkage void 442 453 do_general_protection(struct pt_regs *regs, long error_code) 443 454 { 444 455 struct task_struct *tsk; 445 - enum ctx_state prev_state; 446 456 447 - prev_state = exception_enter(); 457 + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 448 458 conditional_sti(regs); 449 459 450 460 if (v8086_mode(regs)) { 451 461 local_irq_enable(); 452 462 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); 453 - goto exit; 463 + return; 454 464 } 455 465 456 466 tsk = current; 457 467 if (!user_mode(regs)) { 458 468 if (fixup_exception(regs)) 459 - goto exit; 469 + return; 460 470 461 471 tsk->thread.error_code = error_code; 462 472 tsk->thread.trap_nr = X86_TRAP_GP; 463 473 if (notify_die(DIE_GPF, "general protection fault", regs, error_code, 464 474 X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP) 465 475 die("general protection fault", regs, error_code); 466 - goto exit; 476 + return; 467 477 } 468 478 469 479 tsk->thread.error_code = error_code; ··· 477 491 } 478 492 479 493 force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); 480 - exit: 481 - exception_exit(prev_state); 482 494 } 483 495 NOKPROBE_SYMBOL(do_general_protection); 484 496 485 497 /* May run on IST stack. */ 486 498 dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) 487 499 { 488 - enum ctx_state prev_state; 489 - 490 500 #ifdef CONFIG_DYNAMIC_FTRACE 491 501 /* 492 502 * ftrace must be first, everything else may cause a recursive crash. ··· 495 513 if (poke_int3_handler(regs)) 496 514 return; 497 515 498 - prev_state = ist_enter(regs); 516 + ist_enter(regs); 517 + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 499 518 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP 500 519 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, 501 520 SIGTRAP) == NOTIFY_STOP) ··· 522 539 preempt_conditional_cli(regs); 523 540 debug_stack_usage_dec(); 524 541 exit: 525 - ist_exit(regs, prev_state); 542 + ist_exit(regs); 526 543 } 527 544 NOKPROBE_SYMBOL(do_int3); 528 545 ··· 598 615 dotraplinkage void do_debug(struct pt_regs *regs, long error_code) 599 616 { 600 617 struct task_struct *tsk = current; 601 - enum ctx_state prev_state; 602 618 int user_icebp = 0; 603 619 unsigned long dr6; 604 620 int si_code; 605 621 606 - prev_state = ist_enter(regs); 622 + ist_enter(regs); 607 623 608 624 get_debugreg(dr6, 6); 609 625 ··· 677 695 debug_stack_usage_dec(); 678 696 679 697 exit: 680 - ist_exit(regs, prev_state); 698 + ist_exit(regs); 681 699 } 682 700 NOKPROBE_SYMBOL(do_debug); 683 701 ··· 729 747 730 748 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) 731 749 { 732 - enum ctx_state prev_state; 733 - 734 - prev_state = exception_enter(); 750 + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 735 751 math_error(regs, error_code, X86_TRAP_MF); 736 - exception_exit(prev_state); 737 752 } 738 753 739 754 dotraplinkage void 740 755 do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 741 756 { 742 - enum ctx_state prev_state; 743 - 744 - prev_state = exception_enter(); 757 + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 745 758 math_error(regs, error_code, X86_TRAP_XF); 746 - exception_exit(prev_state); 747 759 } 748 760 749 761 dotraplinkage void ··· 749 773 dotraplinkage void 750 774 do_device_not_available(struct pt_regs *regs, long error_code) 751 775 { 752 - enum ctx_state prev_state; 753 - 754 - prev_state = exception_enter(); 776 + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 755 777 BUG_ON(use_eager_fpu()); 756 778 757 779 #ifdef CONFIG_MATH_EMULATION ··· 760 786 761 787 info.regs = regs; 762 788 math_emulate(&info); 763 - exception_exit(prev_state); 764 789 return; 765 790 } 766 791 #endif ··· 767 794 #ifdef CONFIG_X86_32 768 795 conditional_sti(regs); 769 796 #endif 770 - exception_exit(prev_state); 771 797 } 772 798 NOKPROBE_SYMBOL(do_device_not_available); 773 799 ··· 774 802 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) 775 803 { 776 804 siginfo_t info; 777 - enum ctx_state prev_state; 778 805 779 - prev_state = exception_enter(); 806 + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 780 807 local_irq_enable(); 781 808 782 809 info.si_signo = SIGILL; ··· 787 816 do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, 788 817 &info); 789 818 } 790 - exception_exit(prev_state); 791 819 } 792 820 #endif 793 821

+3 -9

arch/x86/kernel/tsc.c

··· 248 248 249 249 data = cyc2ns_write_begin(cpu); 250 250 251 - rdtscll(tsc_now); 251 + tsc_now = rdtsc(); 252 252 ns_now = cycles_2_ns(tsc_now); 253 253 254 254 /* ··· 290 290 } 291 291 292 292 /* read the Time Stamp Counter: */ 293 - rdtscll(tsc_now); 293 + tsc_now = rdtsc(); 294 294 295 295 /* return the value in ns */ 296 296 return cycles_2_ns(tsc_now); ··· 307 307 unsigned long long 308 308 sched_clock(void) __attribute__((alias("native_sched_clock"))); 309 309 #endif 310 - 311 - unsigned long long native_read_tsc(void) 312 - { 313 - return __native_read_tsc(); 314 - } 315 - EXPORT_SYMBOL(native_read_tsc); 316 310 317 311 int check_tsc_unstable(void) 318 312 { ··· 970 976 */ 971 977 static cycle_t read_tsc(struct clocksource *cs) 972 978 { 973 - return (cycle_t)get_cycles(); 979 + return (cycle_t)rdtsc_ordered(); 974 980 } 975 981 976 982 /*

+6 -8

arch/x86/kernel/tsc_sync.c

··· 39 39 static int nr_warps; 40 40 41 41 /* 42 - * TSC-warp measurement loop running on both CPUs: 42 + * TSC-warp measurement loop running on both CPUs. This is not called 43 + * if there is no TSC. 43 44 */ 44 45 static void check_tsc_warp(unsigned int timeout) 45 46 { 46 47 cycles_t start, now, prev, end; 47 48 int i; 48 49 49 - rdtsc_barrier(); 50 - start = get_cycles(); 51 - rdtsc_barrier(); 50 + start = rdtsc_ordered(); 52 51 /* 53 52 * The measurement runs for 'timeout' msecs: 54 53 */ ··· 62 63 */ 63 64 arch_spin_lock(&sync_lock); 64 65 prev = last_tsc; 65 - rdtsc_barrier(); 66 - now = get_cycles(); 67 - rdtsc_barrier(); 66 + now = rdtsc_ordered(); 68 67 last_tsc = now; 69 68 arch_spin_unlock(&sync_lock); 70 69 ··· 123 126 124 127 /* 125 128 * No need to check if we already know that the TSC is not 126 - * synchronized: 129 + * synchronized or if we have no TSC. 127 130 */ 128 131 if (unsynchronized_tsc()) 129 132 return; ··· 187 190 { 188 191 int cpus = 2; 189 192 193 + /* Also aborts if there is no TSC. */ 190 194 if (unsynchronized_tsc() || tsc_clocksource_reliable) 191 195 return; 192 196

+183 -190

arch/x86/kernel/vm86_32.c

··· 44 44 #include <linux/ptrace.h> 45 45 #include <linux/audit.h> 46 46 #include <linux/stddef.h> 47 + #include <linux/slab.h> 47 48 48 49 #include <asm/uaccess.h> 49 50 #include <asm/io.h> 50 51 #include <asm/tlbflush.h> 51 52 #include <asm/irq.h> 53 + #include <asm/traps.h> 54 + #include <asm/vm86.h> 52 55 53 56 /* 54 57 * Known problems: ··· 69 66 */ 70 67 71 68 72 - #define KVM86 ((struct kernel_vm86_struct *)regs) 73 - #define VMPI KVM86->vm86plus 74 - 75 - 76 69 /* 77 70 * 8- and 16-bit register defines.. 78 71 */ ··· 80 81 /* 81 82 * virtual flags (16 and 32-bit versions) 82 83 */ 83 - #define VFLAGS (*(unsigned short *)&(current->thread.v86flags)) 84 - #define VEFLAGS (current->thread.v86flags) 84 + #define VFLAGS (*(unsigned short *)&(current->thread.vm86->veflags)) 85 + #define VEFLAGS (current->thread.vm86->veflags) 85 86 86 87 #define set_flags(X, new, mask) \ 87 88 ((X) = ((X) & ~(mask)) | ((new) & (mask))) ··· 89 90 #define SAFE_MASK (0xDD5) 90 91 #define RETURN_MASK (0xDFF) 91 92 92 - /* convert kernel_vm86_regs to vm86_regs */ 93 - static int copy_vm86_regs_to_user(struct vm86_regs __user *user, 94 - const struct kernel_vm86_regs *regs) 95 - { 96 - int ret = 0; 97 - 98 - /* 99 - * kernel_vm86_regs is missing gs, so copy everything up to 100 - * (but not including) orig_eax, and then rest including orig_eax. 101 - */ 102 - ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax)); 103 - ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax, 104 - sizeof(struct kernel_vm86_regs) - 105 - offsetof(struct kernel_vm86_regs, pt.orig_ax)); 106 - 107 - return ret; 108 - } 109 - 110 - /* convert vm86_regs to kernel_vm86_regs */ 111 - static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs, 112 - const struct vm86_regs __user *user, 113 - unsigned extra) 114 - { 115 - int ret = 0; 116 - 117 - /* copy ax-fs inclusive */ 118 - ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax)); 119 - /* copy orig_ax-__gsh+extra */ 120 - ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax, 121 - sizeof(struct kernel_vm86_regs) - 122 - offsetof(struct kernel_vm86_regs, pt.orig_ax) + 123 - extra); 124 - return ret; 125 - } 126 - 127 - struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) 93 + void save_v86_state(struct kernel_vm86_regs *regs, int retval) 128 94 { 129 95 struct tss_struct *tss; 130 - struct pt_regs *ret; 131 - unsigned long tmp; 96 + struct task_struct *tsk = current; 97 + struct vm86plus_struct __user *user; 98 + struct vm86 *vm86 = current->thread.vm86; 99 + long err = 0; 132 100 133 101 /* 134 102 * This gets called from entry.S with interrupts disabled, but ··· 104 138 */ 105 139 local_irq_enable(); 106 140 107 - if (!current->thread.vm86_info) { 108 - pr_alert("no vm86_info: BAD\n"); 141 + if (!vm86 || !vm86->user_vm86) { 142 + pr_alert("no user_vm86: BAD\n"); 109 143 do_exit(SIGSEGV); 110 144 } 111 - set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask); 112 - tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs); 113 - tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap); 114 - if (tmp) { 115 - pr_alert("could not access userspace vm86_info\n"); 145 + set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask); 146 + user = vm86->user_vm86; 147 + 148 + if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ? 149 + sizeof(struct vm86plus_struct) : 150 + sizeof(struct vm86_struct))) { 151 + pr_alert("could not access userspace vm86 info\n"); 152 + do_exit(SIGSEGV); 153 + } 154 + 155 + put_user_try { 156 + put_user_ex(regs->pt.bx, &user->regs.ebx); 157 + put_user_ex(regs->pt.cx, &user->regs.ecx); 158 + put_user_ex(regs->pt.dx, &user->regs.edx); 159 + put_user_ex(regs->pt.si, &user->regs.esi); 160 + put_user_ex(regs->pt.di, &user->regs.edi); 161 + put_user_ex(regs->pt.bp, &user->regs.ebp); 162 + put_user_ex(regs->pt.ax, &user->regs.eax); 163 + put_user_ex(regs->pt.ip, &user->regs.eip); 164 + put_user_ex(regs->pt.cs, &user->regs.cs); 165 + put_user_ex(regs->pt.flags, &user->regs.eflags); 166 + put_user_ex(regs->pt.sp, &user->regs.esp); 167 + put_user_ex(regs->pt.ss, &user->regs.ss); 168 + put_user_ex(regs->es, &user->regs.es); 169 + put_user_ex(regs->ds, &user->regs.ds); 170 + put_user_ex(regs->fs, &user->regs.fs); 171 + put_user_ex(regs->gs, &user->regs.gs); 172 + 173 + put_user_ex(vm86->screen_bitmap, &user->screen_bitmap); 174 + } put_user_catch(err); 175 + if (err) { 176 + pr_alert("could not access userspace vm86 info\n"); 116 177 do_exit(SIGSEGV); 117 178 } 118 179 119 180 tss = &per_cpu(cpu_tss, get_cpu()); 120 - current->thread.sp0 = current->thread.saved_sp0; 121 - current->thread.sysenter_cs = __KERNEL_CS; 122 - load_sp0(tss, &current->thread); 123 - current->thread.saved_sp0 = 0; 181 + tsk->thread.sp0 = vm86->saved_sp0; 182 + tsk->thread.sysenter_cs = __KERNEL_CS; 183 + load_sp0(tss, &tsk->thread); 184 + vm86->saved_sp0 = 0; 124 185 put_cpu(); 125 186 126 - ret = KVM86->regs32; 187 + memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs)); 127 188 128 - ret->fs = current->thread.saved_fs; 129 - set_user_gs(ret, current->thread.saved_gs); 189 + lazy_load_gs(vm86->regs32.gs); 130 190 131 - return ret; 191 + regs->pt.ax = retval; 132 192 } 133 193 134 194 static void mark_screen_rdonly(struct mm_struct *mm) ··· 192 200 193 201 194 202 static int do_vm86_irq_handling(int subfunction, int irqnumber); 195 - static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); 203 + static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus); 196 204 197 - SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86) 205 + SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, user_vm86) 198 206 { 199 - struct kernel_vm86_struct info; /* declare this _on top_, 200 - * this avoids wasting of stack space. 201 - * This remains on the stack until we 202 - * return to 32 bit user space. 203 - */ 204 - struct task_struct *tsk = current; 205 - int tmp; 206 - 207 - if (tsk->thread.saved_sp0) 208 - return -EPERM; 209 - tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 210 - offsetof(struct kernel_vm86_struct, vm86plus) - 211 - sizeof(info.regs)); 212 - if (tmp) 213 - return -EFAULT; 214 - memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); 215 - info.regs32 = current_pt_regs(); 216 - tsk->thread.vm86_info = v86; 217 - do_sys_vm86(&info, tsk); 218 - return 0; /* we never return here */ 207 + return do_sys_vm86((struct vm86plus_struct __user *) user_vm86, false); 219 208 } 220 209 221 210 222 211 SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) 223 212 { 224 - struct kernel_vm86_struct info; /* declare this _on top_, 225 - * this avoids wasting of stack space. 226 - * This remains on the stack until we 227 - * return to 32 bit user space. 228 - */ 229 - struct task_struct *tsk; 230 - int tmp; 231 - struct vm86plus_struct __user *v86; 232 - 233 - tsk = current; 234 213 switch (cmd) { 235 214 case VM86_REQUEST_IRQ: 236 215 case VM86_FREE_IRQ: ··· 219 256 } 220 257 221 258 /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ 222 - if (tsk->thread.saved_sp0) 223 - return -EPERM; 224 - v86 = (struct vm86plus_struct __user *)arg; 225 - tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 226 - offsetof(struct kernel_vm86_struct, regs32) - 227 - sizeof(info.regs)); 228 - if (tmp) 229 - return -EFAULT; 230 - info.regs32 = current_pt_regs(); 231 - info.vm86plus.is_vm86pus = 1; 232 - tsk->thread.vm86_info = (struct vm86_struct __user *)v86; 233 - do_sys_vm86(&info, tsk); 234 - return 0; /* we never return here */ 259 + return do_sys_vm86((struct vm86plus_struct __user *) arg, true); 235 260 } 236 261 237 262 238 - static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) 263 + static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) 239 264 { 240 265 struct tss_struct *tss; 241 - /* 242 - * make sure the vm86() system call doesn't try to do anything silly 243 - */ 244 - info->regs.pt.ds = 0; 245 - info->regs.pt.es = 0; 246 - info->regs.pt.fs = 0; 247 - #ifndef CONFIG_X86_32_LAZY_GS 248 - info->regs.pt.gs = 0; 249 - #endif 266 + struct task_struct *tsk = current; 267 + struct vm86 *vm86 = tsk->thread.vm86; 268 + struct kernel_vm86_regs vm86regs; 269 + struct pt_regs *regs = current_pt_regs(); 270 + unsigned long err = 0; 271 + 272 + if (!vm86) { 273 + if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL))) 274 + return -ENOMEM; 275 + tsk->thread.vm86 = vm86; 276 + } 277 + if (vm86->saved_sp0) 278 + return -EPERM; 279 + 280 + if (!access_ok(VERIFY_READ, user_vm86, plus ? 281 + sizeof(struct vm86_struct) : 282 + sizeof(struct vm86plus_struct))) 283 + return -EFAULT; 284 + 285 + memset(&vm86regs, 0, sizeof(vm86regs)); 286 + get_user_try { 287 + unsigned short seg; 288 + get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx); 289 + get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx); 290 + get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx); 291 + get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi); 292 + get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi); 293 + get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp); 294 + get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax); 295 + get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip); 296 + get_user_ex(seg, &user_vm86->regs.cs); 297 + vm86regs.pt.cs = seg; 298 + get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags); 299 + get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp); 300 + get_user_ex(seg, &user_vm86->regs.ss); 301 + vm86regs.pt.ss = seg; 302 + get_user_ex(vm86regs.es, &user_vm86->regs.es); 303 + get_user_ex(vm86regs.ds, &user_vm86->regs.ds); 304 + get_user_ex(vm86regs.fs, &user_vm86->regs.fs); 305 + get_user_ex(vm86regs.gs, &user_vm86->regs.gs); 306 + 307 + get_user_ex(vm86->flags, &user_vm86->flags); 308 + get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap); 309 + get_user_ex(vm86->cpu_type, &user_vm86->cpu_type); 310 + } get_user_catch(err); 311 + if (err) 312 + return err; 313 + 314 + if (copy_from_user(&vm86->int_revectored, 315 + &user_vm86->int_revectored, 316 + sizeof(struct revectored_struct))) 317 + return -EFAULT; 318 + if (copy_from_user(&vm86->int21_revectored, 319 + &user_vm86->int21_revectored, 320 + sizeof(struct revectored_struct))) 321 + return -EFAULT; 322 + if (plus) { 323 + if (copy_from_user(&vm86->vm86plus, &user_vm86->vm86plus, 324 + sizeof(struct vm86plus_info_struct))) 325 + return -EFAULT; 326 + vm86->vm86plus.is_vm86pus = 1; 327 + } else 328 + memset(&vm86->vm86plus, 0, 329 + sizeof(struct vm86plus_info_struct)); 330 + 331 + memcpy(&vm86->regs32, regs, sizeof(struct pt_regs)); 332 + vm86->user_vm86 = user_vm86; 250 333 251 334 /* 252 335 * The flags register is also special: we cannot trust that the user 253 336 * has set it up safely, so this makes sure interrupt etc flags are 254 337 * inherited from protected mode. 255 338 */ 256 - VEFLAGS = info->regs.pt.flags; 257 - info->regs.pt.flags &= SAFE_MASK; 258 - info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK; 259 - info->regs.pt.flags |= X86_VM_MASK; 339 + VEFLAGS = vm86regs.pt.flags; 340 + vm86regs.pt.flags &= SAFE_MASK; 341 + vm86regs.pt.flags |= regs->flags & ~SAFE_MASK; 342 + vm86regs.pt.flags |= X86_VM_MASK; 260 343 261 - switch (info->cpu_type) { 344 + vm86regs.pt.orig_ax = regs->orig_ax; 345 + 346 + switch (vm86->cpu_type) { 262 347 case CPU_286: 263 - tsk->thread.v86mask = 0; 348 + vm86->veflags_mask = 0; 264 349 break; 265 350 case CPU_386: 266 - tsk->thread.v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL; 351 + vm86->veflags_mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL; 267 352 break; 268 353 case CPU_486: 269 - tsk->thread.v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; 354 + vm86->veflags_mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; 270 355 break; 271 356 default: 272 - tsk->thread.v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; 357 + vm86->veflags_mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; 273 358 break; 274 359 } 275 360 276 361 /* 277 - * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL) 362 + * Save old state 278 363 */ 279 - info->regs32->ax = VM86_SIGNAL; 280 - tsk->thread.saved_sp0 = tsk->thread.sp0; 281 - tsk->thread.saved_fs = info->regs32->fs; 282 - tsk->thread.saved_gs = get_user_gs(info->regs32); 364 + vm86->saved_sp0 = tsk->thread.sp0; 365 + lazy_save_gs(vm86->regs32.gs); 283 366 284 367 tss = &per_cpu(cpu_tss, get_cpu()); 285 - tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; 368 + /* make room for real-mode segments */ 369 + tsk->thread.sp0 += 16; 286 370 if (cpu_has_sep) 287 371 tsk->thread.sysenter_cs = 0; 288 372 load_sp0(tss, &tsk->thread); 289 373 put_cpu(); 290 374 291 - tsk->thread.screen_bitmap = info->screen_bitmap; 292 - if (info->flags & VM86_SCREEN_BITMAP) 375 + if (vm86->flags & VM86_SCREEN_BITMAP) 293 376 mark_screen_rdonly(tsk->mm); 294 377 295 - /*call __audit_syscall_exit since we do not exit via the normal paths */ 296 - #ifdef CONFIG_AUDITSYSCALL 297 - if (unlikely(current->audit_context)) 298 - __audit_syscall_exit(1, 0); 299 - #endif 300 - 301 - __asm__ __volatile__( 302 - "movl %0,%%esp\n\t" 303 - "movl %1,%%ebp\n\t" 304 - #ifdef CONFIG_X86_32_LAZY_GS 305 - "mov %2, %%gs\n\t" 306 - #endif 307 - "jmp resume_userspace" 308 - : /* no outputs */ 309 - :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); 310 - /* we never return here */ 311 - } 312 - 313 - static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval) 314 - { 315 - struct pt_regs *regs32; 316 - 317 - regs32 = save_v86_state(regs16); 318 - regs32->ax = retval; 319 - __asm__ __volatile__("movl %0,%%esp\n\t" 320 - "movl %1,%%ebp\n\t" 321 - "jmp resume_userspace" 322 - : : "r" (regs32), "r" (current_thread_info())); 378 + memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs)); 379 + force_iret(); 380 + return regs->ax; 323 381 } 324 382 325 383 static inline void set_IF(struct kernel_vm86_regs *regs) 326 384 { 327 385 VEFLAGS |= X86_EFLAGS_VIF; 328 - if (VEFLAGS & X86_EFLAGS_VIP) 329 - return_to_32bit(regs, VM86_STI); 330 386 } 331 387 332 388 static inline void clear_IF(struct kernel_vm86_regs *regs) ··· 377 395 378 396 static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs) 379 397 { 380 - set_flags(VEFLAGS, flags, current->thread.v86mask); 398 + set_flags(VEFLAGS, flags, current->thread.vm86->veflags_mask); 381 399 set_flags(regs->pt.flags, flags, SAFE_MASK); 382 400 if (flags & X86_EFLAGS_IF) 383 401 set_IF(regs); ··· 387 405 388 406 static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs) 389 407 { 390 - set_flags(VFLAGS, flags, current->thread.v86mask); 408 + set_flags(VFLAGS, flags, current->thread.vm86->veflags_mask); 391 409 set_flags(regs->pt.flags, flags, SAFE_MASK); 392 410 if (flags & X86_EFLAGS_IF) 393 411 set_IF(regs); ··· 402 420 if (VEFLAGS & X86_EFLAGS_VIF) 403 421 flags |= X86_EFLAGS_IF; 404 422 flags |= X86_EFLAGS_IOPL; 405 - return flags | (VEFLAGS & current->thread.v86mask); 423 + return flags | (VEFLAGS & current->thread.vm86->veflags_mask); 406 424 } 407 425 408 426 static inline int is_revectored(int nr, struct revectored_struct *bitmap) ··· 500 518 { 501 519 unsigned long __user *intr_ptr; 502 520 unsigned long segoffs; 521 + struct vm86 *vm86 = current->thread.vm86; 503 522 504 523 if (regs->pt.cs == BIOSSEG) 505 524 goto cannot_handle; 506 - if (is_revectored(i, &KVM86->int_revectored)) 525 + if (is_revectored(i, &vm86->int_revectored)) 507 526 goto cannot_handle; 508 - if (i == 0x21 && is_revectored(AH(regs), &KVM86->int21_revectored)) 527 + if (i == 0x21 && is_revectored(AH(regs), &vm86->int21_revectored)) 509 528 goto cannot_handle; 510 529 intr_ptr = (unsigned long __user *) (i << 2); 511 530 if (get_user(segoffs, intr_ptr)) ··· 525 542 return; 526 543 527 544 cannot_handle: 528 - return_to_32bit(regs, VM86_INTx + (i << 8)); 545 + save_v86_state(regs, VM86_INTx + (i << 8)); 529 546 } 530 547 531 548 int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) 532 549 { 533 - if (VMPI.is_vm86pus) { 550 + struct vm86 *vm86 = current->thread.vm86; 551 + 552 + if (vm86->vm86plus.is_vm86pus) { 534 553 if ((trapno == 3) || (trapno == 1)) { 535 - KVM86->regs32->ax = VM86_TRAP + (trapno << 8); 536 - /* setting this flag forces the code in entry_32.S to 537 - the path where we call save_v86_state() and change 538 - the stack pointer to KVM86->regs32 */ 539 - set_thread_flag(TIF_NOTIFY_RESUME); 554 + save_v86_state(regs, VM86_TRAP + (trapno << 8)); 540 555 return 0; 541 556 } 542 557 do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); ··· 555 574 unsigned char __user *ssp; 556 575 unsigned short ip, sp, orig_flags; 557 576 int data32, pref_done; 577 + struct vm86plus_info_struct *vmpi = &current->thread.vm86->vm86plus; 558 578 559 579 #define CHECK_IF_IN_TRAP \ 560 - if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \ 580 + if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \ 561 581 newflags |= X86_EFLAGS_TF 562 - #define VM86_FAULT_RETURN do { \ 563 - if (VMPI.force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \ 564 - return_to_32bit(regs, VM86_PICRETURN); \ 565 - if (orig_flags & X86_EFLAGS_TF) \ 566 - handle_vm86_trap(regs, 0, 1); \ 567 - return; } while (0) 568 582 569 583 orig_flags = *(unsigned short *)&regs->pt.flags; 570 584 ··· 598 622 SP(regs) -= 2; 599 623 } 600 624 IP(regs) = ip; 601 - VM86_FAULT_RETURN; 625 + goto vm86_fault_return; 602 626 603 627 /* popf */ 604 628 case 0x9d: ··· 618 642 else 619 643 set_vflags_short(newflags, regs); 620 644 621 - VM86_FAULT_RETURN; 645 + goto check_vip; 622 646 } 623 647 624 648 /* int xx */ 625 649 case 0xcd: { 626 650 int intno = popb(csp, ip, simulate_sigsegv); 627 651 IP(regs) = ip; 628 - if (VMPI.vm86dbg_active) { 629 - if ((1 << (intno & 7)) & VMPI.vm86dbg_intxxtab[intno >> 3]) 630 - return_to_32bit(regs, VM86_INTx + (intno << 8)); 652 + if (vmpi->vm86dbg_active) { 653 + if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) { 654 + save_v86_state(regs, VM86_INTx + (intno << 8)); 655 + return; 656 + } 631 657 } 632 658 do_int(regs, intno, ssp, sp); 633 659 return; ··· 660 682 } else { 661 683 set_vflags_short(newflags, regs); 662 684 } 663 - VM86_FAULT_RETURN; 685 + goto check_vip; 664 686 } 665 687 666 688 /* cli */ 667 689 case 0xfa: 668 690 IP(regs) = ip; 669 691 clear_IF(regs); 670 - VM86_FAULT_RETURN; 692 + goto vm86_fault_return; 671 693 672 694 /* sti */ 673 695 /* ··· 679 701 case 0xfb: 680 702 IP(regs) = ip; 681 703 set_IF(regs); 682 - VM86_FAULT_RETURN; 704 + goto check_vip; 683 705 684 706 default: 685 - return_to_32bit(regs, VM86_UNKNOWN); 707 + save_v86_state(regs, VM86_UNKNOWN); 686 708 } 687 709 710 + return; 711 + 712 + check_vip: 713 + if (VEFLAGS & X86_EFLAGS_VIP) { 714 + save_v86_state(regs, VM86_STI); 715 + return; 716 + } 717 + 718 + vm86_fault_return: 719 + if (vmpi->force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) { 720 + save_v86_state(regs, VM86_PICRETURN); 721 + return; 722 + } 723 + if (orig_flags & X86_EFLAGS_TF) 724 + handle_vm86_trap(regs, 0, X86_TRAP_DB); 688 725 return; 689 726 690 727 simulate_sigsegv: ··· 713 720 * should be a mixture of the two, but how do we 714 721 * get the information? [KD] 715 722 */ 716 - return_to_32bit(regs, VM86_UNKNOWN); 723 + save_v86_state(regs, VM86_UNKNOWN); 717 724 } 718 725 719 726 /* ---------------- vm86 special IRQ passing stuff ----------------- */

+2 -2

arch/x86/kvm/lapic.c

··· 1172 1172 1173 1173 tsc_deadline = apic->lapic_timer.expired_tscdeadline; 1174 1174 apic->lapic_timer.expired_tscdeadline = 0; 1175 - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); 1175 + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc()); 1176 1176 trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); 1177 1177 1178 1178 /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ ··· 1240 1240 local_irq_save(flags); 1241 1241 1242 1242 now = apic->lapic_timer.timer.base->get_time(); 1243 - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); 1243 + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc()); 1244 1244 if (likely(tscdeadline > guest_tsc)) { 1245 1245 ns = (tscdeadline - guest_tsc) * 1000000ULL; 1246 1246 do_div(ns, this_tsc_khz);

+2 -2

arch/x86/kvm/svm.c

··· 1139 1139 { 1140 1140 u64 tsc; 1141 1141 1142 - tsc = svm_scale_tsc(vcpu, native_read_tsc()); 1142 + tsc = svm_scale_tsc(vcpu, rdtsc()); 1143 1143 1144 1144 return target_tsc - tsc; 1145 1145 } ··· 3172 3172 switch (msr_info->index) { 3173 3173 case MSR_IA32_TSC: { 3174 3174 msr_info->data = svm->vmcb->control.tsc_offset + 3175 - svm_scale_tsc(vcpu, native_read_tsc()); 3175 + svm_scale_tsc(vcpu, rdtsc()); 3176 3176 3177 3177 break; 3178 3178 }

+2 -2

arch/x86/kvm/vmx.c

··· 2236 2236 { 2237 2237 u64 host_tsc, tsc_offset; 2238 2238 2239 - rdtscll(host_tsc); 2239 + host_tsc = rdtsc(); 2240 2240 tsc_offset = vmcs_read64(TSC_OFFSET); 2241 2241 return host_tsc + tsc_offset; 2242 2242 } ··· 2317 2317 2318 2318 static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) 2319 2319 { 2320 - return target_tsc - native_read_tsc(); 2320 + return target_tsc - rdtsc(); 2321 2321 } 2322 2322 2323 2323 static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)

+7 -19

arch/x86/kvm/x86.c

··· 1444 1444 1445 1445 static cycle_t read_tsc(void) 1446 1446 { 1447 - cycle_t ret; 1448 - u64 last; 1449 - 1450 - /* 1451 - * Empirically, a fence (of type that depends on the CPU) 1452 - * before rdtsc is enough to ensure that rdtsc is ordered 1453 - * with respect to loads. The various CPU manuals are unclear 1454 - * as to whether rdtsc can be reordered with later loads, 1455 - * but no one has ever seen it happen. 1456 - */ 1457 - rdtsc_barrier(); 1458 - ret = (cycle_t)vget_cycles(); 1459 - 1460 - last = pvclock_gtod_data.clock.cycle_last; 1447 + cycle_t ret = (cycle_t)rdtsc_ordered(); 1448 + u64 last = pvclock_gtod_data.clock.cycle_last; 1461 1449 1462 1450 if (likely(ret >= last)) 1463 1451 return ret; ··· 1634 1646 return 1; 1635 1647 } 1636 1648 if (!use_master_clock) { 1637 - host_tsc = native_read_tsc(); 1649 + host_tsc = rdtsc(); 1638 1650 kernel_ns = get_kernel_ns(); 1639 1651 } 1640 1652 ··· 2798 2810 2799 2811 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { 2800 2812 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : 2801 - native_read_tsc() - vcpu->arch.last_host_tsc; 2813 + rdtsc() - vcpu->arch.last_host_tsc; 2802 2814 if (tsc_delta < 0) 2803 2815 mark_tsc_unstable("KVM discovered backwards TSC"); 2804 2816 if (check_tsc_unstable()) { ··· 2826 2838 { 2827 2839 kvm_x86_ops->vcpu_put(vcpu); 2828 2840 kvm_put_guest_fpu(vcpu); 2829 - vcpu->arch.last_host_tsc = native_read_tsc(); 2841 + vcpu->arch.last_host_tsc = rdtsc(); 2830 2842 } 2831 2843 2832 2844 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, ··· 6610 6622 hw_breakpoint_restore(); 6611 6623 6612 6624 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, 6613 - native_read_tsc()); 6625 + rdtsc()); 6614 6626 6615 6627 vcpu->mode = OUTSIDE_GUEST_MODE; 6616 6628 smp_wmb(); ··· 7419 7431 if (ret != 0) 7420 7432 return ret; 7421 7433 7422 - local_tsc = native_read_tsc(); 7434 + local_tsc = rdtsc(); 7423 7435 stable = !check_tsc_unstable(); 7424 7436 list_for_each_entry(kvm, &vm_list, vm_list) { 7425 7437 kvm_for_each_vcpu(i, vcpu, kvm) {

+5 -8

arch/x86/lib/delay.c

··· 49 49 /* TSC based delay: */ 50 50 static void delay_tsc(unsigned long __loops) 51 51 { 52 - u32 bclock, now, loops = __loops; 52 + u64 bclock, now, loops = __loops; 53 53 int cpu; 54 54 55 55 preempt_disable(); 56 56 cpu = smp_processor_id(); 57 - rdtsc_barrier(); 58 - rdtscl(bclock); 57 + bclock = rdtsc_ordered(); 59 58 for (;;) { 60 - rdtsc_barrier(); 61 - rdtscl(now); 59 + now = rdtsc_ordered(); 62 60 if ((now - bclock) >= loops) 63 61 break; 64 62 ··· 77 79 if (unlikely(cpu != smp_processor_id())) { 78 80 loops -= (now - bclock); 79 81 cpu = smp_processor_id(); 80 - rdtsc_barrier(); 81 - rdtscl(bclock); 82 + bclock = rdtsc_ordered(); 82 83 } 83 84 } 84 85 preempt_enable(); ··· 97 100 int read_current_timer(unsigned long *timer_val) 98 101 { 99 102 if (delay_fn == delay_tsc) { 100 - rdtscll(*timer_val); 103 + *timer_val = rdtsc(); 101 104 return 0; 102 105 } 103 106 return -1;

+1

arch/x86/math-emu/get_address.c

··· 21 21 22 22 #include <asm/uaccess.h> 23 23 #include <asm/desc.h> 24 + #include <asm/vm86.h> 24 25 25 26 #include "fpu_system.h" 26 27 #include "exception.h"

+5 -2

arch/x86/mm/fault.c

··· 20 20 #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ 21 21 #include <asm/fixmap.h> /* VSYSCALL_ADDR */ 22 22 #include <asm/vsyscall.h> /* emulate_vsyscall */ 23 + #include <asm/vm86.h> /* struct vm86 */ 23 24 24 25 #define CREATE_TRACE_POINTS 25 26 #include <asm/trace/exceptions.h> ··· 302 301 check_v8086_mode(struct pt_regs *regs, unsigned long address, 303 302 struct task_struct *tsk) 304 303 { 304 + #ifdef CONFIG_VM86 305 305 unsigned long bit; 306 306 307 - if (!v8086_mode(regs)) 307 + if (!v8086_mode(regs) || !tsk->thread.vm86) 308 308 return; 309 309 310 310 bit = (address - 0xA0000) >> PAGE_SHIFT; 311 311 if (bit < 32) 312 - tsk->thread.screen_bitmap |= 1 << bit; 312 + tsk->thread.vm86->screen_bitmap |= 1 << bit; 313 + #endif 313 314 } 314 315 315 316 static bool low_pfn(unsigned long pfn)

-13

arch/x86/um/asm/barrier.h

··· 45 45 #define read_barrier_depends() do { } while (0) 46 46 #define smp_read_barrier_depends() do { } while (0) 47 47 48 - /* 49 - * Stop RDTSC speculation. This is needed when you need to use RDTSC 50 - * (or get_cycles or vread that possibly accesses the TSC) in a defined 51 - * code region. 52 - * 53 - * (Could use an alternative three way for this if there was one.) 54 - */ 55 - static inline void rdtsc_barrier(void) 56 - { 57 - alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, 58 - "lfence", X86_FEATURE_LFENCE_RDTSC); 59 - } 60 - 61 48 #endif

-3

arch/x86/xen/enlighten.c

··· 1215 1215 .read_msr = xen_read_msr_safe, 1216 1216 .write_msr = xen_write_msr_safe, 1217 1217 1218 - .read_tsc = native_read_tsc, 1219 1218 .read_pmc = native_read_pmc, 1220 - 1221 - .read_tscp = native_read_tscp, 1222 1219 1223 1220 .iret = xen_iret, 1224 1221 #ifdef CONFIG_X86_64

+1 -1

drivers/cpufreq/intel_pstate.c

··· 766 766 local_irq_save(flags); 767 767 rdmsrl(MSR_IA32_APERF, aperf); 768 768 rdmsrl(MSR_IA32_MPERF, mperf); 769 - tsc = native_read_tsc(); 769 + tsc = rdtsc(); 770 770 local_irq_restore(flags); 771 771 772 772 cpu->last_sample_time = cpu->sample.time;

+2 -2

drivers/input/gameport/gameport.c

··· 149 149 150 150 for(i = 0; i < 50; i++) { 151 151 local_irq_save(flags); 152 - rdtscl(t1); 152 + t1 = rdtsc(); 153 153 for (t = 0; t < 50; t++) gameport_read(gameport); 154 - rdtscl(t2); 154 + t2 = rdtsc(); 155 155 local_irq_restore(flags); 156 156 udelay(i * 10); 157 157 if (t2 - t1 < tx) tx = t2 - t1;

+2 -2

drivers/input/joystick/analog.c

··· 143 143 144 144 #include <linux/i8253.h> 145 145 146 - #define GET_TIME(x) do { if (cpu_has_tsc) rdtscl(x); else x = get_time_pit(); } while (0) 146 + #define GET_TIME(x) do { if (cpu_has_tsc) x = (unsigned int)rdtsc(); else x = get_time_pit(); } while (0) 147 147 #define DELTA(x,y) (cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0))) 148 148 #define TIME_NAME (cpu_has_tsc?"TSC":"PIT") 149 149 static unsigned int get_time_pit(void) ··· 160 160 return count; 161 161 } 162 162 #elif defined(__x86_64__) 163 - #define GET_TIME(x) rdtscl(x) 163 + #define GET_TIME(x) do { x = (unsigned int)rdtsc(); } while (0) 164 164 #define DELTA(x,y) ((y)-(x)) 165 165 #define TIME_NAME "TSC" 166 166 #elif defined(__alpha__) || defined(CONFIG_MN10300) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_TILE)

+1 -1

drivers/net/hamradio/baycom_epp.c

··· 638 638 #define GETTICK(x) \ 639 639 ({ \ 640 640 if (cpu_has_tsc) \ 641 - rdtscl(x); \ 641 + x = (unsigned int)rdtsc(); \ 642 642 }) 643 643 #else /* __i386__ */ 644 644 #define GETTICK(x)

+3

drivers/scsi/dpt_i2o.c

··· 1924 1924 #endif 1925 1925 1926 1926 #if defined __i386__ 1927 + 1928 + #include <uapi/asm/vm86.h> 1929 + 1927 1930 static void adpt_i386_info(sysInfo_S* si) 1928 1931 { 1929 1932 // This is all the info we need for now

+4 -59

drivers/staging/media/lirc/lirc_serial.c

··· 327 327 * time 328 328 */ 329 329 330 - /* So send_pulse can quickly convert microseconds to clocks */ 331 - static unsigned long conv_us_to_clocks; 332 - 333 330 static int init_timing_params(unsigned int new_duty_cycle, 334 331 unsigned int new_freq) 335 332 { ··· 341 344 /* How many clocks in a microsecond?, avoiding long long divide */ 342 345 work = loops_per_sec; 343 346 work *= 4295; /* 4295 = 2^32 / 1e6 */ 344 - conv_us_to_clocks = work >> 32; 345 347 346 348 /* 347 349 * Carrier period in clocks, approach good up to 32GHz clock, ··· 353 357 pulse_width = period * duty_cycle / 100; 354 358 space_width = period - pulse_width; 355 359 dprintk("in init_timing_params, freq=%d, duty_cycle=%d, " 356 - "clk/jiffy=%ld, pulse=%ld, space=%ld, " 357 - "conv_us_to_clocks=%ld\n", 360 + "clk/jiffy=%ld, pulse=%ld, space=%ld\n", 358 361 freq, duty_cycle, __this_cpu_read(cpu_info.loops_per_jiffy), 359 - pulse_width, space_width, conv_us_to_clocks); 362 + pulse_width, space_width); 360 363 return 0; 361 364 } 362 365 #else /* ! USE_RDTSC */ ··· 426 431 return ret; 427 432 } 428 433 429 - #ifdef USE_RDTSC 430 - /* Version that uses Pentium rdtsc instruction to measure clocks */ 431 - 432 - /* 433 - * This version does sub-microsecond timing using rdtsc instruction, 434 - * and does away with the fudged LIRC_SERIAL_TRANSMITTER_LATENCY 435 - * Implicitly i586 architecture... - Steve 436 - */ 437 - 438 - static long send_pulse_homebrew_softcarrier(unsigned long length) 439 - { 440 - int flag; 441 - unsigned long target, start, now; 442 - 443 - /* Get going quick as we can */ 444 - rdtscl(start); 445 - on(); 446 - /* Convert length from microseconds to clocks */ 447 - length *= conv_us_to_clocks; 448 - /* And loop till time is up - flipping at right intervals */ 449 - now = start; 450 - target = pulse_width; 451 - flag = 1; 452 - /* 453 - * FIXME: This looks like a hard busy wait, without even an occasional, 454 - * polite, cpu_relax() call. There's got to be a better way? 455 - * 456 - * The i2c code has the result of a lot of bit-banging work, I wonder if 457 - * there's something there which could be helpful here. 458 - */ 459 - while ((now - start) < length) { 460 - /* Delay till flip time */ 461 - do { 462 - rdtscl(now); 463 - } while ((now - start) < target); 464 - 465 - /* flip */ 466 - if (flag) { 467 - rdtscl(now); 468 - off(); 469 - target += space_width; 470 - } else { 471 - rdtscl(now); on(); 472 - target += pulse_width; 473 - } 474 - flag = !flag; 475 - } 476 - rdtscl(now); 477 - return ((now - start) - length) / conv_us_to_clocks; 478 - } 479 - #else /* ! USE_RDTSC */ 480 434 /* Version using udelay() */ 481 435 482 436 /* 483 437 * here we use fixed point arithmetic, with 8 484 438 * fractional bits. that gets us within 0.1% or so of the right average 485 439 * frequency, albeit with some jitter in pulse length - Steve 440 + * 441 + * This should use ndelay instead. 486 442 */ 487 443 488 444 /* To match 8 fractional bits used for pulse/space length */ ··· 466 520 } 467 521 return (actual-length) >> 8; 468 522 } 469 - #endif /* USE_RDTSC */ 470 523 471 524 static long send_pulse_homebrew(unsigned long length) 472 525 {

+2 -2

drivers/thermal/intel_powerclamp.c

··· 340 340 341 341 /* check result for the last window */ 342 342 msr_now = pkg_state_counter(); 343 - rdtscll(tsc_now); 343 + tsc_now = rdtsc(); 344 344 345 345 /* calculate pkg cstate vs tsc ratio */ 346 346 if (!msr_last || !tsc_last) ··· 482 482 u64 val64; 483 483 484 484 msr_now = pkg_state_counter(); 485 - rdtscll(tsc_now); 485 + tsc_now = rdtsc(); 486 486 jiffies_now = jiffies; 487 487 488 488 /* calculate pkg cstate vs tsc ratio */

+15

include/linux/context_tracking.h

··· 49 49 } 50 50 } 51 51 52 + 53 + /** 54 + * ct_state() - return the current context tracking state if known 55 + * 56 + * Returns the current cpu's context tracking state if context tracking 57 + * is enabled. If context tracking is disabled, returns 58 + * CONTEXT_DISABLED. This should be used primarily for debugging. 59 + */ 60 + static inline enum ctx_state ct_state(void) 61 + { 62 + return context_tracking_is_enabled() ? 63 + this_cpu_read(context_tracking.state) : CONTEXT_DISABLED; 64 + } 52 65 #else 53 66 static inline void user_enter(void) { } 54 67 static inline void user_exit(void) { } 55 68 static inline enum ctx_state exception_enter(void) { return 0; } 56 69 static inline void exception_exit(enum ctx_state prev_ctx) { } 70 + static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; } 57 71 #endif /* !CONFIG_CONTEXT_TRACKING */ 58 72 73 + #define CT_WARN_ON(cond) WARN_ON(context_tracking_is_enabled() && (cond)) 59 74 60 75 #ifdef CONFIG_CONTEXT_TRACKING_FORCE 61 76 extern void context_tracking_init(void);

+1

include/linux/context_tracking_state.h

··· 14 14 bool active; 15 15 int recursion; 16 16 enum ctx_state { 17 + CONTEXT_DISABLED = -1, /* returned by ct_state() if unknown */ 17 18 CONTEXT_KERNEL = 0, 18 19 CONTEXT_USER, 19 20 CONTEXT_GUEST,

+15 -15

include/linux/spinlock.h

··· 296 296 * Map the spin_lock functions to the raw variants for PREEMPT_RT=n 297 297 */ 298 298 299 - static inline raw_spinlock_t *spinlock_check(spinlock_t *lock) 299 + static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock) 300 300 { 301 301 return &lock->rlock; 302 302 } ··· 307 307 raw_spin_lock_init(&(_lock)->rlock); \ 308 308 } while (0) 309 309 310 - static inline void spin_lock(spinlock_t *lock) 310 + static __always_inline void spin_lock(spinlock_t *lock) 311 311 { 312 312 raw_spin_lock(&lock->rlock); 313 313 } 314 314 315 - static inline void spin_lock_bh(spinlock_t *lock) 315 + static __always_inline void spin_lock_bh(spinlock_t *lock) 316 316 { 317 317 raw_spin_lock_bh(&lock->rlock); 318 318 } 319 319 320 - static inline int spin_trylock(spinlock_t *lock) 320 + static __always_inline int spin_trylock(spinlock_t *lock) 321 321 { 322 322 return raw_spin_trylock(&lock->rlock); 323 323 } ··· 337 337 raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \ 338 338 } while (0) 339 339 340 - static inline void spin_lock_irq(spinlock_t *lock) 340 + static __always_inline void spin_lock_irq(spinlock_t *lock) 341 341 { 342 342 raw_spin_lock_irq(&lock->rlock); 343 343 } ··· 352 352 raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \ 353 353 } while (0) 354 354 355 - static inline void spin_unlock(spinlock_t *lock) 355 + static __always_inline void spin_unlock(spinlock_t *lock) 356 356 { 357 357 raw_spin_unlock(&lock->rlock); 358 358 } 359 359 360 - static inline void spin_unlock_bh(spinlock_t *lock) 360 + static __always_inline void spin_unlock_bh(spinlock_t *lock) 361 361 { 362 362 raw_spin_unlock_bh(&lock->rlock); 363 363 } 364 364 365 - static inline void spin_unlock_irq(spinlock_t *lock) 365 + static __always_inline void spin_unlock_irq(spinlock_t *lock) 366 366 { 367 367 raw_spin_unlock_irq(&lock->rlock); 368 368 } 369 369 370 - static inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 370 + static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 371 371 { 372 372 raw_spin_unlock_irqrestore(&lock->rlock, flags); 373 373 } 374 374 375 - static inline int spin_trylock_bh(spinlock_t *lock) 375 + static __always_inline int spin_trylock_bh(spinlock_t *lock) 376 376 { 377 377 return raw_spin_trylock_bh(&lock->rlock); 378 378 } 379 379 380 - static inline int spin_trylock_irq(spinlock_t *lock) 380 + static __always_inline int spin_trylock_irq(spinlock_t *lock) 381 381 { 382 382 return raw_spin_trylock_irq(&lock->rlock); 383 383 } ··· 387 387 raw_spin_trylock_irqsave(spinlock_check(lock), flags); \ 388 388 }) 389 389 390 - static inline void spin_unlock_wait(spinlock_t *lock) 390 + static __always_inline void spin_unlock_wait(spinlock_t *lock) 391 391 { 392 392 raw_spin_unlock_wait(&lock->rlock); 393 393 } 394 394 395 - static inline int spin_is_locked(spinlock_t *lock) 395 + static __always_inline int spin_is_locked(spinlock_t *lock) 396 396 { 397 397 return raw_spin_is_locked(&lock->rlock); 398 398 } 399 399 400 - static inline int spin_is_contended(spinlock_t *lock) 400 + static __always_inline int spin_is_contended(spinlock_t *lock) 401 401 { 402 402 return raw_spin_is_contended(&lock->rlock); 403 403 } 404 404 405 - static inline int spin_can_lock(spinlock_t *lock) 405 + static __always_inline int spin_can_lock(spinlock_t *lock) 406 406 { 407 407 return raw_spin_can_lock(&lock->rlock); 408 408 }

+2

kernel/notifier.c

··· 544 544 .signr = sig, 545 545 546 546 }; 547 + rcu_lockdep_assert(rcu_is_watching(), 548 + "notify_die called but RCU thinks we're quiescent"); 547 549 return atomic_notifier_call_chain(&die_chain, val, &args); 548 550 } 549 551 NOKPROBE_SYMBOL(notify_die);

+1

kernel/sys_ni.c

··· 140 140 cond_syscall(sys_ssetmask); 141 141 cond_syscall(sys_vm86old); 142 142 cond_syscall(sys_vm86); 143 + cond_syscall(sys_modify_ldt); 143 144 cond_syscall(sys_ipc); 144 145 cond_syscall(compat_sys_ipc); 145 146 cond_syscall(compat_sys_sysctl);

+2 -2

tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c

··· 81 81 82 82 printk(KERN_DEBUG "start--> \n"); 83 83 then = read_pmtmr(); 84 - rdtscll(then_tsc); 84 + then_tsc = rdtsc(); 85 85 for (i=0;i<20;i++) { 86 86 mdelay(100); 87 87 now = read_pmtmr(); 88 - rdtscll(now_tsc); 88 + now_tsc = rdtsc(); 89 89 diff = (now - then) & 0xFFFFFF; 90 90 diff_tsc = now_tsc - then_tsc; 91 91 printk(KERN_DEBUG "t1: %08u t2: %08u diff_pmtmr: %08u diff_tsc: %016llu\n", then, now, diff, diff_tsc);

+2 -2

tools/testing/selftests/x86/Makefile

··· 4 4 5 5 .PHONY: all all_32 all_64 warn_32bit_failure clean 6 6 7 - TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs 8 - TARGETS_C_32BIT_ONLY := entry_from_vm86 7 + TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs ldt_gdt 8 + TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault 9 9 10 10 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) 11 11 BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)

+128 -9

tools/testing/selftests/x86/entry_from_vm86.c

··· 28 28 static unsigned long load_addr = 0x10000; 29 29 static int nerrs = 0; 30 30 31 + static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), 32 + int flags) 33 + { 34 + struct sigaction sa; 35 + memset(&sa, 0, sizeof(sa)); 36 + sa.sa_sigaction = handler; 37 + sa.sa_flags = SA_SIGINFO | flags; 38 + sigemptyset(&sa.sa_mask); 39 + if (sigaction(sig, &sa, 0)) 40 + err(1, "sigaction"); 41 + } 42 + 43 + static void clearhandler(int sig) 44 + { 45 + struct sigaction sa; 46 + memset(&sa, 0, sizeof(sa)); 47 + sa.sa_handler = SIG_DFL; 48 + sigemptyset(&sa.sa_mask); 49 + if (sigaction(sig, &sa, 0)) 50 + err(1, "sigaction"); 51 + } 52 + 53 + static sig_atomic_t got_signal; 54 + 55 + static void sighandler(int sig, siginfo_t *info, void *ctx_void) 56 + { 57 + ucontext_t *ctx = (ucontext_t*)ctx_void; 58 + 59 + if (ctx->uc_mcontext.gregs[REG_EFL] & X86_EFLAGS_VM || 60 + (ctx->uc_mcontext.gregs[REG_CS] & 3) != 3) { 61 + printf("[FAIL]\tSignal frame should not reflect vm86 mode\n"); 62 + nerrs++; 63 + } 64 + 65 + const char *signame; 66 + if (sig == SIGSEGV) 67 + signame = "SIGSEGV"; 68 + else if (sig == SIGILL) 69 + signame = "SIGILL"; 70 + else 71 + signame = "unexpected signal"; 72 + 73 + printf("[INFO]\t%s: FLAGS = 0x%lx, CS = 0x%hx\n", signame, 74 + (unsigned long)ctx->uc_mcontext.gregs[REG_EFL], 75 + (unsigned short)ctx->uc_mcontext.gregs[REG_CS]); 76 + 77 + got_signal = 1; 78 + } 79 + 31 80 asm ( 32 81 ".pushsection .rodata\n\t" 33 82 ".type vmcode_bound, @object\n\t" ··· 87 38 "int3\n\t" 88 39 "vmcode_sysenter:\n\t" 89 40 "sysenter\n\t" 41 + "vmcode_syscall:\n\t" 42 + "syscall\n\t" 43 + "vmcode_sti:\n\t" 44 + "sti\n\t" 45 + "vmcode_int3:\n\t" 46 + "int3\n\t" 47 + "vmcode_int80:\n\t" 48 + "int $0x80\n\t" 90 49 ".size vmcode, . - vmcode\n\t" 91 50 "end_vmcode:\n\t" 92 51 ".code32\n\t" ··· 102 45 ); 103 46 104 47 extern unsigned char vmcode[], end_vmcode[]; 105 - extern unsigned char vmcode_bound[], vmcode_sysenter[]; 48 + extern unsigned char vmcode_bound[], vmcode_sysenter[], vmcode_syscall[], 49 + vmcode_sti[], vmcode_int3[], vmcode_int80[]; 106 50 107 - static void do_test(struct vm86plus_struct *v86, unsigned long eip, 51 + /* Returns false if the test was skipped. */ 52 + static bool do_test(struct vm86plus_struct *v86, unsigned long eip, 53 + unsigned int rettype, unsigned int retarg, 108 54 const char *text) 109 55 { 110 56 long ret; ··· 118 58 119 59 if (ret == -1 && errno == ENOSYS) { 120 60 printf("[SKIP]\tvm86 not supported\n"); 121 - return; 61 + return false; 122 62 } 123 63 124 64 if (VM86_TYPE(ret) == VM86_INTx) { ··· 133 73 else 134 74 sprintf(trapname, "%d", trapno); 135 75 136 - printf("[OK]\tExited vm86 mode due to #%s\n", trapname); 76 + printf("[INFO]\tExited vm86 mode due to #%s\n", trapname); 137 77 } else if (VM86_TYPE(ret) == VM86_UNKNOWN) { 138 - printf("[OK]\tExited vm86 mode due to unhandled GP fault\n"); 78 + printf("[INFO]\tExited vm86 mode due to unhandled GP fault\n"); 79 + } else if (VM86_TYPE(ret) == VM86_TRAP) { 80 + printf("[INFO]\tExited vm86 mode due to a trap (arg=%ld)\n", 81 + VM86_ARG(ret)); 82 + } else if (VM86_TYPE(ret) == VM86_SIGNAL) { 83 + printf("[INFO]\tExited vm86 mode due to a signal\n"); 84 + } else if (VM86_TYPE(ret) == VM86_STI) { 85 + printf("[INFO]\tExited vm86 mode due to STI\n"); 139 86 } else { 140 - printf("[OK]\tExited vm86 mode due to type %ld, arg %ld\n", 87 + printf("[INFO]\tExited vm86 mode due to type %ld, arg %ld\n", 141 88 VM86_TYPE(ret), VM86_ARG(ret)); 142 89 } 90 + 91 + if (rettype == -1 || 92 + (VM86_TYPE(ret) == rettype && VM86_ARG(ret) == retarg)) { 93 + printf("[OK]\tReturned correctly\n"); 94 + } else { 95 + printf("[FAIL]\tIncorrect return reason\n"); 96 + nerrs++; 97 + } 98 + 99 + return true; 143 100 } 144 101 145 102 int main(void) ··· 182 105 assert((v86.regs.cs & 3) == 0); /* Looks like RPL = 0 */ 183 106 184 107 /* #BR -- should deliver SIG??? */ 185 - do_test(&v86, vmcode_bound - vmcode, "#BR"); 108 + do_test(&v86, vmcode_bound - vmcode, VM86_INTx, 5, "#BR"); 186 109 187 - /* SYSENTER -- should cause #GP or #UD depending on CPU */ 188 - do_test(&v86, vmcode_sysenter - vmcode, "SYSENTER"); 110 + /* 111 + * SYSENTER -- should cause #GP or #UD depending on CPU. 112 + * Expected return type -1 means that we shouldn't validate 113 + * the vm86 return value. This will avoid problems on non-SEP 114 + * CPUs. 115 + */ 116 + sethandler(SIGILL, sighandler, 0); 117 + do_test(&v86, vmcode_sysenter - vmcode, -1, 0, "SYSENTER"); 118 + clearhandler(SIGILL); 119 + 120 + /* 121 + * SYSCALL would be a disaster in VM86 mode. Fortunately, 122 + * there is no kernel that both enables SYSCALL and sets 123 + * EFER.SCE, so it's #UD on all systems. But vm86 is 124 + * buggy (or has a "feature"), so the SIGILL will actually 125 + * be delivered. 126 + */ 127 + sethandler(SIGILL, sighandler, 0); 128 + do_test(&v86, vmcode_syscall - vmcode, VM86_SIGNAL, 0, "SYSCALL"); 129 + clearhandler(SIGILL); 130 + 131 + /* STI with VIP set */ 132 + v86.regs.eflags |= X86_EFLAGS_VIP; 133 + v86.regs.eflags &= ~X86_EFLAGS_IF; 134 + do_test(&v86, vmcode_sti - vmcode, VM86_STI, 0, "STI with VIP set"); 135 + 136 + /* INT3 -- should cause #BP */ 137 + do_test(&v86, vmcode_int3 - vmcode, VM86_TRAP, 3, "INT3"); 138 + 139 + /* INT80 -- should exit with "INTx 0x80" */ 140 + v86.regs.eax = (unsigned int)-1; 141 + do_test(&v86, vmcode_int80 - vmcode, VM86_INTx, 0x80, "int80"); 142 + 143 + /* Execute a null pointer */ 144 + v86.regs.cs = 0; 145 + v86.regs.ss = 0; 146 + sethandler(SIGSEGV, sighandler, 0); 147 + got_signal = 0; 148 + if (do_test(&v86, 0, VM86_SIGNAL, 0, "Execute null pointer") && 149 + !got_signal) { 150 + printf("[FAIL]\tDid not receive SIGSEGV\n"); 151 + nerrs++; 152 + } 153 + clearhandler(SIGSEGV); 189 154 190 155 return (nerrs == 0 ? 0 : 1); 191 156 }

+576

tools/testing/selftests/x86/ldt_gdt.c

··· 1 + /* 2 + * ldt_gdt.c - Test cases for LDT and GDT access 3 + * Copyright (c) 2015 Andrew Lutomirski 4 + */ 5 + 6 + #define _GNU_SOURCE 7 + #include <err.h> 8 + #include <stdio.h> 9 + #include <stdint.h> 10 + #include <signal.h> 11 + #include <setjmp.h> 12 + #include <stdlib.h> 13 + #include <string.h> 14 + #include <errno.h> 15 + #include <unistd.h> 16 + #include <sys/syscall.h> 17 + #include <asm/ldt.h> 18 + #include <sys/types.h> 19 + #include <sys/wait.h> 20 + #include <stdbool.h> 21 + #include <pthread.h> 22 + #include <sched.h> 23 + #include <linux/futex.h> 24 + 25 + #define AR_ACCESSED (1<<8) 26 + 27 + #define AR_TYPE_RODATA (0 * (1<<9)) 28 + #define AR_TYPE_RWDATA (1 * (1<<9)) 29 + #define AR_TYPE_RODATA_EXPDOWN (2 * (1<<9)) 30 + #define AR_TYPE_RWDATA_EXPDOWN (3 * (1<<9)) 31 + #define AR_TYPE_XOCODE (4 * (1<<9)) 32 + #define AR_TYPE_XRCODE (5 * (1<<9)) 33 + #define AR_TYPE_XOCODE_CONF (6 * (1<<9)) 34 + #define AR_TYPE_XRCODE_CONF (7 * (1<<9)) 35 + 36 + #define AR_DPL3 (3 * (1<<13)) 37 + 38 + #define AR_S (1 << 12) 39 + #define AR_P (1 << 15) 40 + #define AR_AVL (1 << 20) 41 + #define AR_L (1 << 21) 42 + #define AR_DB (1 << 22) 43 + #define AR_G (1 << 23) 44 + 45 + static int nerrs; 46 + 47 + static void check_invalid_segment(uint16_t index, int ldt) 48 + { 49 + uint32_t has_limit = 0, has_ar = 0, limit, ar; 50 + uint32_t selector = (index << 3) | (ldt << 2) | 3; 51 + 52 + asm ("lsl %[selector], %[limit]\n\t" 53 + "jnz 1f\n\t" 54 + "movl $1, %[has_limit]\n\t" 55 + "1:" 56 + : [limit] "=r" (limit), [has_limit] "+rm" (has_limit) 57 + : [selector] "r" (selector)); 58 + asm ("larl %[selector], %[ar]\n\t" 59 + "jnz 1f\n\t" 60 + "movl $1, %[has_ar]\n\t" 61 + "1:" 62 + : [ar] "=r" (ar), [has_ar] "+rm" (has_ar) 63 + : [selector] "r" (selector)); 64 + 65 + if (has_limit || has_ar) { 66 + printf("[FAIL]\t%s entry %hu is valid but should be invalid\n", 67 + (ldt ? "LDT" : "GDT"), index); 68 + nerrs++; 69 + } else { 70 + printf("[OK]\t%s entry %hu is invalid\n", 71 + (ldt ? "LDT" : "GDT"), index); 72 + } 73 + } 74 + 75 + static void check_valid_segment(uint16_t index, int ldt, 76 + uint32_t expected_ar, uint32_t expected_limit, 77 + bool verbose) 78 + { 79 + uint32_t has_limit = 0, has_ar = 0, limit, ar; 80 + uint32_t selector = (index << 3) | (ldt << 2) | 3; 81 + 82 + asm ("lsl %[selector], %[limit]\n\t" 83 + "jnz 1f\n\t" 84 + "movl $1, %[has_limit]\n\t" 85 + "1:" 86 + : [limit] "=r" (limit), [has_limit] "+rm" (has_limit) 87 + : [selector] "r" (selector)); 88 + asm ("larl %[selector], %[ar]\n\t" 89 + "jnz 1f\n\t" 90 + "movl $1, %[has_ar]\n\t" 91 + "1:" 92 + : [ar] "=r" (ar), [has_ar] "+rm" (has_ar) 93 + : [selector] "r" (selector)); 94 + 95 + if (!has_limit || !has_ar) { 96 + printf("[FAIL]\t%s entry %hu is invalid but should be valid\n", 97 + (ldt ? "LDT" : "GDT"), index); 98 + nerrs++; 99 + return; 100 + } 101 + 102 + if (ar != expected_ar) { 103 + printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n", 104 + (ldt ? "LDT" : "GDT"), index, ar, expected_ar); 105 + nerrs++; 106 + } else if (limit != expected_limit) { 107 + printf("[FAIL]\t%s entry %hu has limit 0x%08X but expected 0x%08X\n", 108 + (ldt ? "LDT" : "GDT"), index, limit, expected_limit); 109 + nerrs++; 110 + } else if (verbose) { 111 + printf("[OK]\t%s entry %hu has AR 0x%08X and limit 0x%08X\n", 112 + (ldt ? "LDT" : "GDT"), index, ar, limit); 113 + } 114 + } 115 + 116 + static bool install_valid_mode(const struct user_desc *desc, uint32_t ar, 117 + bool oldmode) 118 + { 119 + int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11, 120 + desc, sizeof(*desc)); 121 + if (ret < -1) 122 + errno = -ret; 123 + if (ret == 0) { 124 + uint32_t limit = desc->limit; 125 + if (desc->limit_in_pages) 126 + limit = (limit << 12) + 4095; 127 + check_valid_segment(desc->entry_number, 1, ar, limit, true); 128 + return true; 129 + } else if (errno == ENOSYS) { 130 + printf("[OK]\tmodify_ldt returned -ENOSYS\n"); 131 + return false; 132 + } else { 133 + if (desc->seg_32bit) { 134 + printf("[FAIL]\tUnexpected modify_ldt failure %d\n", 135 + errno); 136 + nerrs++; 137 + return false; 138 + } else { 139 + printf("[OK]\tmodify_ldt rejected 16 bit segment\n"); 140 + return false; 141 + } 142 + } 143 + } 144 + 145 + static bool install_valid(const struct user_desc *desc, uint32_t ar) 146 + { 147 + return install_valid_mode(desc, ar, false); 148 + } 149 + 150 + static void install_invalid(const struct user_desc *desc, bool oldmode) 151 + { 152 + int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11, 153 + desc, sizeof(*desc)); 154 + if (ret < -1) 155 + errno = -ret; 156 + if (ret == 0) { 157 + check_invalid_segment(desc->entry_number, 1); 158 + } else if (errno == ENOSYS) { 159 + printf("[OK]\tmodify_ldt returned -ENOSYS\n"); 160 + } else { 161 + if (desc->seg_32bit) { 162 + printf("[FAIL]\tUnexpected modify_ldt failure %d\n", 163 + errno); 164 + nerrs++; 165 + } else { 166 + printf("[OK]\tmodify_ldt rejected 16 bit segment\n"); 167 + } 168 + } 169 + } 170 + 171 + static int safe_modify_ldt(int func, struct user_desc *ptr, 172 + unsigned long bytecount) 173 + { 174 + int ret = syscall(SYS_modify_ldt, 0x11, ptr, bytecount); 175 + if (ret < -1) 176 + errno = -ret; 177 + return ret; 178 + } 179 + 180 + static void fail_install(struct user_desc *desc) 181 + { 182 + if (safe_modify_ldt(0x11, desc, sizeof(*desc)) == 0) { 183 + printf("[FAIL]\tmodify_ldt accepted a bad descriptor\n"); 184 + nerrs++; 185 + } else if (errno == ENOSYS) { 186 + printf("[OK]\tmodify_ldt returned -ENOSYS\n"); 187 + } else { 188 + printf("[OK]\tmodify_ldt failure %d\n", errno); 189 + } 190 + } 191 + 192 + static void do_simple_tests(void) 193 + { 194 + struct user_desc desc = { 195 + .entry_number = 0, 196 + .base_addr = 0, 197 + .limit = 10, 198 + .seg_32bit = 1, 199 + .contents = 2, /* Code, not conforming */ 200 + .read_exec_only = 0, 201 + .limit_in_pages = 0, 202 + .seg_not_present = 0, 203 + .useable = 0 204 + }; 205 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB); 206 + 207 + desc.limit_in_pages = 1; 208 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | 209 + AR_S | AR_P | AR_DB | AR_G); 210 + 211 + check_invalid_segment(1, 1); 212 + 213 + desc.entry_number = 2; 214 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | 215 + AR_S | AR_P | AR_DB | AR_G); 216 + 217 + check_invalid_segment(1, 1); 218 + 219 + desc.base_addr = 0xf0000000; 220 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | 221 + AR_S | AR_P | AR_DB | AR_G); 222 + 223 + desc.useable = 1; 224 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | 225 + AR_S | AR_P | AR_DB | AR_G | AR_AVL); 226 + 227 + desc.seg_not_present = 1; 228 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | 229 + AR_S | AR_DB | AR_G | AR_AVL); 230 + 231 + desc.seg_32bit = 0; 232 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | 233 + AR_S | AR_G | AR_AVL); 234 + 235 + desc.seg_32bit = 1; 236 + desc.contents = 0; 237 + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | 238 + AR_S | AR_DB | AR_G | AR_AVL); 239 + 240 + desc.read_exec_only = 1; 241 + install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | 242 + AR_S | AR_DB | AR_G | AR_AVL); 243 + 244 + desc.contents = 1; 245 + install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA_EXPDOWN | 246 + AR_S | AR_DB | AR_G | AR_AVL); 247 + 248 + desc.read_exec_only = 0; 249 + desc.limit_in_pages = 0; 250 + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA_EXPDOWN | 251 + AR_S | AR_DB | AR_AVL); 252 + 253 + desc.contents = 3; 254 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE_CONF | 255 + AR_S | AR_DB | AR_AVL); 256 + 257 + desc.read_exec_only = 1; 258 + install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE_CONF | 259 + AR_S | AR_DB | AR_AVL); 260 + 261 + desc.read_exec_only = 0; 262 + desc.contents = 2; 263 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | 264 + AR_S | AR_DB | AR_AVL); 265 + 266 + desc.read_exec_only = 1; 267 + 268 + #ifdef __x86_64__ 269 + desc.lm = 1; 270 + install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE | 271 + AR_S | AR_DB | AR_AVL); 272 + desc.lm = 0; 273 + #endif 274 + 275 + bool entry1_okay = install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE | 276 + AR_S | AR_DB | AR_AVL); 277 + 278 + if (entry1_okay) { 279 + printf("[RUN]\tTest fork\n"); 280 + pid_t child = fork(); 281 + if (child == 0) { 282 + nerrs = 0; 283 + check_valid_segment(desc.entry_number, 1, 284 + AR_DPL3 | AR_TYPE_XOCODE | 285 + AR_S | AR_DB | AR_AVL, desc.limit, 286 + true); 287 + check_invalid_segment(1, 1); 288 + exit(nerrs ? 1 : 0); 289 + } else { 290 + int status; 291 + if (waitpid(child, &status, 0) != child || 292 + !WIFEXITED(status)) { 293 + printf("[FAIL]\tChild died\n"); 294 + nerrs++; 295 + } else if (WEXITSTATUS(status) != 0) { 296 + printf("[FAIL]\tChild failed\n"); 297 + nerrs++; 298 + } else { 299 + printf("[OK]\tChild succeeded\n"); 300 + } 301 + } 302 + 303 + printf("[RUN]\tTest size\n"); 304 + int i; 305 + for (i = 0; i < 8192; i++) { 306 + desc.entry_number = i; 307 + desc.limit = i; 308 + if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) { 309 + printf("[FAIL]\tFailed to install entry %d\n", i); 310 + nerrs++; 311 + break; 312 + } 313 + } 314 + for (int j = 0; j < i; j++) { 315 + check_valid_segment(j, 1, AR_DPL3 | AR_TYPE_XOCODE | 316 + AR_S | AR_DB | AR_AVL, j, false); 317 + } 318 + printf("[DONE]\tSize test\n"); 319 + } else { 320 + printf("[SKIP]\tSkipping fork and size tests because we have no LDT\n"); 321 + } 322 + 323 + /* Test entry_number too high. */ 324 + desc.entry_number = 8192; 325 + fail_install(&desc); 326 + 327 + /* Test deletion and actions mistakeable for deletion. */ 328 + memset(&desc, 0, sizeof(desc)); 329 + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P); 330 + 331 + desc.seg_not_present = 1; 332 + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S); 333 + 334 + desc.seg_not_present = 0; 335 + desc.read_exec_only = 1; 336 + install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S | AR_P); 337 + 338 + desc.read_exec_only = 0; 339 + desc.seg_not_present = 1; 340 + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S); 341 + 342 + desc.read_exec_only = 1; 343 + desc.limit = 1; 344 + install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S); 345 + 346 + desc.limit = 0; 347 + desc.base_addr = 1; 348 + install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S); 349 + 350 + desc.base_addr = 0; 351 + install_invalid(&desc, false); 352 + 353 + desc.seg_not_present = 0; 354 + desc.read_exec_only = 0; 355 + desc.seg_32bit = 1; 356 + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB); 357 + install_invalid(&desc, true); 358 + } 359 + 360 + /* 361 + * 0: thread is idle 362 + * 1: thread armed 363 + * 2: thread should clear LDT entry 0 364 + * 3: thread should exit 365 + */ 366 + static volatile unsigned int ftx; 367 + 368 + static void *threadproc(void *ctx) 369 + { 370 + cpu_set_t cpuset; 371 + CPU_ZERO(&cpuset); 372 + CPU_SET(1, &cpuset); 373 + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) 374 + err(1, "sched_setaffinity to CPU 1"); /* should never fail */ 375 + 376 + while (1) { 377 + syscall(SYS_futex, &ftx, FUTEX_WAIT, 0, NULL, NULL, 0); 378 + while (ftx != 2) { 379 + if (ftx >= 3) 380 + return NULL; 381 + } 382 + 383 + /* clear LDT entry 0 */ 384 + const struct user_desc desc = {}; 385 + if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) != 0) 386 + err(1, "modify_ldt"); 387 + 388 + /* If ftx == 2, set it to zero. If ftx == 100, quit. */ 389 + unsigned int x = -2; 390 + asm volatile ("lock xaddl %[x], %[ftx]" : 391 + [x] "+r" (x), [ftx] "+m" (ftx)); 392 + if (x != 2) 393 + return NULL; 394 + } 395 + } 396 + 397 + static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), 398 + int flags) 399 + { 400 + struct sigaction sa; 401 + memset(&sa, 0, sizeof(sa)); 402 + sa.sa_sigaction = handler; 403 + sa.sa_flags = SA_SIGINFO | flags; 404 + sigemptyset(&sa.sa_mask); 405 + if (sigaction(sig, &sa, 0)) 406 + err(1, "sigaction"); 407 + 408 + } 409 + 410 + static jmp_buf jmpbuf; 411 + 412 + static void sigsegv(int sig, siginfo_t *info, void *ctx_void) 413 + { 414 + siglongjmp(jmpbuf, 1); 415 + } 416 + 417 + static void do_multicpu_tests(void) 418 + { 419 + cpu_set_t cpuset; 420 + pthread_t thread; 421 + int failures = 0, iters = 5, i; 422 + unsigned short orig_ss; 423 + 424 + CPU_ZERO(&cpuset); 425 + CPU_SET(1, &cpuset); 426 + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) { 427 + printf("[SKIP]\tCannot set affinity to CPU 1\n"); 428 + return; 429 + } 430 + 431 + CPU_ZERO(&cpuset); 432 + CPU_SET(0, &cpuset); 433 + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) { 434 + printf("[SKIP]\tCannot set affinity to CPU 0\n"); 435 + return; 436 + } 437 + 438 + sethandler(SIGSEGV, sigsegv, 0); 439 + #ifdef __i386__ 440 + /* True 32-bit kernels send SIGILL instead of SIGSEGV on IRET faults. */ 441 + sethandler(SIGILL, sigsegv, 0); 442 + #endif 443 + 444 + printf("[RUN]\tCross-CPU LDT invalidation\n"); 445 + 446 + if (pthread_create(&thread, 0, threadproc, 0) != 0) 447 + err(1, "pthread_create"); 448 + 449 + asm volatile ("mov %%ss, %0" : "=rm" (orig_ss)); 450 + 451 + for (i = 0; i < 5; i++) { 452 + if (sigsetjmp(jmpbuf, 1) != 0) 453 + continue; 454 + 455 + /* Make sure the thread is ready after the last test. */ 456 + while (ftx != 0) 457 + ; 458 + 459 + struct user_desc desc = { 460 + .entry_number = 0, 461 + .base_addr = 0, 462 + .limit = 0xfffff, 463 + .seg_32bit = 1, 464 + .contents = 0, /* Data */ 465 + .read_exec_only = 0, 466 + .limit_in_pages = 1, 467 + .seg_not_present = 0, 468 + .useable = 0 469 + }; 470 + 471 + if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) { 472 + if (errno != ENOSYS) 473 + err(1, "modify_ldt"); 474 + printf("[SKIP]\tmodify_ldt unavailable\n"); 475 + break; 476 + } 477 + 478 + /* Arm the thread. */ 479 + ftx = 1; 480 + syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); 481 + 482 + asm volatile ("mov %0, %%ss" : : "r" (0x7)); 483 + 484 + /* Go! */ 485 + ftx = 2; 486 + 487 + while (ftx != 0) 488 + ; 489 + 490 + /* 491 + * On success, modify_ldt will segfault us synchronously, 492 + * and we'll escape via siglongjmp. 493 + */ 494 + 495 + failures++; 496 + asm volatile ("mov %0, %%ss" : : "rm" (orig_ss)); 497 + }; 498 + 499 + ftx = 100; /* Kill the thread. */ 500 + syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); 501 + 502 + if (pthread_join(thread, NULL) != 0) 503 + err(1, "pthread_join"); 504 + 505 + if (failures) { 506 + printf("[FAIL]\t%d of %d iterations failed\n", failures, iters); 507 + nerrs++; 508 + } else { 509 + printf("[OK]\tAll %d iterations succeeded\n", iters); 510 + } 511 + } 512 + 513 + static int finish_exec_test(void) 514 + { 515 + /* 516 + * In a sensible world, this would be check_invalid_segment(0, 1); 517 + * For better or for worse, though, the LDT is inherited across exec. 518 + * We can probably change this safely, but for now we test it. 519 + */ 520 + check_valid_segment(0, 1, 521 + AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB, 522 + 42, true); 523 + 524 + return nerrs ? 1 : 0; 525 + } 526 + 527 + static void do_exec_test(void) 528 + { 529 + printf("[RUN]\tTest exec\n"); 530 + 531 + struct user_desc desc = { 532 + .entry_number = 0, 533 + .base_addr = 0, 534 + .limit = 42, 535 + .seg_32bit = 1, 536 + .contents = 2, /* Code, not conforming */ 537 + .read_exec_only = 0, 538 + .limit_in_pages = 0, 539 + .seg_not_present = 0, 540 + .useable = 0 541 + }; 542 + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB); 543 + 544 + pid_t child = fork(); 545 + if (child == 0) { 546 + execl("/proc/self/exe", "ldt_gdt_test_exec", NULL); 547 + printf("[FAIL]\tCould not exec self\n"); 548 + exit(1); /* exec failed */ 549 + } else { 550 + int status; 551 + if (waitpid(child, &status, 0) != child || 552 + !WIFEXITED(status)) { 553 + printf("[FAIL]\tChild died\n"); 554 + nerrs++; 555 + } else if (WEXITSTATUS(status) != 0) { 556 + printf("[FAIL]\tChild failed\n"); 557 + nerrs++; 558 + } else { 559 + printf("[OK]\tChild succeeded\n"); 560 + } 561 + } 562 + } 563 + 564 + int main(int argc, char **argv) 565 + { 566 + if (argc == 1 && !strcmp(argv[0], "ldt_gdt_test_exec")) 567 + return finish_exec_test(); 568 + 569 + do_simple_tests(); 570 + 571 + do_multicpu_tests(); 572 + 573 + do_exec_test(); 574 + 575 + return nerrs ? 1 : 0; 576 + }

+130

tools/testing/selftests/x86/syscall_arg_fault.c

··· 1 + /* 2 + * syscall_arg_fault.c - tests faults 32-bit fast syscall stack args 3 + * Copyright (c) 2015 Andrew Lutomirski 4 + * 5 + * This program is free software; you can redistribute it and/or modify 6 + * it under the terms and conditions of the GNU General Public License, 7 + * version 2, as published by the Free Software Foundation. 8 + * 9 + * This program is distributed in the hope it will be useful, but 10 + * WITHOUT ANY WARRANTY; without even the implied warranty of 11 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 + * General Public License for more details. 13 + */ 14 + 15 + #define _GNU_SOURCE 16 + 17 + #include <stdlib.h> 18 + #include <stdio.h> 19 + #include <string.h> 20 + #include <sys/signal.h> 21 + #include <sys/ucontext.h> 22 + #include <err.h> 23 + #include <setjmp.h> 24 + #include <errno.h> 25 + 26 + /* Our sigaltstack scratch space. */ 27 + static unsigned char altstack_data[SIGSTKSZ]; 28 + 29 + static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), 30 + int flags) 31 + { 32 + struct sigaction sa; 33 + memset(&sa, 0, sizeof(sa)); 34 + sa.sa_sigaction = handler; 35 + sa.sa_flags = SA_SIGINFO | flags; 36 + sigemptyset(&sa.sa_mask); 37 + if (sigaction(sig, &sa, 0)) 38 + err(1, "sigaction"); 39 + } 40 + 41 + static volatile sig_atomic_t sig_traps; 42 + static sigjmp_buf jmpbuf; 43 + 44 + static volatile sig_atomic_t n_errs; 45 + 46 + static void sigsegv(int sig, siginfo_t *info, void *ctx_void) 47 + { 48 + ucontext_t *ctx = (ucontext_t*)ctx_void; 49 + 50 + if (ctx->uc_mcontext.gregs[REG_EAX] != -EFAULT) { 51 + printf("[FAIL]\tAX had the wrong value: 0x%x\n", 52 + ctx->uc_mcontext.gregs[REG_EAX]); 53 + n_errs++; 54 + } else { 55 + printf("[OK]\tSeems okay\n"); 56 + } 57 + 58 + siglongjmp(jmpbuf, 1); 59 + } 60 + 61 + static void sigill(int sig, siginfo_t *info, void *ctx_void) 62 + { 63 + printf("[SKIP]\tIllegal instruction\n"); 64 + siglongjmp(jmpbuf, 1); 65 + } 66 + 67 + int main() 68 + { 69 + stack_t stack = { 70 + .ss_sp = altstack_data, 71 + .ss_size = SIGSTKSZ, 72 + }; 73 + if (sigaltstack(&stack, NULL) != 0) 74 + err(1, "sigaltstack"); 75 + 76 + sethandler(SIGSEGV, sigsegv, SA_ONSTACK); 77 + sethandler(SIGILL, sigill, SA_ONSTACK); 78 + 79 + /* 80 + * Exercise another nasty special case. The 32-bit SYSCALL 81 + * and SYSENTER instructions (even in compat mode) each 82 + * clobber one register. A Linux system call has a syscall 83 + * number and six arguments, and the user stack pointer 84 + * needs to live in some register on return. That means 85 + * that we need eight registers, but SYSCALL and SYSENTER 86 + * only preserve seven registers. As a result, one argument 87 + * ends up on the stack. The stack is user memory, which 88 + * means that the kernel can fail to read it. 89 + * 90 + * The 32-bit fast system calls don't have a defined ABI: 91 + * we're supposed to invoke them through the vDSO. So we'll 92 + * fudge it: we set all regs to invalid pointer values and 93 + * invoke the entry instruction. The return will fail no 94 + * matter what, and we completely lose our program state, 95 + * but we can fix it up with a signal handler. 96 + */ 97 + 98 + printf("[RUN]\tSYSENTER with invalid state\n"); 99 + if (sigsetjmp(jmpbuf, 1) == 0) { 100 + asm volatile ( 101 + "movl $-1, %%eax\n\t" 102 + "movl $-1, %%ebx\n\t" 103 + "movl $-1, %%ecx\n\t" 104 + "movl $-1, %%edx\n\t" 105 + "movl $-1, %%esi\n\t" 106 + "movl $-1, %%edi\n\t" 107 + "movl $-1, %%ebp\n\t" 108 + "movl $-1, %%esp\n\t" 109 + "sysenter" 110 + : : : "memory", "flags"); 111 + } 112 + 113 + printf("[RUN]\tSYSCALL with invalid state\n"); 114 + if (sigsetjmp(jmpbuf, 1) == 0) { 115 + asm volatile ( 116 + "movl $-1, %%eax\n\t" 117 + "movl $-1, %%ebx\n\t" 118 + "movl $-1, %%ecx\n\t" 119 + "movl $-1, %%edx\n\t" 120 + "movl $-1, %%esi\n\t" 121 + "movl $-1, %%edi\n\t" 122 + "movl $-1, %%ebp\n\t" 123 + "movl $-1, %%esp\n\t" 124 + "syscall\n\t" 125 + "pushl $0" /* make sure we segfault cleanly */ 126 + : : : "memory", "flags"); 127 + } 128 + 129 + return 0; 130 + }