Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-tip

* 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-tip:
x86-64: Rework vsyscall emulation and add vsyscall= parameter
x86-64: Wire up getcpu syscall
x86: Remove unnecessary compile flag tweaks for vsyscall code
x86-64: Add vsyscall:emulate_vsyscall trace event
x86-64: Add user_64bit_mode paravirt op
x86-64, xen: Enable the vvar mapping
x86-64: Work around gold bug 13023
x86-64: Move the "user" vsyscall segment out of the data segment.
x86-64: Pad vDSO to a page boundary

+194 -115
+21
Documentation/kernel-parameters.txt
··· 2680 2680 vmpoff= [KNL,S390] Perform z/VM CP command after power off. 2681 2681 Format: <command> 2682 2682 2683 + vsyscall= [X86-64] 2684 + Controls the behavior of vsyscalls (i.e. calls to 2685 + fixed addresses of 0xffffffffff600x00 from legacy 2686 + code). Most statically-linked binaries and older 2687 + versions of glibc use these calls. Because these 2688 + functions are at fixed addresses, they make nice 2689 + targets for exploits that can control RIP. 2690 + 2691 + emulate [default] Vsyscalls turn into traps and are 2692 + emulated reasonably safely. 2693 + 2694 + native Vsyscalls are native syscall instructions. 2695 + This is a little bit faster than trapping 2696 + and makes a few dynamic recompilers work 2697 + better than they would in emulation mode. 2698 + It also makes exploits much easier to write. 2699 + 2700 + none Vsyscalls don't work at all. This makes 2701 + them quite hard to use for exploits but 2702 + might break your system. 2703 + 2683 2704 vt.cur_default= [VT] Default cursor shape. 2684 2705 Format: 0xCCBBAA, where AA, BB, and CC are the same as 2685 2706 the parameters of the <Esc>[?A;B;Cc escape sequence;
+2 -2
arch/x86/include/asm/desc.h
··· 27 27 28 28 desc->base2 = (info->base_addr & 0xff000000) >> 24; 29 29 /* 30 - * Don't allow setting of the lm bit. It is useless anyway 31 - * because 64bit system calls require __USER_CS: 30 + * Don't allow setting of the lm bit. It would confuse 31 + * user_64bit_mode and would get overridden by sysret anyway. 32 32 */ 33 33 desc->l = 0; 34 34 }
-4
arch/x86/include/asm/irq_vectors.h
··· 17 17 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events 18 18 * Vectors 32 ... 127 : device interrupts 19 19 * Vector 128 : legacy int80 syscall interface 20 - * Vector 204 : legacy x86_64 vsyscall emulation 21 20 * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts 22 21 * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts 23 22 * ··· 49 50 #define IA32_SYSCALL_VECTOR 0x80 50 51 #ifdef CONFIG_X86_32 51 52 # define SYSCALL_VECTOR 0x80 52 - #endif 53 - #ifdef CONFIG_X86_64 54 - # define VSYSCALL_EMU_VECTOR 0xcc 55 53 #endif 56 54 57 55 /*
+6
arch/x86/include/asm/paravirt_types.h
··· 41 41 42 42 #include <asm/desc_defs.h> 43 43 #include <asm/kmap_types.h> 44 + #include <asm/pgtable_types.h> 44 45 45 46 struct page; 46 47 struct thread_struct; ··· 64 63 struct pv_info { 65 64 unsigned int kernel_rpl; 66 65 int shared_kernel_pmd; 66 + 67 + #ifdef CONFIG_X86_64 68 + u16 extra_user_64bit_cs; /* __USER_CS if none */ 69 + #endif 70 + 67 71 int paravirt_enabled; 68 72 const char *name; 69 73 };
+19
arch/x86/include/asm/ptrace.h
··· 131 131 #ifdef __KERNEL__ 132 132 133 133 #include <linux/init.h> 134 + #ifdef CONFIG_PARAVIRT 135 + #include <asm/paravirt_types.h> 136 + #endif 134 137 135 138 struct cpuinfo_x86; 136 139 struct task_struct; ··· 189 186 return 0; /* No V86 mode support in long mode */ 190 187 #endif 191 188 } 189 + 190 + #ifdef CONFIG_X86_64 191 + static inline bool user_64bit_mode(struct pt_regs *regs) 192 + { 193 + #ifndef CONFIG_PARAVIRT 194 + /* 195 + * On non-paravirt systems, this is the only long mode CPL 3 196 + * selector. We do not allow long mode selectors in the LDT. 197 + */ 198 + return regs->cs == __USER_CS; 199 + #else 200 + /* Headers are too twisted for this to go in paravirt.h. */ 201 + return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs; 202 + #endif 203 + } 204 + #endif 192 205 193 206 /* 194 207 * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
-2
arch/x86/include/asm/traps.h
··· 40 40 asmlinkage void machine_check(void); 41 41 #endif /* CONFIG_X86_MCE */ 42 42 asmlinkage void simd_coprocessor_error(void); 43 - asmlinkage void emulate_vsyscall(void); 44 43 45 44 dotraplinkage void do_divide_error(struct pt_regs *, long); 46 45 dotraplinkage void do_debug(struct pt_regs *, long); ··· 66 67 dotraplinkage void do_machine_check(struct pt_regs *, long); 67 68 #endif 68 69 dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); 69 - dotraplinkage void do_emulate_vsyscall(struct pt_regs *, long); 70 70 #ifdef CONFIG_X86_32 71 71 dotraplinkage void do_iret_error(struct pt_regs *, long); 72 72 #endif
+2
arch/x86/include/asm/unistd_64.h
··· 681 681 __SYSCALL(__NR_sendmmsg, sys_sendmmsg) 682 682 #define __NR_setns 308 683 683 __SYSCALL(__NR_setns, sys_setns) 684 + #define __NR_getcpu 309 685 + __SYSCALL(__NR_getcpu, sys_getcpu) 684 686 685 687 #ifndef __NO_STUBS 686 688 #define __ARCH_WANT_OLD_READDIR
+6
arch/x86/include/asm/vsyscall.h
··· 27 27 28 28 extern void map_vsyscall(void); 29 29 30 + /* 31 + * Called on instruction fetch fault in vsyscall page. 32 + * Returns true if handled. 33 + */ 34 + extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); 35 + 30 36 #endif /* __KERNEL__ */ 31 37 32 38 #endif /* _ASM_X86_VSYSCALL_H */
-13
arch/x86/kernel/Makefile
··· 17 17 CFLAGS_REMOVE_early_printk.o = -pg 18 18 endif 19 19 20 - # 21 - # vsyscalls (which work on the user stack) should have 22 - # no stack-protector checks: 23 - # 24 - nostackp := $(call cc-option, -fno-stack-protector) 25 - CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) 26 - CFLAGS_hpet.o := $(nostackp) 27 - CFLAGS_paravirt.o := $(nostackp) 28 - GCOV_PROFILE_vsyscall_64.o := n 29 - GCOV_PROFILE_hpet.o := n 30 - GCOV_PROFILE_tsc.o := n 31 - GCOV_PROFILE_paravirt.o := n 32 - 33 20 obj-y := process_$(BITS).o signal.o entry_$(BITS).o 34 21 obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 35 22 obj-y += time.o ioport.o ldt.o dumpstack.o
-1
arch/x86/kernel/entry_64.S
··· 1111 1111 zeroentry coprocessor_error do_coprocessor_error 1112 1112 errorentry alignment_check do_alignment_check 1113 1113 zeroentry simd_coprocessor_error do_simd_coprocessor_error 1114 - zeroentry emulate_vsyscall do_emulate_vsyscall 1115 1114 1116 1115 1117 1116 /* Reload gs selector with exception handling */
+4
arch/x86/kernel/paravirt.c
··· 307 307 .paravirt_enabled = 0, 308 308 .kernel_rpl = 0, 309 309 .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ 310 + 311 + #ifdef CONFIG_X86_64 312 + .extra_user_64bit_cs = __USER_CS, 313 + #endif 310 314 }; 311 315 312 316 struct pv_init_ops pv_init_ops = {
+1 -1
arch/x86/kernel/step.c
··· 74 74 75 75 #ifdef CONFIG_X86_64 76 76 case 0x40 ... 0x4f: 77 - if (regs->cs != __USER_CS) 77 + if (!user_64bit_mode(regs)) 78 78 /* 32-bit mode: register increment */ 79 79 return 0; 80 80 /* 64-bit mode: REX prefix */
-6
arch/x86/kernel/traps.c
··· 872 872 set_bit(SYSCALL_VECTOR, used_vectors); 873 873 #endif 874 874 875 - #ifdef CONFIG_X86_64 876 - BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors)); 877 - set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall); 878 - set_bit(VSYSCALL_EMU_VECTOR, used_vectors); 879 - #endif 880 - 881 875 /* 882 876 * Should be a barrier for any external CPU state: 883 877 */
+6 -35
arch/x86/kernel/vmlinux.lds.S
··· 71 71 text PT_LOAD FLAGS(5); /* R_E */ 72 72 data PT_LOAD FLAGS(6); /* RW_ */ 73 73 #ifdef CONFIG_X86_64 74 - user PT_LOAD FLAGS(5); /* R_E */ 75 74 #ifdef CONFIG_SMP 76 75 percpu PT_LOAD FLAGS(6); /* RW_ */ 77 76 #endif ··· 153 154 154 155 #ifdef CONFIG_X86_64 155 156 156 - #define VSYSCALL_ADDR (-10*1024*1024) 157 - 158 - #define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET) 159 - #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) 160 - 161 - #define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) 162 - #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) 163 - 164 - . = ALIGN(4096); 165 - __vsyscall_0 = .; 166 - 167 - . = VSYSCALL_ADDR; 168 - .vsyscall : AT(VLOAD(.vsyscall)) { 169 - *(.vsyscall_0) 170 - 171 - . = 1024; 172 - *(.vsyscall_1) 173 - 174 - . = 2048; 175 - *(.vsyscall_2) 176 - 177 - . = 4096; /* Pad the whole page. */ 178 - } :user =0xcc 179 - . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE); 180 - 181 - #undef VSYSCALL_ADDR 182 - #undef VLOAD_OFFSET 183 - #undef VLOAD 184 - #undef VVIRT_OFFSET 185 - #undef VVIRT 186 - 157 + . = ALIGN(PAGE_SIZE); 187 158 __vvar_page = .; 188 159 189 160 .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { 161 + /* work around gold bug 13023 */ 162 + __vvar_beginning_hack = .; 190 163 191 - /* Place all vvars at the offsets in asm/vvar.h. */ 192 - #define EMIT_VVAR(name, offset) \ 193 - . = offset; \ 164 + /* Place all vvars at the offsets in asm/vvar.h. */ 165 + #define EMIT_VVAR(name, offset) \ 166 + . = __vvar_beginning_hack + offset; \ 194 167 *(.vvar_ ## name) 195 168 #define __VVAR_KERNEL_LDS 196 169 #include <asm/vvar.h>
+54 -36
arch/x86/kernel/vsyscall_64.c
··· 18 18 * use the vDSO. 19 19 */ 20 20 21 - /* Disable profiling for userspace code: */ 22 - #define DISABLE_BRANCH_PROFILING 23 - 24 21 #include <linux/time.h> 25 22 #include <linux/init.h> 26 23 #include <linux/kernel.h> ··· 47 50 #include <asm/vgtod.h> 48 51 #include <asm/traps.h> 49 52 53 + #define CREATE_TRACE_POINTS 54 + #include "vsyscall_trace.h" 55 + 50 56 DEFINE_VVAR(int, vgetcpu_mode); 51 57 DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = 52 58 { 53 59 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), 54 60 }; 61 + 62 + static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; 63 + 64 + static int __init vsyscall_setup(char *str) 65 + { 66 + if (str) { 67 + if (!strcmp("emulate", str)) 68 + vsyscall_mode = EMULATE; 69 + else if (!strcmp("native", str)) 70 + vsyscall_mode = NATIVE; 71 + else if (!strcmp("none", str)) 72 + vsyscall_mode = NONE; 73 + else 74 + return -EINVAL; 75 + 76 + return 0; 77 + } 78 + 79 + return -EINVAL; 80 + } 81 + early_param("vsyscall", vsyscall_setup); 55 82 56 83 void update_vsyscall_tz(void) 57 84 { ··· 121 100 122 101 printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", 123 102 level, tsk->comm, task_pid_nr(tsk), 124 - message, regs->ip - 2, regs->cs, 103 + message, regs->ip, regs->cs, 125 104 regs->sp, regs->ax, regs->si, regs->di); 126 105 } 127 106 ··· 139 118 return nr; 140 119 } 141 120 142 - void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) 121 + bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) 143 122 { 144 123 struct task_struct *tsk; 145 124 unsigned long caller; 146 125 int vsyscall_nr; 147 126 long ret; 148 127 149 - local_irq_enable(); 150 - 151 128 /* 152 - * Real 64-bit user mode code has cs == __USER_CS. Anything else 153 - * is bogus. 129 + * No point in checking CS -- the only way to get here is a user mode 130 + * trap to a high address, which means that we're in 64-bit user code. 154 131 */ 155 - if (regs->cs != __USER_CS) { 156 - /* 157 - * If we trapped from kernel mode, we might as well OOPS now 158 - * instead of returning to some random address and OOPSing 159 - * then. 160 - */ 161 - BUG_ON(!user_mode(regs)); 162 132 163 - /* Compat mode and non-compat 32-bit CS should both segfault. */ 164 - warn_bad_vsyscall(KERN_WARNING, regs, 165 - "illegal int 0xcc from 32-bit mode"); 166 - goto sigsegv; 133 + WARN_ON_ONCE(address != regs->ip); 134 + 135 + if (vsyscall_mode == NONE) { 136 + warn_bad_vsyscall(KERN_INFO, regs, 137 + "vsyscall attempted with vsyscall=none"); 138 + return false; 167 139 } 168 140 169 - /* 170 - * x86-ism here: regs->ip points to the instruction after the int 0xcc, 171 - * and int 0xcc is two bytes long. 172 - */ 173 - vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2); 141 + vsyscall_nr = addr_to_vsyscall_nr(address); 142 + 143 + trace_emulate_vsyscall(vsyscall_nr); 144 + 174 145 if (vsyscall_nr < 0) { 175 146 warn_bad_vsyscall(KERN_WARNING, regs, 176 - "illegal int 0xcc (exploit attempt?)"); 147 + "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); 177 148 goto sigsegv; 178 149 } 179 150 180 151 if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { 181 - warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)"); 152 + warn_bad_vsyscall(KERN_WARNING, regs, 153 + "vsyscall with bad stack (exploit attempt?)"); 182 154 goto sigsegv; 183 155 } 184 156 ··· 216 202 regs->ip = caller; 217 203 regs->sp += 8; 218 204 219 - local_irq_disable(); 220 - return; 205 + return true; 221 206 222 207 sigsegv: 223 - regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */ 224 208 force_sig(SIGSEGV, current); 225 - local_irq_disable(); 209 + return true; 226 210 } 227 211 228 212 /* ··· 268 256 269 257 void __init map_vsyscall(void) 270 258 { 271 - extern char __vsyscall_0; 272 - unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); 259 + extern char __vsyscall_page; 260 + unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); 273 261 extern char __vvar_page; 274 262 unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); 275 263 276 - /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ 277 - __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); 264 + __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall, 265 + vsyscall_mode == NATIVE 266 + ? PAGE_KERNEL_VSYSCALL 267 + : PAGE_KERNEL_VVAR); 268 + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) != 269 + (unsigned long)VSYSCALL_START); 270 + 278 271 __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); 279 - BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS); 272 + BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != 273 + (unsigned long)VVAR_ADDRESS); 280 274 } 281 275 282 276 static int __init vsyscall_init(void)
+23 -13
arch/x86/kernel/vsyscall_emu_64.S
··· 7 7 */ 8 8 9 9 #include <linux/linkage.h> 10 + 10 11 #include <asm/irq_vectors.h> 12 + #include <asm/page_types.h> 13 + #include <asm/unistd_64.h> 11 14 12 - /* The unused parts of the page are filled with 0xcc by the linker script. */ 15 + __PAGE_ALIGNED_DATA 16 + .globl __vsyscall_page 17 + .balign PAGE_SIZE, 0xcc 18 + .type __vsyscall_page, @object 19 + __vsyscall_page: 13 20 14 - .section .vsyscall_0, "a" 15 - ENTRY(vsyscall_0) 16 - int $VSYSCALL_EMU_VECTOR 17 - END(vsyscall_0) 21 + mov $__NR_gettimeofday, %rax 22 + syscall 23 + ret 18 24 19 - .section .vsyscall_1, "a" 20 - ENTRY(vsyscall_1) 21 - int $VSYSCALL_EMU_VECTOR 22 - END(vsyscall_1) 25 + .balign 1024, 0xcc 26 + mov $__NR_time, %rax 27 + syscall 28 + ret 23 29 24 - .section .vsyscall_2, "a" 25 - ENTRY(vsyscall_2) 26 - int $VSYSCALL_EMU_VECTOR 27 - END(vsyscall_2) 30 + .balign 1024, 0xcc 31 + mov $__NR_getcpu, %rax 32 + syscall 33 + ret 34 + 35 + .balign 4096, 0xcc 36 + 37 + .size __vsyscall_page, 4096
+29
arch/x86/kernel/vsyscall_trace.h
··· 1 + #undef TRACE_SYSTEM 2 + #define TRACE_SYSTEM vsyscall 3 + 4 + #if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) 5 + #define __VSYSCALL_TRACE_H 6 + 7 + #include <linux/tracepoint.h> 8 + 9 + TRACE_EVENT(emulate_vsyscall, 10 + 11 + TP_PROTO(int nr), 12 + 13 + TP_ARGS(nr), 14 + 15 + TP_STRUCT__entry(__field(int, nr)), 16 + 17 + TP_fast_assign( 18 + __entry->nr = nr; 19 + ), 20 + 21 + TP_printk("nr = %d", __entry->nr) 22 + ); 23 + 24 + #endif 25 + 26 + #undef TRACE_INCLUDE_PATH 27 + #define TRACE_INCLUDE_PATH ../../arch/x86/kernel 28 + #define TRACE_INCLUDE_FILE vsyscall_trace 29 + #include <trace/define_trace.h>
+13 -1
arch/x86/mm/fault.c
··· 105 105 * but for now it's good enough to assume that long 106 106 * mode only uses well known segments or kernel. 107 107 */ 108 - return (!user_mode(regs)) || (regs->cs == __USER_CS); 108 + return (!user_mode(regs) || user_64bit_mode(regs)); 109 109 #endif 110 110 case 0x60: 111 111 /* 0x64 thru 0x67 are valid prefixes in all modes. */ ··· 719 719 720 720 if (is_errata100(regs, address)) 721 721 return; 722 + 723 + #ifdef CONFIG_X86_64 724 + /* 725 + * Instruction fetch faults in the vsyscall page might need 726 + * emulation. 727 + */ 728 + if (unlikely((error_code & PF_INSTR) && 729 + ((address & ~0xfff) == VSYSCALL_START))) { 730 + if (emulate_vsyscall(regs, address)) 731 + return; 732 + } 733 + #endif 722 734 723 735 if (unlikely(show_unhandled_signals)) 724 736 show_signal_msg(regs, error_code, address, tsk);
+1
arch/x86/vdso/vdso.S
··· 9 9 vdso_start: 10 10 .incbin "arch/x86/vdso/vdso.so" 11 11 vdso_end: 12 + .align PAGE_SIZE /* extra data here leaks to userspace. */ 12 13 13 14 .previous 14 15
+4
arch/x86/xen/enlighten.c
··· 951 951 .paravirt_enabled = 1, 952 952 .shared_kernel_pmd = 0, 953 953 954 + #ifdef CONFIG_X86_64 955 + .extra_user_64bit_cs = FLAT_USER_CS64, 956 + #endif 957 + 954 958 .name = "Xen", 955 959 }; 956 960
+3 -1
arch/x86/xen/mmu.c
··· 1916 1916 # endif 1917 1917 #else 1918 1918 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: 1919 + case VVAR_PAGE: 1919 1920 #endif 1920 1921 case FIX_TEXT_POKE0: 1921 1922 case FIX_TEXT_POKE1: ··· 1957 1956 #ifdef CONFIG_X86_64 1958 1957 /* Replicate changes to map the vsyscall page into the user 1959 1958 pagetable vsyscall mapping. */ 1960 - if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { 1959 + if ((idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) || 1960 + idx == VVAR_PAGE) { 1961 1961 unsigned long vaddr = __fix_to_virt(idx); 1962 1962 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); 1963 1963 }