Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86-64: Emulate legacy vsyscalls

There's a fair amount of code in the vsyscall page. It contains
a syscall instruction (in the gettimeofday fallback) and who
knows what will happen if an exploit jumps into the middle of
some other code.

Reduce the risk by replacing the vsyscalls with short magic
incantations that cause the kernel to emulate the real
vsyscalls. These incantations are useless if entered in the
middle.

This causes vsyscalls to be a little more expensive than real
syscalls. Fortunately sensible programs don't use them.
The only exception is time() which is still called by glibc
through the vsyscall - but calling time() millions of times
per second is not sensible. glibc has this fixed in the
development tree.

This patch is not perfect: the vread_tsc and vread_hpet
functions are still at a fixed address. Fixing that might
involve making alternative patching work in the vDSO.

Signed-off-by: Andy Lutomirski <luto@mit.edu>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jesper Juhl <jj@chaosbits.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Jan Beulich <JBeulich@novell.com>
Cc: richard -rw- weinberger <richard.weinberger@gmail.com>
Cc: Mikael Pettersson <mikpe@it.uu.se>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: Valdis.Kletnieks@vt.edu
Cc: pageexec@freemail.hu
Link: http://lkml.kernel.org/r/e64e1b3c64858820d12c48fa739efbd1485e79d5.1307292171.git.luto@mit.edu
[ Removed the CONFIG option - it's simpler to just do it unconditionally. Tidied up the code as well. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by

Andy Lutomirski and committed by
Ingo Molnar
5cec93c2 5dfcea62

+203 -154
+5 -1
arch/x86/include/asm/irq_vectors.h
··· 17 17 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events 18 18 * Vectors 32 ... 127 : device interrupts 19 19 * Vector 128 : legacy int80 syscall interface 20 - * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts 20 + * Vector 204 : legacy x86_64 vsyscall emulation 21 + * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts 21 22 * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts 22 23 * 23 24 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. ··· 50 49 #define IA32_SYSCALL_VECTOR 0x80 51 50 #ifdef CONFIG_X86_32 52 51 # define SYSCALL_VECTOR 0x80 52 + #endif 53 + #ifdef CONFIG_X86_64 54 + # define VSYSCALL_EMU_VECTOR 0xcc 53 55 #endif 54 56 55 57 /*
+4
arch/x86/include/asm/traps.h
··· 1 1 #ifndef _ASM_X86_TRAPS_H 2 2 #define _ASM_X86_TRAPS_H 3 3 4 + #include <linux/kprobes.h> 5 + 4 6 #include <asm/debugreg.h> 5 7 #include <asm/siginfo.h> /* TRAP_TRACE, ... */ 6 8 ··· 40 38 asmlinkage void machine_check(void); 41 39 #endif /* CONFIG_X86_MCE */ 42 40 asmlinkage void simd_coprocessor_error(void); 41 + asmlinkage void emulate_vsyscall(void); 43 42 44 43 dotraplinkage void do_divide_error(struct pt_regs *, long); 45 44 dotraplinkage void do_debug(struct pt_regs *, long); ··· 67 64 dotraplinkage void do_machine_check(struct pt_regs *, long); 68 65 #endif 69 66 dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); 67 + dotraplinkage void do_emulate_vsyscall(struct pt_regs *, long); 70 68 #ifdef CONFIG_X86_32 71 69 dotraplinkage void do_iret_error(struct pt_regs *, long); 72 70 #endif
+12
arch/x86/include/asm/vsyscall.h
··· 31 31 32 32 extern void map_vsyscall(void); 33 33 34 + /* Emulation */ 35 + 36 + static inline bool is_vsyscall_entry(unsigned long addr) 37 + { 38 + return (addr & ~0xC00UL) == VSYSCALL_START; 39 + } 40 + 41 + static inline int vsyscall_entry_nr(unsigned long addr) 42 + { 43 + return (addr & 0xC00UL) >> 10; 44 + } 45 + 34 46 #endif /* __KERNEL__ */ 35 47 36 48 #endif /* _ASM_X86_VSYSCALL_H */
+1
arch/x86/kernel/Makefile
··· 44 44 obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 45 45 obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 46 46 obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o 47 + obj-$(CONFIG_X86_64) += vsyscall_emu_64.o 47 48 obj-y += bootflag.o e820.o 48 49 obj-y += pci-dma.o quirks.o topology.o kdebugfs.o 49 50 obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
+2
arch/x86/kernel/entry_64.S
··· 1123 1123 zeroentry coprocessor_error do_coprocessor_error 1124 1124 errorentry alignment_check do_alignment_check 1125 1125 zeroentry simd_coprocessor_error do_simd_coprocessor_error 1126 + zeroentry emulate_vsyscall do_emulate_vsyscall 1127 + 1126 1128 1127 1129 /* Reload gs selector with exception handling */ 1128 1130 /* edi: new selector */
+6
arch/x86/kernel/traps.c
··· 872 872 set_bit(SYSCALL_VECTOR, used_vectors); 873 873 #endif 874 874 875 + #ifdef CONFIG_X86_64 876 + BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors)); 877 + set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall); 878 + set_bit(VSYSCALL_EMU_VECTOR, used_vectors); 879 + #endif 880 + 875 881 /* 876 882 * Should be a barrier for any external CPU state: 877 883 */
+136 -153
arch/x86/kernel/vsyscall_64.c
··· 2 2 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 3 3 * Copyright 2003 Andi Kleen, SuSE Labs. 4 4 * 5 + * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ] 6 + * 5 7 * Thanks to hpa@transmeta.com for some useful hint. 6 8 * Special thanks to Ingo Molnar for his early experience with 7 9 * a different vsyscall implementation for Linux/IA32 and for the name. ··· 13 11 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid 14 12 * jumping out of line if necessary. We cannot add more with this 15 13 * mechanism because older kernels won't return -ENOSYS. 16 - * If we want more than four we need a vDSO. 17 14 * 18 - * Note: the concept clashes with user mode linux. If you use UML and 19 - * want per guest time just set the kernel.vsyscall64 sysctl to 0. 15 + * Note: the concept clashes with user mode linux. UML users should 16 + * use the vDSO. 20 17 */ 21 18 22 19 /* Disable profiling for userspace code: */ ··· 33 32 #include <linux/cpu.h> 34 33 #include <linux/smp.h> 35 34 #include <linux/notifier.h> 35 + #include <linux/syscalls.h> 36 + #include <linux/ratelimit.h> 36 37 37 38 #include <asm/vsyscall.h> 38 39 #include <asm/pgtable.h> ··· 47 44 #include <asm/desc.h> 48 45 #include <asm/topology.h> 49 46 #include <asm/vgtod.h> 50 - 51 - #define __vsyscall(nr) \ 52 - __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace 53 - #define __syscall_clobber "r11","cx","memory" 47 + #include <asm/traps.h> 54 48 55 49 DEFINE_VVAR(int, vgetcpu_mode); 56 50 DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = ··· 71 71 unsigned long flags; 72 72 73 73 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); 74 + 74 75 /* copy vsyscall data */ 75 - vsyscall_gtod_data.clock.vread = clock->vread; 76 - vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; 77 - vsyscall_gtod_data.clock.mask = clock->mask; 78 - vsyscall_gtod_data.clock.mult = mult; 79 - vsyscall_gtod_data.clock.shift = clock->shift; 80 - vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 81 - vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 82 - vsyscall_gtod_data.wall_to_monotonic = *wtm; 83 - vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); 76 + vsyscall_gtod_data.clock.vread = clock->vread; 77 + vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; 78 + vsyscall_gtod_data.clock.mask = clock->mask; 79 + vsyscall_gtod_data.clock.mult = mult; 80 + vsyscall_gtod_data.clock.shift = clock->shift; 81 + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 82 + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 83 + vsyscall_gtod_data.wall_to_monotonic = *wtm; 84 + vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); 85 + 84 86 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 85 87 } 86 88 87 - /* RED-PEN may want to readd seq locking, but then the variable should be 88 - * write-once. 89 + static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, 90 + const char *message) 91 + { 92 + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); 93 + struct task_struct *tsk; 94 + 95 + if (!show_unhandled_signals || !__ratelimit(&rs)) 96 + return; 97 + 98 + tsk = current; 99 + 100 + printk("%s%s[%d] %s ip:%lx sp:%lx ax:%lx si:%lx di:%lx\n", 101 + level, tsk->comm, task_pid_nr(tsk), 102 + message, regs->ip - 2, regs->sp, regs->ax, regs->si, regs->di); 103 + } 104 + 105 + void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) 106 + { 107 + const char *vsyscall_name; 108 + struct task_struct *tsk; 109 + unsigned long caller; 110 + int vsyscall_nr; 111 + long ret; 112 + 113 + /* Kernel code must never get here. */ 114 + BUG_ON(!user_mode(regs)); 115 + 116 + local_irq_enable(); 117 + 118 + /* 119 + * x86-ism here: regs->ip points to the instruction after the int 0xcc, 120 + * and int 0xcc is two bytes long. 121 + */ 122 + if (!is_vsyscall_entry(regs->ip - 2)) { 123 + warn_bad_vsyscall(KERN_WARNING, regs, "illegal int 0xcc (exploit attempt?)"); 124 + goto sigsegv; 125 + } 126 + vsyscall_nr = vsyscall_entry_nr(regs->ip - 2); 127 + 128 + if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { 129 + warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)"); 130 + goto sigsegv; 131 + } 132 + 133 + tsk = current; 134 + if (seccomp_mode(&tsk->seccomp)) 135 + do_exit(SIGKILL); 136 + 137 + switch (vsyscall_nr) { 138 + case 0: 139 + vsyscall_name = "gettimeofday"; 140 + ret = sys_gettimeofday( 141 + (struct timeval __user *)regs->di, 142 + (struct timezone __user *)regs->si); 143 + break; 144 + 145 + case 1: 146 + vsyscall_name = "time"; 147 + ret = sys_time((time_t __user *)regs->di); 148 + break; 149 + 150 + case 2: 151 + vsyscall_name = "getcpu"; 152 + ret = sys_getcpu((unsigned __user *)regs->di, 153 + (unsigned __user *)regs->si, 154 + 0); 155 + break; 156 + 157 + default: 158 + /* 159 + * If we get here, then vsyscall_nr indicates that int 0xcc 160 + * happened at an address in the vsyscall page that doesn't 161 + * contain int 0xcc. That can't happen. 162 + */ 163 + BUG(); 164 + } 165 + 166 + if (ret == -EFAULT) { 167 + /* 168 + * Bad news -- userspace fed a bad pointer to a vsyscall. 169 + * 170 + * With a real vsyscall, that would have caused SIGSEGV. 171 + * To make writing reliable exploits using the emulated 172 + * vsyscalls harder, generate SIGSEGV here as well. 173 + */ 174 + warn_bad_vsyscall(KERN_INFO, regs, 175 + "vsyscall fault (exploit attempt?)"); 176 + goto sigsegv; 177 + } 178 + 179 + regs->ax = ret; 180 + 181 + /* Emulate a ret instruction. */ 182 + regs->ip = caller; 183 + regs->sp += 8; 184 + 185 + local_irq_disable(); 186 + return; 187 + 188 + sigsegv: 189 + regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */ 190 + force_sig(SIGSEGV, current); 191 + } 192 + 193 + /* 194 + * Assume __initcall executes before all user space. Hopefully kmod 195 + * doesn't violate that. We'll find out if it does. 89 196 */ 90 - static __always_inline void do_get_tz(struct timezone * tz) 91 - { 92 - *tz = VVAR(vsyscall_gtod_data).sys_tz; 93 - } 94 - 95 - static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) 96 - { 97 - int ret; 98 - asm volatile("syscall" 99 - : "=a" (ret) 100 - : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) 101 - : __syscall_clobber ); 102 - return ret; 103 - } 104 - 105 - static __always_inline void do_vgettimeofday(struct timeval * tv) 106 - { 107 - cycle_t now, base, mask, cycle_delta; 108 - unsigned seq; 109 - unsigned long mult, shift, nsec; 110 - cycle_t (*vread)(void); 111 - do { 112 - seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); 113 - 114 - vread = VVAR(vsyscall_gtod_data).clock.vread; 115 - if (unlikely(!vread)) { 116 - gettimeofday(tv,NULL); 117 - return; 118 - } 119 - 120 - now = vread(); 121 - base = VVAR(vsyscall_gtod_data).clock.cycle_last; 122 - mask = VVAR(vsyscall_gtod_data).clock.mask; 123 - mult = VVAR(vsyscall_gtod_data).clock.mult; 124 - shift = VVAR(vsyscall_gtod_data).clock.shift; 125 - 126 - tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec; 127 - nsec = VVAR(vsyscall_gtod_data).wall_time_nsec; 128 - } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); 129 - 130 - /* calculate interval: */ 131 - cycle_delta = (now - base) & mask; 132 - /* convert to nsecs: */ 133 - nsec += (cycle_delta * mult) >> shift; 134 - 135 - while (nsec >= NSEC_PER_SEC) { 136 - tv->tv_sec += 1; 137 - nsec -= NSEC_PER_SEC; 138 - } 139 - tv->tv_usec = nsec / NSEC_PER_USEC; 140 - } 141 - 142 - int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) 143 - { 144 - if (tv) 145 - do_vgettimeofday(tv); 146 - if (tz) 147 - do_get_tz(tz); 148 - return 0; 149 - } 150 - 151 - /* This will break when the xtime seconds get inaccurate, but that is 152 - * unlikely */ 153 - time_t __vsyscall(1) vtime(time_t *t) 154 - { 155 - unsigned seq; 156 - time_t result; 157 - 158 - do { 159 - seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); 160 - 161 - result = VVAR(vsyscall_gtod_data).wall_time_sec; 162 - 163 - } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); 164 - 165 - if (t) 166 - *t = result; 167 - return result; 168 - } 169 - 170 - /* Fast way to get current CPU and node. 171 - This helps to do per node and per CPU caches in user space. 172 - The result is not guaranteed without CPU affinity, but usually 173 - works out because the scheduler tries to keep a thread on the same 174 - CPU. 175 - 176 - tcache must point to a two element sized long array. 177 - All arguments can be NULL. */ 178 - long __vsyscall(2) 179 - vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) 180 - { 181 - unsigned int p; 182 - unsigned long j = 0; 183 - 184 - /* Fast cache - only recompute value once per jiffies and avoid 185 - relatively costly rdtscp/cpuid otherwise. 186 - This works because the scheduler usually keeps the process 187 - on the same CPU and this syscall doesn't guarantee its 188 - results anyways. 189 - We do this here because otherwise user space would do it on 190 - its own in a likely inferior way (no access to jiffies). 191 - If you don't like it pass NULL. */ 192 - if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) { 193 - p = tcache->blob[1]; 194 - } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { 195 - /* Load per CPU data from RDTSCP */ 196 - native_read_tscp(&p); 197 - } else { 198 - /* Load per CPU data from GDT */ 199 - asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); 200 - } 201 - if (tcache) { 202 - tcache->blob[0] = j; 203 - tcache->blob[1] = p; 204 - } 205 - if (cpu) 206 - *cpu = p & 0xfff; 207 - if (node) 208 - *node = p >> 12; 209 - return 0; 210 - } 211 - 212 - /* Assume __initcall executes before all user space. Hopefully kmod 213 - doesn't violate that. We'll find out if it does. */ 214 197 static void __cpuinit vsyscall_set_cpu(int cpu) 215 198 { 216 199 unsigned long d; ··· 204 221 if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) 205 222 write_rdtscp_aux((node << 12) | cpu); 206 223 207 - /* Store cpu number in limit so that it can be loaded quickly 208 - in user space in vgetcpu. 209 - 12 bits for the CPU and 8 bits for the node. */ 224 + /* 225 + * Store cpu number in limit so that it can be loaded quickly 226 + * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node) 227 + */ 210 228 d = 0x0f40000000000ULL; 211 229 d |= cpu; 212 230 d |= (node & 0xf) << 12; 213 231 d |= (node >> 4) << 48; 232 + 214 233 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); 215 234 } 216 235 ··· 226 241 cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) 227 242 { 228 243 long cpu = (long)arg; 244 + 229 245 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) 230 246 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); 247 + 231 248 return NOTIFY_DONE; 232 249 } 233 250 ··· 243 256 /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ 244 257 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); 245 258 __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); 246 - BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != 247 - (unsigned long)VVAR_ADDRESS); 259 + BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS); 248 260 } 249 261 250 262 static int __init vsyscall_init(void) 251 263 { 252 - BUG_ON(((unsigned long) &vgettimeofday != 253 - VSYSCALL_ADDR(__NR_vgettimeofday))); 254 - BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); 255 - BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); 256 - BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); 264 + BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)); 265 + 257 266 on_each_cpu(cpu_vsyscall_init, NULL, 1); 258 267 /* notifier priority > KVM */ 259 268 hotcpu_notifier(cpu_vsyscall_notifier, 30); 269 + 260 270 return 0; 261 271 } 262 - 263 272 __initcall(vsyscall_init);
+27
arch/x86/kernel/vsyscall_emu_64.S
··· 1 + /* 2 + * vsyscall_emu_64.S: Vsyscall emulation page 3 + * 4 + * Copyright (c) 2011 Andy Lutomirski 5 + * 6 + * Subject to the GNU General Public License, version 2 7 + */ 8 + 9 + #include <linux/linkage.h> 10 + #include <asm/irq_vectors.h> 11 + 12 + /* The unused parts of the page are filled with 0xcc by the linker script. */ 13 + 14 + .section .vsyscall_0, "a" 15 + ENTRY(vsyscall_0) 16 + int $VSYSCALL_EMU_VECTOR 17 + END(vsyscall_0) 18 + 19 + .section .vsyscall_1, "a" 20 + ENTRY(vsyscall_1) 21 + int $VSYSCALL_EMU_VECTOR 22 + END(vsyscall_1) 23 + 24 + .section .vsyscall_2, "a" 25 + ENTRY(vsyscall_2) 26 + int $VSYSCALL_EMU_VECTOR 27 + END(vsyscall_2)
+10
include/linux/seccomp.h
··· 19 19 extern long prctl_get_seccomp(void); 20 20 extern long prctl_set_seccomp(unsigned long); 21 21 22 + static inline int seccomp_mode(seccomp_t *s) 23 + { 24 + return s->mode; 25 + } 26 + 22 27 #else /* CONFIG_SECCOMP */ 23 28 24 29 #include <linux/errno.h> ··· 40 35 static inline long prctl_set_seccomp(unsigned long arg2) 41 36 { 42 37 return -EINVAL; 38 + } 39 + 40 + static inline int seccomp_mode(seccomp_t *s) 41 + { 42 + return 0; 43 43 } 44 44 45 45 #endif /* CONFIG_SECCOMP */