x86-64, espfix: Don't leak bits 31:16 of %esp returning to 16-bit stack

+2

Documentation/x86/x86_64/mm.txt

··· 12 12 ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole 13 13 ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) 14 14 ... unused hole ... 15 + ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks 16 + ... unused hole ... 15 17 ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0 16 18 ffffffffa0000000 - ffffffffff5fffff (=1525 MB) module mapping space 17 19 ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls

+2

arch/x86/include/asm/pgtable_64_types.h

··· 61 61 #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) 62 62 #define MODULES_END _AC(0xffffffffff000000, UL) 63 63 #define MODULES_LEN (MODULES_END - MODULES_VADDR) 64 + #define ESPFIX_PGD_ENTRY _AC(-2, UL) 65 + #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT) 64 66 65 67 #define EARLY_DYNAMIC_PAGE_TABLES 64 66 68

+3

arch/x86/include/asm/setup.h

··· 57 57 static inline void x86_ce4100_early_setup(void) { } 58 58 #endif 59 59 60 + extern void init_espfix_bsp(void); 61 + extern void init_espfix_ap(void); 62 + 60 63 #ifndef _SETUP 61 64 62 65 /*

+1

arch/x86/kernel/Makefile

··· 29 29 obj-y += syscall_$(BITS).o vsyscall_gtod.o 30 30 obj-$(CONFIG_X86_64) += vsyscall_64.o 31 31 obj-$(CONFIG_X86_64) += vsyscall_emu_64.o 32 + obj-$(CONFIG_X86_64) += espfix_64.o 32 33 obj-$(CONFIG_SYSFS) += ksysfs.o 33 34 obj-y += bootflag.o e820.o 34 35 obj-y += pci-dma.o quirks.o topology.o kdebugfs.o

+69 -4

arch/x86/kernel/entry_64.S

··· 58 58 #include <asm/asm.h> 59 59 #include <asm/context_tracking.h> 60 60 #include <asm/smap.h> 61 + #include <asm/pgtable_types.h> 61 62 #include <linux/err.h> 62 63 63 64 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ ··· 1041 1040 RESTORE_ARGS 1,8,1 1042 1041 1043 1042 irq_return: 1043 + /* 1044 + * Are we returning to a stack segment from the LDT? Note: in 1045 + * 64-bit mode SS:RSP on the exception stack is always valid. 1046 + */ 1047 + testb $4,(SS-RIP)(%rsp) 1048 + jnz irq_return_ldt 1049 + 1050 + irq_return_iret: 1044 1051 INTERRUPT_RETURN 1045 - _ASM_EXTABLE(irq_return, bad_iret) 1052 + _ASM_EXTABLE(irq_return_iret, bad_iret) 1046 1053 1047 1054 #ifdef CONFIG_PARAVIRT 1048 1055 ENTRY(native_iret) 1049 1056 iretq 1050 1057 _ASM_EXTABLE(native_iret, bad_iret) 1051 1058 #endif 1059 + 1060 + irq_return_ldt: 1061 + pushq_cfi %rax 1062 + pushq_cfi %rdi 1063 + SWAPGS 1064 + movq PER_CPU_VAR(espfix_waddr),%rdi 1065 + movq %rax,(0*8)(%rdi) /* RAX */ 1066 + movq (2*8)(%rsp),%rax /* RIP */ 1067 + movq %rax,(1*8)(%rdi) 1068 + movq (3*8)(%rsp),%rax /* CS */ 1069 + movq %rax,(2*8)(%rdi) 1070 + movq (4*8)(%rsp),%rax /* RFLAGS */ 1071 + movq %rax,(3*8)(%rdi) 1072 + movq (6*8)(%rsp),%rax /* SS */ 1073 + movq %rax,(5*8)(%rdi) 1074 + movq (5*8)(%rsp),%rax /* RSP */ 1075 + movq %rax,(4*8)(%rdi) 1076 + andl $0xffff0000,%eax 1077 + popq_cfi %rdi 1078 + orq PER_CPU_VAR(espfix_stack),%rax 1079 + SWAPGS 1080 + movq %rax,%rsp 1081 + popq_cfi %rax 1082 + jmp irq_return_iret 1052 1083 1053 1084 .section .fixup,"ax" 1054 1085 bad_iret: ··· 1143 1110 call preempt_schedule_irq 1144 1111 jmp exit_intr 1145 1112 #endif 1146 - 1147 1113 CFI_ENDPROC 1148 1114 END(common_interrupt) 1115 + 1116 + /* 1117 + * If IRET takes a fault on the espfix stack, then we 1118 + * end up promoting it to a doublefault. In that case, 1119 + * modify the stack to make it look like we just entered 1120 + * the #GP handler from user space, similar to bad_iret. 1121 + */ 1122 + ALIGN 1123 + __do_double_fault: 1124 + XCPT_FRAME 1 RDI+8 1125 + movq RSP(%rdi),%rax /* Trap on the espfix stack? */ 1126 + sarq $PGDIR_SHIFT,%rax 1127 + cmpl $ESPFIX_PGD_ENTRY,%eax 1128 + jne do_double_fault /* No, just deliver the fault */ 1129 + cmpl $__KERNEL_CS,CS(%rdi) 1130 + jne do_double_fault 1131 + movq RIP(%rdi),%rax 1132 + cmpq $irq_return_iret,%rax 1133 + #ifdef CONFIG_PARAVIRT 1134 + je 1f 1135 + cmpq $native_iret,%rax 1136 + #endif 1137 + jne do_double_fault /* This shouldn't happen... */ 1138 + 1: 1139 + movq PER_CPU_VAR(kernel_stack),%rax 1140 + subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */ 1141 + movq %rax,RSP(%rdi) 1142 + movq $0,(%rax) /* Missing (lost) #GP error code */ 1143 + movq $general_protection,RIP(%rdi) 1144 + retq 1145 + CFI_ENDPROC 1146 + END(__do_double_fault) 1147 + 1149 1148 /* 1150 1149 * End of kprobes section 1151 1150 */ ··· 1379 1314 zeroentry bounds do_bounds 1380 1315 zeroentry invalid_op do_invalid_op 1381 1316 zeroentry device_not_available do_device_not_available 1382 - paranoiderrorentry double_fault do_double_fault 1317 + paranoiderrorentry double_fault __do_double_fault 1383 1318 zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun 1384 1319 errorentry invalid_TSS do_invalid_TSS 1385 1320 errorentry segment_not_present do_segment_not_present ··· 1666 1601 */ 1667 1602 error_kernelspace: 1668 1603 incl %ebx 1669 - leaq irq_return(%rip),%rcx 1604 + leaq irq_return_iret(%rip),%rcx 1670 1605 cmpq %rcx,RIP+8(%rsp) 1671 1606 je error_swapgs 1672 1607 movl %ecx,%eax /* zero extend */

+208

arch/x86/kernel/espfix_64.c

··· 1 + /* ----------------------------------------------------------------------- * 2 + * 3 + * Copyright 2014 Intel Corporation; author: H. Peter Anvin 4 + * 5 + * This program is free software; you can redistribute it and/or modify it 6 + * under the terms and conditions of the GNU General Public License, 7 + * version 2, as published by the Free Software Foundation. 8 + * 9 + * This program is distributed in the hope it will be useful, but WITHOUT 10 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 + * more details. 13 + * 14 + * ----------------------------------------------------------------------- */ 15 + 16 + /* 17 + * The IRET instruction, when returning to a 16-bit segment, only 18 + * restores the bottom 16 bits of the user space stack pointer. This 19 + * causes some 16-bit software to break, but it also leaks kernel state 20 + * to user space. 21 + * 22 + * This works around this by creating percpu "ministacks", each of which 23 + * is mapped 2^16 times 64K apart. When we detect that the return SS is 24 + * on the LDT, we copy the IRET frame to the ministack and use the 25 + * relevant alias to return to userspace. The ministacks are mapped 26 + * readonly, so if the IRET fault we promote #GP to #DF which is an IST 27 + * vector and thus has its own stack; we then do the fixup in the #DF 28 + * handler. 29 + * 30 + * This file sets up the ministacks and the related page tables. The 31 + * actual ministack invocation is in entry_64.S. 32 + */ 33 + 34 + #include <linux/init.h> 35 + #include <linux/init_task.h> 36 + #include <linux/kernel.h> 37 + #include <linux/percpu.h> 38 + #include <linux/gfp.h> 39 + #include <linux/random.h> 40 + #include <asm/pgtable.h> 41 + #include <asm/pgalloc.h> 42 + #include <asm/setup.h> 43 + 44 + /* 45 + * Note: we only need 6*8 = 48 bytes for the espfix stack, but round 46 + * it up to a cache line to avoid unnecessary sharing. 47 + */ 48 + #define ESPFIX_STACK_SIZE (8*8UL) 49 + #define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) 50 + 51 + /* There is address space for how many espfix pages? */ 52 + #define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16)) 53 + 54 + #define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) 55 + #if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS 56 + # error "Need more than one PGD for the ESPFIX hack" 57 + #endif 58 + 59 + #define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) 60 + 61 + /* This contains the *bottom* address of the espfix stack */ 62 + DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); 63 + DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); 64 + 65 + /* Initialization mutex - should this be a spinlock? */ 66 + static DEFINE_MUTEX(espfix_init_mutex); 67 + 68 + /* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */ 69 + #define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE) 70 + static void *espfix_pages[ESPFIX_MAX_PAGES]; 71 + 72 + static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD] 73 + __aligned(PAGE_SIZE); 74 + 75 + static unsigned int page_random, slot_random; 76 + 77 + /* 78 + * This returns the bottom address of the espfix stack for a specific CPU. 79 + * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case 80 + * we have to account for some amount of padding at the end of each page. 81 + */ 82 + static inline unsigned long espfix_base_addr(unsigned int cpu) 83 + { 84 + unsigned long page, slot; 85 + unsigned long addr; 86 + 87 + page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random; 88 + slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE; 89 + addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE); 90 + addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16); 91 + addr += ESPFIX_BASE_ADDR; 92 + return addr; 93 + } 94 + 95 + #define PTE_STRIDE (65536/PAGE_SIZE) 96 + #define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE) 97 + #define ESPFIX_PMD_CLONES PTRS_PER_PMD 98 + #define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES)) 99 + 100 + #define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX) 101 + 102 + static void init_espfix_random(void) 103 + { 104 + unsigned long rand; 105 + 106 + /* 107 + * This is run before the entropy pools are initialized, 108 + * but this is hopefully better than nothing. 109 + */ 110 + if (!arch_get_random_long(&rand)) { 111 + /* The constant is an arbitrary large prime */ 112 + rdtscll(rand); 113 + rand *= 0xc345c6b72fd16123UL; 114 + } 115 + 116 + slot_random = rand % ESPFIX_STACKS_PER_PAGE; 117 + page_random = (rand / ESPFIX_STACKS_PER_PAGE) 118 + & (ESPFIX_PAGE_SPACE - 1); 119 + } 120 + 121 + void __init init_espfix_bsp(void) 122 + { 123 + pgd_t *pgd_p; 124 + pteval_t ptemask; 125 + 126 + ptemask = __supported_pte_mask; 127 + 128 + /* Install the espfix pud into the kernel page directory */ 129 + pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; 130 + pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); 131 + 132 + /* Randomize the locations */ 133 + init_espfix_random(); 134 + 135 + /* The rest is the same as for any other processor */ 136 + init_espfix_ap(); 137 + } 138 + 139 + void init_espfix_ap(void) 140 + { 141 + unsigned int cpu, page; 142 + unsigned long addr; 143 + pud_t pud, *pud_p; 144 + pmd_t pmd, *pmd_p; 145 + pte_t pte, *pte_p; 146 + int n; 147 + void *stack_page; 148 + pteval_t ptemask; 149 + 150 + /* We only have to do this once... */ 151 + if (likely(this_cpu_read(espfix_stack))) 152 + return; /* Already initialized */ 153 + 154 + cpu = smp_processor_id(); 155 + addr = espfix_base_addr(cpu); 156 + page = cpu/ESPFIX_STACKS_PER_PAGE; 157 + 158 + /* Did another CPU already set this up? */ 159 + stack_page = ACCESS_ONCE(espfix_pages[page]); 160 + if (likely(stack_page)) 161 + goto done; 162 + 163 + mutex_lock(&espfix_init_mutex); 164 + 165 + /* Did we race on the lock? */ 166 + stack_page = ACCESS_ONCE(espfix_pages[page]); 167 + if (stack_page) 168 + goto unlock_done; 169 + 170 + ptemask = __supported_pte_mask; 171 + 172 + pud_p = &espfix_pud_page[pud_index(addr)]; 173 + pud = *pud_p; 174 + if (!pud_present(pud)) { 175 + pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP); 176 + pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); 177 + paravirt_alloc_pud(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); 178 + for (n = 0; n < ESPFIX_PUD_CLONES; n++) 179 + set_pud(&pud_p[n], pud); 180 + } 181 + 182 + pmd_p = pmd_offset(&pud, addr); 183 + pmd = *pmd_p; 184 + if (!pmd_present(pmd)) { 185 + pte_p = (pte_t *)__get_free_page(PGALLOC_GFP); 186 + pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); 187 + paravirt_alloc_pmd(&init_mm, __pa(pte_p) >> PAGE_SHIFT); 188 + for (n = 0; n < ESPFIX_PMD_CLONES; n++) 189 + set_pmd(&pmd_p[n], pmd); 190 + } 191 + 192 + pte_p = pte_offset_kernel(&pmd, addr); 193 + stack_page = (void *)__get_free_page(GFP_KERNEL); 194 + pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); 195 + paravirt_alloc_pte(&init_mm, __pa(stack_page) >> PAGE_SHIFT); 196 + for (n = 0; n < ESPFIX_PTE_CLONES; n++) 197 + set_pte(&pte_p[n*PTE_STRIDE], pte); 198 + 199 + /* Job is done for this CPU and any CPU which shares this page */ 200 + ACCESS_ONCE(espfix_pages[page]) = stack_page; 201 + 202 + unlock_done: 203 + mutex_unlock(&espfix_init_mutex); 204 + done: 205 + this_cpu_write(espfix_stack, addr); 206 + this_cpu_write(espfix_waddr, (unsigned long)stack_page 207 + + (addr & ~PAGE_MASK)); 208 + }

-11

arch/x86/kernel/ldt.c

··· 229 229 } 230 230 } 231 231 232 - /* 233 - * On x86-64 we do not support 16-bit segments due to 234 - * IRET leaking the high bits of the kernel stack address. 235 - */ 236 - #ifdef CONFIG_X86_64 237 - if (!ldt_info.seg_32bit) { 238 - error = -EINVAL; 239 - goto out_unlock; 240 - } 241 - #endif 242 - 243 232 fill_ldt(&ldt, &ldt_info); 244 233 if (oldmode) 245 234 ldt.avl = 0;

+7

arch/x86/kernel/smpboot.c

··· 244 244 check_tsc_sync_target(); 245 245 246 246 /* 247 + * Enable the espfix hack for this CPU 248 + */ 249 + #ifdef CONFIG_X86_64 250 + init_espfix_ap(); 251 + #endif 252 + 253 + /* 247 254 * We need to hold vector_lock so there the set of online cpus 248 255 * does not change while we are assigning vectors to cpus. Holding 249 256 * this lock ensures we don't half assign or remove an irq from a cpu.

+32 -10

arch/x86/mm/dump_pagetables.c

··· 30 30 unsigned long start_address; 31 31 unsigned long current_address; 32 32 const struct addr_marker *marker; 33 + unsigned long lines; 33 34 bool to_dmesg; 34 35 }; 35 36 36 37 struct addr_marker { 37 38 unsigned long start_address; 38 39 const char *name; 40 + unsigned long max_lines; 39 41 }; 40 42 41 43 /* indices for address_markers; keep sync'd w/ address_markers below */ ··· 48 46 LOW_KERNEL_NR, 49 47 VMALLOC_START_NR, 50 48 VMEMMAP_START_NR, 49 + ESPFIX_START_NR, 51 50 HIGH_KERNEL_NR, 52 51 MODULES_VADDR_NR, 53 52 MODULES_END_NR, ··· 71 68 { PAGE_OFFSET, "Low Kernel Mapping" }, 72 69 { VMALLOC_START, "vmalloc() Area" }, 73 70 { VMEMMAP_START, "Vmemmap" }, 71 + { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, 74 72 { __START_KERNEL_map, "High Kernel Mapping" }, 75 73 { MODULES_VADDR, "Modules" }, 76 74 { MODULES_END, "End Modules" }, ··· 186 182 pgprot_t new_prot, int level) 187 183 { 188 184 pgprotval_t prot, cur; 189 - static const char units[] = "KMGTPE"; 185 + static const char units[] = "BKMGTPE"; 190 186 191 187 /* 192 188 * If we have a "break" in the series, we need to flush the state that ··· 201 197 st->current_prot = new_prot; 202 198 st->level = level; 203 199 st->marker = address_markers; 200 + st->lines = 0; 204 201 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 205 202 st->marker->name); 206 203 } else if (prot != cur || level != st->level || ··· 213 208 /* 214 209 * Now print the actual finished series 215 210 */ 216 - pt_dump_seq_printf(m, st->to_dmesg, "0x%0*lx-0x%0*lx ", 217 - width, st->start_address, 218 - width, st->current_address); 211 + if (!st->marker->max_lines || 212 + st->lines < st->marker->max_lines) { 213 + pt_dump_seq_printf(m, st->to_dmesg, 214 + "0x%0*lx-0x%0*lx ", 215 + width, st->start_address, 216 + width, st->current_address); 219 217 220 - delta = (st->current_address - st->start_address) >> 10; 221 - while (!(delta & 1023) && unit[1]) { 222 - delta >>= 10; 223 - unit++; 218 + delta = st->current_address - st->start_address; 219 + while (!(delta & 1023) && unit[1]) { 220 + delta >>= 10; 221 + unit++; 222 + } 223 + pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", 224 + delta, *unit); 225 + printk_prot(m, st->current_prot, st->level, 226 + st->to_dmesg); 224 227 } 225 - pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", delta, *unit); 226 - printk_prot(m, st->current_prot, st->level, st->to_dmesg); 228 + st->lines++; 227 229 228 230 /* 229 231 * We print markers for special areas of address space, ··· 238 226 * This helps in the interpretation. 239 227 */ 240 228 if (st->current_address >= st->marker[1].start_address) { 229 + if (st->marker->max_lines && 230 + st->lines > st->marker->max_lines) { 231 + unsigned long nskip = 232 + st->lines - st->marker->max_lines; 233 + pt_dump_seq_printf(m, st->to_dmesg, 234 + "... %lu entr%s skipped ... \n", 235 + nskip, 236 + nskip == 1 ? "y" : "ies"); 237 + } 241 238 st->marker++; 239 + st->lines = 0; 242 240 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 243 241 st->marker->name); 244 242 }

+4

init/main.c

··· 617 617 if (efi_enabled(EFI_RUNTIME_SERVICES)) 618 618 efi_enter_virtual_mode(); 619 619 #endif 620 + #ifdef CONFIG_X86_64 621 + /* Should be run before the first non-init thread is created */ 622 + init_espfix_bsp(); 623 + #endif 620 624 thread_info_cache_init(); 621 625 cred_init(); 622 626 fork_init(totalram_pages);