Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:
"A landry list of fixes:

- fix reboot breakage on some PCID-enabled system

- fix crashes/hangs on some PCID-enabled systems

- fix microcode loading on certain older CPUs

- various unwinder fixes

- extend an APIC quirk to more hardware systems and disable APIC
related warning on virtualized systems

- various Hyper-V fixes

- a macro definition robustness fix

- remove jprobes IRQ disabling

- various mem-encryption fixes"

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/microcode: Do the family check first
x86/mm: Flush more aggressively in lazy TLB mode
x86/apic: Update TSC_DEADLINE quirk with additional SKX stepping
x86/apic: Silence "FW_BUG TSC_DEADLINE disabled due to Errata" on hypervisors
x86/mm: Disable various instrumentations of mm/mem_encrypt.c and mm/tlb.c
x86/hyperv: Fix hypercalls with extended CPU ranges for TLB flushing
x86/hyperv: Don't use percpu areas for pcpu_flush/pcpu_flush_ex structures
x86/hyperv: Clear vCPU banks between calls to avoid flushing unneeded vCPUs
x86/unwind: Disable unwinder warnings on 32-bit
x86/unwind: Align stack pointer in unwinder dump
x86/unwind: Use MSB for frame pointer encoding on 32-bit
x86/unwind: Fix dereference of untrusted pointer
x86/alternatives: Fix alt_max_short macro to really be a max()
x86/mm/64: Fix reboot interaction with CR4.PCIDE
kprobes/x86: Remove IRQ disabling from jprobe handlers
kprobes/x86: Set up frame pointer in kprobe trampoline

Linus Torvalds 8 years ago e7a36a6e a339b351

+284 -88

16 changed files

expand all collapse all

arch

x86

entry

entry_32.S

hyperv

hv_init.c

mmu.c

include

asm

alternative-asm.h

alternative.h

mmu_context.h

mshyperv.h

tlbflush.h

kernel

apic

apic.c

cpu

microcode

core.c

kprobes

common.h

core.c

reboot.c

unwind_frame.c

Makefile

tlb.c

+2 -2

arch/x86/entry/entry_32.S

reviewed

··· 176 /* 177 * This is a sneaky trick to help the unwinder find pt_regs on the stack. The 178 * frame pointer is replaced with an encoded pointer to pt_regs. The encoding 179 - * is just setting the LSB, which makes it an invalid stack address and is also 180 * a signal to the unwinder that it's a pt_regs pointer in disguise. 181 * 182 * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the ··· 185 .macro ENCODE_FRAME_POINTER 186 #ifdef CONFIG_FRAME_POINTER 187 mov %esp, %ebp 188 - orl $0x1, %ebp 189 #endif 190 .endm 191

··· 176 /* 177 * This is a sneaky trick to help the unwinder find pt_regs on the stack. The 178 * frame pointer is replaced with an encoded pointer to pt_regs. The encoding 179 + * is just clearing the MSB, which makes it an invalid stack address and is also 180 * a signal to the unwinder that it's a pt_regs pointer in disguise. 181 * 182 * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the ··· 185 .macro ENCODE_FRAME_POINTER 186 #ifdef CONFIG_FRAME_POINTER 187 mov %esp, %ebp 188 + andl $0x7fffffff, %ebp 189 #endif 190 .endm 191

arch/x86/hyperv/hv_init.c

reviewed

··· 85 u32 *hv_vp_index; 86 EXPORT_SYMBOL_GPL(hv_vp_index); 87 0 0 88 static int hv_cpu_init(unsigned int cpu) 89 { 90 u64 msr_vp_index; ··· 94 hv_get_vp_index(msr_vp_index); 95 96 hv_vp_index[smp_processor_id()] = msr_vp_index; 0 0 0 97 98 return 0; 99 }

··· 85 u32 *hv_vp_index; 86 EXPORT_SYMBOL_GPL(hv_vp_index); 87 88 + u32 hv_max_vp_index; 89 + 90 static int hv_cpu_init(unsigned int cpu) 91 { 92 u64 msr_vp_index; ··· 92 hv_get_vp_index(msr_vp_index); 93 94 hv_vp_index[smp_processor_id()] = msr_vp_index; 95 + 96 + if (msr_vp_index > hv_max_vp_index) 97 + hv_max_vp_index = msr_vp_index; 98 99 return 0; 100 }

+43 -14

arch/x86/hyperv/mmu.c

reviewed

··· 36 /* Each gva in gva_list encodes up to 4096 pages to flush */ 37 #define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE) 38 39 - static struct hv_flush_pcpu __percpu *pcpu_flush; 40 41 - static struct hv_flush_pcpu_ex __percpu *pcpu_flush_ex; 42 43 /* 44 * Fills in gva_list starting from offset. Returns the number of items added. ··· 76 { 77 int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1; 78 0 0 0 0 0 0 0 0 0 0 0 0 79 /* 80 * Some banks may end up being empty but this is acceptable. 81 */ ··· 95 vcpu = hv_cpu_number_to_vp_number(cpu); 96 vcpu_bank = vcpu / 64; 97 vcpu_offset = vcpu % 64; 98 - 99 - /* valid_bank_mask can represent up to 64 banks */ 100 - if (vcpu_bank >= 64) 101 - return 0; 102 - 103 __set_bit(vcpu_offset, (unsigned long *) 104 &flush->hv_vp_set.bank_contents[vcpu_bank]); 105 if (vcpu_bank >= nr_bank) ··· 109 const struct flush_tlb_info *info) 110 { 111 int cpu, vcpu, gva_n, max_gvas; 0 112 struct hv_flush_pcpu *flush; 113 u64 status = U64_MAX; 114 unsigned long flags; ··· 124 125 local_irq_save(flags); 126 127 - flush = this_cpu_ptr(pcpu_flush); 0 0 0 0 0 0 0 0 0 0 128 129 if (info->mm) { 130 flush->address_space = virt_to_phys(info->mm->pgd); ··· 191 const struct flush_tlb_info *info) 192 { 193 int nr_bank = 0, max_gvas, gva_n; 0 194 struct hv_flush_pcpu_ex *flush; 195 u64 status = U64_MAX; 196 unsigned long flags; ··· 206 207 local_irq_save(flags); 208 209 - flush = this_cpu_ptr(pcpu_flush_ex); 0 0 0 0 0 0 0 0 0 0 210 211 if (info->mm) { 212 flush->address_space = virt_to_phys(info->mm->pgd); ··· 251 flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; 252 status = hv_do_rep_hypercall( 253 HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, 254 - 0, nr_bank + 2, flush, NULL); 255 } else if (info->end && 256 ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { 257 status = hv_do_rep_hypercall( 258 HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, 259 - 0, nr_bank + 2, flush, NULL); 260 } else { 261 gva_n = fill_gva_list(flush->gva_list, nr_bank, 262 info->start, info->end); 263 status = hv_do_rep_hypercall( 264 HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX, 265 - gva_n, nr_bank + 2, flush, NULL); 266 } 267 268 local_irq_restore(flags); ··· 295 return; 296 297 if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) 298 - pcpu_flush = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); 299 else 300 - pcpu_flush_ex = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); 301 }

··· 36 /* Each gva in gva_list encodes up to 4096 pages to flush */ 37 #define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE) 38 39 + static struct hv_flush_pcpu __percpu **pcpu_flush; 40 41 + static struct hv_flush_pcpu_ex __percpu **pcpu_flush_ex; 42 43 /* 44 * Fills in gva_list starting from offset. Returns the number of items added. ··· 76 { 77 int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1; 78 79 + /* valid_bank_mask can represent up to 64 banks */ 80 + if (hv_max_vp_index / 64 >= 64) 81 + return 0; 82 + 83 + /* 84 + * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex 85 + * structs are not cleared between calls, we risk flushing unneeded 86 + * vCPUs otherwise. 87 + */ 88 + for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++) 89 + flush->hv_vp_set.bank_contents[vcpu_bank] = 0; 90 + 91 /* 92 * Some banks may end up being empty but this is acceptable. 93 */ ··· 83 vcpu = hv_cpu_number_to_vp_number(cpu); 84 vcpu_bank = vcpu / 64; 85 vcpu_offset = vcpu % 64; 0 0 0 0 0 86 __set_bit(vcpu_offset, (unsigned long *) 87 &flush->hv_vp_set.bank_contents[vcpu_bank]); 88 if (vcpu_bank >= nr_bank) ··· 102 const struct flush_tlb_info *info) 103 { 104 int cpu, vcpu, gva_n, max_gvas; 105 + struct hv_flush_pcpu **flush_pcpu; 106 struct hv_flush_pcpu *flush; 107 u64 status = U64_MAX; 108 unsigned long flags; ··· 116 117 local_irq_save(flags); 118 119 + flush_pcpu = this_cpu_ptr(pcpu_flush); 120 + 121 + if (unlikely(!*flush_pcpu)) 122 + *flush_pcpu = page_address(alloc_page(GFP_ATOMIC)); 123 + 124 + flush = *flush_pcpu; 125 + 126 + if (unlikely(!flush)) { 127 + local_irq_restore(flags); 128 + goto do_native; 129 + } 130 131 if (info->mm) { 132 flush->address_space = virt_to_phys(info->mm->pgd); ··· 173 const struct flush_tlb_info *info) 174 { 175 int nr_bank = 0, max_gvas, gva_n; 176 + struct hv_flush_pcpu_ex **flush_pcpu; 177 struct hv_flush_pcpu_ex *flush; 178 u64 status = U64_MAX; 179 unsigned long flags; ··· 187 188 local_irq_save(flags); 189 190 + flush_pcpu = this_cpu_ptr(pcpu_flush_ex); 191 + 192 + if (unlikely(!*flush_pcpu)) 193 + *flush_pcpu = page_address(alloc_page(GFP_ATOMIC)); 194 + 195 + flush = *flush_pcpu; 196 + 197 + if (unlikely(!flush)) { 198 + local_irq_restore(flags); 199 + goto do_native; 200 + } 201 202 if (info->mm) { 203 flush->address_space = virt_to_phys(info->mm->pgd); ··· 222 flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; 223 status = hv_do_rep_hypercall( 224 HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, 225 + 0, nr_bank, flush, NULL); 226 } else if (info->end && 227 ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { 228 status = hv_do_rep_hypercall( 229 HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, 230 + 0, nr_bank, flush, NULL); 231 } else { 232 gva_n = fill_gva_list(flush->gva_list, nr_bank, 233 info->start, info->end); 234 status = hv_do_rep_hypercall( 235 HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX, 236 + gva_n, nr_bank, flush, NULL); 237 } 238 239 local_irq_restore(flags); ··· 266 return; 267 268 if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) 269 + pcpu_flush = alloc_percpu(struct hv_flush_pcpu *); 270 else 271 + pcpu_flush_ex = alloc_percpu(struct hv_flush_pcpu_ex *); 272 }

+3 -1

arch/x86/include/asm/alternative-asm.h

reviewed

··· 62 #define new_len2 145f-144f 63 64 /* 65 - * max without conditionals. Idea adapted from: 66 * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax 0 0 67 */ 68 #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) 69

··· 62 #define new_len2 145f-144f 63 64 /* 65 + * gas compatible max based on the idea from: 66 * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax 67 + * 68 + * The additional "-" is needed because gas uses a "true" value of -1. 69 */ 70 #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) 71

+3 -3

arch/x86/include/asm/alternative.h

reviewed

··· 103 alt_end_marker ":\n" 104 105 /* 106 - * max without conditionals. Idea adapted from: 107 * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax 108 * 109 - * The additional "-" is needed because gas works with s32s. 110 */ 111 - #define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))" 112 113 /* 114 * Pad the second replacement alternative with additional NOPs if it is

··· 103 alt_end_marker ":\n" 104 105 /* 106 + * gas compatible max based on the idea from: 107 * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax 108 * 109 + * The additional "-" is needed because gas uses a "true" value of -1. 110 */ 111 + #define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))" 112 113 /* 114 * Pad the second replacement alternative with additional NOPs if it is

+1 -7

arch/x86/include/asm/mmu_context.h

reviewed

··· 126 DEBUG_LOCKS_WARN_ON(preemptible()); 127 } 128 129 - static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 130 - { 131 - int cpu = smp_processor_id(); 132 - 133 - if (cpumask_test_cpu(cpu, mm_cpumask(mm))) 134 - cpumask_clear_cpu(cpu, mm_cpumask(mm)); 135 - } 136 137 static inline int init_new_context(struct task_struct *tsk, 138 struct mm_struct *mm)

··· 126 DEBUG_LOCKS_WARN_ON(preemptible()); 127 } 128 129 + void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); 0 0 0 0 0 0 130 131 static inline int init_new_context(struct task_struct *tsk, 132 struct mm_struct *mm)

arch/x86/include/asm/mshyperv.h

reviewed

··· 289 * to this information. 290 */ 291 extern u32 *hv_vp_index; 0 292 293 /** 294 * hv_cpu_number_to_vp_number() - Map CPU to VP.

··· 289 * to this information. 290 */ 291 extern u32 *hv_vp_index; 292 + extern u32 hv_max_vp_index; 293 294 /** 295 * hv_cpu_number_to_vp_number() - Map CPU to VP.

+24

arch/x86/include/asm/tlbflush.h

reviewed

··· 83 #endif 84 85 /* 0 0 0 0 0 0 0 86 * 6 because 6 should be plenty and struct tlb_state will fit in 87 * two cache lines. 88 */ ··· 110 struct mm_struct *loaded_mm; 111 u16 loaded_mm_asid; 112 u16 next_asid; 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 113 114 /* 115 * Access to this CR4 shadow and to H/W CR4 is protected by

··· 83 #endif 84 85 /* 86 + * If tlb_use_lazy_mode is true, then we try to avoid switching CR3 to point 87 + * to init_mm when we switch to a kernel thread (e.g. the idle thread). If 88 + * it's false, then we immediately switch CR3 when entering a kernel thread. 89 + */ 90 + DECLARE_STATIC_KEY_TRUE(tlb_use_lazy_mode); 91 + 92 + /* 93 * 6 because 6 should be plenty and struct tlb_state will fit in 94 * two cache lines. 95 */ ··· 103 struct mm_struct *loaded_mm; 104 u16 loaded_mm_asid; 105 u16 next_asid; 106 + 107 + /* 108 + * We can be in one of several states: 109 + * 110 + * - Actively using an mm. Our CPU's bit will be set in 111 + * mm_cpumask(loaded_mm) and is_lazy == false; 112 + * 113 + * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit 114 + * will not be set in mm_cpumask(&init_mm) and is_lazy == false. 115 + * 116 + * - Lazily using a real mm. loaded_mm != &init_mm, our bit 117 + * is set in mm_cpumask(loaded_mm), but is_lazy == true. 118 + * We're heuristically guessing that the CR3 load we 119 + * skipped more than makes up for the overhead added by 120 + * lazy mode. 121 + */ 122 + bool is_lazy; 123 124 /* 125 * Access to this CR4 shadow and to H/W CR4 is protected by

+13 -2

arch/x86/kernel/apic/apic.c

reviewed

··· 573 return ~0U; 574 } 575 0 0 0 0 0 0 0 0 0 0 576 static const struct x86_cpu_id deadline_match[] = { 577 DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev), 578 DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020), 579 DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev), 580 - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_X, 0x02000014), 581 582 DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22), 583 DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20), ··· 610 const struct x86_cpu_id *m; 611 u32 rev; 612 613 - if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) 0 614 return; 615 616 m = x86_match_cpu(deadline_match);

··· 573 return ~0U; 574 } 575 576 + static u32 skx_deadline_rev(void) 577 + { 578 + switch (boot_cpu_data.x86_mask) { 579 + case 0x03: return 0x01000136; 580 + case 0x04: return 0x02000014; 581 + } 582 + 583 + return ~0U; 584 + } 585 + 586 static const struct x86_cpu_id deadline_match[] = { 587 DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev), 588 DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020), 589 DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev), 590 + DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev), 591 592 DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22), 593 DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20), ··· 600 const struct x86_cpu_id *m; 601 u32 rev; 602 603 + if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) || 604 + boot_cpu_has(X86_FEATURE_HYPERVISOR)) 605 return; 606 607 m = x86_match_cpu(deadline_match);

+18 -9

arch/x86/kernel/cpu/microcode/core.c

reviewed

··· 122 bool *res = &dis_ucode_ldr; 123 #endif 124 125 - if (!have_cpuid_p()) 126 - return *res; 127 - 128 /* 129 * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not 130 * completely accurate as xen pv guests don't see that CPUID bit set but ··· 163 void __init load_ucode_bsp(void) 164 { 165 unsigned int cpuid_1_eax; 0 166 167 - if (check_loader_disabled_bsp()) 168 return; 169 170 cpuid_1_eax = native_cpuid_eax(1); 171 172 switch (x86_cpuid_vendor()) { 173 case X86_VENDOR_INTEL: 174 - if (x86_family(cpuid_1_eax) >= 6) 175 - load_ucode_intel_bsp(); 176 break; 0 177 case X86_VENDOR_AMD: 178 - if (x86_family(cpuid_1_eax) >= 0x10) 179 - load_ucode_amd_bsp(cpuid_1_eax); 0 180 break; 0 181 default: 182 - break; 183 } 0 0 0 0 0 0 0 0 184 } 185 186 static bool check_loader_disabled_ap(void)

··· 122 bool *res = &dis_ucode_ldr; 123 #endif 124 0 0 0 125 /* 126 * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not 127 * completely accurate as xen pv guests don't see that CPUID bit set but ··· 166 void __init load_ucode_bsp(void) 167 { 168 unsigned int cpuid_1_eax; 169 + bool intel = true; 170 171 + if (!have_cpuid_p()) 172 return; 173 174 cpuid_1_eax = native_cpuid_eax(1); 175 176 switch (x86_cpuid_vendor()) { 177 case X86_VENDOR_INTEL: 178 + if (x86_family(cpuid_1_eax) < 6) 179 + return; 180 break; 181 + 182 case X86_VENDOR_AMD: 183 + if (x86_family(cpuid_1_eax) < 0x10) 184 + return; 185 + intel = false; 186 break; 187 + 188 default: 189 + return; 190 } 191 + 192 + if (check_loader_disabled_bsp()) 193 + return; 194 + 195 + if (intel) 196 + load_ucode_intel_bsp(); 197 + else 198 + load_ucode_amd_bsp(cpuid_1_eax); 199 } 200 201 static bool check_loader_disabled_ap(void)

+11 -2

arch/x86/kernel/kprobes/common.h

reviewed

··· 3 4 /* Kprobes and Optprobes common header */ 5 0 0 0 0 0 0 0 0 0 6 #ifdef CONFIG_X86_64 7 #define SAVE_REGS_STRING \ 8 /* Skip cs, ip, orig_ax. */ \ ··· 26 " pushq %r10\n" \ 27 " pushq %r11\n" \ 28 " pushq %rbx\n" \ 29 - " pushq %rbp\n" \ 30 " pushq %r12\n" \ 31 " pushq %r13\n" \ 32 " pushq %r14\n" \ ··· 57 " pushl %es\n" \ 58 " pushl %ds\n" \ 59 " pushl %eax\n" \ 60 - " pushl %ebp\n" \ 61 " pushl %edi\n" \ 62 " pushl %esi\n" \ 63 " pushl %edx\n" \

··· 3 4 /* Kprobes and Optprobes common header */ 5 6 + #include <asm/asm.h> 7 + 8 + #ifdef CONFIG_FRAME_POINTER 9 + # define SAVE_RBP_STRING " push %" _ASM_BP "\n" \ 10 + " mov %" _ASM_SP ", %" _ASM_BP "\n" 11 + #else 12 + # define SAVE_RBP_STRING " push %" _ASM_BP "\n" 13 + #endif 14 + 15 #ifdef CONFIG_X86_64 16 #define SAVE_REGS_STRING \ 17 /* Skip cs, ip, orig_ax. */ \ ··· 17 " pushq %r10\n" \ 18 " pushq %r11\n" \ 19 " pushq %rbx\n" \ 20 + SAVE_RBP_STRING \ 21 " pushq %r12\n" \ 22 " pushq %r13\n" \ 23 " pushq %r14\n" \ ··· 48 " pushl %es\n" \ 49 " pushl %ds\n" \ 50 " pushl %eax\n" \ 51 + SAVE_RBP_STRING \ 52 " pushl %edi\n" \ 53 " pushl %esi\n" \ 54 " pushl %edx\n" \

-2

arch/x86/kernel/kprobes/core.c

reviewed

··· 1080 * raw stack chunk with redzones: 1081 */ 1082 __memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr)); 1083 - regs->flags &= ~X86_EFLAGS_IF; 1084 - trace_hardirqs_off(); 1085 regs->ip = (unsigned long)(jp->entry); 1086 1087 /*

··· 1080 * raw stack chunk with redzones: 1081 */ 1082 __memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr)); 0 0 1083 regs->ip = (unsigned long)(jp->entry); 1084 1085 /*

arch/x86/kernel/reboot.c

reviewed

··· 105 load_cr3(initial_page_table); 106 #else 107 write_cr3(real_mode_header->trampoline_pgd); 0 0 0 0 108 #endif 109 110 /* Jump to the identity-mapped low memory code */

··· 105 load_cr3(initial_page_table); 106 #else 107 write_cr3(real_mode_header->trampoline_pgd); 108 + 109 + /* Exiting long mode will fail if CR4.PCIDE is set. */ 110 + if (static_cpu_has(X86_FEATURE_PCID)) 111 + cr4_clear_bits(X86_CR4_PCIDE); 112 #endif 113 114 /* Jump to the identity-mapped low memory code */

+36 -2

arch/x86/kernel/unwind_frame.c

reviewed

··· 44 state->stack_info.type, state->stack_info.next_sp, 45 state->stack_mask, state->graph_idx); 46 47 - for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { 0 48 if (get_stack_info(sp, state->task, &stack_info, &visit_mask)) 49 break; 50 ··· 175 * This determines if the frame pointer actually contains an encoded pointer to 176 * pt_regs on the stack. See ENCODE_FRAME_POINTER. 177 */ 0 178 static struct pt_regs *decode_frame_pointer(unsigned long *bp) 179 { 180 unsigned long regs = (unsigned long)bp; ··· 185 186 return (struct pt_regs *)(regs & ~0x1); 187 } 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 188 189 static bool update_stack_state(struct unwind_state *state, 190 unsigned long *next_bp) ··· 221 regs = decode_frame_pointer(next_bp); 222 if (regs) { 223 frame = (unsigned long *)regs; 224 - len = regs_size(regs); 225 state->got_irq = true; 226 } else { 227 frame = next_bp; ··· 243 /* Make sure it only unwinds up and doesn't overlap the prev frame: */ 244 if (state->orig_sp && state->stack_info.type == prev_type && 245 frame < prev_frame_end) 0 0 0 0 0 0 0 0 246 return false; 247 248 /* Move state to the next frame: */ ··· 353 if (state->regs && 354 state->regs->sp >= (unsigned long)last_aligned_frame(state) && 355 state->regs->sp < (unsigned long)task_pt_regs(state->task)) 0 0 0 0 0 0 0 356 goto the_end; 357 358 if (state->regs) {

··· 44 state->stack_info.type, state->stack_info.next_sp, 45 state->stack_mask, state->graph_idx); 46 47 + for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp; 48 + sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { 49 if (get_stack_info(sp, state->task, &stack_info, &visit_mask)) 50 break; 51 ··· 174 * This determines if the frame pointer actually contains an encoded pointer to 175 * pt_regs on the stack. See ENCODE_FRAME_POINTER. 176 */ 177 + #ifdef CONFIG_X86_64 178 static struct pt_regs *decode_frame_pointer(unsigned long *bp) 179 { 180 unsigned long regs = (unsigned long)bp; ··· 183 184 return (struct pt_regs *)(regs & ~0x1); 185 } 186 + #else 187 + static struct pt_regs *decode_frame_pointer(unsigned long *bp) 188 + { 189 + unsigned long regs = (unsigned long)bp; 190 + 191 + if (regs & 0x80000000) 192 + return NULL; 193 + 194 + return (struct pt_regs *)(regs | 0x80000000); 195 + } 196 + #endif 197 + 198 + #ifdef CONFIG_X86_32 199 + #define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long)) 200 + #else 201 + #define KERNEL_REGS_SIZE (sizeof(struct pt_regs)) 202 + #endif 203 204 static bool update_stack_state(struct unwind_state *state, 205 unsigned long *next_bp) ··· 202 regs = decode_frame_pointer(next_bp); 203 if (regs) { 204 frame = (unsigned long *)regs; 205 + len = KERNEL_REGS_SIZE; 206 state->got_irq = true; 207 } else { 208 frame = next_bp; ··· 224 /* Make sure it only unwinds up and doesn't overlap the prev frame: */ 225 if (state->orig_sp && state->stack_info.type == prev_type && 226 frame < prev_frame_end) 227 + return false; 228 + 229 + /* 230 + * On 32-bit with user mode regs, make sure the last two regs are safe 231 + * to access: 232 + */ 233 + if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) && 234 + !on_stack(info, frame, len + 2*sizeof(long))) 235 return false; 236 237 /* Move state to the next frame: */ ··· 326 if (state->regs && 327 state->regs->sp >= (unsigned long)last_aligned_frame(state) && 328 state->regs->sp < (unsigned long)task_pt_regs(state->task)) 329 + goto the_end; 330 + 331 + /* 332 + * There are some known frame pointer issues on 32-bit. Disable 333 + * unwinder warnings on 32-bit until it gets objtool support. 334 + */ 335 + if (IS_ENABLED(CONFIG_X86_32)) 336 goto the_end; 337 338 if (state->regs) {

+9 -2

arch/x86/mm/Makefile

reviewed

··· 1 - # Kernel does not boot with instrumentation of tlb.c. 2 - KCOV_INSTRUMENT_tlb.o := n 0 0 0 0 0 0 0 3 4 obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 5 pat.o pgtable.o physaddr.o setup_nx.o tlb.o

··· 1 + # Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c 2 + KCOV_INSTRUMENT_tlb.o := n 3 + KCOV_INSTRUMENT_mem_encrypt.o := n 4 + 5 + KASAN_SANITIZE_mem_encrypt.o := n 6 + 7 + ifdef CONFIG_FUNCTION_TRACER 8 + CFLAGS_REMOVE_mem_encrypt.o = -pg 9 + endif 10 11 obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 12 pat.o pgtable.o physaddr.o setup_nx.o tlb.o

+111 -42

arch/x86/mm/tlb.c

reviewed

··· 30 31 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); 32 0 0 33 static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, 34 u16 *new_asid, bool *need_flush) 35 { ··· 82 return; 83 84 /* Warn if we're not lazy. */ 85 - WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); 86 87 switch_mm(NULL, &init_mm, NULL); 88 } ··· 144 __flush_tlb_all(); 145 } 146 #endif 0 147 148 if (real_prev == next) { 149 VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != 150 next->context.ctx_id); 151 152 - if (cpumask_test_cpu(cpu, mm_cpumask(next))) { 153 - /* 154 - * There's nothing to do: we weren't lazy, and we 155 - * aren't changing our mm. We don't need to flush 156 - * anything, nor do we need to update CR3, CR4, or 157 - * LDTR. 158 - */ 159 - return; 160 - } 161 - 162 - /* Resume remote flushes and then read tlb_gen. */ 163 - cpumask_set_cpu(cpu, mm_cpumask(next)); 164 - next_tlb_gen = atomic64_read(&next->context.tlb_gen); 165 - 166 - if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < 167 - next_tlb_gen) { 168 - /* 169 - * Ideally, we'd have a flush_tlb() variant that 170 - * takes the known CR3 value as input. This would 171 - * be faster on Xen PV and on hypothetical CPUs 172 - * on which INVPCID is fast. 173 - */ 174 - this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, 175 - next_tlb_gen); 176 - write_cr3(build_cr3(next, prev_asid)); 177 - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 178 - TLB_FLUSH_ALL); 179 - } 180 - 181 /* 182 - * We just exited lazy mode, which means that CR4 and/or LDTR 183 - * may be stale. (Changes to the required CR4 and LDTR states 184 - * are not reflected in tlb_gen.) 0 0 185 */ 0 0 0 0 0 186 } else { 187 u16 new_asid; 188 bool need_flush; ··· 180 } 181 182 /* Stop remote flushes for the previous mm */ 183 - if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) 184 - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); 185 - 186 - VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); 187 188 /* 189 * Start remote flushes and then read tlb_gen. ··· 210 211 load_mm_cr4(next); 212 switch_ldt(real_prev, next); 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 213 } 214 215 /* ··· 314 /* This code cannot presently handle being reentered. */ 315 VM_WARN_ON(!irqs_disabled()); 316 0 0 0 317 VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != 318 loaded_mm->context.ctx_id); 319 320 - if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { 321 /* 322 - * We're in lazy mode -- don't flush. We can get here on 323 - * remote flushes due to races and on local flushes if a 324 - * kernel thread coincidentally flushes the mm it's lazily 325 - * still using. 326 */ 0 327 return; 328 } 329 ··· 626 return 0; 627 } 628 late_initcall(create_tlb_single_page_flush_ceiling); 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

··· 30 31 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); 32 33 + DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode); 34 + 35 static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, 36 u16 *new_asid, bool *need_flush) 37 { ··· 80 return; 81 82 /* Warn if we're not lazy. */ 83 + WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy)); 84 85 switch_mm(NULL, &init_mm, NULL); 86 } ··· 142 __flush_tlb_all(); 143 } 144 #endif 145 + this_cpu_write(cpu_tlbstate.is_lazy, false); 146 147 if (real_prev == next) { 148 VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != 149 next->context.ctx_id); 150 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 151 /* 152 + * We don't currently support having a real mm loaded without 153 + * our cpu set in mm_cpumask(). We have all the bookkeeping 154 + * in place to figure out whether we would need to flush 155 + * if our cpu were cleared in mm_cpumask(), but we don't 156 + * currently use it. 157 */ 158 + if (WARN_ON_ONCE(real_prev != &init_mm && 159 + !cpumask_test_cpu(cpu, mm_cpumask(next)))) 160 + cpumask_set_cpu(cpu, mm_cpumask(next)); 161 + 162 + return; 163 } else { 164 u16 new_asid; 165 bool need_flush; ··· 199 } 200 201 /* Stop remote flushes for the previous mm */ 202 + VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && 203 + real_prev != &init_mm); 204 + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); 0 205 206 /* 207 * Start remote flushes and then read tlb_gen. ··· 230 231 load_mm_cr4(next); 232 switch_ldt(real_prev, next); 233 + } 234 + 235 + /* 236 + * enter_lazy_tlb() is a hint from the scheduler that we are entering a 237 + * kernel thread or other context without an mm. Acceptable implementations 238 + * include doing nothing whatsoever, switching to init_mm, or various clever 239 + * lazy tricks to try to minimize TLB flushes. 240 + * 241 + * The scheduler reserves the right to call enter_lazy_tlb() several times 242 + * in a row. It will notify us that we're going back to a real mm by 243 + * calling switch_mm_irqs_off(). 244 + */ 245 + void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 246 + { 247 + if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) 248 + return; 249 + 250 + if (static_branch_unlikely(&tlb_use_lazy_mode)) { 251 + /* 252 + * There's a significant optimization that may be possible 253 + * here. We have accurate enough TLB flush tracking that we 254 + * don't need to maintain coherence of TLB per se when we're 255 + * lazy. We do, however, need to maintain coherence of 256 + * paging-structure caches. We could, in principle, leave our 257 + * old mm loaded and only switch to init_mm when 258 + * tlb_remove_page() happens. 259 + */ 260 + this_cpu_write(cpu_tlbstate.is_lazy, true); 261 + } else { 262 + switch_mm(NULL, &init_mm, NULL); 263 + } 264 } 265 266 /* ··· 303 /* This code cannot presently handle being reentered. */ 304 VM_WARN_ON(!irqs_disabled()); 305 306 + if (unlikely(loaded_mm == &init_mm)) 307 + return; 308 + 309 VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != 310 loaded_mm->context.ctx_id); 311 312 + if (this_cpu_read(cpu_tlbstate.is_lazy)) { 313 /* 314 + * We're in lazy mode. We need to at least flush our 315 + * paging-structure cache to avoid speculatively reading 316 + * garbage into our TLB. Since switching to init_mm is barely 317 + * slower than a minimal flush, just switch to init_mm. 318 */ 319 + switch_mm_irqs_off(NULL, &init_mm, NULL); 320 return; 321 } 322 ··· 611 return 0; 612 } 613 late_initcall(create_tlb_single_page_flush_ceiling); 614 + 615 + static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf, 616 + size_t count, loff_t *ppos) 617 + { 618 + char buf[2]; 619 + 620 + buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0'; 621 + buf[1] = '\n'; 622 + 623 + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); 624 + } 625 + 626 + static ssize_t tlblazy_write_file(struct file *file, 627 + const char __user *user_buf, size_t count, loff_t *ppos) 628 + { 629 + bool val; 630 + 631 + if (kstrtobool_from_user(user_buf, count, &val)) 632 + return -EINVAL; 633 + 634 + if (val) 635 + static_branch_enable(&tlb_use_lazy_mode); 636 + else 637 + static_branch_disable(&tlb_use_lazy_mode); 638 + 639 + return count; 640 + } 641 + 642 + static const struct file_operations fops_tlblazy = { 643 + .read = tlblazy_read_file, 644 + .write = tlblazy_write_file, 645 + .llseek = default_llseek, 646 + }; 647 + 648 + static int __init init_tlb_use_lazy_mode(void) 649 + { 650 + if (boot_cpu_has(X86_FEATURE_PCID)) { 651 + /* 652 + * Heuristic: with PCID on, switching to and from 653 + * init_mm is reasonably fast, but remote flush IPIs 654 + * as expensive as ever, so turn off lazy TLB mode. 655 + * 656 + * We can't do this in setup_pcid() because static keys 657 + * haven't been initialized yet, and it would blow up 658 + * badly. 659 + */ 660 + static_branch_disable(&tlb_use_lazy_mode); 661 + } 662 + 663 + debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR, 664 + arch_debugfs_dir, NULL, &fops_tlblazy); 665 + return 0; 666 + } 667 + late_initcall(init_tlb_use_lazy_mode);