Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:
"A landry list of fixes:

- fix reboot breakage on some PCID-enabled system

- fix crashes/hangs on some PCID-enabled systems

- fix microcode loading on certain older CPUs

- various unwinder fixes

- extend an APIC quirk to more hardware systems and disable APIC
related warning on virtualized systems

- various Hyper-V fixes

- a macro definition robustness fix

- remove jprobes IRQ disabling

- various mem-encryption fixes"

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/microcode: Do the family check first
x86/mm: Flush more aggressively in lazy TLB mode
x86/apic: Update TSC_DEADLINE quirk with additional SKX stepping
x86/apic: Silence "FW_BUG TSC_DEADLINE disabled due to Errata" on hypervisors
x86/mm: Disable various instrumentations of mm/mem_encrypt.c and mm/tlb.c
x86/hyperv: Fix hypercalls with extended CPU ranges for TLB flushing
x86/hyperv: Don't use percpu areas for pcpu_flush/pcpu_flush_ex structures
x86/hyperv: Clear vCPU banks between calls to avoid flushing unneeded vCPUs
x86/unwind: Disable unwinder warnings on 32-bit
x86/unwind: Align stack pointer in unwinder dump
x86/unwind: Use MSB for frame pointer encoding on 32-bit
x86/unwind: Fix dereference of untrusted pointer
x86/alternatives: Fix alt_max_short macro to really be a max()
x86/mm/64: Fix reboot interaction with CR4.PCIDE
kprobes/x86: Remove IRQ disabling from jprobe handlers
kprobes/x86: Set up frame pointer in kprobe trampoline

Changed files
+284 -88
arch
+2 -2
arch/x86/entry/entry_32.S
··· 176 176 /* 177 177 * This is a sneaky trick to help the unwinder find pt_regs on the stack. The 178 178 * frame pointer is replaced with an encoded pointer to pt_regs. The encoding 179 - * is just setting the LSB, which makes it an invalid stack address and is also 179 + * is just clearing the MSB, which makes it an invalid stack address and is also 180 180 * a signal to the unwinder that it's a pt_regs pointer in disguise. 181 181 * 182 182 * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the ··· 185 185 .macro ENCODE_FRAME_POINTER 186 186 #ifdef CONFIG_FRAME_POINTER 187 187 mov %esp, %ebp 188 - orl $0x1, %ebp 188 + andl $0x7fffffff, %ebp 189 189 #endif 190 190 .endm 191 191
+5
arch/x86/hyperv/hv_init.c
··· 85 85 u32 *hv_vp_index; 86 86 EXPORT_SYMBOL_GPL(hv_vp_index); 87 87 88 + u32 hv_max_vp_index; 89 + 88 90 static int hv_cpu_init(unsigned int cpu) 89 91 { 90 92 u64 msr_vp_index; ··· 94 92 hv_get_vp_index(msr_vp_index); 95 93 96 94 hv_vp_index[smp_processor_id()] = msr_vp_index; 95 + 96 + if (msr_vp_index > hv_max_vp_index) 97 + hv_max_vp_index = msr_vp_index; 97 98 98 99 return 0; 99 100 }
+43 -14
arch/x86/hyperv/mmu.c
··· 36 36 /* Each gva in gva_list encodes up to 4096 pages to flush */ 37 37 #define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE) 38 38 39 - static struct hv_flush_pcpu __percpu *pcpu_flush; 39 + static struct hv_flush_pcpu __percpu **pcpu_flush; 40 40 41 - static struct hv_flush_pcpu_ex __percpu *pcpu_flush_ex; 41 + static struct hv_flush_pcpu_ex __percpu **pcpu_flush_ex; 42 42 43 43 /* 44 44 * Fills in gva_list starting from offset. Returns the number of items added. ··· 76 76 { 77 77 int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1; 78 78 79 + /* valid_bank_mask can represent up to 64 banks */ 80 + if (hv_max_vp_index / 64 >= 64) 81 + return 0; 82 + 83 + /* 84 + * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex 85 + * structs are not cleared between calls, we risk flushing unneeded 86 + * vCPUs otherwise. 87 + */ 88 + for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++) 89 + flush->hv_vp_set.bank_contents[vcpu_bank] = 0; 90 + 79 91 /* 80 92 * Some banks may end up being empty but this is acceptable. 81 93 */ ··· 95 83 vcpu = hv_cpu_number_to_vp_number(cpu); 96 84 vcpu_bank = vcpu / 64; 97 85 vcpu_offset = vcpu % 64; 98 - 99 - /* valid_bank_mask can represent up to 64 banks */ 100 - if (vcpu_bank >= 64) 101 - return 0; 102 - 103 86 __set_bit(vcpu_offset, (unsigned long *) 104 87 &flush->hv_vp_set.bank_contents[vcpu_bank]); 105 88 if (vcpu_bank >= nr_bank) ··· 109 102 const struct flush_tlb_info *info) 110 103 { 111 104 int cpu, vcpu, gva_n, max_gvas; 105 + struct hv_flush_pcpu **flush_pcpu; 112 106 struct hv_flush_pcpu *flush; 113 107 u64 status = U64_MAX; 114 108 unsigned long flags; ··· 124 116 125 117 local_irq_save(flags); 126 118 127 - flush = this_cpu_ptr(pcpu_flush); 119 + flush_pcpu = this_cpu_ptr(pcpu_flush); 120 + 121 + if (unlikely(!*flush_pcpu)) 122 + *flush_pcpu = page_address(alloc_page(GFP_ATOMIC)); 123 + 124 + flush = *flush_pcpu; 125 + 126 + if (unlikely(!flush)) { 127 + local_irq_restore(flags); 128 + goto do_native; 129 + } 128 130 129 131 if (info->mm) { 130 132 flush->address_space = virt_to_phys(info->mm->pgd); ··· 191 173 const struct flush_tlb_info *info) 192 174 { 193 175 int nr_bank = 0, max_gvas, gva_n; 176 + struct hv_flush_pcpu_ex **flush_pcpu; 194 177 struct hv_flush_pcpu_ex *flush; 195 178 u64 status = U64_MAX; 196 179 unsigned long flags; ··· 206 187 207 188 local_irq_save(flags); 208 189 209 - flush = this_cpu_ptr(pcpu_flush_ex); 190 + flush_pcpu = this_cpu_ptr(pcpu_flush_ex); 191 + 192 + if (unlikely(!*flush_pcpu)) 193 + *flush_pcpu = page_address(alloc_page(GFP_ATOMIC)); 194 + 195 + flush = *flush_pcpu; 196 + 197 + if (unlikely(!flush)) { 198 + local_irq_restore(flags); 199 + goto do_native; 200 + } 210 201 211 202 if (info->mm) { 212 203 flush->address_space = virt_to_phys(info->mm->pgd); ··· 251 222 flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; 252 223 status = hv_do_rep_hypercall( 253 224 HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, 254 - 0, nr_bank + 2, flush, NULL); 225 + 0, nr_bank, flush, NULL); 255 226 } else if (info->end && 256 227 ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { 257 228 status = hv_do_rep_hypercall( 258 229 HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, 259 - 0, nr_bank + 2, flush, NULL); 230 + 0, nr_bank, flush, NULL); 260 231 } else { 261 232 gva_n = fill_gva_list(flush->gva_list, nr_bank, 262 233 info->start, info->end); 263 234 status = hv_do_rep_hypercall( 264 235 HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX, 265 - gva_n, nr_bank + 2, flush, NULL); 236 + gva_n, nr_bank, flush, NULL); 266 237 } 267 238 268 239 local_irq_restore(flags); ··· 295 266 return; 296 267 297 268 if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) 298 - pcpu_flush = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); 269 + pcpu_flush = alloc_percpu(struct hv_flush_pcpu *); 299 270 else 300 - pcpu_flush_ex = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); 271 + pcpu_flush_ex = alloc_percpu(struct hv_flush_pcpu_ex *); 301 272 }
+3 -1
arch/x86/include/asm/alternative-asm.h
··· 62 62 #define new_len2 145f-144f 63 63 64 64 /* 65 - * max without conditionals. Idea adapted from: 65 + * gas compatible max based on the idea from: 66 66 * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax 67 + * 68 + * The additional "-" is needed because gas uses a "true" value of -1. 67 69 */ 68 70 #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) 69 71
+3 -3
arch/x86/include/asm/alternative.h
··· 103 103 alt_end_marker ":\n" 104 104 105 105 /* 106 - * max without conditionals. Idea adapted from: 106 + * gas compatible max based on the idea from: 107 107 * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax 108 108 * 109 - * The additional "-" is needed because gas works with s32s. 109 + * The additional "-" is needed because gas uses a "true" value of -1. 110 110 */ 111 - #define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))" 111 + #define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))" 112 112 113 113 /* 114 114 * Pad the second replacement alternative with additional NOPs if it is
+1 -7
arch/x86/include/asm/mmu_context.h
··· 126 126 DEBUG_LOCKS_WARN_ON(preemptible()); 127 127 } 128 128 129 - static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 130 - { 131 - int cpu = smp_processor_id(); 132 - 133 - if (cpumask_test_cpu(cpu, mm_cpumask(mm))) 134 - cpumask_clear_cpu(cpu, mm_cpumask(mm)); 135 - } 129 + void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); 136 130 137 131 static inline int init_new_context(struct task_struct *tsk, 138 132 struct mm_struct *mm)
+1
arch/x86/include/asm/mshyperv.h
··· 289 289 * to this information. 290 290 */ 291 291 extern u32 *hv_vp_index; 292 + extern u32 hv_max_vp_index; 292 293 293 294 /** 294 295 * hv_cpu_number_to_vp_number() - Map CPU to VP.
+24
arch/x86/include/asm/tlbflush.h
··· 83 83 #endif 84 84 85 85 /* 86 + * If tlb_use_lazy_mode is true, then we try to avoid switching CR3 to point 87 + * to init_mm when we switch to a kernel thread (e.g. the idle thread). If 88 + * it's false, then we immediately switch CR3 when entering a kernel thread. 89 + */ 90 + DECLARE_STATIC_KEY_TRUE(tlb_use_lazy_mode); 91 + 92 + /* 86 93 * 6 because 6 should be plenty and struct tlb_state will fit in 87 94 * two cache lines. 88 95 */ ··· 110 103 struct mm_struct *loaded_mm; 111 104 u16 loaded_mm_asid; 112 105 u16 next_asid; 106 + 107 + /* 108 + * We can be in one of several states: 109 + * 110 + * - Actively using an mm. Our CPU's bit will be set in 111 + * mm_cpumask(loaded_mm) and is_lazy == false; 112 + * 113 + * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit 114 + * will not be set in mm_cpumask(&init_mm) and is_lazy == false. 115 + * 116 + * - Lazily using a real mm. loaded_mm != &init_mm, our bit 117 + * is set in mm_cpumask(loaded_mm), but is_lazy == true. 118 + * We're heuristically guessing that the CR3 load we 119 + * skipped more than makes up for the overhead added by 120 + * lazy mode. 121 + */ 122 + bool is_lazy; 113 123 114 124 /* 115 125 * Access to this CR4 shadow and to H/W CR4 is protected by
+13 -2
arch/x86/kernel/apic/apic.c
··· 573 573 return ~0U; 574 574 } 575 575 576 + static u32 skx_deadline_rev(void) 577 + { 578 + switch (boot_cpu_data.x86_mask) { 579 + case 0x03: return 0x01000136; 580 + case 0x04: return 0x02000014; 581 + } 582 + 583 + return ~0U; 584 + } 585 + 576 586 static const struct x86_cpu_id deadline_match[] = { 577 587 DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev), 578 588 DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020), 579 589 DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev), 580 - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_X, 0x02000014), 590 + DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev), 581 591 582 592 DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22), 583 593 DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20), ··· 610 600 const struct x86_cpu_id *m; 611 601 u32 rev; 612 602 613 - if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) 603 + if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) || 604 + boot_cpu_has(X86_FEATURE_HYPERVISOR)) 614 605 return; 615 606 616 607 m = x86_match_cpu(deadline_match);
+18 -9
arch/x86/kernel/cpu/microcode/core.c
··· 122 122 bool *res = &dis_ucode_ldr; 123 123 #endif 124 124 125 - if (!have_cpuid_p()) 126 - return *res; 127 - 128 125 /* 129 126 * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not 130 127 * completely accurate as xen pv guests don't see that CPUID bit set but ··· 163 166 void __init load_ucode_bsp(void) 164 167 { 165 168 unsigned int cpuid_1_eax; 169 + bool intel = true; 166 170 167 - if (check_loader_disabled_bsp()) 171 + if (!have_cpuid_p()) 168 172 return; 169 173 170 174 cpuid_1_eax = native_cpuid_eax(1); 171 175 172 176 switch (x86_cpuid_vendor()) { 173 177 case X86_VENDOR_INTEL: 174 - if (x86_family(cpuid_1_eax) >= 6) 175 - load_ucode_intel_bsp(); 178 + if (x86_family(cpuid_1_eax) < 6) 179 + return; 176 180 break; 181 + 177 182 case X86_VENDOR_AMD: 178 - if (x86_family(cpuid_1_eax) >= 0x10) 179 - load_ucode_amd_bsp(cpuid_1_eax); 183 + if (x86_family(cpuid_1_eax) < 0x10) 184 + return; 185 + intel = false; 180 186 break; 187 + 181 188 default: 182 - break; 189 + return; 183 190 } 191 + 192 + if (check_loader_disabled_bsp()) 193 + return; 194 + 195 + if (intel) 196 + load_ucode_intel_bsp(); 197 + else 198 + load_ucode_amd_bsp(cpuid_1_eax); 184 199 } 185 200 186 201 static bool check_loader_disabled_ap(void)
+11 -2
arch/x86/kernel/kprobes/common.h
··· 3 3 4 4 /* Kprobes and Optprobes common header */ 5 5 6 + #include <asm/asm.h> 7 + 8 + #ifdef CONFIG_FRAME_POINTER 9 + # define SAVE_RBP_STRING " push %" _ASM_BP "\n" \ 10 + " mov %" _ASM_SP ", %" _ASM_BP "\n" 11 + #else 12 + # define SAVE_RBP_STRING " push %" _ASM_BP "\n" 13 + #endif 14 + 6 15 #ifdef CONFIG_X86_64 7 16 #define SAVE_REGS_STRING \ 8 17 /* Skip cs, ip, orig_ax. */ \ ··· 26 17 " pushq %r10\n" \ 27 18 " pushq %r11\n" \ 28 19 " pushq %rbx\n" \ 29 - " pushq %rbp\n" \ 20 + SAVE_RBP_STRING \ 30 21 " pushq %r12\n" \ 31 22 " pushq %r13\n" \ 32 23 " pushq %r14\n" \ ··· 57 48 " pushl %es\n" \ 58 49 " pushl %ds\n" \ 59 50 " pushl %eax\n" \ 60 - " pushl %ebp\n" \ 51 + SAVE_RBP_STRING \ 61 52 " pushl %edi\n" \ 62 53 " pushl %esi\n" \ 63 54 " pushl %edx\n" \
-2
arch/x86/kernel/kprobes/core.c
··· 1080 1080 * raw stack chunk with redzones: 1081 1081 */ 1082 1082 __memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr)); 1083 - regs->flags &= ~X86_EFLAGS_IF; 1084 - trace_hardirqs_off(); 1085 1083 regs->ip = (unsigned long)(jp->entry); 1086 1084 1087 1085 /*
+4
arch/x86/kernel/reboot.c
··· 105 105 load_cr3(initial_page_table); 106 106 #else 107 107 write_cr3(real_mode_header->trampoline_pgd); 108 + 109 + /* Exiting long mode will fail if CR4.PCIDE is set. */ 110 + if (static_cpu_has(X86_FEATURE_PCID)) 111 + cr4_clear_bits(X86_CR4_PCIDE); 108 112 #endif 109 113 110 114 /* Jump to the identity-mapped low memory code */
+36 -2
arch/x86/kernel/unwind_frame.c
··· 44 44 state->stack_info.type, state->stack_info.next_sp, 45 45 state->stack_mask, state->graph_idx); 46 46 47 - for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { 47 + for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp; 48 + sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { 48 49 if (get_stack_info(sp, state->task, &stack_info, &visit_mask)) 49 50 break; 50 51 ··· 175 174 * This determines if the frame pointer actually contains an encoded pointer to 176 175 * pt_regs on the stack. See ENCODE_FRAME_POINTER. 177 176 */ 177 + #ifdef CONFIG_X86_64 178 178 static struct pt_regs *decode_frame_pointer(unsigned long *bp) 179 179 { 180 180 unsigned long regs = (unsigned long)bp; ··· 185 183 186 184 return (struct pt_regs *)(regs & ~0x1); 187 185 } 186 + #else 187 + static struct pt_regs *decode_frame_pointer(unsigned long *bp) 188 + { 189 + unsigned long regs = (unsigned long)bp; 190 + 191 + if (regs & 0x80000000) 192 + return NULL; 193 + 194 + return (struct pt_regs *)(regs | 0x80000000); 195 + } 196 + #endif 197 + 198 + #ifdef CONFIG_X86_32 199 + #define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long)) 200 + #else 201 + #define KERNEL_REGS_SIZE (sizeof(struct pt_regs)) 202 + #endif 188 203 189 204 static bool update_stack_state(struct unwind_state *state, 190 205 unsigned long *next_bp) ··· 221 202 regs = decode_frame_pointer(next_bp); 222 203 if (regs) { 223 204 frame = (unsigned long *)regs; 224 - len = regs_size(regs); 205 + len = KERNEL_REGS_SIZE; 225 206 state->got_irq = true; 226 207 } else { 227 208 frame = next_bp; ··· 243 224 /* Make sure it only unwinds up and doesn't overlap the prev frame: */ 244 225 if (state->orig_sp && state->stack_info.type == prev_type && 245 226 frame < prev_frame_end) 227 + return false; 228 + 229 + /* 230 + * On 32-bit with user mode regs, make sure the last two regs are safe 231 + * to access: 232 + */ 233 + if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) && 234 + !on_stack(info, frame, len + 2*sizeof(long))) 246 235 return false; 247 236 248 237 /* Move state to the next frame: */ ··· 353 326 if (state->regs && 354 327 state->regs->sp >= (unsigned long)last_aligned_frame(state) && 355 328 state->regs->sp < (unsigned long)task_pt_regs(state->task)) 329 + goto the_end; 330 + 331 + /* 332 + * There are some known frame pointer issues on 32-bit. Disable 333 + * unwinder warnings on 32-bit until it gets objtool support. 334 + */ 335 + if (IS_ENABLED(CONFIG_X86_32)) 356 336 goto the_end; 357 337 358 338 if (state->regs) {
+9 -2
arch/x86/mm/Makefile
··· 1 - # Kernel does not boot with instrumentation of tlb.c. 2 - KCOV_INSTRUMENT_tlb.o := n 1 + # Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c 2 + KCOV_INSTRUMENT_tlb.o := n 3 + KCOV_INSTRUMENT_mem_encrypt.o := n 4 + 5 + KASAN_SANITIZE_mem_encrypt.o := n 6 + 7 + ifdef CONFIG_FUNCTION_TRACER 8 + CFLAGS_REMOVE_mem_encrypt.o = -pg 9 + endif 3 10 4 11 obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 5 12 pat.o pgtable.o physaddr.o setup_nx.o tlb.o
+111 -42
arch/x86/mm/tlb.c
··· 30 30 31 31 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); 32 32 33 + DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode); 34 + 33 35 static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, 34 36 u16 *new_asid, bool *need_flush) 35 37 { ··· 82 80 return; 83 81 84 82 /* Warn if we're not lazy. */ 85 - WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); 83 + WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy)); 86 84 87 85 switch_mm(NULL, &init_mm, NULL); 88 86 } ··· 144 142 __flush_tlb_all(); 145 143 } 146 144 #endif 145 + this_cpu_write(cpu_tlbstate.is_lazy, false); 147 146 148 147 if (real_prev == next) { 149 148 VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != 150 149 next->context.ctx_id); 151 150 152 - if (cpumask_test_cpu(cpu, mm_cpumask(next))) { 153 - /* 154 - * There's nothing to do: we weren't lazy, and we 155 - * aren't changing our mm. We don't need to flush 156 - * anything, nor do we need to update CR3, CR4, or 157 - * LDTR. 158 - */ 159 - return; 160 - } 161 - 162 - /* Resume remote flushes and then read tlb_gen. */ 163 - cpumask_set_cpu(cpu, mm_cpumask(next)); 164 - next_tlb_gen = atomic64_read(&next->context.tlb_gen); 165 - 166 - if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < 167 - next_tlb_gen) { 168 - /* 169 - * Ideally, we'd have a flush_tlb() variant that 170 - * takes the known CR3 value as input. This would 171 - * be faster on Xen PV and on hypothetical CPUs 172 - * on which INVPCID is fast. 173 - */ 174 - this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, 175 - next_tlb_gen); 176 - write_cr3(build_cr3(next, prev_asid)); 177 - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 178 - TLB_FLUSH_ALL); 179 - } 180 - 181 151 /* 182 - * We just exited lazy mode, which means that CR4 and/or LDTR 183 - * may be stale. (Changes to the required CR4 and LDTR states 184 - * are not reflected in tlb_gen.) 152 + * We don't currently support having a real mm loaded without 153 + * our cpu set in mm_cpumask(). We have all the bookkeeping 154 + * in place to figure out whether we would need to flush 155 + * if our cpu were cleared in mm_cpumask(), but we don't 156 + * currently use it. 185 157 */ 158 + if (WARN_ON_ONCE(real_prev != &init_mm && 159 + !cpumask_test_cpu(cpu, mm_cpumask(next)))) 160 + cpumask_set_cpu(cpu, mm_cpumask(next)); 161 + 162 + return; 186 163 } else { 187 164 u16 new_asid; 188 165 bool need_flush; ··· 180 199 } 181 200 182 201 /* Stop remote flushes for the previous mm */ 183 - if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) 184 - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); 185 - 186 - VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); 202 + VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && 203 + real_prev != &init_mm); 204 + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); 187 205 188 206 /* 189 207 * Start remote flushes and then read tlb_gen. ··· 210 230 211 231 load_mm_cr4(next); 212 232 switch_ldt(real_prev, next); 233 + } 234 + 235 + /* 236 + * enter_lazy_tlb() is a hint from the scheduler that we are entering a 237 + * kernel thread or other context without an mm. Acceptable implementations 238 + * include doing nothing whatsoever, switching to init_mm, or various clever 239 + * lazy tricks to try to minimize TLB flushes. 240 + * 241 + * The scheduler reserves the right to call enter_lazy_tlb() several times 242 + * in a row. It will notify us that we're going back to a real mm by 243 + * calling switch_mm_irqs_off(). 244 + */ 245 + void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 246 + { 247 + if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) 248 + return; 249 + 250 + if (static_branch_unlikely(&tlb_use_lazy_mode)) { 251 + /* 252 + * There's a significant optimization that may be possible 253 + * here. We have accurate enough TLB flush tracking that we 254 + * don't need to maintain coherence of TLB per se when we're 255 + * lazy. We do, however, need to maintain coherence of 256 + * paging-structure caches. We could, in principle, leave our 257 + * old mm loaded and only switch to init_mm when 258 + * tlb_remove_page() happens. 259 + */ 260 + this_cpu_write(cpu_tlbstate.is_lazy, true); 261 + } else { 262 + switch_mm(NULL, &init_mm, NULL); 263 + } 213 264 } 214 265 215 266 /* ··· 314 303 /* This code cannot presently handle being reentered. */ 315 304 VM_WARN_ON(!irqs_disabled()); 316 305 306 + if (unlikely(loaded_mm == &init_mm)) 307 + return; 308 + 317 309 VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != 318 310 loaded_mm->context.ctx_id); 319 311 320 - if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { 312 + if (this_cpu_read(cpu_tlbstate.is_lazy)) { 321 313 /* 322 - * We're in lazy mode -- don't flush. We can get here on 323 - * remote flushes due to races and on local flushes if a 324 - * kernel thread coincidentally flushes the mm it's lazily 325 - * still using. 314 + * We're in lazy mode. We need to at least flush our 315 + * paging-structure cache to avoid speculatively reading 316 + * garbage into our TLB. Since switching to init_mm is barely 317 + * slower than a minimal flush, just switch to init_mm. 326 318 */ 319 + switch_mm_irqs_off(NULL, &init_mm, NULL); 327 320 return; 328 321 } 329 322 ··· 626 611 return 0; 627 612 } 628 613 late_initcall(create_tlb_single_page_flush_ceiling); 614 + 615 + static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf, 616 + size_t count, loff_t *ppos) 617 + { 618 + char buf[2]; 619 + 620 + buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0'; 621 + buf[1] = '\n'; 622 + 623 + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); 624 + } 625 + 626 + static ssize_t tlblazy_write_file(struct file *file, 627 + const char __user *user_buf, size_t count, loff_t *ppos) 628 + { 629 + bool val; 630 + 631 + if (kstrtobool_from_user(user_buf, count, &val)) 632 + return -EINVAL; 633 + 634 + if (val) 635 + static_branch_enable(&tlb_use_lazy_mode); 636 + else 637 + static_branch_disable(&tlb_use_lazy_mode); 638 + 639 + return count; 640 + } 641 + 642 + static const struct file_operations fops_tlblazy = { 643 + .read = tlblazy_read_file, 644 + .write = tlblazy_write_file, 645 + .llseek = default_llseek, 646 + }; 647 + 648 + static int __init init_tlb_use_lazy_mode(void) 649 + { 650 + if (boot_cpu_has(X86_FEATURE_PCID)) { 651 + /* 652 + * Heuristic: with PCID on, switching to and from 653 + * init_mm is reasonably fast, but remote flush IPIs 654 + * as expensive as ever, so turn off lazy TLB mode. 655 + * 656 + * We can't do this in setup_pcid() because static keys 657 + * haven't been initialized yet, and it would blow up 658 + * badly. 659 + */ 660 + static_branch_disable(&tlb_use_lazy_mode); 661 + } 662 + 663 + debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR, 664 + arch_debugfs_dir, NULL, &fops_tlblazy); 665 + return 0; 666 + } 667 + late_initcall(init_tlb_use_lazy_mode);