x86/mm/64: Initialize CR4.PCIDE early

cpu_init() is weird: it's called rather late (after early
identification and after most MMU state is initialized) on the boot
CPU but is called extremely early (before identification) on secondary
CPUs. It's called just late enough on the boot CPU that its CR4 value
isn't propagated to mmu_cr4_features.

Even if we put CR4.PCIDE into mmu_cr4_features, we'd hit two
problems. First, we'd crash in the trampoline code. That's
fixable, and I tried that. It turns out that mmu_cr4_features is
totally ignored by secondary_start_64(), though, so even with the
trampoline code fixed, it wouldn't help.

This means that we don't currently have CR4.PCIDE reliably initialized
before we start playing with cpu_tlbstate. This is very fragile and
tends to cause boot failures if I make even small changes to the TLB
handling code.

Make it more robust: initialize CR4.PCIDE earlier on the boot CPU
and propagate it to secondary CPUs in start_secondary().

( Yes, this is ugly. I think we should have improved mmu_cr4_features
to actually control CR4 during secondary bootup, but that would be
fairly intrusive at this stage. )

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reported-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Tested-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Fixes: 660da7c9228f ("x86/mm: Enable CR4.PCIDE on supported systems")
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Andy Lutomirski and committed by
Ingo Molnar
c7ad5ad2 f34902c5

+50 -46
+7 -42
arch/x86/kernel/cpu/common.c
··· 169 __setup("nompx", x86_mpx_setup); 170 171 #ifdef CONFIG_X86_64 172 - static int __init x86_pcid_setup(char *s) 173 { 174 - /* require an exact match without trailing characters */ 175 - if (strlen(s)) 176 - return 0; 177 178 /* do not emit a message if the feature is not present */ 179 if (!boot_cpu_has(X86_FEATURE_PCID)) 180 - return 1; 181 182 setup_clear_cpu_cap(X86_FEATURE_PCID); 183 pr_info("nopcid: PCID feature disabled\n"); 184 - return 1; 185 } 186 - __setup("nopcid", x86_pcid_setup); 187 #endif 188 189 static int __init x86_noinvpcid_setup(char *s) ··· 326 #else 327 cr4_clear_bits(X86_CR4_SMAP); 328 #endif 329 - } 330 - } 331 - 332 - static void setup_pcid(struct cpuinfo_x86 *c) 333 - { 334 - if (cpu_has(c, X86_FEATURE_PCID)) { 335 - if (cpu_has(c, X86_FEATURE_PGE)) { 336 - /* 337 - * We'd like to use cr4_set_bits_and_update_boot(), 338 - * but we can't. CR4.PCIDE is special and can only 339 - * be set in long mode, and the early CPU init code 340 - * doesn't know this and would try to restore CR4.PCIDE 341 - * prior to entering long mode. 342 - * 343 - * Instead, we rely on the fact that hotplug, resume, 344 - * etc all fully restore CR4 before they write anything 345 - * that could have nonzero PCID bits to CR3. CR4.PCIDE 346 - * has no effect on the page tables themselves, so we 347 - * don't need it to be restored early. 348 - */ 349 - cr4_set_bits(X86_CR4_PCIDE); 350 - } else { 351 - /* 352 - * flush_tlb_all(), as currently implemented, won't 353 - * work if PCID is on but PGE is not. Since that 354 - * combination doesn't exist on real hardware, there's 355 - * no reason to try to fully support it, but it's 356 - * polite to avoid corrupting data if we're on 357 - * an improperly configured VM. 358 - */ 359 - clear_cpu_cap(c, X86_FEATURE_PCID); 360 - } 361 } 362 } 363 ··· 1142 /* Set up SMEP/SMAP */ 1143 setup_smep(c); 1144 setup_smap(c); 1145 - 1146 - /* Set up PCID */ 1147 - setup_pcid(c); 1148 1149 /* 1150 * The vendor-specific functions might have changed features.
··· 169 __setup("nompx", x86_mpx_setup); 170 171 #ifdef CONFIG_X86_64 172 + static int __init x86_nopcid_setup(char *s) 173 { 174 + /* nopcid doesn't accept parameters */ 175 + if (s) 176 + return -EINVAL; 177 178 /* do not emit a message if the feature is not present */ 179 if (!boot_cpu_has(X86_FEATURE_PCID)) 180 + return 0; 181 182 setup_clear_cpu_cap(X86_FEATURE_PCID); 183 pr_info("nopcid: PCID feature disabled\n"); 184 + return 0; 185 } 186 + early_param("nopcid", x86_nopcid_setup); 187 #endif 188 189 static int __init x86_noinvpcid_setup(char *s) ··· 326 #else 327 cr4_clear_bits(X86_CR4_SMAP); 328 #endif 329 } 330 } 331 ··· 1174 /* Set up SMEP/SMAP */ 1175 setup_smep(c); 1176 setup_smap(c); 1177 1178 /* 1179 * The vendor-specific functions might have changed features.
+4 -1
arch/x86/kernel/setup.c
··· 1178 * with the current CR4 value. This may not be necessary, but 1179 * auditing all the early-boot CR4 manipulation would be needed to 1180 * rule it out. 1181 */ 1182 - mmu_cr4_features = __read_cr4(); 1183 1184 memblock_set_current_limit(get_max_mapped()); 1185
··· 1178 * with the current CR4 value. This may not be necessary, but 1179 * auditing all the early-boot CR4 manipulation would be needed to 1180 * rule it out. 1181 + * 1182 + * Mask off features that don't work outside long mode (just 1183 + * PCIDE for now). 1184 */ 1185 + mmu_cr4_features = __read_cr4() & ~X86_CR4_PCIDE; 1186 1187 memblock_set_current_limit(get_max_mapped()); 1188
+5 -3
arch/x86/kernel/smpboot.c
··· 226 static void notrace start_secondary(void *unused) 227 { 228 /* 229 - * Don't put *anything* before cpu_init(), SMP booting is too 230 - * fragile that we want to limit the things done here to the 231 - * most necessary things. 232 */ 233 cpu_init(); 234 x86_cpuinit.early_percpu_clock_init(); 235 preempt_disable();
··· 226 static void notrace start_secondary(void *unused) 227 { 228 /* 229 + * Don't put *anything* except direct CPU state initialization 230 + * before cpu_init(), SMP booting is too fragile that we want to 231 + * limit the things done here to the most necessary things. 232 */ 233 + if (boot_cpu_has(X86_FEATURE_PCID)) 234 + __write_cr4(__read_cr4() | X86_CR4_PCIDE); 235 cpu_init(); 236 x86_cpuinit.early_percpu_clock_init(); 237 preempt_disable();
+34
arch/x86/mm/init.c
··· 19 #include <asm/microcode.h> 20 #include <asm/kaslr.h> 21 #include <asm/hypervisor.h> 22 23 /* 24 * We need to define the tracepoints somewhere, and tlb.c ··· 192 } else { 193 direct_gbpages = 0; 194 } 195 } 196 197 #ifdef CONFIG_X86_32 ··· 625 unsigned long end; 626 627 probe_page_size_mask(); 628 629 #ifdef CONFIG_X86_64 630 end = max_pfn << PAGE_SHIFT;
··· 19 #include <asm/microcode.h> 20 #include <asm/kaslr.h> 21 #include <asm/hypervisor.h> 22 + #include <asm/cpufeature.h> 23 24 /* 25 * We need to define the tracepoints somewhere, and tlb.c ··· 191 } else { 192 direct_gbpages = 0; 193 } 194 + } 195 + 196 + static void setup_pcid(void) 197 + { 198 + #ifdef CONFIG_X86_64 199 + if (boot_cpu_has(X86_FEATURE_PCID)) { 200 + if (boot_cpu_has(X86_FEATURE_PGE)) { 201 + /* 202 + * This can't be cr4_set_bits_and_update_boot() -- 203 + * the trampoline code can't handle CR4.PCIDE and 204 + * it wouldn't do any good anyway. Despite the name, 205 + * cr4_set_bits_and_update_boot() doesn't actually 206 + * cause the bits in question to remain set all the 207 + * way through the secondary boot asm. 208 + * 209 + * Instead, we brute-force it and set CR4.PCIDE 210 + * manually in start_secondary(). 211 + */ 212 + cr4_set_bits(X86_CR4_PCIDE); 213 + } else { 214 + /* 215 + * flush_tlb_all(), as currently implemented, won't 216 + * work if PCID is on but PGE is not. Since that 217 + * combination doesn't exist on real hardware, there's 218 + * no reason to try to fully support it, but it's 219 + * polite to avoid corrupting data if we're on 220 + * an improperly configured VM. 221 + */ 222 + setup_clear_cpu_cap(X86_FEATURE_PCID); 223 + } 224 + } 225 + #endif 226 } 227 228 #ifdef CONFIG_X86_32 ··· 592 unsigned long end; 593 594 probe_page_size_mask(); 595 + setup_pcid(); 596 597 #ifdef CONFIG_X86_64 598 end = max_pfn << PAGE_SHIFT;