Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()

The kernel has several code paths that read CR3. Most of them assume that
CR3 contains the PGD's physical address, whereas some of them awkwardly
use PHYSICAL_PAGE_MASK to mask off low bits.

Add explicit mask macros for CR3 and convert all of the CR3 readers.
This will keep them from breaking when PCID is enabled.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: xen-devel <xen-devel@lists.xen.org>
Link: http://lkml.kernel.org/r/883f8fb121f4616c1c1427ad87350bb2f5ffeca1.1497288170.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Andy Lutomirski and committed by
Ingo Molnar
6c690ee1 3f365cf3

+79 -29
+1 -1
arch/x86/boot/compressed/pagetable.c
··· 92 92 * and we must append to the existing area instead of entirely 93 93 * overwriting it. 94 94 */ 95 - level4p = read_cr3(); 95 + level4p = read_cr3_pa(); 96 96 if (level4p == (unsigned long)_pgtable) { 97 97 debug_putstr("booted via startup_32()\n"); 98 98 pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
+1 -1
arch/x86/include/asm/efi.h
··· 74 74 __kernel_fpu_begin(); \ 75 75 \ 76 76 if (efi_scratch.use_pgd) { \ 77 - efi_scratch.prev_cr3 = read_cr3(); \ 77 + efi_scratch.prev_cr3 = __read_cr3(); \ 78 78 write_cr3((unsigned long)efi_scratch.efi_pgt); \ 79 79 __flush_tlb_all(); \ 80 80 } \
+2 -2
arch/x86/include/asm/mmu_context.h
··· 269 269 270 270 /* 271 271 * This can be used from process context to figure out what the value of 272 - * CR3 is without needing to do a (slow) read_cr3(). 272 + * CR3 is without needing to do a (slow) __read_cr3(). 273 273 * 274 274 * It's intended to be used for code like KVM that sneakily changes CR3 275 275 * and needs to restore it. It needs to be used very carefully. ··· 281 281 /* For now, be very restrictive about when this can be called. */ 282 282 VM_WARN_ON(in_nmi() || !in_atomic()); 283 283 284 - VM_BUG_ON(cr3 != read_cr3()); 284 + VM_BUG_ON(cr3 != __read_cr3()); 285 285 return cr3; 286 286 } 287 287
+1 -1
arch/x86/include/asm/paravirt.h
··· 61 61 PVOP_VCALL1(pv_mmu_ops.write_cr2, x); 62 62 } 63 63 64 - static inline unsigned long read_cr3(void) 64 + static inline unsigned long __read_cr3(void) 65 65 { 66 66 return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3); 67 67 }
+36
arch/x86/include/asm/processor-flags.h
··· 8 8 #else 9 9 #define X86_VM_MASK 0 /* No VM86 support */ 10 10 #endif 11 + 12 + /* 13 + * CR3's layout varies depending on several things. 14 + * 15 + * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID. 16 + * If PAE is enabled, then CR3[11:5] is part of the PDPT address 17 + * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored. 18 + * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and 19 + * CR3[2:0] and CR3[11:5] are ignored. 20 + * 21 + * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD. 22 + * 23 + * CR3[63] is always read as zero. If CR4.PCIDE is set, then CR3[63] may be 24 + * written as 1 to prevent the write to CR3 from flushing the TLB. 25 + * 26 + * On systems with SME, one bit (in a variable position!) is stolen to indicate 27 + * that the top-level paging structure is encrypted. 28 + * 29 + * All of the remaining bits indicate the physical address of the top-level 30 + * paging structure. 31 + * 32 + * CR3_ADDR_MASK is the mask used by read_cr3_pa(). 33 + */ 34 + #ifdef CONFIG_X86_64 35 + /* Mask off the address space ID bits. */ 36 + #define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull 37 + #define CR3_PCID_MASK 0xFFFull 38 + #else 39 + /* 40 + * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save 41 + * a tiny bit of code size by setting all the bits. 42 + */ 43 + #define CR3_ADDR_MASK 0xFFFFFFFFull 44 + #define CR3_PCID_MASK 0ull 45 + #endif 46 + 11 47 #endif /* _ASM_X86_PROCESSOR_FLAGS_H */
+8
arch/x86/include/asm/processor.h
··· 231 231 native_cpuid_reg(ecx) 232 232 native_cpuid_reg(edx) 233 233 234 + /* 235 + * Friendlier CR3 helpers. 236 + */ 237 + static inline unsigned long read_cr3_pa(void) 238 + { 239 + return __read_cr3() & CR3_ADDR_MASK; 240 + } 241 + 234 242 static inline void load_cr3(pgd_t *pgdir) 235 243 { 236 244 write_cr3(__pa(pgdir));
+7 -3
arch/x86/include/asm/special_insns.h
··· 39 39 asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); 40 40 } 41 41 42 - static inline unsigned long native_read_cr3(void) 42 + static inline unsigned long __native_read_cr3(void) 43 43 { 44 44 unsigned long val; 45 45 asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); ··· 159 159 native_write_cr2(x); 160 160 } 161 161 162 - static inline unsigned long read_cr3(void) 162 + /* 163 + * Careful! CR3 contains more than just an address. You probably want 164 + * read_cr3_pa() instead. 165 + */ 166 + static inline unsigned long __read_cr3(void) 163 167 { 164 - return native_read_cr3(); 168 + return __native_read_cr3(); 165 169 } 166 170 167 171 static inline void write_cr3(unsigned long x)
+2 -2
arch/x86/include/asm/tlbflush.h
··· 156 156 * back: 157 157 */ 158 158 preempt_disable(); 159 - native_write_cr3(native_read_cr3()); 159 + native_write_cr3(__native_read_cr3()); 160 160 preempt_enable(); 161 161 } 162 162 ··· 264 264 this_cpu_write(cpu_tlbstate.state, 0); 265 265 this_cpu_write(cpu_tlbstate.loaded_mm, &init_mm); 266 266 267 - WARN_ON(read_cr3() != __pa_symbol(swapper_pg_dir)); 267 + WARN_ON(read_cr3_pa() != __pa_symbol(swapper_pg_dir)); 268 268 } 269 269 270 270 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
+2 -1
arch/x86/kernel/head64.c
··· 55 55 pmdval_t pmd, *pmd_p; 56 56 57 57 /* Invalid address or early pgt is done ? */ 58 - if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt)) 58 + if (physaddr >= MAXMEM || 59 + read_cr3_pa() != __pa_nodebug(early_level4_pgt)) 59 60 return -1; 60 61 61 62 again:
+1 -1
arch/x86/kernel/paravirt.c
··· 391 391 392 392 .read_cr2 = native_read_cr2, 393 393 .write_cr2 = native_write_cr2, 394 - .read_cr3 = native_read_cr3, 394 + .read_cr3 = __native_read_cr3, 395 395 .write_cr3 = native_write_cr3, 396 396 397 397 .flush_tlb_user = native_flush_tlb,
+1 -1
arch/x86/kernel/process_32.c
··· 92 92 93 93 cr0 = read_cr0(); 94 94 cr2 = read_cr2(); 95 - cr3 = read_cr3(); 95 + cr3 = __read_cr3(); 96 96 cr4 = __read_cr4(); 97 97 printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", 98 98 cr0, cr2, cr3, cr4);
+1 -1
arch/x86/kernel/process_64.c
··· 104 104 105 105 cr0 = read_cr0(); 106 106 cr2 = read_cr2(); 107 - cr3 = read_cr3(); 107 + cr3 = __read_cr3(); 108 108 cr4 = __read_cr4(); 109 109 110 110 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
+1 -1
arch/x86/kvm/vmx.c
··· 5024 5024 * Save the most likely value for this task's CR3 in the VMCS. 5025 5025 * We can't use __get_current_cr3_fast() because we're not atomic. 5026 5026 */ 5027 - cr3 = read_cr3(); 5027 + cr3 = __read_cr3(); 5028 5028 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ 5029 5029 vmx->host_state.vmcs_host_cr3 = cr3; 5030 5030
+5 -5
arch/x86/mm/fault.c
··· 346 346 * Do _not_ use "current" here. We might be inside 347 347 * an interrupt in the middle of a task switch.. 348 348 */ 349 - pgd_paddr = read_cr3(); 349 + pgd_paddr = read_cr3_pa(); 350 350 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 351 351 if (!pmd_k) 352 352 return -1; ··· 388 388 389 389 static void dump_pagetable(unsigned long address) 390 390 { 391 - pgd_t *base = __va(read_cr3()); 391 + pgd_t *base = __va(read_cr3_pa()); 392 392 pgd_t *pgd = &base[pgd_index(address)]; 393 393 p4d_t *p4d; 394 394 pud_t *pud; ··· 451 451 * happen within a race in page table update. In the later 452 452 * case just flush: 453 453 */ 454 - pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address); 454 + pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); 455 455 pgd_ref = pgd_offset_k(address); 456 456 if (pgd_none(*pgd_ref)) 457 457 return -1; ··· 555 555 556 556 static void dump_pagetable(unsigned long address) 557 557 { 558 - pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); 558 + pgd_t *base = __va(read_cr3_pa()); 559 559 pgd_t *pgd = base + pgd_index(address); 560 560 p4d_t *p4d; 561 561 pud_t *pud; ··· 700 700 pgd_t *pgd; 701 701 pte_t *pte; 702 702 703 - pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); 703 + pgd = __va(read_cr3_pa()); 704 704 pgd += pgd_index(address); 705 705 706 706 pte = lookup_address_in_pgd(pgd, address, &level);
+1 -1
arch/x86/mm/ioremap.c
··· 424 424 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) 425 425 { 426 426 /* Don't assume we're using swapper_pg_dir at this point */ 427 - pgd_t *base = __va(read_cr3()); 427 + pgd_t *base = __va(read_cr3_pa()); 428 428 pgd_t *pgd = &base[pgd_index(addr)]; 429 429 p4d_t *p4d = p4d_offset(pgd, addr); 430 430 pud_t *pud = pud_offset(p4d, addr);
+2 -2
arch/x86/platform/efi/efi_64.c
··· 80 80 int n_pgds, i, j; 81 81 82 82 if (!efi_enabled(EFI_OLD_MEMMAP)) { 83 - save_pgd = (pgd_t *)read_cr3(); 83 + save_pgd = (pgd_t *)__read_cr3(); 84 84 write_cr3((unsigned long)efi_scratch.efi_pgt); 85 85 goto out; 86 86 } ··· 646 646 efi_sync_low_kernel_mappings(); 647 647 local_irq_save(flags); 648 648 649 - efi_scratch.prev_cr3 = read_cr3(); 649 + efi_scratch.prev_cr3 = __read_cr3(); 650 650 write_cr3((unsigned long)efi_scratch.efi_pgt); 651 651 __flush_tlb_all(); 652 652
+1 -1
arch/x86/platform/olpc/olpc-xo1-pm.c
··· 77 77 78 78 asmlinkage __visible int xo1_do_sleep(u8 sleep_state) 79 79 { 80 - void *pgd_addr = __va(read_cr3()); 80 + void *pgd_addr = __va(read_cr3_pa()); 81 81 82 82 /* Program wakeup mask (using dword access to CS5536_PM1_EN) */ 83 83 outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
+1 -1
arch/x86/power/cpu.c
··· 129 129 */ 130 130 ctxt->cr0 = read_cr0(); 131 131 ctxt->cr2 = read_cr2(); 132 - ctxt->cr3 = read_cr3(); 132 + ctxt->cr3 = __read_cr3(); 133 133 ctxt->cr4 = __read_cr4(); 134 134 #ifdef CONFIG_X86_64 135 135 ctxt->cr8 = read_cr8();
+2 -1
arch/x86/power/hibernate_64.c
··· 150 150 memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE); 151 151 152 152 /* Make the page containing the relocated code executable */ 153 - pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); 153 + pgd = (pgd_t *)__va(read_cr3_pa()) + 154 + pgd_index(relocated_restore_code); 154 155 p4d = p4d_offset(pgd, relocated_restore_code); 155 156 if (p4d_large(*p4d)) { 156 157 set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
+3 -3
arch/x86/xen/mmu_pv.c
··· 2017 2017 pmd_t pmd; 2018 2018 pte_t pte; 2019 2019 2020 - pa = read_cr3(); 2020 + pa = read_cr3_pa(); 2021 2021 pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * 2022 2022 sizeof(pgd))); 2023 2023 if (!pgd_present(pgd)) ··· 2097 2097 pt_phys = pmd_phys + PFN_PHYS(n_pmd); 2098 2098 p2m_pfn = PFN_DOWN(pt_phys) + n_pt; 2099 2099 2100 - pgd = __va(read_cr3()); 2100 + pgd = __va(read_cr3_pa()); 2101 2101 new_p2m = (unsigned long *)(2 * PGDIR_SIZE); 2102 2102 idx_p4d = 0; 2103 2103 save_pud = n_pud; ··· 2204 2204 { 2205 2205 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); 2206 2206 2207 - BUG_ON(read_cr3() != __pa(initial_page_table)); 2207 + BUG_ON(read_cr3_pa() != __pa(initial_page_table)); 2208 2208 BUG_ON(cr3 != __pa(swapper_pg_dir)); 2209 2209 2210 2210 /*