x86/pti: Fix !PCID and sanitize defines

The switch to the user space page tables in the low level ASM code sets
unconditionally bit 12 and bit 11 of CR3. Bit 12 is switching the base
address of the page directory to the user part, bit 11 is switching the
PCID to the PCID associated with the user page tables.

This fails on a machine which lacks PCID support because bit 11 is set in
CR3. Bit 11 is reserved when PCID is inactive.

While the Intel SDM claims that the reserved bits are ignored when PCID is
disabled, the AMD APM states that they should be cleared.

This went unnoticed as the AMD APM was not checked when the code was
developed and reviewed and test systems with Intel CPUs never failed to
boot. The report is against a Centos 6 host where the guest fails to boot,
so it's not yet clear whether this is a virt issue or can happen on real
hardware too, but thats irrelevant as the AMD APM clearly ask for clearing
the reserved bits.

Make sure that on non PCID machines bit 11 is not set by the page table
switching code.

Andy suggested to rename the related bits and masks so they are clearly
describing what they should be used for, which is done as well for clarity.

That split could have been done with alternatives but the macro hell is
horrible and ugly. This can be done on top if someone cares to remove the
extra orq. For now it's a straight forward fix.

Fixes: 6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches")
Reported-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable <stable@vger.kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Willy Tarreau <w@1wt.eu>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801140009150.2371@nanos

+23 -21
+19 -17
arch/x86/entry/calling.h
··· 198 * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two 199 * halves: 200 */ 201 - #define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT) 202 - #define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT)) 203 204 .macro SET_NOFLUSH_BIT reg:req 205 bts $X86_CR3_PCID_NOFLUSH_BIT, \reg ··· 211 .macro ADJUST_KERNEL_CR3 reg:req 212 ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID 213 /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ 214 - andq $(~PTI_SWITCH_MASK), \reg 215 .endm 216 217 .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req ··· 242 /* Flush needed, clear the bit */ 243 btr \scratch_reg, THIS_CPU_user_pcid_flush_mask 244 movq \scratch_reg2, \scratch_reg 245 - jmp .Lwrcr3_\@ 246 247 .Lnoflush_\@: 248 movq \scratch_reg2, \scratch_reg 249 SET_NOFLUSH_BIT \scratch_reg 250 251 .Lwrcr3_\@: 252 - /* Flip the PGD and ASID to the user version */ 253 - orq $(PTI_SWITCH_MASK), \scratch_reg 254 mov \scratch_reg, %cr3 255 .Lend_\@: 256 .endm ··· 270 movq %cr3, \scratch_reg 271 movq \scratch_reg, \save_reg 272 /* 273 - * Is the "switch mask" all zero? That means that both of 274 - * these are zero: 275 - * 276 - * 1. The user/kernel PCID bit, and 277 - * 2. The user/kernel "bit" that points CR3 to the 278 - * bottom half of the 8k PGD 279 - * 280 - * That indicates a kernel CR3 value, not a user CR3. 281 */ 282 - testq $(PTI_SWITCH_MASK), \scratch_reg 283 - jz .Ldone_\@ 284 285 ADJUST_KERNEL_CR3 \scratch_reg 286 movq \scratch_reg, %cr3 ··· 292 * KERNEL pages can always resume with NOFLUSH as we do 293 * explicit flushes. 294 */ 295 - bt $X86_CR3_PTI_SWITCH_BIT, \save_reg 296 jnc .Lnoflush_\@ 297 298 /*
··· 198 * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two 199 * halves: 200 */ 201 + #define PTI_USER_PGTABLE_BIT PAGE_SHIFT 202 + #define PTI_USER_PGTABLE_MASK (1 << PTI_USER_PGTABLE_BIT) 203 + #define PTI_USER_PCID_BIT X86_CR3_PTI_PCID_USER_BIT 204 + #define PTI_USER_PCID_MASK (1 << PTI_USER_PCID_BIT) 205 + #define PTI_USER_PGTABLE_AND_PCID_MASK (PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK) 206 207 .macro SET_NOFLUSH_BIT reg:req 208 bts $X86_CR3_PCID_NOFLUSH_BIT, \reg ··· 208 .macro ADJUST_KERNEL_CR3 reg:req 209 ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID 210 /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ 211 + andq $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg 212 .endm 213 214 .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req ··· 239 /* Flush needed, clear the bit */ 240 btr \scratch_reg, THIS_CPU_user_pcid_flush_mask 241 movq \scratch_reg2, \scratch_reg 242 + jmp .Lwrcr3_pcid_\@ 243 244 .Lnoflush_\@: 245 movq \scratch_reg2, \scratch_reg 246 SET_NOFLUSH_BIT \scratch_reg 247 248 + .Lwrcr3_pcid_\@: 249 + /* Flip the ASID to the user version */ 250 + orq $(PTI_USER_PCID_MASK), \scratch_reg 251 + 252 .Lwrcr3_\@: 253 + /* Flip the PGD to the user version */ 254 + orq $(PTI_USER_PGTABLE_MASK), \scratch_reg 255 mov \scratch_reg, %cr3 256 .Lend_\@: 257 .endm ··· 263 movq %cr3, \scratch_reg 264 movq \scratch_reg, \save_reg 265 /* 266 + * Test the user pagetable bit. If set, then the user page tables 267 + * are active. If clear CR3 already has the kernel page table 268 + * active. 269 */ 270 + bt $PTI_USER_PGTABLE_BIT, \scratch_reg 271 + jnc .Ldone_\@ 272 273 ADJUST_KERNEL_CR3 \scratch_reg 274 movq \scratch_reg, %cr3 ··· 290 * KERNEL pages can always resume with NOFLUSH as we do 291 * explicit flushes. 292 */ 293 + bt $PTI_USER_PGTABLE_BIT, \save_reg 294 jnc .Lnoflush_\@ 295 296 /*
+1 -1
arch/x86/include/asm/processor-flags.h
··· 40 #define CR3_NOFLUSH BIT_ULL(63) 41 42 #ifdef CONFIG_PAGE_TABLE_ISOLATION 43 - # define X86_CR3_PTI_SWITCH_BIT 11 44 #endif 45 46 #else
··· 40 #define CR3_NOFLUSH BIT_ULL(63) 41 42 #ifdef CONFIG_PAGE_TABLE_ISOLATION 43 + # define X86_CR3_PTI_PCID_USER_BIT 11 44 #endif 45 46 #else
+3 -3
arch/x86/include/asm/tlbflush.h
··· 81 * Make sure that the dynamic ASID space does not confict with the 82 * bit we are using to switch between user and kernel ASIDs. 83 */ 84 - BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT)); 85 86 /* 87 * The ASID being passed in here should have respected the 88 * MAX_ASID_AVAILABLE and thus never have the switch bit set. 89 */ 90 - VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT)); 91 #endif 92 /* 93 * The dynamically-assigned ASIDs that get passed in are small ··· 112 { 113 u16 ret = kern_pcid(asid); 114 #ifdef CONFIG_PAGE_TABLE_ISOLATION 115 - ret |= 1 << X86_CR3_PTI_SWITCH_BIT; 116 #endif 117 return ret; 118 }
··· 81 * Make sure that the dynamic ASID space does not confict with the 82 * bit we are using to switch between user and kernel ASIDs. 83 */ 84 + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); 85 86 /* 87 * The ASID being passed in here should have respected the 88 * MAX_ASID_AVAILABLE and thus never have the switch bit set. 89 */ 90 + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT)); 91 #endif 92 /* 93 * The dynamically-assigned ASIDs that get passed in are small ··· 112 { 113 u16 ret = kern_pcid(asid); 114 #ifdef CONFIG_PAGE_TABLE_ISOLATION 115 + ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; 116 #endif 117 return ret; 118 }