Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: MMU: fix SMAP virtualization

KVM may turn a user page to a kernel page when kernel writes a readonly
user page if CR0.WP = 1. This shadow page entry will be reused after
SMAP is enabled so that kernel is allowed to access this user page

Fix it by setting SMAP && !CR0.WP into shadow page's role and reset mmu
once CR4.SMAP is updated

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

authored by

Xiao Guangrong and committed by
Paolo Bonzini
0be0226f 89876115

+30 -15
+14 -4
Documentation/virtual/kvm/mmu.txt
··· 169 169 Contains the value of cr4.smep && !cr0.wp for which the page is valid 170 170 (pages for which this is true are different from other pages; see the 171 171 treatment of cr0.wp=0 below). 172 + role.smap_andnot_wp: 173 + Contains the value of cr4.smap && !cr0.wp for which the page is valid 174 + (pages for which this is true are different from other pages; see the 175 + treatment of cr0.wp=0 below). 172 176 gfn: 173 177 Either the guest page table containing the translations shadowed by this 174 178 page, or the base page frame for linear translations. See role.direct. ··· 348 344 349 345 (user write faults generate a #PF) 350 346 351 - In the first case there is an additional complication if CR4.SMEP is 352 - enabled: since we've turned the page into a kernel page, the kernel may now 353 - execute it. We handle this by also setting spte.nx. If we get a user 354 - fetch or read fault, we'll change spte.u=1 and spte.nx=gpte.nx back. 347 + In the first case there are two additional complications: 348 + - if CR4.SMEP is enabled: since we've turned the page into a kernel page, 349 + the kernel may now execute it. We handle this by also setting spte.nx. 350 + If we get a user fetch or read fault, we'll change spte.u=1 and 351 + spte.nx=gpte.nx back. 352 + - if CR4.SMAP is disabled: since the page has been changed to a kernel 353 + page, it can not be reused when CR4.SMAP is enabled. We set 354 + CR4.SMAP && !CR0.WP into shadow page's role to avoid this case. Note, 355 + here we do not care the case that CR4.SMAP is enabled since KVM will 356 + directly inject #PF to guest due to failed permission check. 355 357 356 358 To prevent an spte that was converted into a kernel page with cr0.wp=0 357 359 from being written by the kernel after cr0.wp has changed to 1, we make
+1
arch/x86/include/asm/kvm_host.h
··· 207 207 unsigned nxe:1; 208 208 unsigned cr0_wp:1; 209 209 unsigned smep_andnot_wp:1; 210 + unsigned smap_andnot_wp:1; 210 211 }; 211 212 }; 212 213
+12 -4
arch/x86/kvm/mmu.c
··· 3736 3736 } 3737 3737 } 3738 3738 3739 - void update_permission_bitmask(struct kvm_vcpu *vcpu, 3740 - struct kvm_mmu *mmu, bool ept) 3739 + static void update_permission_bitmask(struct kvm_vcpu *vcpu, 3740 + struct kvm_mmu *mmu, bool ept) 3741 3741 { 3742 3742 unsigned bit, byte, pfec; 3743 3743 u8 map; ··· 3918 3918 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) 3919 3919 { 3920 3920 bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); 3921 + bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); 3921 3922 struct kvm_mmu *context = &vcpu->arch.mmu; 3922 3923 3923 3924 MMU_WARN_ON(VALID_PAGE(context->root_hpa)); ··· 3937 3936 context->base_role.cr0_wp = is_write_protection(vcpu); 3938 3937 context->base_role.smep_andnot_wp 3939 3938 = smep && !is_write_protection(vcpu); 3939 + context->base_role.smap_andnot_wp 3940 + = smap && !is_write_protection(vcpu); 3940 3941 } 3941 3942 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); 3942 3943 ··· 4210 4207 const u8 *new, int bytes) 4211 4208 { 4212 4209 gfn_t gfn = gpa >> PAGE_SHIFT; 4213 - union kvm_mmu_page_role mask = { .word = 0 }; 4214 4210 struct kvm_mmu_page *sp; 4215 4211 LIST_HEAD(invalid_list); 4216 4212 u64 entry, gentry, *spte; 4217 4213 int npte; 4218 4214 bool remote_flush, local_flush, zap_page; 4215 + union kvm_mmu_page_role mask = (union kvm_mmu_page_role) { 4216 + .cr0_wp = 1, 4217 + .cr4_pae = 1, 4218 + .nxe = 1, 4219 + .smep_andnot_wp = 1, 4220 + .smap_andnot_wp = 1, 4221 + }; 4219 4222 4220 4223 /* 4221 4224 * If we don't have indirect shadow pages, it means no page is ··· 4247 4238 ++vcpu->kvm->stat.mmu_pte_write; 4248 4239 kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); 4249 4240 4250 - mask.cr0_wp = mask.cr4_pae = mask.nxe = mask.smep_andnot_wp = 1; 4251 4241 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { 4252 4242 if (detect_write_misaligned(sp, gpa, bytes) || 4253 4243 detect_write_flooding(sp)) {
-2
arch/x86/kvm/mmu.h
··· 71 71 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); 72 72 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); 73 73 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); 74 - void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 75 - bool ept); 76 74 77 75 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) 78 76 {
+3 -5
arch/x86/kvm/x86.c
··· 702 702 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 703 703 { 704 704 unsigned long old_cr4 = kvm_read_cr4(vcpu); 705 - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | 706 - X86_CR4_PAE | X86_CR4_SMEP; 705 + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | 706 + X86_CR4_SMEP | X86_CR4_SMAP; 707 + 707 708 if (cr4 & CR4_RESERVED_BITS) 708 709 return 1; 709 710 ··· 744 743 if (((cr4 ^ old_cr4) & pdptr_bits) || 745 744 (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) 746 745 kvm_mmu_reset_context(vcpu); 747 - 748 - if ((cr4 ^ old_cr4) & X86_CR4_SMAP) 749 - update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false); 750 746 751 747 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) 752 748 kvm_update_cpuid(vcpu);