Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'kvm-nvmx-and-vm-teardown' into HEAD

The immediate issue being fixed here is a nVMX bug where KVM fails to
detect that, after nested VM-Exit, L1 has a pending IRQ (or NMI).
However, checking for a pending interrupt accesses the legacy PIC, and
x86's kvm_arch_destroy_vm() currently frees the PIC before destroying
vCPUs, i.e. checking for IRQs during the forced nested VM-Exit results
in a NULL pointer deref; that's a prerequisite for the nVMX fix.

The remaining patches attempt to bring a bit of sanity to x86's VM
teardown code, which has accumulated a lot of cruft over the years. E.g.
KVM currently unloads each vCPU's MMUs in a separate operation from
destroying vCPUs, all because when guest SMP support was added, KVM had a
kludgy MMU teardown flow that broke when a VM had more than one 1 vCPU.
And that oddity lived on, for 18 years...

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

+22 -36
-2
arch/arm64/include/asm/kvm_host.h
··· 1375 1375 return cpus_have_final_cap(ARM64_SPECTRE_V3A); 1376 1376 } 1377 1377 1378 - static inline void kvm_arch_sync_events(struct kvm *kvm) {} 1379 - 1380 1378 void kvm_init_host_debug_data(void); 1381 1379 void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu); 1382 1380 void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu);
-1
arch/loongarch/include/asm/kvm_host.h
··· 326 326 327 327 /* Misc */ 328 328 static inline void kvm_arch_hardware_unsetup(void) {} 329 - static inline void kvm_arch_sync_events(struct kvm *kvm) {} 330 329 static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} 331 330 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} 332 331 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
-1
arch/mips/include/asm/kvm_host.h
··· 886 886 extern int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 887 887 struct kvm_mips_interrupt *irq); 888 888 889 - static inline void kvm_arch_sync_events(struct kvm *kvm) {} 890 889 static inline void kvm_arch_free_memslot(struct kvm *kvm, 891 890 struct kvm_memory_slot *slot) {} 892 891 static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
-1
arch/powerpc/include/asm/kvm_host.h
··· 902 902 #define __KVM_HAVE_ARCH_WQP 903 903 #define __KVM_HAVE_CREATE_DEVICE 904 904 905 - static inline void kvm_arch_sync_events(struct kvm *kvm) {} 906 905 static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} 907 906 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} 908 907 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
-2
arch/riscv/include/asm/kvm_host.h
··· 301 301 return IS_ENABLED(CONFIG_GUEST_PERF_EVENTS) && !!vcpu; 302 302 } 303 303 304 - static inline void kvm_arch_sync_events(struct kvm *kvm) {} 305 - 306 304 #define KVM_RISCV_GSTAGE_TLB_MIN_ORDER 12 307 305 308 306 void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid,
-1
arch/s390/include/asm/kvm_host.h
··· 1056 1056 extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); 1057 1057 extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); 1058 1058 1059 - static inline void kvm_arch_sync_events(struct kvm *kvm) {} 1060 1059 static inline void kvm_arch_free_memslot(struct kvm *kvm, 1061 1060 struct kvm_memory_slot *slot) {} 1062 1061 static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
+14 -26
arch/x86/kvm/x86.c
··· 12369 12369 { 12370 12370 int idx; 12371 12371 12372 + kvm_clear_async_pf_completion_queue(vcpu); 12373 + kvm_mmu_unload(vcpu); 12374 + 12372 12375 kvmclock_reset(vcpu); 12373 12376 12374 12377 kvm_x86_call(vcpu_free)(vcpu); ··· 12765 12762 return ret; 12766 12763 } 12767 12764 12768 - static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 12769 - { 12770 - vcpu_load(vcpu); 12771 - kvm_mmu_unload(vcpu); 12772 - vcpu_put(vcpu); 12773 - } 12774 - 12775 - static void kvm_unload_vcpu_mmus(struct kvm *kvm) 12776 - { 12777 - unsigned long i; 12778 - struct kvm_vcpu *vcpu; 12779 - 12780 - kvm_for_each_vcpu(i, vcpu, kvm) { 12781 - kvm_clear_async_pf_completion_queue(vcpu); 12782 - kvm_unload_vcpu_mmu(vcpu); 12783 - } 12784 - } 12785 - 12786 - void kvm_arch_sync_events(struct kvm *kvm) 12787 - { 12788 - cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); 12789 - cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); 12790 - kvm_free_pit(kvm); 12791 - } 12792 - 12793 12765 /** 12794 12766 * __x86_set_memory_region: Setup KVM internal memory slot 12795 12767 * ··· 12843 12865 12844 12866 void kvm_arch_pre_destroy_vm(struct kvm *kvm) 12845 12867 { 12868 + /* 12869 + * Stop all background workers and kthreads before destroying vCPUs, as 12870 + * iterating over vCPUs in a different task while vCPUs are being freed 12871 + * is unsafe, i.e. will lead to use-after-free. The PIT also needs to 12872 + * be stopped before IRQ routing is freed. 12873 + */ 12874 + cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); 12875 + cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); 12876 + 12877 + kvm_free_pit(kvm); 12878 + 12846 12879 kvm_mmu_pre_destroy_vm(kvm); 12847 12880 } 12848 12881 ··· 12873 12884 __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); 12874 12885 mutex_unlock(&kvm->slots_lock); 12875 12886 } 12876 - kvm_unload_vcpu_mmus(kvm); 12877 12887 kvm_destroy_vcpus(kvm); 12878 12888 kvm_x86_call(vm_destroy)(kvm); 12879 12889 kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
-1
include/linux/kvm_host.h
··· 1747 1747 1748 1748 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type); 1749 1749 void kvm_arch_destroy_vm(struct kvm *kvm); 1750 - void kvm_arch_sync_events(struct kvm *kvm); 1751 1750 1752 1751 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); 1753 1752
+8 -1
virt/kvm/kvm_main.c
··· 489 489 kvm_for_each_vcpu(i, vcpu, kvm) { 490 490 kvm_vcpu_destroy(vcpu); 491 491 xa_erase(&kvm->vcpu_array, i); 492 + 493 + /* 494 + * Assert that the vCPU isn't visible in any way, to ensure KVM 495 + * doesn't trigger a use-after-free if destroying vCPUs results 496 + * in VM-wide request, e.g. to flush remote TLBs when tearing 497 + * down MMUs, or to mark the VM dead if a KVM_BUG_ON() fires. 498 + */ 499 + WARN_ON_ONCE(xa_load(&kvm->vcpu_array, i) || kvm_get_vcpu(kvm, i)); 492 500 } 493 501 494 502 atomic_set(&kvm->online_vcpus, 0); ··· 1271 1263 kvm_destroy_pm_notifier(kvm); 1272 1264 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); 1273 1265 kvm_destroy_vm_debugfs(kvm); 1274 - kvm_arch_sync_events(kvm); 1275 1266 mutex_lock(&kvm_lock); 1276 1267 list_del(&kvm->vm_list); 1277 1268 mutex_unlock(&kvm_lock);