Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
"PPC:
- Hide KVM_CAP_IRQFD_RESAMPLE if XIVE is enabled

s390:
- Fix handling of external interrupts in protected guests

x86:
- Resample the pending state of IOAPIC interrupts when unmasking them

- Fix usage of Hyper-V "enlightened TLB" on AMD

- Small fixes to real mode exceptions

- Suppress pending MMIO write exits if emulator detects exception

Documentation:
- Fix rST syntax"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
docs: kvm: x86: Fix broken field list
KVM: PPC: Make KVM_CAP_IRQFD_RESAMPLE platform dependent
KVM: s390: pv: fix external interruption loop not always detected
KVM: nVMX: Do not report error code when synthesizing VM-Exit from Real Mode
KVM: x86: Clear "has_error_code", not "error_code", for RM exception injection
KVM: x86: Suppress pending MMIO write exits if emulator detects exception
KVM: x86/ioapic: Resample the pending state of an IRQ when unmasking
KVM: irqfd: Make resampler_list an RCU list
KVM: SVM: Flush Hyper-V TLB when required

+192 -29
+2 -2
Documentation/virt/kvm/api.rst
··· 8296 8296 8.35 KVM_CAP_PMU_CAPABILITY 8297 8297 --------------------------- 8298 8298 8299 - :Capability KVM_CAP_PMU_CAPABILITY 8299 + :Capability: KVM_CAP_PMU_CAPABILITY 8300 8300 :Architectures: x86 8301 8301 :Type: vm 8302 8302 :Parameters: arg[0] is bitmask of PMU virtualization capabilities. 8303 - :Returns 0 on success, -EINVAL when arg[0] contains invalid bits 8303 + :Returns: 0 on success, -EINVAL when arg[0] contains invalid bits 8304 8304 8305 8305 This capability alters PMU virtualization in KVM. 8306 8306
+1
arch/arm64/kvm/arm.c
··· 220 220 case KVM_CAP_VCPU_ATTRIBUTES: 221 221 case KVM_CAP_PTP_KVM: 222 222 case KVM_CAP_ARM_SYSTEM_SUSPEND: 223 + case KVM_CAP_IRQFD_RESAMPLE: 223 224 r = 1; 224 225 break; 225 226 case KVM_CAP_SET_GUEST_DEBUG2:
+6
arch/powerpc/kvm/powerpc.c
··· 576 576 break; 577 577 #endif 578 578 579 + #ifdef CONFIG_HAVE_KVM_IRQFD 580 + case KVM_CAP_IRQFD_RESAMPLE: 581 + r = !xive_enabled(); 582 + break; 583 + #endif 584 + 579 585 case KVM_CAP_PPC_ALLOC_HTAB: 580 586 r = hv_enabled; 581 587 break;
+24 -8
arch/s390/kvm/intercept.c
··· 271 271 * handle_external_interrupt - used for external interruption interceptions 272 272 * @vcpu: virtual cpu 273 273 * 274 - * This interception only occurs if the CPUSTAT_EXT_INT bit was set, or if 275 - * the new PSW does not have external interrupts disabled. In the first case, 276 - * we've got to deliver the interrupt manually, and in the second case, we 277 - * drop to userspace to handle the situation there. 274 + * This interception occurs if: 275 + * - the CPUSTAT_EXT_INT bit was already set when the external interrupt 276 + * occurred. In this case, the interrupt needs to be injected manually to 277 + * preserve interrupt priority. 278 + * - the external new PSW has external interrupts enabled, which will cause an 279 + * interruption loop. We drop to userspace in this case. 280 + * 281 + * The latter case can be detected by inspecting the external mask bit in the 282 + * external new psw. 283 + * 284 + * Under PV, only the latter case can occur, since interrupt priorities are 285 + * handled in the ultravisor. 278 286 */ 279 287 static int handle_external_interrupt(struct kvm_vcpu *vcpu) 280 288 { ··· 293 285 294 286 vcpu->stat.exit_external_interrupt++; 295 287 296 - rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t)); 297 - if (rc) 298 - return rc; 299 - /* We can not handle clock comparator or timer interrupt with bad PSW */ 288 + if (kvm_s390_pv_cpu_is_protected(vcpu)) { 289 + newpsw = vcpu->arch.sie_block->gpsw; 290 + } else { 291 + rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t)); 292 + if (rc) 293 + return rc; 294 + } 295 + 296 + /* 297 + * Clock comparator or timer interrupt with external interrupt enabled 298 + * will cause interrupt loop. Drop to userspace. 299 + */ 300 300 if ((eic == EXT_IRQ_CLK_COMP || eic == EXT_IRQ_CPU_TIMER) && 301 301 (newpsw.mask & PSW_MASK_EXT)) 302 302 return -EOPNOTSUPP;
+1
arch/s390/kvm/kvm-s390.c
··· 573 573 case KVM_CAP_S390_VCPU_RESETS: 574 574 case KVM_CAP_SET_GUEST_DEBUG: 575 575 case KVM_CAP_S390_DIAG318: 576 + case KVM_CAP_IRQFD_RESAMPLE: 576 577 r = 1; 577 578 break; 578 579 case KVM_CAP_SET_GUEST_DEBUG2:
+33 -3
arch/x86/kvm/ioapic.c
··· 368 368 mask_after = e->fields.mask; 369 369 if (mask_before != mask_after) 370 370 kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); 371 - if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG 372 - && ioapic->irr & (1 << index)) 373 - ioapic_service(ioapic, index, false); 371 + if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && 372 + ioapic->irr & (1 << index) && !e->fields.mask && !e->fields.remote_irr) { 373 + /* 374 + * Pending status in irr may be outdated: the IRQ line may have 375 + * already been deasserted by a device while the IRQ was masked. 376 + * This occurs, for instance, if the interrupt is handled in a 377 + * Linux guest as a oneshot interrupt (IRQF_ONESHOT). In this 378 + * case the guest acknowledges the interrupt to the device in 379 + * its threaded irq handler, i.e. after the EOI but before 380 + * unmasking, so at the time of unmasking the IRQ line is 381 + * already down but our pending irr bit is still set. In such 382 + * cases, injecting this pending interrupt to the guest is 383 + * buggy: the guest will receive an extra unwanted interrupt. 384 + * 385 + * So we need to check here if the IRQ is actually still pending. 386 + * As we are generally not able to probe the IRQ line status 387 + * directly, we do it through irqfd resampler. Namely, we clear 388 + * the pending status and notify the resampler that this interrupt 389 + * is done, without actually injecting it into the guest. If the 390 + * IRQ line is actually already deasserted, we are done. If it is 391 + * still asserted, a new interrupt will be shortly triggered 392 + * through irqfd and injected into the guest. 393 + * 394 + * If, however, it's not possible to resample (no irqfd resampler 395 + * registered for this irq), then unconditionally inject this 396 + * pending interrupt into the guest, so the guest will not miss 397 + * an interrupt, although may get an extra unwanted interrupt. 398 + */ 399 + if (kvm_notify_irqfd_resampler(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index)) 400 + ioapic->irr &= ~(1 << index); 401 + else 402 + ioapic_service(ioapic, index, false); 403 + } 374 404 if (e->fields.delivery_mode == APIC_DM_FIXED) { 375 405 struct kvm_lapic_irq irq; 376 406
+5
arch/x86/kvm/kvm_onhyperv.h
··· 12 12 int hv_remote_flush_tlb(struct kvm *kvm); 13 13 void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp); 14 14 #else /* !CONFIG_HYPERV */ 15 + static inline int hv_remote_flush_tlb(struct kvm *kvm) 16 + { 17 + return -EOPNOTSUPP; 18 + } 19 + 15 20 static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) 16 21 { 17 22 }
+34 -3
arch/x86/kvm/svm/svm.c
··· 3729 3729 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 3730 3730 } 3731 3731 3732 - static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) 3732 + static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu) 3733 3733 { 3734 3734 struct vcpu_svm *svm = to_svm(vcpu); 3735 3735 ··· 3751 3751 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; 3752 3752 else 3753 3753 svm->current_vmcb->asid_generation--; 3754 + } 3755 + 3756 + static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) 3757 + { 3758 + hpa_t root_tdp = vcpu->arch.mmu->root.hpa; 3759 + 3760 + /* 3761 + * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly 3762 + * flush the NPT mappings via hypercall as flushing the ASID only 3763 + * affects virtual to physical mappings, it does not invalidate guest 3764 + * physical to host physical mappings. 3765 + */ 3766 + if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp)) 3767 + hyperv_flush_guest_mapping(root_tdp); 3768 + 3769 + svm_flush_tlb_asid(vcpu); 3770 + } 3771 + 3772 + static void svm_flush_tlb_all(struct kvm_vcpu *vcpu) 3773 + { 3774 + /* 3775 + * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB 3776 + * flushes should be routed to hv_remote_flush_tlb() without requesting 3777 + * a "regular" remote flush. Reaching this point means either there's 3778 + * a KVM bug or a prior hv_remote_flush_tlb() call failed, both of 3779 + * which might be fatal to the guest. Yell, but try to recover. 3780 + */ 3781 + if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu))) 3782 + hv_remote_flush_tlb(vcpu->kvm); 3783 + 3784 + svm_flush_tlb_asid(vcpu); 3754 3785 } 3755 3786 3756 3787 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) ··· 4776 4745 .set_rflags = svm_set_rflags, 4777 4746 .get_if_flag = svm_get_if_flag, 4778 4747 4779 - .flush_tlb_all = svm_flush_tlb_current, 4748 + .flush_tlb_all = svm_flush_tlb_all, 4780 4749 .flush_tlb_current = svm_flush_tlb_current, 4781 4750 .flush_tlb_gva = svm_flush_tlb_gva, 4782 - .flush_tlb_guest = svm_flush_tlb_current, 4751 + .flush_tlb_guest = svm_flush_tlb_asid, 4783 4752 4784 4753 .vcpu_pre_run = svm_vcpu_pre_run, 4785 4754 .vcpu_run = svm_vcpu_run,
+15
arch/x86/kvm/svm/svm_onhyperv.h
··· 6 6 #ifndef __ARCH_X86_KVM_SVM_ONHYPERV_H__ 7 7 #define __ARCH_X86_KVM_SVM_ONHYPERV_H__ 8 8 9 + #include <asm/mshyperv.h> 10 + 9 11 #if IS_ENABLED(CONFIG_HYPERV) 10 12 11 13 #include "kvm_onhyperv.h" ··· 16 14 static struct kvm_x86_ops svm_x86_ops; 17 15 18 16 int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu); 17 + 18 + static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu) 19 + { 20 + struct hv_vmcb_enlightenments *hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments; 21 + 22 + return ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB && 23 + !!hve->hv_enlightenments_control.enlightened_npt_tlb; 24 + } 19 25 20 26 static inline void svm_hv_init_vmcb(struct vmcb *vmcb) 21 27 { ··· 89 79 } 90 80 } 91 81 #else 82 + 83 + static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu) 84 + { 85 + return false; 86 + } 92 87 93 88 static inline void svm_hv_init_vmcb(struct vmcb *vmcb) 94 89 {
+6 -1
arch/x86/kvm/vmx/nested.c
··· 3868 3868 exit_qual = 0; 3869 3869 } 3870 3870 3871 - if (ex->has_error_code) { 3871 + /* 3872 + * Unlike AMD's Paged Real Mode, which reports an error code on #PF 3873 + * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the 3874 + * "has error code" flags on VM-Exit if the CPU is in Real Mode. 3875 + */ 3876 + if (ex->has_error_code && is_protmode(vcpu)) { 3872 3877 /* 3873 3878 * Intel CPUs do not generate error codes with bits 31:16 set, 3874 3879 * and more importantly VMX disallows setting bits 31:16 in the
+12 -2
arch/x86/kvm/x86.c
··· 4432 4432 case KVM_CAP_VAPIC: 4433 4433 case KVM_CAP_ENABLE_CAP: 4434 4434 case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: 4435 + case KVM_CAP_IRQFD_RESAMPLE: 4435 4436 r = 1; 4436 4437 break; 4437 4438 case KVM_CAP_EXIT_HYPERCALL: ··· 8904 8903 } 8905 8904 8906 8905 if (ctxt->have_exception) { 8906 + WARN_ON_ONCE(vcpu->mmio_needed && !vcpu->mmio_is_write); 8907 + vcpu->mmio_needed = false; 8907 8908 r = 1; 8908 8909 inject_emulated_exception(vcpu); 8909 8910 } else if (vcpu->arch.pio.count) { ··· 9909 9906 9910 9907 static void kvm_inject_exception(struct kvm_vcpu *vcpu) 9911 9908 { 9909 + /* 9910 + * Suppress the error code if the vCPU is in Real Mode, as Real Mode 9911 + * exceptions don't report error codes. The presence of an error code 9912 + * is carried with the exception and only stripped when the exception 9913 + * is injected as intercepted #PF VM-Exits for AMD's Paged Real Mode do 9914 + * report an error code despite the CPU being in Real Mode. 9915 + */ 9916 + vcpu->arch.exception.has_error_code &= is_protmode(vcpu); 9917 + 9912 9918 trace_kvm_inj_exception(vcpu->arch.exception.vector, 9913 9919 vcpu->arch.exception.has_error_code, 9914 9920 vcpu->arch.exception.error_code, 9915 9921 vcpu->arch.exception.injected); 9916 9922 9917 - if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) 9918 - vcpu->arch.exception.error_code = false; 9919 9923 static_call(kvm_x86_inject_exception)(vcpu); 9920 9924 } 9921 9925
+11
include/linux/kvm_host.h
··· 755 755 struct { 756 756 spinlock_t lock; 757 757 struct list_head items; 758 + /* resampler_list update side is protected by resampler_lock. */ 758 759 struct list_head resampler_list; 759 760 struct mutex resampler_lock; 760 761 } irqfds; ··· 1987 1986 #ifdef CONFIG_HAVE_KVM_IRQFD 1988 1987 int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args); 1989 1988 void kvm_irqfd_release(struct kvm *kvm); 1989 + bool kvm_notify_irqfd_resampler(struct kvm *kvm, 1990 + unsigned int irqchip, 1991 + unsigned int pin); 1990 1992 void kvm_irq_routing_update(struct kvm *); 1991 1993 #else 1992 1994 static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) ··· 1998 1994 } 1999 1995 2000 1996 static inline void kvm_irqfd_release(struct kvm *kvm) {} 1997 + 1998 + static inline bool kvm_notify_irqfd_resampler(struct kvm *kvm, 1999 + unsigned int irqchip, 2000 + unsigned int pin) 2001 + { 2002 + return false; 2003 + } 2001 2004 #endif 2002 2005 2003 2006 #else
+1 -1
include/linux/kvm_irqfd.h
··· 31 31 /* 32 32 * Entry in list of kvm->irqfd.resampler_list. Use for sharing 33 33 * resamplers among irqfds on the same gsi. 34 - * Accessed and modified under kvm->irqfds.resampler_lock 34 + * RCU list modified under kvm->irqfds.resampler_lock 35 35 */ 36 36 struct list_head link; 37 37 };
+41 -8
virt/kvm/eventfd.c
··· 55 55 irqfd->gsi, 1, false); 56 56 } 57 57 58 + static void irqfd_resampler_notify(struct kvm_kernel_irqfd_resampler *resampler) 59 + { 60 + struct kvm_kernel_irqfd *irqfd; 61 + 62 + list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link, 63 + srcu_read_lock_held(&resampler->kvm->irq_srcu)) 64 + eventfd_signal(irqfd->resamplefd, 1); 65 + } 66 + 58 67 /* 59 68 * Since resampler irqfds share an IRQ source ID, we de-assert once 60 69 * then notify all of the resampler irqfds using this GSI. We can't ··· 74 65 { 75 66 struct kvm_kernel_irqfd_resampler *resampler; 76 67 struct kvm *kvm; 77 - struct kvm_kernel_irqfd *irqfd; 78 68 int idx; 79 69 80 70 resampler = container_of(kian, ··· 84 76 resampler->notifier.gsi, 0, false); 85 77 86 78 idx = srcu_read_lock(&kvm->irq_srcu); 87 - 88 - list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link, 89 - srcu_read_lock_held(&kvm->irq_srcu)) 90 - eventfd_signal(irqfd->resamplefd, 1); 91 - 79 + irqfd_resampler_notify(resampler); 92 80 srcu_read_unlock(&kvm->irq_srcu, idx); 93 81 } 94 82 ··· 100 96 synchronize_srcu(&kvm->irq_srcu); 101 97 102 98 if (list_empty(&resampler->list)) { 103 - list_del(&resampler->link); 99 + list_del_rcu(&resampler->link); 104 100 kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); 101 + /* 102 + * synchronize_srcu(&kvm->irq_srcu) already called 103 + * in kvm_unregister_irq_ack_notifier(). 104 + */ 105 105 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 106 106 resampler->notifier.gsi, 0, false); 107 107 kfree(resampler); ··· 377 369 resampler->notifier.irq_acked = irqfd_resampler_ack; 378 370 INIT_LIST_HEAD(&resampler->link); 379 371 380 - list_add(&resampler->link, &kvm->irqfds.resampler_list); 372 + list_add_rcu(&resampler->link, &kvm->irqfds.resampler_list); 381 373 kvm_register_irq_ack_notifier(kvm, 382 374 &resampler->notifier); 383 375 irqfd->resampler = resampler; ··· 650 642 } 651 643 652 644 spin_unlock_irq(&kvm->irqfds.lock); 645 + } 646 + 647 + bool kvm_notify_irqfd_resampler(struct kvm *kvm, 648 + unsigned int irqchip, 649 + unsigned int pin) 650 + { 651 + struct kvm_kernel_irqfd_resampler *resampler; 652 + int gsi, idx; 653 + 654 + idx = srcu_read_lock(&kvm->irq_srcu); 655 + gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 656 + if (gsi != -1) { 657 + list_for_each_entry_srcu(resampler, 658 + &kvm->irqfds.resampler_list, link, 659 + srcu_read_lock_held(&kvm->irq_srcu)) { 660 + if (resampler->notifier.gsi == gsi) { 661 + irqfd_resampler_notify(resampler); 662 + srcu_read_unlock(&kvm->irq_srcu, idx); 663 + return true; 664 + } 665 + } 666 + } 667 + srcu_read_unlock(&kvm->irq_srcu, idx); 668 + 669 + return false; 653 670 } 654 671 655 672 /*
-1
virt/kvm/kvm_main.c
··· 4479 4479 #endif 4480 4480 #ifdef CONFIG_HAVE_KVM_IRQFD 4481 4481 case KVM_CAP_IRQFD: 4482 - case KVM_CAP_IRQFD_RESAMPLE: 4483 4482 #endif 4484 4483 case KVM_CAP_IOEVENTFD_ANY_LENGTH: 4485 4484 case KVM_CAP_CHECK_EXTENSION_VM: