Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM fixes from Paolo Bonzini:

- Fix a bug where AVIC is incorrectly inhibited when running with
x2AVIC disabled via module param (or on a system without x2AVIC)

- Fix a dangling device posted IRQs bug by explicitly checking if the
irqfd is still active (on the list) when handling an eventfd signal,
instead of zeroing the irqfd's routing information when the irqfd is
deassigned.

Zeroing the irqfd's routing info causes arm64 and x86's to not
disable posting for the IRQ (kvm_arch_irq_bypass_del_producer() looks
for an MSI), incorrectly leaving the IRQ in posted mode (and leading
to use-after-free and memory leaks on AMD in particular).

This is both the most pressing and scariest, but it's been in -next
for a while.

- Disable FORTIFY_SOURCE for KVM selftests to prevent the compiler from
generating calls to the checked versions of memset() and friends,
which leads to unexpected page faults in guest code due e.g.
__memset_chk@plt not being resolved.

- Explicitly configure the supported XSS capabilities from within
{svm,vmx}_set_cpu_caps() to fix a bug where VMX will compute the
reference VMCS configuration with SHSTK and IBT enabled, but then
compute each CPUs local config with SHSTK and IBT disabled if not all
CET xfeatures are enabled, e.g. if the kernel is built with
X86_KERNEL_IBT=n.

The mismatch in features results in differing nVMX setting, and
ultimately causes kvm-intel.ko to refuse to load with nested=1.

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
KVM: x86: Explicitly configure supported XSS from {svm,vmx}_set_cpu_caps()
KVM: selftests: Add -U_FORTIFY_SOURCE to avoid some unpredictable test failures
KVM: x86: Assert that non-MSI doesn't have bypass vCPU when deleting producer
KVM: Don't clobber irqfd routing type when deassigning irqfd
KVM: SVM: Check vCPU ID against max x2AVIC ID if and only if x2AVIC is enabled

+52 -36
+2 -1
arch/x86/kvm/irq.c
··· 514 514 */ 515 515 spin_lock_irq(&kvm->irqfds.lock); 516 516 517 - if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) { 517 + if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI || 518 + WARN_ON_ONCE(irqfd->irq_bypass_vcpu)) { 518 519 ret = kvm_pi_update_irte(irqfd, NULL); 519 520 if (ret) 520 521 pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n",
+2 -2
arch/x86/kvm/svm/avic.c
··· 376 376 377 377 static int avic_init_backing_page(struct kvm_vcpu *vcpu) 378 378 { 379 + u32 max_id = x2avic_enabled ? x2avic_max_physical_id : AVIC_MAX_PHYSICAL_ID; 379 380 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 380 381 struct vcpu_svm *svm = to_svm(vcpu); 381 382 u32 id = vcpu->vcpu_id; ··· 389 388 * avic_vcpu_load() expects to be called if and only if the vCPU has 390 389 * fully initialized AVIC. 391 390 */ 392 - if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) || 393 - (id > x2avic_max_physical_id)) { 391 + if (id > max_id) { 394 392 kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG); 395 393 vcpu->arch.apic->apicv_active = false; 396 394 return 0;
+2
arch/x86/kvm/svm/svm.c
··· 5284 5284 */ 5285 5285 kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT); 5286 5286 kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM); 5287 + 5288 + kvm_setup_xss_caps(); 5287 5289 } 5288 5290 5289 5291 static __init int svm_hardware_setup(void)
+2
arch/x86/kvm/vmx/vmx.c
··· 8051 8051 kvm_cpu_cap_clear(X86_FEATURE_SHSTK); 8052 8052 kvm_cpu_cap_clear(X86_FEATURE_IBT); 8053 8053 } 8054 + 8055 + kvm_setup_xss_caps(); 8054 8056 } 8055 8057 8056 8058 static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu,
+17 -13
arch/x86/kvm/x86.c
··· 9953 9953 }; 9954 9954 #endif 9955 9955 9956 + void kvm_setup_xss_caps(void) 9957 + { 9958 + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) 9959 + kvm_caps.supported_xss = 0; 9960 + 9961 + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 9962 + !kvm_cpu_cap_has(X86_FEATURE_IBT)) 9963 + kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; 9964 + 9965 + if ((kvm_caps.supported_xss & XFEATURE_MASK_CET_ALL) != XFEATURE_MASK_CET_ALL) { 9966 + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); 9967 + kvm_cpu_cap_clear(X86_FEATURE_IBT); 9968 + kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; 9969 + } 9970 + } 9971 + EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_setup_xss_caps); 9972 + 9956 9973 static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) 9957 9974 { 9958 9975 memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); ··· 10141 10124 /* KVM always ignores guest PAT for shadow paging. */ 10142 10125 if (!tdp_enabled) 10143 10126 kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; 10144 - 10145 - if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) 10146 - kvm_caps.supported_xss = 0; 10147 - 10148 - if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 10149 - !kvm_cpu_cap_has(X86_FEATURE_IBT)) 10150 - kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; 10151 - 10152 - if ((kvm_caps.supported_xss & XFEATURE_MASK_CET_ALL) != XFEATURE_MASK_CET_ALL) { 10153 - kvm_cpu_cap_clear(X86_FEATURE_SHSTK); 10154 - kvm_cpu_cap_clear(X86_FEATURE_IBT); 10155 - kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; 10156 - } 10157 10127 10158 10128 if (kvm_caps.has_tsc_control) { 10159 10129 /*
+2
arch/x86/kvm/x86.h
··· 471 471 472 472 extern bool enable_pmu; 473 473 474 + void kvm_setup_xss_caps(void); 475 + 474 476 /* 475 477 * Get a filtered version of KVM's supported XCR0 that strips out dynamic 476 478 * features for which the current process doesn't (yet) have permission to use.
+1
tools/testing/selftests/kvm/Makefile.kvm
··· 251 251 LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include 252 252 CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ 253 253 -Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \ 254 + -U_FORTIFY_SOURCE \ 254 255 -fno-builtin-memcmp -fno-builtin-memcpy \ 255 256 -fno-builtin-memset -fno-builtin-strnlen \ 256 257 -fno-stack-protector -fno-PIE -fno-strict-aliasing \
+24 -20
virt/kvm/eventfd.c
··· 157 157 } 158 158 159 159 160 - /* assumes kvm->irqfds.lock is held */ 161 - static bool 162 - irqfd_is_active(struct kvm_kernel_irqfd *irqfd) 160 + static bool irqfd_is_active(struct kvm_kernel_irqfd *irqfd) 163 161 { 162 + /* 163 + * Assert that either irqfds.lock or SRCU is held, as irqfds.lock must 164 + * be held to prevent false positives (on the irqfd being active), and 165 + * while false negatives are impossible as irqfds are never added back 166 + * to the list once they're deactivated, the caller must at least hold 167 + * SRCU to guard against routing changes if the irqfd is deactivated. 168 + */ 169 + lockdep_assert_once(lockdep_is_held(&irqfd->kvm->irqfds.lock) || 170 + srcu_read_lock_held(&irqfd->kvm->irq_srcu)); 171 + 164 172 return list_empty(&irqfd->list) ? false : true; 165 173 } 166 174 167 175 /* 168 176 * Mark the irqfd as inactive and schedule it for removal 169 - * 170 - * assumes kvm->irqfds.lock is held 171 177 */ 172 - static void 173 - irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) 178 + static void irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) 174 179 { 180 + lockdep_assert_held(&irqfd->kvm->irqfds.lock); 181 + 175 182 BUG_ON(!irqfd_is_active(irqfd)); 176 183 177 184 list_del_init(&irqfd->list); ··· 224 217 seq = read_seqcount_begin(&irqfd->irq_entry_sc); 225 218 irq = irqfd->irq_entry; 226 219 } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq)); 227 - /* An event has been signaled, inject an interrupt */ 228 - if (kvm_arch_set_irq_inatomic(&irq, kvm, 220 + 221 + /* 222 + * An event has been signaled, inject an interrupt unless the 223 + * irqfd is being deassigned (isn't active), in which case the 224 + * routing information may be stale (once the irqfd is removed 225 + * from the list, it will stop receiving routing updates). 226 + */ 227 + if (unlikely(!irqfd_is_active(irqfd)) || 228 + kvm_arch_set_irq_inatomic(&irq, kvm, 229 229 KVM_USERSPACE_IRQ_SOURCE_ID, 1, 230 230 false) == -EWOULDBLOCK) 231 231 schedule_work(&irqfd->inject); ··· 599 585 spin_lock_irq(&kvm->irqfds.lock); 600 586 601 587 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { 602 - if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) { 603 - /* 604 - * This clearing of irq_entry.type is needed for when 605 - * another thread calls kvm_irq_routing_update before 606 - * we flush workqueue below (we synchronize with 607 - * kvm_irq_routing_update using irqfds.lock). 608 - */ 609 - write_seqcount_begin(&irqfd->irq_entry_sc); 610 - irqfd->irq_entry.type = 0; 611 - write_seqcount_end(&irqfd->irq_entry_sc); 588 + if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) 612 589 irqfd_deactivate(irqfd); 613 - } 614 590 } 615 591 616 592 spin_unlock_irq(&kvm->irqfds.lock);