Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:

- Fix for compilation of selftests on non-x86 architectures

- Fix for kvm_run->if_flag on SEV-ES

- Fix for page table use-after-free if yielding during exit_mm()

- Improve behavior when userspace starts a nested guest with invalid
state

- Fix missed wakeup with assigned devices but no VT-d posted interrupts

- Do not tell userspace to save/restore an unsupported PMU MSR

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
KVM: VMX: Wake vCPU when delivering posted IRQ even if vCPU == this vCPU
KVM: selftests: Add test to verify TRIPLE_FAULT on invalid L2 guest state
KVM: VMX: Fix stale docs for kvm-intel.emulate_invalid_guest_state
KVM: nVMX: Synthesize TRIPLE_FAULT for L2 if emulation is required
KVM: VMX: Always clear vmx->fail on emulation_required
selftests: KVM: Fix non-x86 compiling
KVM: x86: Always set kvm_run->if_flag
KVM: x86/mmu: Don't advance iterator after restart due to yielding
KVM: x86: remove PMU FIXED_CTR3 from msrs_to_save_all

+195 -55
+6 -2
Documentation/admin-guide/kernel-parameters.txt
··· 2413 2413 Default is 1 (enabled) 2414 2414 2415 2415 kvm-intel.emulate_invalid_guest_state= 2416 - [KVM,Intel] Enable emulation of invalid guest states 2417 - Default is 0 (disabled) 2416 + [KVM,Intel] Disable emulation of invalid guest state. 2417 + Ignored if kvm-intel.enable_unrestricted_guest=1, as 2418 + guest state is never invalid for unrestricted guests. 2419 + This param doesn't apply to nested guests (L2), as KVM 2420 + never emulates invalid L2 guest state. 2421 + Default is 1 (enabled) 2418 2422 2419 2423 kvm-intel.flexpriority= 2420 2424 [KVM,Intel] Disable FlexPriority feature (TPR shadow).
+1
arch/x86/include/asm/kvm-x86-ops.h
··· 47 47 KVM_X86_OP(cache_reg) 48 48 KVM_X86_OP(get_rflags) 49 49 KVM_X86_OP(set_rflags) 50 + KVM_X86_OP(get_if_flag) 50 51 KVM_X86_OP(tlb_flush_all) 51 52 KVM_X86_OP(tlb_flush_current) 52 53 KVM_X86_OP_NULL(tlb_remote_flush)
+1
arch/x86/include/asm/kvm_host.h
··· 1349 1349 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); 1350 1350 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); 1351 1351 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 1352 + bool (*get_if_flag)(struct kvm_vcpu *vcpu); 1352 1353 1353 1354 void (*tlb_flush_all)(struct kvm_vcpu *vcpu); 1354 1355 void (*tlb_flush_current)(struct kvm_vcpu *vcpu);
+6
arch/x86/kvm/mmu/tdp_iter.c
··· 26 26 */ 27 27 void tdp_iter_restart(struct tdp_iter *iter) 28 28 { 29 + iter->yielded = false; 29 30 iter->yielded_gfn = iter->next_last_level_gfn; 30 31 iter->level = iter->root_level; 31 32 ··· 161 160 */ 162 161 void tdp_iter_next(struct tdp_iter *iter) 163 162 { 163 + if (iter->yielded) { 164 + tdp_iter_restart(iter); 165 + return; 166 + } 167 + 164 168 if (try_step_down(iter)) 165 169 return; 166 170
+6
arch/x86/kvm/mmu/tdp_iter.h
··· 45 45 * iterator walks off the end of the paging structure. 46 46 */ 47 47 bool valid; 48 + /* 49 + * True if KVM dropped mmu_lock and yielded in the middle of a walk, in 50 + * which case tdp_iter_next() needs to restart the walk at the root 51 + * level instead of advancing to the next entry. 52 + */ 53 + bool yielded; 48 54 }; 49 55 50 56 /*
+16 -13
arch/x86/kvm/mmu/tdp_mmu.c
··· 502 502 struct tdp_iter *iter, 503 503 u64 new_spte) 504 504 { 505 + WARN_ON_ONCE(iter->yielded); 506 + 505 507 lockdep_assert_held_read(&kvm->mmu_lock); 506 508 507 509 /* ··· 577 575 u64 new_spte, bool record_acc_track, 578 576 bool record_dirty_log) 579 577 { 578 + WARN_ON_ONCE(iter->yielded); 579 + 580 580 lockdep_assert_held_write(&kvm->mmu_lock); 581 581 582 582 /* ··· 644 640 * If this function should yield and flush is set, it will perform a remote 645 641 * TLB flush before yielding. 646 642 * 647 - * If this function yields, it will also reset the tdp_iter's walk over the 648 - * paging structure and the calling function should skip to the next 649 - * iteration to allow the iterator to continue its traversal from the 650 - * paging structure root. 643 + * If this function yields, iter->yielded is set and the caller must skip to 644 + * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk 645 + * over the paging structures to allow the iterator to continue its traversal 646 + * from the paging structure root. 651 647 * 652 - * Return true if this function yielded and the iterator's traversal was reset. 653 - * Return false if a yield was not needed. 648 + * Returns true if this function yielded. 654 649 */ 655 - static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, 656 - struct tdp_iter *iter, bool flush, 657 - bool shared) 650 + static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, 651 + struct tdp_iter *iter, 652 + bool flush, bool shared) 658 653 { 654 + WARN_ON(iter->yielded); 655 + 659 656 /* Ensure forward progress has been made before yielding. */ 660 657 if (iter->next_last_level_gfn == iter->yielded_gfn) 661 658 return false; ··· 676 671 677 672 WARN_ON(iter->gfn > iter->next_last_level_gfn); 678 673 679 - tdp_iter_restart(iter); 680 - 681 - return true; 674 + iter->yielded = true; 682 675 } 683 676 684 - return false; 677 + return iter->yielded; 685 678 } 686 679 687 680 /*
+12 -9
arch/x86/kvm/svm/svm.c
··· 1585 1585 to_svm(vcpu)->vmcb->save.rflags = rflags; 1586 1586 } 1587 1587 1588 + static bool svm_get_if_flag(struct kvm_vcpu *vcpu) 1589 + { 1590 + struct vmcb *vmcb = to_svm(vcpu)->vmcb; 1591 + 1592 + return sev_es_guest(vcpu->kvm) 1593 + ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK 1594 + : kvm_get_rflags(vcpu) & X86_EFLAGS_IF; 1595 + } 1596 + 1588 1597 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 1589 1598 { 1590 1599 switch (reg) { ··· 3577 3568 if (!gif_set(svm)) 3578 3569 return true; 3579 3570 3580 - if (sev_es_guest(vcpu->kvm)) { 3581 - /* 3582 - * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask 3583 - * bit to determine the state of the IF flag. 3584 - */ 3585 - if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK)) 3586 - return true; 3587 - } else if (is_guest_mode(vcpu)) { 3571 + if (is_guest_mode(vcpu)) { 3588 3572 /* As long as interrupts are being delivered... */ 3589 3573 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) 3590 3574 ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF) ··· 3588 3586 if (nested_exit_on_intr(svm)) 3589 3587 return false; 3590 3588 } else { 3591 - if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) 3589 + if (!svm_get_if_flag(vcpu)) 3592 3590 return true; 3593 3591 } 3594 3592 ··· 4623 4621 .cache_reg = svm_cache_reg, 4624 4622 .get_rflags = svm_get_rflags, 4625 4623 .set_rflags = svm_set_rflags, 4624 + .get_if_flag = svm_get_if_flag, 4626 4625 4627 4626 .tlb_flush_all = svm_flush_tlb, 4628 4627 .tlb_flush_current = svm_flush_tlb,
+32 -13
arch/x86/kvm/vmx/vmx.c
··· 1363 1363 vmx->emulation_required = vmx_emulation_required(vcpu); 1364 1364 } 1365 1365 1366 + static bool vmx_get_if_flag(struct kvm_vcpu *vcpu) 1367 + { 1368 + return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; 1369 + } 1370 + 1366 1371 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 1367 1372 { 1368 1373 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); ··· 3964 3959 if (pi_test_and_set_on(&vmx->pi_desc)) 3965 3960 return 0; 3966 3961 3967 - if (vcpu != kvm_get_running_vcpu() && 3968 - !kvm_vcpu_trigger_posted_interrupt(vcpu, false)) 3962 + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) 3969 3963 kvm_vcpu_kick(vcpu); 3970 3964 3971 3965 return 0; ··· 5881 5877 vmx_flush_pml_buffer(vcpu); 5882 5878 5883 5879 /* 5884 - * We should never reach this point with a pending nested VM-Enter, and 5885 - * more specifically emulation of L2 due to invalid guest state (see 5886 - * below) should never happen as that means we incorrectly allowed a 5887 - * nested VM-Enter with an invalid vmcs12. 5880 + * KVM should never reach this point with a pending nested VM-Enter. 5881 + * More specifically, short-circuiting VM-Entry to emulate L2 due to 5882 + * invalid guest state should never happen as that means KVM knowingly 5883 + * allowed a nested VM-Enter with an invalid vmcs12. More below. 5888 5884 */ 5889 5885 if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) 5890 5886 return -EIO; 5891 - 5892 - /* If guest state is invalid, start emulating */ 5893 - if (vmx->emulation_required) 5894 - return handle_invalid_guest_state(vcpu); 5895 5887 5896 5888 if (is_guest_mode(vcpu)) { 5897 5889 /* ··· 5910 5910 */ 5911 5911 nested_mark_vmcs12_pages_dirty(vcpu); 5912 5912 5913 + /* 5914 + * Synthesize a triple fault if L2 state is invalid. In normal 5915 + * operation, nested VM-Enter rejects any attempt to enter L2 5916 + * with invalid state. However, those checks are skipped if 5917 + * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If 5918 + * L2 state is invalid, it means either L1 modified SMRAM state 5919 + * or userspace provided bad state. Synthesize TRIPLE_FAULT as 5920 + * doing so is architecturally allowed in the RSM case, and is 5921 + * the least awful solution for the userspace case without 5922 + * risking false positives. 5923 + */ 5924 + if (vmx->emulation_required) { 5925 + nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 5926 + return 1; 5927 + } 5928 + 5913 5929 if (nested_vmx_reflect_vmexit(vcpu)) 5914 5930 return 1; 5915 5931 } 5932 + 5933 + /* If guest state is invalid, start emulating. L2 is handled above. */ 5934 + if (vmx->emulation_required) 5935 + return handle_invalid_guest_state(vcpu); 5916 5936 5917 5937 if (exit_reason.failed_vmentry) { 5918 5938 dump_vmcs(vcpu); ··· 6628 6608 * consistency check VM-Exit due to invalid guest state and bail. 6629 6609 */ 6630 6610 if (unlikely(vmx->emulation_required)) { 6631 - 6632 - /* We don't emulate invalid state of a nested guest */ 6633 - vmx->fail = is_guest_mode(vcpu); 6611 + vmx->fail = 0; 6634 6612 6635 6613 vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; 6636 6614 vmx->exit_reason.failed_vmentry = 1; ··· 7597 7579 .cache_reg = vmx_cache_reg, 7598 7580 .get_rflags = vmx_get_rflags, 7599 7581 .set_rflags = vmx_set_rflags, 7582 + .get_if_flag = vmx_get_if_flag, 7600 7583 7601 7584 .tlb_flush_all = vmx_flush_tlb_all, 7602 7585 .tlb_flush_current = vmx_flush_tlb_current,
+2 -9
arch/x86/kvm/x86.c
··· 1331 1331 MSR_IA32_UMWAIT_CONTROL, 1332 1332 1333 1333 MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, 1334 - MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, 1334 + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, 1335 1335 MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, 1336 1336 MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 1337 1337 MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, ··· 9001 9001 { 9002 9002 struct kvm_run *kvm_run = vcpu->run; 9003 9003 9004 - /* 9005 - * if_flag is obsolete and useless, so do not bother 9006 - * setting it for SEV-ES guests. Userspace can just 9007 - * use kvm_run->ready_for_interrupt_injection. 9008 - */ 9009 - kvm_run->if_flag = !vcpu->arch.guest_state_protected 9010 - && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 9011 - 9004 + kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu); 9012 9005 kvm_run->cr8 = kvm_get_cr8(vcpu); 9013 9006 kvm_run->apic_base = kvm_get_apic_base(vcpu); 9014 9007
+1
tools/testing/selftests/kvm/.gitignore
··· 35 35 /x86_64/vmx_apic_access_test 36 36 /x86_64/vmx_close_while_nested_test 37 37 /x86_64/vmx_dirty_log_test 38 + /x86_64/vmx_invalid_nested_guest_state 38 39 /x86_64/vmx_preemption_timer_test 39 40 /x86_64/vmx_set_nested_state_test 40 41 /x86_64/vmx_tsc_adjust_test
+1
tools/testing/selftests/kvm/Makefile
··· 64 64 TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test 65 65 TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test 66 66 TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test 67 + TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state 67 68 TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test 68 69 TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test 69 70 TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
+1 -9
tools/testing/selftests/kvm/include/kvm_util.h
··· 71 71 72 72 #endif 73 73 74 - #if defined(__x86_64__) 75 - unsigned long vm_compute_max_gfn(struct kvm_vm *vm); 76 - #else 77 - static inline unsigned long vm_compute_max_gfn(struct kvm_vm *vm) 78 - { 79 - return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1; 80 - } 81 - #endif 82 - 83 74 #define MIN_PAGE_SIZE (1U << MIN_PAGE_SHIFT) 84 75 #define PTES_PER_MIN_PAGE ptes_per_page(MIN_PAGE_SIZE) 85 76 ··· 321 330 322 331 unsigned int vm_get_page_size(struct kvm_vm *vm); 323 332 unsigned int vm_get_page_shift(struct kvm_vm *vm); 333 + unsigned long vm_compute_max_gfn(struct kvm_vm *vm); 324 334 uint64_t vm_get_max_gfn(struct kvm_vm *vm); 325 335 int vm_get_fd(struct kvm_vm *vm); 326 336
+5
tools/testing/selftests/kvm/lib/kvm_util.c
··· 2328 2328 return vm->page_shift; 2329 2329 } 2330 2330 2331 + unsigned long __attribute__((weak)) vm_compute_max_gfn(struct kvm_vm *vm) 2332 + { 2333 + return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1; 2334 + } 2335 + 2331 2336 uint64_t vm_get_max_gfn(struct kvm_vm *vm) 2332 2337 { 2333 2338 return vm->max_gfn;
+105
tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + #include "test_util.h" 3 + #include "kvm_util.h" 4 + #include "processor.h" 5 + #include "vmx.h" 6 + 7 + #include <string.h> 8 + #include <sys/ioctl.h> 9 + 10 + #include "kselftest.h" 11 + 12 + #define VCPU_ID 0 13 + #define ARBITRARY_IO_PORT 0x2000 14 + 15 + static struct kvm_vm *vm; 16 + 17 + static void l2_guest_code(void) 18 + { 19 + /* 20 + * Generate an exit to L0 userspace, i.e. main(), via I/O to an 21 + * arbitrary port. 22 + */ 23 + asm volatile("inb %%dx, %%al" 24 + : : [port] "d" (ARBITRARY_IO_PORT) : "rax"); 25 + } 26 + 27 + static void l1_guest_code(struct vmx_pages *vmx_pages) 28 + { 29 + #define L2_GUEST_STACK_SIZE 64 30 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; 31 + 32 + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); 33 + GUEST_ASSERT(load_vmcs(vmx_pages)); 34 + 35 + /* Prepare the VMCS for L2 execution. */ 36 + prepare_vmcs(vmx_pages, l2_guest_code, 37 + &l2_guest_stack[L2_GUEST_STACK_SIZE]); 38 + 39 + /* 40 + * L2 must be run without unrestricted guest, verify that the selftests 41 + * library hasn't enabled it. Because KVM selftests jump directly to 42 + * 64-bit mode, unrestricted guest support isn't required. 43 + */ 44 + GUEST_ASSERT(!(vmreadz(CPU_BASED_VM_EXEC_CONTROL) & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) || 45 + !(vmreadz(SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_UNRESTRICTED_GUEST)); 46 + 47 + GUEST_ASSERT(!vmlaunch()); 48 + 49 + /* L2 should triple fault after main() stuffs invalid guest state. */ 50 + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_TRIPLE_FAULT); 51 + GUEST_DONE(); 52 + } 53 + 54 + int main(int argc, char *argv[]) 55 + { 56 + vm_vaddr_t vmx_pages_gva; 57 + struct kvm_sregs sregs; 58 + struct kvm_run *run; 59 + struct ucall uc; 60 + 61 + nested_vmx_check_supported(); 62 + 63 + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); 64 + 65 + /* Allocate VMX pages and shared descriptors (vmx_pages). */ 66 + vcpu_alloc_vmx(vm, &vmx_pages_gva); 67 + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); 68 + 69 + vcpu_run(vm, VCPU_ID); 70 + 71 + run = vcpu_state(vm, VCPU_ID); 72 + 73 + /* 74 + * The first exit to L0 userspace should be an I/O access from L2. 75 + * Running L1 should launch L2 without triggering an exit to userspace. 76 + */ 77 + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, 78 + "Expected KVM_EXIT_IO, got: %u (%s)\n", 79 + run->exit_reason, exit_reason_str(run->exit_reason)); 80 + 81 + TEST_ASSERT(run->io.port == ARBITRARY_IO_PORT, 82 + "Expected IN from port %d from L2, got port %d", 83 + ARBITRARY_IO_PORT, run->io.port); 84 + 85 + /* 86 + * Stuff invalid guest state for L2 by making TR unusuable. The next 87 + * KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support 88 + * emulating invalid guest state for L2. 89 + */ 90 + memset(&sregs, 0, sizeof(sregs)); 91 + vcpu_sregs_get(vm, VCPU_ID, &sregs); 92 + sregs.tr.unusable = 1; 93 + vcpu_sregs_set(vm, VCPU_ID, &sregs); 94 + 95 + vcpu_run(vm, VCPU_ID); 96 + 97 + switch (get_ucall(vm, VCPU_ID, &uc)) { 98 + case UCALL_DONE: 99 + break; 100 + case UCALL_ABORT: 101 + TEST_FAIL("%s", (const char *)uc.args[0]); 102 + default: 103 + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); 104 + } 105 + }