Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
"A bit late... I got sidetracked by back-from-vacation routines and
conferences. But most of these patches are already a few weeks old and
things look more calm on the mailing list than what this pull request
would suggest.

x86:

- missing TLB flush

- nested virtualization fixes for SMM (secure boot on nested
hypervisor) and other nested SVM fixes

- syscall fuzzing fixes

- live migration fix for AMD SEV

- mirror VMs now work for SEV-ES too

- fixes for reset

- possible out-of-bounds access in IOAPIC emulation

- fix enlightened VMCS on Windows 2022

ARM:

- Add missing FORCE target when building the EL2 object

- Fix a PMU probe regression on some platforms

Generic:

- KCSAN fixes

selftests:

- random fixes, mostly for clang compilation"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (43 commits)
selftests: KVM: Explicitly use movq to read xmm registers
selftests: KVM: Call ucall_init when setting up in rseq_test
KVM: Remove tlbs_dirty
KVM: X86: Synchronize the shadow pagetable before link it
KVM: X86: Fix missed remote tlb flush in rmap_write_protect()
KVM: x86: nSVM: don't copy virt_ext from vmcb12
KVM: x86: nSVM: test eax for 4K alignment for GP errata workaround
KVM: x86: selftests: test simultaneous uses of V_IRQ from L1 and L0
KVM: x86: nSVM: restore int_vector in svm_clear_vintr
kvm: x86: Add AMD PMU MSRs to msrs_to_save_all[]
KVM: x86: nVMX: re-evaluate emulation_required on nested VM exit
KVM: x86: nVMX: don't fail nested VM entry on invalid guest state if !from_vmentry
KVM: x86: VMX: synthesize invalid VM exit when emulating invalid guest state
KVM: x86: nSVM: refactor svm_leave_smm and smm_enter_smm
KVM: x86: SVM: call KVM_REQ_GET_NESTED_STATE_PAGES on exit from SMM mode
KVM: x86: reset pdptrs_from_userspace when exiting smm
KVM: x86: nSVM: restore the L1 host state prior to resuming nested guest on SMM exit
KVM: nVMX: Filter out all unsupported controls when eVMCS was activated
KVM: KVM: Use cpumask_available() to check for NULL cpumask when kicking vCPUs
KVM: Clean up benign vcpu->cpu data races when kicking vCPUs
...

+556 -269
+1 -1
arch/arm64/kvm/hyp/nvhe/Makefile
··· 54 54 # runtime. Because the hypervisor is part of the kernel binary, relocations 55 55 # produce a kernel VA. We enumerate relocations targeting hyp at build time 56 56 # and convert the kernel VAs at those positions to hyp VAs. 57 - $(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o $(obj)/gen-hyprel 57 + $(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o $(obj)/gen-hyprel FORCE 58 58 $(call if_changed,hyprel) 59 59 60 60 # 5) Compile hyp-reloc.S and link it into the existing partially linked object.
-3
arch/arm64/kvm/perf.c
··· 50 50 51 51 int kvm_perf_init(void) 52 52 { 53 - if (kvm_pmu_probe_pmuver() != ID_AA64DFR0_PMUVER_IMP_DEF && !is_protected_kvm_enabled()) 54 - static_branch_enable(&kvm_arm_pmu_available); 55 - 56 53 return perf_register_guest_info_callbacks(&kvm_guest_cbs); 57 54 } 58 55
+8 -1
arch/arm64/kvm/pmu-emul.c
··· 740 740 kvm_pmu_create_perf_event(vcpu, select_idx); 741 741 } 742 742 743 - int kvm_pmu_probe_pmuver(void) 743 + void kvm_host_pmu_init(struct arm_pmu *pmu) 744 + { 745 + if (pmu->pmuver != 0 && pmu->pmuver != ID_AA64DFR0_PMUVER_IMP_DEF && 746 + !kvm_arm_support_pmu_v3() && !is_protected_kvm_enabled()) 747 + static_branch_enable(&kvm_arm_pmu_available); 748 + } 749 + 750 + static int kvm_pmu_probe_pmuver(void) 744 751 { 745 752 struct perf_event_attr attr = { }; 746 753 struct perf_event *event;
+2 -2
arch/s390/kvm/interrupt.c
··· 419 419 static void __set_cpu_idle(struct kvm_vcpu *vcpu) 420 420 { 421 421 kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); 422 - set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); 422 + set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask); 423 423 } 424 424 425 425 static void __unset_cpu_idle(struct kvm_vcpu *vcpu) 426 426 { 427 427 kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); 428 - clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); 428 + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask); 429 429 } 430 430 431 431 static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
+1 -1
arch/s390/kvm/kvm-s390.c
··· 4066 4066 kvm_s390_patch_guest_per_regs(vcpu); 4067 4067 } 4068 4068 4069 - clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask); 4069 + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask); 4070 4070 4071 4071 vcpu->arch.sie_block->icptcode = 0; 4072 4072 cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
+1 -1
arch/s390/kvm/kvm-s390.h
··· 79 79 80 80 static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) 81 81 { 82 - return test_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); 82 + return test_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask); 83 83 } 84 84 85 85 static inline int kvm_is_ucontrol(struct kvm *kvm)
+1 -1
arch/x86/include/asm/kvm_page_track.h
··· 46 46 struct kvm_page_track_notifier_node *node); 47 47 }; 48 48 49 - void kvm_page_track_init(struct kvm *kvm); 49 + int kvm_page_track_init(struct kvm *kvm); 50 50 void kvm_page_track_cleanup(struct kvm *kvm); 51 51 52 52 void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
+1 -1
arch/x86/kvm/emulate.c
··· 4206 4206 u64 cr4 = ctxt->ops->get_cr(ctxt, 4); 4207 4207 4208 4208 if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt)) 4209 - return emulate_ud(ctxt); 4209 + return emulate_gp(ctxt, 0); 4210 4210 4211 4211 return X86EMUL_CONTINUE; 4212 4212 }
+3 -4
arch/x86/kvm/hyperv.c
··· 939 939 for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) 940 940 stimer_init(&hv_vcpu->stimer[i], i); 941 941 942 - hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu); 942 + hv_vcpu->vp_index = vcpu->vcpu_idx; 943 943 944 944 return 0; 945 945 } ··· 1444 1444 switch (msr) { 1445 1445 case HV_X64_MSR_VP_INDEX: { 1446 1446 struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); 1447 - int vcpu_idx = kvm_vcpu_get_idx(vcpu); 1448 1447 u32 new_vp_index = (u32)data; 1449 1448 1450 1449 if (!host || new_vp_index >= KVM_MAX_VCPUS) ··· 1458 1459 * VP index is changing, adjust num_mismatched_vp_indexes if 1459 1460 * it now matches or no longer matches vcpu_idx. 1460 1461 */ 1461 - if (hv_vcpu->vp_index == vcpu_idx) 1462 + if (hv_vcpu->vp_index == vcpu->vcpu_idx) 1462 1463 atomic_inc(&hv->num_mismatched_vp_indexes); 1463 - else if (new_vp_index == vcpu_idx) 1464 + else if (new_vp_index == vcpu->vcpu_idx) 1464 1465 atomic_dec(&hv->num_mismatched_vp_indexes); 1465 1466 1466 1467 hv_vcpu->vp_index = new_vp_index;
+1 -1
arch/x86/kvm/hyperv.h
··· 83 83 { 84 84 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 85 85 86 - return hv_vcpu ? hv_vcpu->vp_index : kvm_vcpu_get_idx(vcpu); 86 + return hv_vcpu ? hv_vcpu->vp_index : vcpu->vcpu_idx; 87 87 } 88 88 89 89 int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
+5 -5
arch/x86/kvm/ioapic.c
··· 319 319 unsigned index; 320 320 bool mask_before, mask_after; 321 321 union kvm_ioapic_redirect_entry *e; 322 - unsigned long vcpu_bitmap; 323 322 int old_remote_irr, old_delivery_status, old_dest_id, old_dest_mode; 323 + DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); 324 324 325 325 switch (ioapic->ioregsel) { 326 326 case IOAPIC_REG_VERSION: ··· 384 384 irq.shorthand = APIC_DEST_NOSHORT; 385 385 irq.dest_id = e->fields.dest_id; 386 386 irq.msi_redir_hint = false; 387 - bitmap_zero(&vcpu_bitmap, 16); 387 + bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); 388 388 kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, 389 - &vcpu_bitmap); 389 + vcpu_bitmap); 390 390 if (old_dest_mode != e->fields.dest_mode || 391 391 old_dest_id != e->fields.dest_id) { 392 392 /* ··· 399 399 kvm_lapic_irq_dest_mode( 400 400 !!e->fields.dest_mode); 401 401 kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, 402 - &vcpu_bitmap); 402 + vcpu_bitmap); 403 403 } 404 404 kvm_make_scan_ioapic_request_mask(ioapic->kvm, 405 - &vcpu_bitmap); 405 + vcpu_bitmap); 406 406 } else { 407 407 kvm_make_scan_ioapic_request(ioapic->kvm); 408 408 }
+10 -7
arch/x86/kvm/mmu/mmu.c
··· 2027 2027 } while (!sp->unsync_children); 2028 2028 } 2029 2029 2030 - static void mmu_sync_children(struct kvm_vcpu *vcpu, 2031 - struct kvm_mmu_page *parent) 2030 + static int mmu_sync_children(struct kvm_vcpu *vcpu, 2031 + struct kvm_mmu_page *parent, bool can_yield) 2032 2032 { 2033 2033 int i; 2034 2034 struct kvm_mmu_page *sp; ··· 2055 2055 } 2056 2056 if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { 2057 2057 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); 2058 + if (!can_yield) { 2059 + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 2060 + return -EINTR; 2061 + } 2062 + 2058 2063 cond_resched_rwlock_write(&vcpu->kvm->mmu_lock); 2059 2064 flush = false; 2060 2065 } 2061 2066 } 2062 2067 2063 2068 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); 2069 + return 0; 2064 2070 } 2065 2071 2066 2072 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) ··· 2151 2145 WARN_ON(!list_empty(&invalid_list)); 2152 2146 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 2153 2147 } 2154 - 2155 - if (sp->unsync_children) 2156 - kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 2157 2148 2158 2149 __clear_sp_write_flooding_count(sp); 2159 2150 ··· 3687 3684 write_lock(&vcpu->kvm->mmu_lock); 3688 3685 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 3689 3686 3690 - mmu_sync_children(vcpu, sp); 3687 + mmu_sync_children(vcpu, sp, true); 3691 3688 3692 3689 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 3693 3690 write_unlock(&vcpu->kvm->mmu_lock); ··· 3703 3700 if (IS_VALID_PAE_ROOT(root)) { 3704 3701 root &= PT64_BASE_ADDR_MASK; 3705 3702 sp = to_shadow_page(root); 3706 - mmu_sync_children(vcpu, sp); 3703 + mmu_sync_children(vcpu, sp, true); 3707 3704 } 3708 3705 } 3709 3706
+2 -2
arch/x86/kvm/mmu/page_track.c
··· 164 164 cleanup_srcu_struct(&head->track_srcu); 165 165 } 166 166 167 - void kvm_page_track_init(struct kvm *kvm) 167 + int kvm_page_track_init(struct kvm *kvm) 168 168 { 169 169 struct kvm_page_track_notifier_head *head; 170 170 171 171 head = &kvm->arch.track_notifier_head; 172 - init_srcu_struct(&head->track_srcu); 173 172 INIT_HLIST_HEAD(&head->track_notifier_list); 173 + return init_srcu_struct(&head->track_srcu); 174 174 } 175 175 176 176 /*
+23 -23
arch/x86/kvm/mmu/paging_tmpl.h
··· 707 707 if (!is_shadow_present_pte(*it.sptep)) { 708 708 table_gfn = gw->table_gfn[it.level - 2]; 709 709 access = gw->pt_access[it.level - 2]; 710 - sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, 711 - false, access); 710 + sp = kvm_mmu_get_page(vcpu, table_gfn, addr, 711 + it.level-1, false, access); 712 + /* 713 + * We must synchronize the pagetable before linking it 714 + * because the guest doesn't need to flush tlb when 715 + * the gpte is changed from non-present to present. 716 + * Otherwise, the guest may use the wrong mapping. 717 + * 718 + * For PG_LEVEL_4K, kvm_mmu_get_page() has already 719 + * synchronized it transiently via kvm_sync_page(). 720 + * 721 + * For higher level pagetable, we synchronize it via 722 + * the slower mmu_sync_children(). If it needs to 723 + * break, some progress has been made; return 724 + * RET_PF_RETRY and retry on the next #PF. 725 + * KVM_REQ_MMU_SYNC is not necessary but it 726 + * expedites the process. 727 + */ 728 + if (sp->unsync_children && 729 + mmu_sync_children(vcpu, sp, false)) 730 + return RET_PF_RETRY; 712 731 } 713 732 714 733 /* ··· 1066 1047 * Using the cached information from sp->gfns is safe because: 1067 1048 * - The spte has a reference to the struct page, so the pfn for a given gfn 1068 1049 * can't change unless all sptes pointing to it are nuked first. 1069 - * 1070 - * Note: 1071 - * We should flush all tlbs if spte is dropped even though guest is 1072 - * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page 1073 - * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't 1074 - * used by guest then tlbs are not flushed, so guest is allowed to access the 1075 - * freed pages. 1076 - * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. 1077 1050 */ 1078 1051 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1079 1052 { ··· 1118 1107 return 0; 1119 1108 1120 1109 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { 1121 - /* 1122 - * Update spte before increasing tlbs_dirty to make 1123 - * sure no tlb flush is lost after spte is zapped; see 1124 - * the comments in kvm_flush_remote_tlbs(). 1125 - */ 1126 - smp_wmb(); 1127 - vcpu->kvm->tlbs_dirty++; 1110 + set_spte_ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; 1128 1111 continue; 1129 1112 } 1130 1113 ··· 1133 1128 1134 1129 if (gfn != sp->gfns[i]) { 1135 1130 drop_spte(vcpu->kvm, &sp->spt[i]); 1136 - /* 1137 - * The same as above where we are doing 1138 - * prefetch_invalid_gpte(). 1139 - */ 1140 - smp_wmb(); 1141 - vcpu->kvm->tlbs_dirty++; 1131 + set_spte_ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; 1142 1132 continue; 1143 1133 } 1144 1134
+6 -4
arch/x86/kvm/svm/nested.c
··· 545 545 (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | 546 546 (svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits); 547 547 548 - svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext; 549 548 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; 550 549 svm->vmcb->control.int_state = svm->nested.ctl.int_state; 551 550 svm->vmcb->control.event_inj = svm->nested.ctl.event_inj; ··· 578 579 } 579 580 580 581 int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, 581 - struct vmcb *vmcb12) 582 + struct vmcb *vmcb12, bool from_vmrun) 582 583 { 583 584 struct vcpu_svm *svm = to_svm(vcpu); 584 585 int ret; ··· 608 609 nested_vmcb02_prepare_save(svm, vmcb12); 609 610 610 611 ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, 611 - nested_npt_enabled(svm), true); 612 + nested_npt_enabled(svm), from_vmrun); 612 613 if (ret) 613 614 return ret; 614 615 615 616 if (!npt_enabled) 616 617 vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested; 618 + 619 + if (!from_vmrun) 620 + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 617 621 618 622 svm_set_gif(svm, true); 619 623 ··· 683 681 684 682 svm->nested.nested_run_pending = 1; 685 683 686 - if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12)) 684 + if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true)) 687 685 goto out_exit_err; 688 686 689 687 if (nested_svm_vmrun_msrpm(svm))
+62 -30
arch/x86/kvm/svm/sev.c
··· 595 595 return 0; 596 596 } 597 597 598 + static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, 599 + int *error) 600 + { 601 + struct sev_data_launch_update_vmsa vmsa; 602 + struct vcpu_svm *svm = to_svm(vcpu); 603 + int ret; 604 + 605 + /* Perform some pre-encryption checks against the VMSA */ 606 + ret = sev_es_sync_vmsa(svm); 607 + if (ret) 608 + return ret; 609 + 610 + /* 611 + * The LAUNCH_UPDATE_VMSA command will perform in-place encryption of 612 + * the VMSA memory content (i.e it will write the same memory region 613 + * with the guest's key), so invalidate it first. 614 + */ 615 + clflush_cache_range(svm->vmsa, PAGE_SIZE); 616 + 617 + vmsa.reserved = 0; 618 + vmsa.handle = to_kvm_svm(kvm)->sev_info.handle; 619 + vmsa.address = __sme_pa(svm->vmsa); 620 + vmsa.len = PAGE_SIZE; 621 + return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); 622 + } 623 + 598 624 static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) 599 625 { 600 - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; 601 - struct sev_data_launch_update_vmsa vmsa; 602 626 struct kvm_vcpu *vcpu; 603 627 int i, ret; 604 628 605 629 if (!sev_es_guest(kvm)) 606 630 return -ENOTTY; 607 631 608 - vmsa.reserved = 0; 609 - 610 632 kvm_for_each_vcpu(i, vcpu, kvm) { 611 - struct vcpu_svm *svm = to_svm(vcpu); 612 - 613 - /* Perform some pre-encryption checks against the VMSA */ 614 - ret = sev_es_sync_vmsa(svm); 633 + ret = mutex_lock_killable(&vcpu->mutex); 615 634 if (ret) 616 635 return ret; 617 636 618 - /* 619 - * The LAUNCH_UPDATE_VMSA command will perform in-place 620 - * encryption of the VMSA memory content (i.e it will write 621 - * the same memory region with the guest's key), so invalidate 622 - * it first. 623 - */ 624 - clflush_cache_range(svm->vmsa, PAGE_SIZE); 637 + ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error); 625 638 626 - vmsa.handle = sev->handle; 627 - vmsa.address = __sme_pa(svm->vmsa); 628 - vmsa.len = PAGE_SIZE; 629 - ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, 630 - &argp->error); 639 + mutex_unlock(&vcpu->mutex); 631 640 if (ret) 632 641 return ret; 633 - 634 - svm->vcpu.arch.guest_state_protected = true; 635 642 } 636 643 637 644 return 0; ··· 1404 1397 1405 1398 /* Bind ASID to this guest */ 1406 1399 ret = sev_bind_asid(kvm, start.handle, error); 1407 - if (ret) 1400 + if (ret) { 1401 + sev_decommission(start.handle); 1408 1402 goto e_free_session; 1403 + } 1409 1404 1410 1405 params.handle = start.handle; 1411 1406 if (copy_to_user((void __user *)(uintptr_t)argp->data, ··· 1473 1464 1474 1465 /* Pin guest memory */ 1475 1466 guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, 1476 - PAGE_SIZE, &n, 0); 1467 + PAGE_SIZE, &n, 1); 1477 1468 if (IS_ERR(guest_page)) { 1478 1469 ret = PTR_ERR(guest_page); 1479 1470 goto e_free_trans; ··· 1510 1501 return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); 1511 1502 } 1512 1503 1504 + static bool cmd_allowed_from_miror(u32 cmd_id) 1505 + { 1506 + /* 1507 + * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES 1508 + * active mirror VMs. Also allow the debugging and status commands. 1509 + */ 1510 + if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA || 1511 + cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT || 1512 + cmd_id == KVM_SEV_DBG_ENCRYPT) 1513 + return true; 1514 + 1515 + return false; 1516 + } 1517 + 1513 1518 int svm_mem_enc_op(struct kvm *kvm, void __user *argp) 1514 1519 { 1515 1520 struct kvm_sev_cmd sev_cmd; ··· 1540 1517 1541 1518 mutex_lock(&kvm->lock); 1542 1519 1543 - /* enc_context_owner handles all memory enc operations */ 1544 - if (is_mirroring_enc_context(kvm)) { 1520 + /* Only the enc_context_owner handles some memory enc operations. */ 1521 + if (is_mirroring_enc_context(kvm) && 1522 + !cmd_allowed_from_miror(sev_cmd.id)) { 1545 1523 r = -EINVAL; 1546 1524 goto out; 1547 1525 } ··· 1739 1715 { 1740 1716 struct file *source_kvm_file; 1741 1717 struct kvm *source_kvm; 1742 - struct kvm_sev_info *mirror_sev; 1743 - unsigned int asid; 1718 + struct kvm_sev_info source_sev, *mirror_sev; 1744 1719 int ret; 1745 1720 1746 1721 source_kvm_file = fget(source_fd); ··· 1762 1739 goto e_source_unlock; 1763 1740 } 1764 1741 1765 - asid = to_kvm_svm(source_kvm)->sev_info.asid; 1742 + memcpy(&source_sev, &to_kvm_svm(source_kvm)->sev_info, 1743 + sizeof(source_sev)); 1766 1744 1767 1745 /* 1768 1746 * The mirror kvm holds an enc_context_owner ref so its asid can't ··· 1783 1759 /* Set enc_context_owner and copy its encryption context over */ 1784 1760 mirror_sev = &to_kvm_svm(kvm)->sev_info; 1785 1761 mirror_sev->enc_context_owner = source_kvm; 1786 - mirror_sev->asid = asid; 1787 1762 mirror_sev->active = true; 1763 + mirror_sev->asid = source_sev.asid; 1764 + mirror_sev->fd = source_sev.fd; 1765 + mirror_sev->es_active = source_sev.es_active; 1766 + mirror_sev->handle = source_sev.handle; 1767 + /* 1768 + * Do not copy ap_jump_table. Since the mirror does not share the same 1769 + * KVM contexts as the original, and they may have different 1770 + * memory-views. 1771 + */ 1788 1772 1789 1773 mutex_unlock(&kvm->lock); 1790 1774 return 0;
+74 -63
arch/x86/kvm/svm/svm.c
··· 1566 1566 1567 1567 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & 1568 1568 V_IRQ_INJECTION_BITS_MASK; 1569 + 1570 + svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; 1569 1571 } 1570 1572 1571 1573 vmcb_mark_dirty(svm->vmcb, VMCB_INTR); ··· 2222 2220 2223 2221 /* Both #GP cases have zero error_code */ 2224 2222 if (error_code) 2223 + goto reinject; 2224 + 2225 + /* All SVM instructions expect page aligned RAX */ 2226 + if (svm->vmcb->save.rax & ~PAGE_MASK) 2225 2227 goto reinject; 2226 2228 2227 2229 /* Decode the instruction for usage later */ ··· 4291 4285 struct kvm_host_map map_save; 4292 4286 int ret; 4293 4287 4294 - if (is_guest_mode(vcpu)) { 4295 - /* FED8h - SVM Guest */ 4296 - put_smstate(u64, smstate, 0x7ed8, 1); 4297 - /* FEE0h - SVM Guest VMCB Physical Address */ 4298 - put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); 4288 + if (!is_guest_mode(vcpu)) 4289 + return 0; 4299 4290 4300 - svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 4301 - svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 4302 - svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 4291 + /* FED8h - SVM Guest */ 4292 + put_smstate(u64, smstate, 0x7ed8, 1); 4293 + /* FEE0h - SVM Guest VMCB Physical Address */ 4294 + put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); 4303 4295 4304 - ret = nested_svm_vmexit(svm); 4305 - if (ret) 4306 - return ret; 4296 + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 4297 + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 4298 + svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 4307 4299 4308 - /* 4309 - * KVM uses VMCB01 to store L1 host state while L2 runs but 4310 - * VMCB01 is going to be used during SMM and thus the state will 4311 - * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save 4312 - * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the 4313 - * format of the area is identical to guest save area offsetted 4314 - * by 0x400 (matches the offset of 'struct vmcb_save_area' 4315 - * within 'struct vmcb'). Note: HSAVE area may also be used by 4316 - * L1 hypervisor to save additional host context (e.g. KVM does 4317 - * that, see svm_prepare_guest_switch()) which must be 4318 - * preserved. 4319 - */ 4320 - if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), 4321 - &map_save) == -EINVAL) 4322 - return 1; 4300 + ret = nested_svm_vmexit(svm); 4301 + if (ret) 4302 + return ret; 4323 4303 4324 - BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); 4304 + /* 4305 + * KVM uses VMCB01 to store L1 host state while L2 runs but 4306 + * VMCB01 is going to be used during SMM and thus the state will 4307 + * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save 4308 + * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the 4309 + * format of the area is identical to guest save area offsetted 4310 + * by 0x400 (matches the offset of 'struct vmcb_save_area' 4311 + * within 'struct vmcb'). Note: HSAVE area may also be used by 4312 + * L1 hypervisor to save additional host context (e.g. KVM does 4313 + * that, see svm_prepare_guest_switch()) which must be 4314 + * preserved. 4315 + */ 4316 + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), 4317 + &map_save) == -EINVAL) 4318 + return 1; 4325 4319 4326 - svm_copy_vmrun_state(map_save.hva + 0x400, 4327 - &svm->vmcb01.ptr->save); 4320 + BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); 4328 4321 4329 - kvm_vcpu_unmap(vcpu, &map_save, true); 4330 - } 4322 + svm_copy_vmrun_state(map_save.hva + 0x400, 4323 + &svm->vmcb01.ptr->save); 4324 + 4325 + kvm_vcpu_unmap(vcpu, &map_save, true); 4331 4326 return 0; 4332 4327 } 4333 4328 ··· 4336 4329 { 4337 4330 struct vcpu_svm *svm = to_svm(vcpu); 4338 4331 struct kvm_host_map map, map_save; 4339 - int ret = 0; 4332 + u64 saved_efer, vmcb12_gpa; 4333 + struct vmcb *vmcb12; 4334 + int ret; 4340 4335 4341 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { 4342 - u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); 4343 - u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8); 4344 - u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); 4345 - struct vmcb *vmcb12; 4336 + if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) 4337 + return 0; 4346 4338 4347 - if (guest) { 4348 - if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) 4349 - return 1; 4339 + /* Non-zero if SMI arrived while vCPU was in guest mode. */ 4340 + if (!GET_SMSTATE(u64, smstate, 0x7ed8)) 4341 + return 0; 4350 4342 4351 - if (!(saved_efer & EFER_SVME)) 4352 - return 1; 4343 + if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) 4344 + return 1; 4353 4345 4354 - if (kvm_vcpu_map(vcpu, 4355 - gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) 4356 - return 1; 4346 + saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); 4347 + if (!(saved_efer & EFER_SVME)) 4348 + return 1; 4357 4349 4358 - if (svm_allocate_nested(svm)) 4359 - return 1; 4350 + vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); 4351 + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) 4352 + return 1; 4360 4353 4361 - vmcb12 = map.hva; 4354 + ret = 1; 4355 + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL) 4356 + goto unmap_map; 4362 4357 4363 - nested_load_control_from_vmcb12(svm, &vmcb12->control); 4358 + if (svm_allocate_nested(svm)) 4359 + goto unmap_save; 4364 4360 4365 - ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12); 4366 - kvm_vcpu_unmap(vcpu, &map, true); 4361 + /* 4362 + * Restore L1 host state from L1 HSAVE area as VMCB01 was 4363 + * used during SMM (see svm_enter_smm()) 4364 + */ 4367 4365 4368 - /* 4369 - * Restore L1 host state from L1 HSAVE area as VMCB01 was 4370 - * used during SMM (see svm_enter_smm()) 4371 - */ 4372 - if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), 4373 - &map_save) == -EINVAL) 4374 - return 1; 4366 + svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400); 4375 4367 4376 - svm_copy_vmrun_state(&svm->vmcb01.ptr->save, 4377 - map_save.hva + 0x400); 4368 + /* 4369 + * Enter the nested guest now 4370 + */ 4378 4371 4379 - kvm_vcpu_unmap(vcpu, &map_save, true); 4380 - } 4381 - } 4372 + vmcb12 = map.hva; 4373 + nested_load_control_from_vmcb12(svm, &vmcb12->control); 4374 + ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); 4382 4375 4376 + unmap_save: 4377 + kvm_vcpu_unmap(vcpu, &map_save, true); 4378 + unmap_map: 4379 + kvm_vcpu_unmap(vcpu, &map, true); 4383 4380 return ret; 4384 4381 } 4385 4382
+2 -1
arch/x86/kvm/svm/svm.h
··· 459 459 return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); 460 460 } 461 461 462 - int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12); 462 + int enter_svm_guest_mode(struct kvm_vcpu *vcpu, 463 + u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun); 463 464 void svm_leave_nested(struct vcpu_svm *svm); 464 465 void svm_free_nested(struct vcpu_svm *svm); 465 466 int svm_allocate_nested(struct vcpu_svm *svm);
+9 -3
arch/x86/kvm/vmx/evmcs.c
··· 353 353 switch (msr_index) { 354 354 case MSR_IA32_VMX_EXIT_CTLS: 355 355 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 356 - ctl_high &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 356 + ctl_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; 357 357 break; 358 358 case MSR_IA32_VMX_ENTRY_CTLS: 359 359 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 360 - ctl_high &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 360 + ctl_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; 361 361 break; 362 362 case MSR_IA32_VMX_PROCBASED_CTLS2: 363 - ctl_high &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 363 + ctl_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; 364 + break; 365 + case MSR_IA32_VMX_PINBASED_CTLS: 366 + ctl_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; 367 + break; 368 + case MSR_IA32_VMX_VMFUNC: 369 + ctl_low &= ~EVMCS1_UNSUPPORTED_VMFUNC; 364 370 break; 365 371 } 366 372
+15 -9
arch/x86/kvm/vmx/nested.c
··· 2583 2583 * Guest state is invalid and unrestricted guest is disabled, 2584 2584 * which means L1 attempted VMEntry to L2 with invalid state. 2585 2585 * Fail the VMEntry. 2586 + * 2587 + * However when force loading the guest state (SMM exit or 2588 + * loading nested state after migration, it is possible to 2589 + * have invalid guest state now, which will be later fixed by 2590 + * restoring L2 register state 2586 2591 */ 2587 - if (CC(!vmx_guest_state_valid(vcpu))) { 2592 + if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2588 2593 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2589 2594 return -EINVAL; 2590 2595 } ··· 4356 4351 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4357 4352 vmcs12->vm_exit_msr_load_count)) 4358 4353 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4354 + 4355 + to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4359 4356 } 4360 4357 4361 4358 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) ··· 4906 4899 return -ENOMEM; 4907 4900 } 4908 4901 4909 - /* 4910 - * Emulate the VMXON instruction. 4911 - * Currently, we just remember that VMX is active, and do not save or even 4912 - * inspect the argument to VMXON (the so-called "VMXON pointer") because we 4913 - * do not currently need to store anything in that guest-allocated memory 4914 - * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their 4915 - * argument is different from the VMXON pointer (which the spec says they do). 4916 - */ 4902 + /* Emulate the VMXON instruction. */ 4917 4903 static int handle_vmon(struct kvm_vcpu *vcpu) 4918 4904 { 4919 4905 int ret; ··· 5902 5902 return true; 5903 5903 case EXIT_REASON_VMFUNC: 5904 5904 /* VM functions are emulated through L2->L0 vmexits. */ 5905 + return true; 5906 + case EXIT_REASON_BUS_LOCK: 5907 + /* 5908 + * At present, bus lock VM exit is never exposed to L1. 5909 + * Handle L2's bus locks in L0 directly. 5910 + */ 5905 5911 return true; 5906 5912 default: 5907 5913 break;
+26 -11
arch/x86/kvm/vmx/vmx.c
··· 1323 1323 vmx_prepare_switch_to_host(to_vmx(vcpu)); 1324 1324 } 1325 1325 1326 - static bool emulation_required(struct kvm_vcpu *vcpu) 1326 + bool vmx_emulation_required(struct kvm_vcpu *vcpu) 1327 1327 { 1328 1328 return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); 1329 1329 } ··· 1367 1367 vmcs_writel(GUEST_RFLAGS, rflags); 1368 1368 1369 1369 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) 1370 - vmx->emulation_required = emulation_required(vcpu); 1370 + vmx->emulation_required = vmx_emulation_required(vcpu); 1371 1371 } 1372 1372 1373 1373 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) ··· 1837 1837 &msr_info->data)) 1838 1838 return 1; 1839 1839 /* 1840 - * Enlightened VMCS v1 doesn't have certain fields, but buggy 1841 - * Hyper-V versions are still trying to use corresponding 1842 - * features when they are exposed. Filter out the essential 1843 - * minimum. 1840 + * Enlightened VMCS v1 doesn't have certain VMCS fields but 1841 + * instead of just ignoring the features, different Hyper-V 1842 + * versions are either trying to use them and fail or do some 1843 + * sanity checking and refuse to boot. Filter all unsupported 1844 + * features out. 1844 1845 */ 1845 1846 if (!msr_info->host_initiated && 1846 1847 vmx->nested.enlightened_vmcs_enabled) ··· 3078 3077 } 3079 3078 3080 3079 /* depends on vcpu->arch.cr0 to be set to a new value */ 3081 - vmx->emulation_required = emulation_required(vcpu); 3080 + vmx->emulation_required = vmx_emulation_required(vcpu); 3082 3081 } 3083 3082 3084 3083 static int vmx_get_max_tdp_level(void) ··· 3331 3330 { 3332 3331 __vmx_set_segment(vcpu, var, seg); 3333 3332 3334 - to_vmx(vcpu)->emulation_required = emulation_required(vcpu); 3333 + to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 3335 3334 } 3336 3335 3337 3336 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) ··· 6622 6621 vmx->loaded_vmcs->soft_vnmi_blocked)) 6623 6622 vmx->loaded_vmcs->entry_time = ktime_get(); 6624 6623 6625 - /* Don't enter VMX if guest state is invalid, let the exit handler 6626 - start emulation until we arrive back to a valid state */ 6627 - if (vmx->emulation_required) 6624 + /* 6625 + * Don't enter VMX if guest state is invalid, let the exit handler 6626 + * start emulation until we arrive back to a valid state. Synthesize a 6627 + * consistency check VM-Exit due to invalid guest state and bail. 6628 + */ 6629 + if (unlikely(vmx->emulation_required)) { 6630 + 6631 + /* We don't emulate invalid state of a nested guest */ 6632 + vmx->fail = is_guest_mode(vcpu); 6633 + 6634 + vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; 6635 + vmx->exit_reason.failed_vmentry = 1; 6636 + kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); 6637 + vmx->exit_qualification = ENTRY_FAIL_DEFAULT; 6638 + kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); 6639 + vmx->exit_intr_info = 0; 6628 6640 return EXIT_FASTPATH_NONE; 6641 + } 6629 6642 6630 6643 trace_kvm_entry(vcpu); 6631 6644
+1 -4
arch/x86/kvm/vmx/vmx.h
··· 248 248 * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside 249 249 * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to 250 250 * be loaded into hardware if those conditions aren't met. 251 - * nr_active_uret_msrs tracks the number of MSRs that need to be loaded 252 - * into hardware when running the guest. guest_uret_msrs[] is resorted 253 - * whenever the number of "active" uret MSRs is modified. 254 251 */ 255 252 struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; 256 - int nr_active_uret_msrs; 257 253 bool guest_uret_msrs_loaded; 258 254 #ifdef CONFIG_X86_64 259 255 u64 msr_host_kernel_gs_base; ··· 355 359 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, 356 360 unsigned long fs_base, unsigned long gs_base); 357 361 int vmx_get_cpl(struct kvm_vcpu *vcpu); 362 + bool vmx_emulation_required(struct kvm_vcpu *vcpu); 358 363 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); 359 364 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); 360 365 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu);
+26 -2
arch/x86/kvm/x86.c
··· 1332 1332 MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, 1333 1333 MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, 1334 1334 MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, 1335 + 1336 + MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, 1337 + MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, 1338 + MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, 1339 + MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, 1340 + MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, 1341 + MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, 1335 1342 }; 1336 1343 1337 1344 static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; ··· 2976 2969 offsetof(struct compat_vcpu_info, time)); 2977 2970 if (vcpu->xen.vcpu_time_info_set) 2978 2971 kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0); 2979 - if (v == kvm_get_vcpu(v->kvm, 0)) 2972 + if (!v->vcpu_idx) 2980 2973 kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); 2981 2974 return 0; 2982 2975 } ··· 7665 7658 7666 7659 /* Process a latched INIT or SMI, if any. */ 7667 7660 kvm_make_request(KVM_REQ_EVENT, vcpu); 7661 + 7662 + /* 7663 + * Even if KVM_SET_SREGS2 loaded PDPTRs out of band, 7664 + * on SMM exit we still need to reload them from 7665 + * guest memory 7666 + */ 7667 + vcpu->arch.pdptrs_from_userspace = false; 7668 7668 } 7669 7669 7670 7670 kvm_mmu_reset_context(vcpu); ··· 10666 10652 int r; 10667 10653 10668 10654 vcpu->arch.last_vmentry_cpu = -1; 10655 + vcpu->arch.regs_avail = ~0; 10656 + vcpu->arch.regs_dirty = ~0; 10669 10657 10670 10658 if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) 10671 10659 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; ··· 10908 10892 10909 10893 kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); 10910 10894 kvm_rip_write(vcpu, 0xfff0); 10895 + 10896 + vcpu->arch.cr3 = 0; 10897 + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 10911 10898 10912 10899 /* 10913 10900 * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions ··· 11158 11139 11159 11140 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) 11160 11141 { 11142 + int ret; 11143 + 11161 11144 if (type) 11162 11145 return -EINVAL; 11146 + 11147 + ret = kvm_page_track_init(kvm); 11148 + if (ret) 11149 + return ret; 11163 11150 11164 11151 INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); 11165 11152 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); ··· 11199 11174 11200 11175 kvm_apicv_init(kvm); 11201 11176 kvm_hv_init_vm(kvm); 11202 - kvm_page_track_init(kvm); 11203 11177 kvm_mmu_init_vm(kvm); 11204 11178 kvm_xen_init_vm(kvm); 11205 11179
+2
drivers/perf/arm_pmu.c
··· 952 952 pmu->name, pmu->num_events, 953 953 has_nmi ? ", using NMIs" : ""); 954 954 955 + kvm_host_pmu_init(pmu); 956 + 955 957 return 0; 956 958 957 959 out_destroy:
-3
include/kvm/arm_pmu.h
··· 61 61 int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, 62 62 struct kvm_device_attr *attr); 63 63 int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu); 64 - int kvm_pmu_probe_pmuver(void); 65 64 #else 66 65 struct kvm_pmu { 67 66 }; ··· 116 117 { 117 118 return 0; 118 119 } 119 - 120 - static inline int kvm_pmu_probe_pmuver(void) { return 0xf; } 121 120 122 121 #endif 123 122
-6
include/linux/kvm_host.h
··· 608 608 unsigned long mmu_notifier_range_start; 609 609 unsigned long mmu_notifier_range_end; 610 610 #endif 611 - long tlbs_dirty; 612 611 struct list_head devices; 613 612 u64 manual_dirty_log_protect; 614 613 struct dentry *debugfs_dentry; ··· 718 719 if (vcpu->vcpu_id == id) 719 720 return vcpu; 720 721 return NULL; 721 - } 722 - 723 - static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu) 724 - { 725 - return vcpu->vcpu_idx; 726 722 } 727 723 728 724 #define kvm_for_each_memslot(memslot, slots) \
+6
include/linux/perf/arm_pmu.h
··· 163 163 static inline int arm_pmu_acpi_probe(armpmu_init_fn init_fn) { return 0; } 164 164 #endif 165 165 166 + #ifdef CONFIG_KVM 167 + void kvm_host_pmu_init(struct arm_pmu *pmu); 168 + #else 169 + #define kvm_host_pmu_init(x) do { } while(0) 170 + #endif 171 + 166 172 /* Internal functions only for core arm_pmu code */ 167 173 struct arm_pmu *armpmu_alloc(void); 168 174 struct arm_pmu *armpmu_alloc_atomic(void);
+1
tools/testing/selftests/kvm/.gitignore
··· 24 24 /x86_64/smm_test 25 25 /x86_64/state_test 26 26 /x86_64/svm_vmcall_test 27 + /x86_64/svm_int_ctl_test 27 28 /x86_64/sync_regs_test 28 29 /x86_64/tsc_msrs_test 29 30 /x86_64/userspace_msr_exit_test
+1
tools/testing/selftests/kvm/Makefile
··· 56 56 TEST_GEN_PROGS_x86_64 += x86_64/state_test 57 57 TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test 58 58 TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test 59 + TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test 59 60 TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test 60 61 TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test 61 62 TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
+2 -4
tools/testing/selftests/kvm/access_tracking_perf_test.c
··· 371 371 printf(" -v: specify the number of vCPUs to run.\n"); 372 372 printf(" -o: Overlap guest memory accesses instead of partitioning\n" 373 373 " them into a separate region of memory for each vCPU.\n"); 374 - printf(" -s: specify the type of memory that should be used to\n" 375 - " back the guest data region.\n\n"); 376 - backing_src_help(); 374 + backing_src_help("-s"); 377 375 puts(""); 378 376 exit(0); 379 377 } ··· 379 381 int main(int argc, char *argv[]) 380 382 { 381 383 struct test_params params = { 382 - .backing_src = VM_MEM_SRC_ANONYMOUS, 384 + .backing_src = DEFAULT_VM_MEM_SRC, 383 385 .vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE, 384 386 .vcpus = 1, 385 387 };
+7 -8
tools/testing/selftests/kvm/demand_paging_test.c
··· 179 179 return NULL; 180 180 } 181 181 182 - if (!pollfd[0].revents & POLLIN) 182 + if (!(pollfd[0].revents & POLLIN)) 183 183 continue; 184 184 185 185 r = read(uffd, &msg, sizeof(msg)); ··· 416 416 { 417 417 puts(""); 418 418 printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-d uffd_delay_usec]\n" 419 - " [-b memory] [-t type] [-v vcpus] [-o]\n", name); 419 + " [-b memory] [-s type] [-v vcpus] [-o]\n", name); 420 420 guest_modes_help(); 421 421 printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n" 422 422 " UFFD registration mode: 'MISSING' or 'MINOR'.\n"); ··· 426 426 printf(" -b: specify the size of the memory region which should be\n" 427 427 " demand paged by each vCPU. e.g. 10M or 3G.\n" 428 428 " Default: 1G\n"); 429 - printf(" -t: The type of backing memory to use. Default: anonymous\n"); 430 - backing_src_help(); 429 + backing_src_help("-s"); 431 430 printf(" -v: specify the number of vCPUs to run.\n"); 432 431 printf(" -o: Overlap guest memory accesses instead of partitioning\n" 433 432 " them into a separate region of memory for each vCPU.\n"); ··· 438 439 { 439 440 int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); 440 441 struct test_params p = { 441 - .src_type = VM_MEM_SRC_ANONYMOUS, 442 + .src_type = DEFAULT_VM_MEM_SRC, 442 443 .partition_vcpu_memory_access = true, 443 444 }; 444 445 int opt; 445 446 446 447 guest_modes_append_default(); 447 448 448 - while ((opt = getopt(argc, argv, "hm:u:d:b:t:v:o")) != -1) { 449 + while ((opt = getopt(argc, argv, "hm:u:d:b:s:v:o")) != -1) { 449 450 switch (opt) { 450 451 case 'm': 451 452 guest_modes_cmdline(optarg); ··· 464 465 case 'b': 465 466 guest_percpu_mem_size = parse_size(optarg); 466 467 break; 467 - case 't': 468 + case 's': 468 469 p.src_type = parse_backing_src_type(optarg); 469 470 break; 470 471 case 'v': ··· 484 485 485 486 if (p.uffd_mode == UFFDIO_REGISTER_MODE_MINOR && 486 487 !backing_src_is_shared(p.src_type)) { 487 - TEST_FAIL("userfaultfd MINOR mode requires shared memory; pick a different -t"); 488 + TEST_FAIL("userfaultfd MINOR mode requires shared memory; pick a different -s"); 488 489 } 489 490 490 491 for_each_guest_mode(run_test, &p);
+42 -20
tools/testing/selftests/kvm/dirty_log_perf_test.c
··· 118 118 toggle_dirty_logging(vm, slots, false); 119 119 } 120 120 121 - static void get_dirty_log(struct kvm_vm *vm, int slots, unsigned long *bitmap, 122 - uint64_t nr_pages) 121 + static void get_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], int slots) 123 122 { 124 - uint64_t slot_pages = nr_pages / slots; 125 123 int i; 126 124 127 125 for (i = 0; i < slots; i++) { 128 126 int slot = PERF_TEST_MEM_SLOT_INDEX + i; 129 - unsigned long *slot_bitmap = bitmap + i * slot_pages; 130 127 131 - kvm_vm_get_dirty_log(vm, slot, slot_bitmap); 128 + kvm_vm_get_dirty_log(vm, slot, bitmaps[i]); 132 129 } 133 130 } 134 131 135 - static void clear_dirty_log(struct kvm_vm *vm, int slots, unsigned long *bitmap, 136 - uint64_t nr_pages) 132 + static void clear_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], 133 + int slots, uint64_t pages_per_slot) 137 134 { 138 - uint64_t slot_pages = nr_pages / slots; 139 135 int i; 140 136 141 137 for (i = 0; i < slots; i++) { 142 138 int slot = PERF_TEST_MEM_SLOT_INDEX + i; 143 - unsigned long *slot_bitmap = bitmap + i * slot_pages; 144 139 145 - kvm_vm_clear_dirty_log(vm, slot, slot_bitmap, 0, slot_pages); 140 + kvm_vm_clear_dirty_log(vm, slot, bitmaps[i], 0, pages_per_slot); 146 141 } 142 + } 143 + 144 + static unsigned long **alloc_bitmaps(int slots, uint64_t pages_per_slot) 145 + { 146 + unsigned long **bitmaps; 147 + int i; 148 + 149 + bitmaps = malloc(slots * sizeof(bitmaps[0])); 150 + TEST_ASSERT(bitmaps, "Failed to allocate bitmaps array."); 151 + 152 + for (i = 0; i < slots; i++) { 153 + bitmaps[i] = bitmap_zalloc(pages_per_slot); 154 + TEST_ASSERT(bitmaps[i], "Failed to allocate slot bitmap."); 155 + } 156 + 157 + return bitmaps; 158 + } 159 + 160 + static void free_bitmaps(unsigned long *bitmaps[], int slots) 161 + { 162 + int i; 163 + 164 + for (i = 0; i < slots; i++) 165 + free(bitmaps[i]); 166 + 167 + free(bitmaps); 147 168 } 148 169 149 170 static void run_test(enum vm_guest_mode mode, void *arg) ··· 172 151 struct test_params *p = arg; 173 152 pthread_t *vcpu_threads; 174 153 struct kvm_vm *vm; 175 - unsigned long *bmap; 154 + unsigned long **bitmaps; 176 155 uint64_t guest_num_pages; 177 156 uint64_t host_num_pages; 157 + uint64_t pages_per_slot; 178 158 int vcpu_id; 179 159 struct timespec start; 180 160 struct timespec ts_diff; ··· 193 171 guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm_get_page_shift(vm); 194 172 guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); 195 173 host_num_pages = vm_num_host_pages(mode, guest_num_pages); 196 - bmap = bitmap_zalloc(host_num_pages); 174 + pages_per_slot = host_num_pages / p->slots; 175 + 176 + bitmaps = alloc_bitmaps(p->slots, pages_per_slot); 197 177 198 178 if (dirty_log_manual_caps) { 199 179 cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; ··· 263 239 iteration, ts_diff.tv_sec, ts_diff.tv_nsec); 264 240 265 241 clock_gettime(CLOCK_MONOTONIC, &start); 266 - get_dirty_log(vm, p->slots, bmap, host_num_pages); 242 + get_dirty_log(vm, bitmaps, p->slots); 267 243 ts_diff = timespec_elapsed(start); 268 244 get_dirty_log_total = timespec_add(get_dirty_log_total, 269 245 ts_diff); ··· 272 248 273 249 if (dirty_log_manual_caps) { 274 250 clock_gettime(CLOCK_MONOTONIC, &start); 275 - clear_dirty_log(vm, p->slots, bmap, host_num_pages); 251 + clear_dirty_log(vm, bitmaps, p->slots, pages_per_slot); 276 252 ts_diff = timespec_elapsed(start); 277 253 clear_dirty_log_total = timespec_add(clear_dirty_log_total, 278 254 ts_diff); ··· 305 281 clear_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec); 306 282 } 307 283 308 - free(bmap); 284 + free_bitmaps(bitmaps, p->slots); 309 285 free(vcpu_threads); 310 286 perf_test_destroy_vm(vm); 311 287 } ··· 332 308 printf(" -v: specify the number of vCPUs to run.\n"); 333 309 printf(" -o: Overlap guest memory accesses instead of partitioning\n" 334 310 " them into a separate region of memory for each vCPU.\n"); 335 - printf(" -s: specify the type of memory that should be used to\n" 336 - " back the guest data region.\n\n"); 311 + backing_src_help("-s"); 337 312 printf(" -x: Split the memory region into this number of memslots.\n" 338 - " (default: 1)"); 339 - backing_src_help(); 313 + " (default: 1)\n"); 340 314 puts(""); 341 315 exit(0); 342 316 } ··· 346 324 .iterations = TEST_HOST_LOOP_N, 347 325 .wr_fract = 1, 348 326 .partition_vcpu_memory_access = true, 349 - .backing_src = VM_MEM_SRC_ANONYMOUS, 327 + .backing_src = DEFAULT_VM_MEM_SRC, 350 328 .slots = 1, 351 329 }; 352 330 int opt;
+3 -1
tools/testing/selftests/kvm/include/test_util.h
··· 90 90 NUM_SRC_TYPES, 91 91 }; 92 92 93 + #define DEFAULT_VM_MEM_SRC VM_MEM_SRC_ANONYMOUS 94 + 93 95 struct vm_mem_backing_src_alias { 94 96 const char *name; 95 97 uint32_t flag; ··· 104 102 size_t get_def_hugetlb_pagesz(void); 105 103 const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i); 106 104 size_t get_backing_src_pagesz(uint32_t i); 107 - void backing_src_help(void); 105 + void backing_src_help(const char *flag); 108 106 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name); 109 107 long get_run_delay(void); 110 108
+17 -17
tools/testing/selftests/kvm/include/x86_64/processor.h
··· 312 312 } 313 313 } 314 314 315 - typedef unsigned long v1di __attribute__ ((vector_size (8))); 315 + #define GET_XMM(__xmm) \ 316 + ({ \ 317 + unsigned long __val; \ 318 + asm volatile("movq %%"#__xmm", %0" : "=r"(__val) : : #__xmm); \ 319 + __val; \ 320 + }) 321 + 316 322 static inline unsigned long get_xmm(int n) 317 323 { 318 324 assert(n >= 0 && n <= 7); 319 325 320 - register v1di xmm0 __asm__("%xmm0"); 321 - register v1di xmm1 __asm__("%xmm1"); 322 - register v1di xmm2 __asm__("%xmm2"); 323 - register v1di xmm3 __asm__("%xmm3"); 324 - register v1di xmm4 __asm__("%xmm4"); 325 - register v1di xmm5 __asm__("%xmm5"); 326 - register v1di xmm6 __asm__("%xmm6"); 327 - register v1di xmm7 __asm__("%xmm7"); 328 326 switch (n) { 329 327 case 0: 330 - return (unsigned long)xmm0; 328 + return GET_XMM(xmm0); 331 329 case 1: 332 - return (unsigned long)xmm1; 330 + return GET_XMM(xmm1); 333 331 case 2: 334 - return (unsigned long)xmm2; 332 + return GET_XMM(xmm2); 335 333 case 3: 336 - return (unsigned long)xmm3; 334 + return GET_XMM(xmm3); 337 335 case 4: 338 - return (unsigned long)xmm4; 336 + return GET_XMM(xmm4); 339 337 case 5: 340 - return (unsigned long)xmm5; 338 + return GET_XMM(xmm5); 341 339 case 6: 342 - return (unsigned long)xmm6; 340 + return GET_XMM(xmm6); 343 341 case 7: 344 - return (unsigned long)xmm7; 342 + return GET_XMM(xmm7); 345 343 } 344 + 345 + /* never reached */ 346 346 return 0; 347 347 } 348 348
+2 -5
tools/testing/selftests/kvm/kvm_page_table_test.c
··· 456 456 " (default: 1G)\n"); 457 457 printf(" -v: specify the number of vCPUs to run\n" 458 458 " (default: 1)\n"); 459 - printf(" -s: specify the type of memory that should be used to\n" 460 - " back the guest data region.\n" 461 - " (default: anonymous)\n\n"); 462 - backing_src_help(); 459 + backing_src_help("-s"); 463 460 puts(""); 464 461 } 465 462 ··· 465 468 int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); 466 469 struct test_params p = { 467 470 .test_mem_size = DEFAULT_TEST_MEM_SIZE, 468 - .src_type = VM_MEM_SRC_ANONYMOUS, 471 + .src_type = DEFAULT_VM_MEM_SRC, 469 472 }; 470 473 int opt; 471 474
+13 -4
tools/testing/selftests/kvm/lib/test_util.c
··· 283 283 } 284 284 } 285 285 286 - void backing_src_help(void) 286 + static void print_available_backing_src_types(const char *prefix) 287 287 { 288 288 int i; 289 289 290 - printf("Available backing src types:\n"); 290 + printf("%sAvailable backing src types:\n", prefix); 291 + 291 292 for (i = 0; i < NUM_SRC_TYPES; i++) 292 - printf("\t%s\n", vm_mem_backing_src_alias(i)->name); 293 + printf("%s %s\n", prefix, vm_mem_backing_src_alias(i)->name); 294 + } 295 + 296 + void backing_src_help(const char *flag) 297 + { 298 + printf(" %s: specify the type of memory that should be used to\n" 299 + " back the guest data region. (default: %s)\n", 300 + flag, vm_mem_backing_src_alias(DEFAULT_VM_MEM_SRC)->name); 301 + print_available_backing_src_types(" "); 293 302 } 294 303 295 304 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name) ··· 309 300 if (!strcmp(type_name, vm_mem_backing_src_alias(i)->name)) 310 301 return i; 311 302 312 - backing_src_help(); 303 + print_available_backing_src_types(""); 313 304 TEST_FAIL("Unknown backing src type: %s", type_name); 314 305 return -1; 315 306 }
+1
tools/testing/selftests/kvm/rseq_test.c
··· 180 180 * CPU affinity. 181 181 */ 182 182 vm = vm_create_default(VCPU_ID, 0, guest_code); 183 + ucall_init(vm, NULL); 183 184 184 185 pthread_create(&migration_thread, NULL, migration_worker, 0); 185 186
+2 -2
tools/testing/selftests/kvm/steal_time.c
··· 116 116 uint64_t st_time; 117 117 }; 118 118 119 - static int64_t smccc(uint32_t func, uint32_t arg) 119 + static int64_t smccc(uint32_t func, uint64_t arg) 120 120 { 121 121 unsigned long ret; 122 122 123 123 asm volatile( 124 - "mov x0, %1\n" 124 + "mov w0, %w1\n" 125 125 "mov x1, %2\n" 126 126 "hvc #0\n" 127 127 "mov %0, x0\n"
+128
tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * svm_int_ctl_test 4 + * 5 + * Copyright (C) 2021, Red Hat, Inc. 6 + * 7 + * Nested SVM testing: test simultaneous use of V_IRQ from L1 and L0. 8 + */ 9 + 10 + #include "test_util.h" 11 + #include "kvm_util.h" 12 + #include "processor.h" 13 + #include "svm_util.h" 14 + #include "apic.h" 15 + 16 + #define VCPU_ID 0 17 + 18 + static struct kvm_vm *vm; 19 + 20 + bool vintr_irq_called; 21 + bool intr_irq_called; 22 + 23 + #define VINTR_IRQ_NUMBER 0x20 24 + #define INTR_IRQ_NUMBER 0x30 25 + 26 + static void vintr_irq_handler(struct ex_regs *regs) 27 + { 28 + vintr_irq_called = true; 29 + } 30 + 31 + static void intr_irq_handler(struct ex_regs *regs) 32 + { 33 + x2apic_write_reg(APIC_EOI, 0x00); 34 + intr_irq_called = true; 35 + } 36 + 37 + static void l2_guest_code(struct svm_test_data *svm) 38 + { 39 + /* This code raises interrupt INTR_IRQ_NUMBER in the L1's LAPIC, 40 + * and since L1 didn't enable virtual interrupt masking, 41 + * L2 should receive it and not L1. 42 + * 43 + * L2 also has virtual interrupt 'VINTR_IRQ_NUMBER' pending in V_IRQ 44 + * so it should also receive it after the following 'sti'. 45 + */ 46 + x2apic_write_reg(APIC_ICR, 47 + APIC_DEST_SELF | APIC_INT_ASSERT | INTR_IRQ_NUMBER); 48 + 49 + __asm__ __volatile__( 50 + "sti\n" 51 + "nop\n" 52 + ); 53 + 54 + GUEST_ASSERT(vintr_irq_called); 55 + GUEST_ASSERT(intr_irq_called); 56 + 57 + __asm__ __volatile__( 58 + "vmcall\n" 59 + ); 60 + } 61 + 62 + static void l1_guest_code(struct svm_test_data *svm) 63 + { 64 + #define L2_GUEST_STACK_SIZE 64 65 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; 66 + struct vmcb *vmcb = svm->vmcb; 67 + 68 + x2apic_enable(); 69 + 70 + /* Prepare for L2 execution. */ 71 + generic_svm_setup(svm, l2_guest_code, 72 + &l2_guest_stack[L2_GUEST_STACK_SIZE]); 73 + 74 + /* No virtual interrupt masking */ 75 + vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; 76 + 77 + /* No intercepts for real and virtual interrupts */ 78 + vmcb->control.intercept &= ~(1ULL << INTERCEPT_INTR | INTERCEPT_VINTR); 79 + 80 + /* Make a virtual interrupt VINTR_IRQ_NUMBER pending */ 81 + vmcb->control.int_ctl |= V_IRQ_MASK | (0x1 << V_INTR_PRIO_SHIFT); 82 + vmcb->control.int_vector = VINTR_IRQ_NUMBER; 83 + 84 + run_guest(vmcb, svm->vmcb_gpa); 85 + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); 86 + GUEST_DONE(); 87 + } 88 + 89 + int main(int argc, char *argv[]) 90 + { 91 + vm_vaddr_t svm_gva; 92 + 93 + nested_svm_check_supported(); 94 + 95 + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); 96 + 97 + vm_init_descriptor_tables(vm); 98 + vcpu_init_descriptor_tables(vm, VCPU_ID); 99 + 100 + vm_install_exception_handler(vm, VINTR_IRQ_NUMBER, vintr_irq_handler); 101 + vm_install_exception_handler(vm, INTR_IRQ_NUMBER, intr_irq_handler); 102 + 103 + vcpu_alloc_svm(vm, &svm_gva); 104 + vcpu_args_set(vm, VCPU_ID, 1, svm_gva); 105 + 106 + struct kvm_run *run = vcpu_state(vm, VCPU_ID); 107 + struct ucall uc; 108 + 109 + vcpu_run(vm, VCPU_ID); 110 + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, 111 + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", 112 + run->exit_reason, 113 + exit_reason_str(run->exit_reason)); 114 + 115 + switch (get_ucall(vm, VCPU_ID, &uc)) { 116 + case UCALL_ABORT: 117 + TEST_FAIL("%s", (const char *)uc.args[0]); 118 + break; 119 + /* NOT REACHED */ 120 + case UCALL_DONE: 121 + goto done; 122 + default: 123 + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); 124 + } 125 + done: 126 + kvm_vm_free(vm); 127 + return 0; 128 + }
+49 -19
virt/kvm/kvm_main.c
··· 235 235 { 236 236 } 237 237 238 - static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait) 238 + static inline bool kvm_kick_many_cpus(cpumask_var_t tmp, bool wait) 239 239 { 240 - if (unlikely(!cpus)) 240 + const struct cpumask *cpus; 241 + 242 + if (likely(cpumask_available(tmp))) 243 + cpus = tmp; 244 + else 241 245 cpus = cpu_online_mask; 242 246 243 247 if (cpumask_empty(cpus)) ··· 267 263 continue; 268 264 269 265 kvm_make_request(req, vcpu); 270 - cpu = vcpu->cpu; 271 266 272 267 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) 273 268 continue; 274 269 275 - if (tmp != NULL && cpu != -1 && cpu != me && 276 - kvm_request_needs_ipi(vcpu, req)) 277 - __cpumask_set_cpu(cpu, tmp); 270 + /* 271 + * tmp can be "unavailable" if cpumasks are allocated off stack 272 + * as allocation of the mask is deliberately not fatal and is 273 + * handled by falling back to kicking all online CPUs. 274 + */ 275 + if (!cpumask_available(tmp)) 276 + continue; 277 + 278 + /* 279 + * Note, the vCPU could get migrated to a different pCPU at any 280 + * point after kvm_request_needs_ipi(), which could result in 281 + * sending an IPI to the previous pCPU. But, that's ok because 282 + * the purpose of the IPI is to ensure the vCPU returns to 283 + * OUTSIDE_GUEST_MODE, which is satisfied if the vCPU migrates. 284 + * Entering READING_SHADOW_PAGE_TABLES after this point is also 285 + * ok, as the requirement is only that KVM wait for vCPUs that 286 + * were reading SPTEs _before_ any changes were finalized. See 287 + * kvm_vcpu_kick() for more details on handling requests. 288 + */ 289 + if (kvm_request_needs_ipi(vcpu, req)) { 290 + cpu = READ_ONCE(vcpu->cpu); 291 + if (cpu != -1 && cpu != me) 292 + __cpumask_set_cpu(cpu, tmp); 293 + } 278 294 } 279 295 280 296 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT)); ··· 326 302 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL 327 303 void kvm_flush_remote_tlbs(struct kvm *kvm) 328 304 { 329 - /* 330 - * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in 331 - * kvm_make_all_cpus_request. 332 - */ 333 - long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); 334 - 335 305 ++kvm->stat.generic.remote_tlb_flush_requests; 306 + 336 307 /* 337 308 * We want to publish modifications to the page tables before reading 338 309 * mode. Pairs with a memory barrier in arch-specific code. ··· 342 323 if (!kvm_arch_flush_remote_tlb(kvm) 343 324 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 344 325 ++kvm->stat.generic.remote_tlb_flush; 345 - cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 346 326 } 347 327 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); 348 328 #endif ··· 546 528 } 547 529 } 548 530 549 - if (range->flush_on_ret && (ret || kvm->tlbs_dirty)) 531 + if (range->flush_on_ret && ret) 550 532 kvm_flush_remote_tlbs(kvm); 551 533 552 534 if (locked) ··· 3152 3134 3153 3135 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) 3154 3136 { 3155 - unsigned int old, val, shrink; 3137 + unsigned int old, val, shrink, grow_start; 3156 3138 3157 3139 old = val = vcpu->halt_poll_ns; 3158 3140 shrink = READ_ONCE(halt_poll_ns_shrink); 3141 + grow_start = READ_ONCE(halt_poll_ns_grow_start); 3159 3142 if (shrink == 0) 3160 3143 val = 0; 3161 3144 else 3162 3145 val /= shrink; 3146 + 3147 + if (val < grow_start) 3148 + val = 0; 3163 3149 3164 3150 vcpu->halt_poll_ns = val; 3165 3151 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); ··· 3312 3290 */ 3313 3291 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 3314 3292 { 3315 - int me; 3316 - int cpu = vcpu->cpu; 3293 + int me, cpu; 3317 3294 3318 3295 if (kvm_vcpu_wake_up(vcpu)) 3319 3296 return; 3320 3297 3298 + /* 3299 + * Note, the vCPU could get migrated to a different pCPU at any point 3300 + * after kvm_arch_vcpu_should_kick(), which could result in sending an 3301 + * IPI to the previous pCPU. But, that's ok because the purpose of the 3302 + * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the 3303 + * vCPU also requires it to leave IN_GUEST_MODE. 3304 + */ 3321 3305 me = get_cpu(); 3322 - if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 3323 - if (kvm_arch_vcpu_should_kick(vcpu)) 3306 + if (kvm_arch_vcpu_should_kick(vcpu)) { 3307 + cpu = READ_ONCE(vcpu->cpu); 3308 + if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 3324 3309 smp_send_reschedule(cpu); 3310 + } 3325 3311 put_cpu(); 3326 3312 } 3327 3313 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);