commit e37a07e0c29cd2cef4633b1e6db5579cc99ba4cd · tjh.dev/kernel

+18

Documentation/virtual/kvm/api.txt

··· 4329 4329 virtual SMT modes that can be set using KVM_CAP_PPC_SMT. If bit N 4330 4330 (counting from the right) is set, then a virtual SMT mode of 2^N is 4331 4331 available. 4332 + 4333 + 8.11 KVM_CAP_HYPERV_SYNIC2 4334 + 4335 + Architectures: x86 4336 + 4337 + This capability enables a newer version of Hyper-V Synthetic interrupt 4338 + controller (SynIC). The only difference with KVM_CAP_HYPERV_SYNIC is that KVM 4339 + doesn't clear SynIC message and event flags pages when they are enabled by 4340 + writing to the respective MSRs. 4341 + 4342 + 8.12 KVM_CAP_HYPERV_VP_INDEX 4343 + 4344 + Architectures: x86 4345 + 4346 + This capability indicates that userspace can load HV_X64_MSR_VP_INDEX msr. Its 4347 + value is used to denote the target vcpu for a SynIC interrupt. For 4348 + compatibilty, KVM initializes this msr to KVM's internal vcpu index. When this 4349 + capability is absent, userspace can still query this msr's value.

+3 -2

Documentation/virtual/kvm/msr.txt

··· 166 166 MSR_KVM_ASYNC_PF_EN: 0x4b564d02 167 167 data: Bits 63-6 hold 64-byte aligned physical address of a 168 168 64 byte memory area which must be in guest RAM and must be 169 - zeroed. Bits 5-2 are reserved and should be zero. Bit 0 is 1 169 + zeroed. Bits 5-3 are reserved and should be zero. Bit 0 is 1 170 170 when asynchronous page faults are enabled on the vcpu 0 when 171 171 disabled. Bit 1 is 1 if asynchronous page faults can be injected 172 - when vcpu is in cpl == 0. 172 + when vcpu is in cpl == 0. Bit 2 is 1 if asynchronous page faults 173 + are delivered to L1 as #PF vmexits. 173 174 174 175 First 4 byte of 64 byte memory location will be written to by 175 176 the hypervisor at the time of asynchronous page fault (APF)

+1

arch/x86/include/asm/cpufeatures.h

··· 286 286 #define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ 287 287 #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ 288 288 #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ 289 + #define X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE (15*32+15) /* Virtual VMLOAD VMSAVE */ 289 290 290 291 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ 291 292 #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/

+1

arch/x86/include/asm/kvm_emulate.h

··· 23 23 u16 error_code; 24 24 bool nested_page_fault; 25 25 u64 address; /* cr2 or nested page fault gpa */ 26 + u8 async_page_fault; 26 27 }; 27 28 28 29 /*

+8 -3

arch/x86/include/asm/kvm_host.h

··· 462 462 DECLARE_BITMAP(auto_eoi_bitmap, 256); 463 463 DECLARE_BITMAP(vec_bitmap, 256); 464 464 bool active; 465 + bool dont_zero_synic_pages; 465 466 }; 466 467 467 468 /* Hyper-V per vcpu emulation context */ 468 469 struct kvm_vcpu_hv { 470 + u32 vp_index; 469 471 u64 hv_vapic; 470 472 s64 runtime_offset; 471 473 struct kvm_vcpu_hv_synic synic; ··· 551 549 bool reinject; 552 550 u8 nr; 553 551 u32 error_code; 552 + u8 nested_apf; 554 553 } exception; 555 554 556 555 struct kvm_queued_interrupt { ··· 652 649 u64 msr_val; 653 650 u32 id; 654 651 bool send_user_only; 652 + u32 host_apf_reason; 653 + unsigned long nested_apf_token; 654 + bool delivery_as_pf_vmexit; 655 655 } apf; 656 656 657 657 /* OSVW MSRs (AMD only) */ ··· 809 803 int audit_point; 810 804 #endif 811 805 806 + bool backwards_tsc_observed; 812 807 bool boot_vcpu_runs_old_kvmclock; 813 808 u32 bsp_vcpu_id; 814 809 ··· 959 952 unsigned char *hypercall_addr); 960 953 void (*set_irq)(struct kvm_vcpu *vcpu); 961 954 void (*set_nmi)(struct kvm_vcpu *vcpu); 962 - void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, 963 - bool has_error_code, u32 error_code, 964 - bool reinject); 955 + void (*queue_exception)(struct kvm_vcpu *vcpu); 965 956 void (*cancel_injection)(struct kvm_vcpu *vcpu); 966 957 int (*interrupt_allowed)(struct kvm_vcpu *vcpu); 967 958 int (*nmi_allowed)(struct kvm_vcpu *vcpu);

+4 -1

arch/x86/include/asm/svm.h

··· 83 83 u32 event_inj; 84 84 u32 event_inj_err; 85 85 u64 nested_cr3; 86 - u64 lbr_ctl; 86 + u64 virt_ext; 87 87 u32 clean; 88 88 u32 reserved_5; 89 89 u64 next_rip; ··· 118 118 119 119 #define AVIC_ENABLE_SHIFT 31 120 120 #define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) 121 + 122 + #define LBR_CTL_ENABLE_MASK BIT_ULL(0) 123 + #define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1) 121 124 122 125 #define SVM_INTERRUPT_SHADOW_MASK 1 123 126

+1

arch/x86/include/uapi/asm/kvm_para.h

··· 67 67 68 68 #define KVM_ASYNC_PF_ENABLED (1 << 0) 69 69 #define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) 70 + #define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT (1 << 2) 70 71 71 72 /* Operations for KVM_HC_MMU_OP */ 72 73 #define KVM_MMU_OP_WRITE_PTE 1

+6 -1

arch/x86/kernel/kvm.c

··· 330 330 #ifdef CONFIG_PREEMPT 331 331 pa |= KVM_ASYNC_PF_SEND_ALWAYS; 332 332 #endif 333 - wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); 333 + pa |= KVM_ASYNC_PF_ENABLED; 334 + 335 + /* Async page fault support for L1 hypervisor is optional */ 336 + if (wrmsr_safe(MSR_KVM_ASYNC_PF_EN, 337 + (pa | KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT) & 0xffffffff, pa >> 32) < 0) 338 + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); 334 339 __this_cpu_write(apf_reason.enabled, 1); 335 340 printk(KERN_INFO"KVM setup async PF for cpu %d\n", 336 341 smp_processor_id());

+44 -23

arch/x86/kvm/hyperv.c

··· 106 106 return 0; 107 107 } 108 108 109 - static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vcpu_id) 109 + static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) 110 + { 111 + struct kvm_vcpu *vcpu = NULL; 112 + int i; 113 + 114 + if (vpidx < KVM_MAX_VCPUS) 115 + vcpu = kvm_get_vcpu(kvm, vpidx); 116 + if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) 117 + return vcpu; 118 + kvm_for_each_vcpu(i, vcpu, kvm) 119 + if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) 120 + return vcpu; 121 + return NULL; 122 + } 123 + 124 + static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx) 110 125 { 111 126 struct kvm_vcpu *vcpu; 112 127 struct kvm_vcpu_hv_synic *synic; 113 128 114 - if (vcpu_id >= atomic_read(&kvm->online_vcpus)) 115 - return NULL; 116 - vcpu = kvm_get_vcpu(kvm, vcpu_id); 129 + vcpu = get_vcpu_by_vpidx(kvm, vpidx); 117 130 if (!vcpu) 118 131 return NULL; 119 132 synic = vcpu_to_synic(vcpu); ··· 234 221 synic->version = data; 235 222 break; 236 223 case HV_X64_MSR_SIEFP: 237 - if (data & HV_SYNIC_SIEFP_ENABLE) 224 + if ((data & HV_SYNIC_SIEFP_ENABLE) && !host && 225 + !synic->dont_zero_synic_pages) 238 226 if (kvm_clear_guest(vcpu->kvm, 239 227 data & PAGE_MASK, PAGE_SIZE)) { 240 228 ret = 1; ··· 246 232 synic_exit(synic, msr); 247 233 break; 248 234 case HV_X64_MSR_SIMP: 249 - if (data & HV_SYNIC_SIMP_ENABLE) 235 + if ((data & HV_SYNIC_SIMP_ENABLE) && !host && 236 + !synic->dont_zero_synic_pages) 250 237 if (kvm_clear_guest(vcpu->kvm, 251 238 data & PAGE_MASK, PAGE_SIZE)) { 252 239 ret = 1; ··· 333 318 return ret; 334 319 } 335 320 336 - int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint) 321 + int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint) 337 322 { 338 323 struct kvm_vcpu_hv_synic *synic; 339 324 340 - synic = synic_get(kvm, vcpu_id); 325 + synic = synic_get(kvm, vpidx); 341 326 if (!synic) 342 327 return -EINVAL; 343 328 ··· 356 341 kvm_hv_notify_acked_sint(vcpu, i); 357 342 } 358 343 359 - static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vcpu_id, u32 sint, int gsi) 344 + static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi) 360 345 { 361 346 struct kvm_vcpu_hv_synic *synic; 362 347 363 - synic = synic_get(kvm, vcpu_id); 348 + synic = synic_get(kvm, vpidx); 364 349 if (!synic) 365 350 return -EINVAL; 366 351 ··· 702 687 stimer_init(&hv_vcpu->stimer[i], i); 703 688 } 704 689 705 - int kvm_hv_activate_synic(struct kvm_vcpu *vcpu) 690 + void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu) 706 691 { 692 + struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); 693 + 694 + hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu); 695 + } 696 + 697 + int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) 698 + { 699 + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); 700 + 707 701 /* 708 702 * Hyper-V SynIC auto EOI SINT's are 709 703 * not compatible with APICV, so deactivate APICV 710 704 */ 711 705 kvm_vcpu_deactivate_apicv(vcpu); 712 - vcpu_to_synic(vcpu)->active = true; 706 + synic->active = true; 707 + synic->dont_zero_synic_pages = dont_zero_synic_pages; 713 708 return 0; 714 709 } 715 710 ··· 1003 978 struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; 1004 979 1005 980 switch (msr) { 981 + case HV_X64_MSR_VP_INDEX: 982 + if (!host) 983 + return 1; 984 + hv->vp_index = (u32)data; 985 + break; 1006 986 case HV_X64_MSR_APIC_ASSIST_PAGE: { 1007 987 u64 gfn; 1008 988 unsigned long addr; ··· 1119 1089 struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; 1120 1090 1121 1091 switch (msr) { 1122 - case HV_X64_MSR_VP_INDEX: { 1123 - int r; 1124 - struct kvm_vcpu *v; 1125 - 1126 - kvm_for_each_vcpu(r, v, vcpu->kvm) { 1127 - if (v == vcpu) { 1128 - data = r; 1129 - break; 1130 - } 1131 - } 1092 + case HV_X64_MSR_VP_INDEX: 1093 + data = hv->vp_index; 1132 1094 break; 1133 - } 1134 1095 case HV_X64_MSR_EOI: 1135 1096 return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); 1136 1097 case HV_X64_MSR_ICR:

+2 -1

arch/x86/kvm/hyperv.h

··· 56 56 void kvm_hv_irq_routing_update(struct kvm *kvm); 57 57 int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint); 58 58 void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); 59 - int kvm_hv_activate_synic(struct kvm_vcpu *vcpu); 59 + int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); 60 60 61 61 void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu); 62 + void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu); 62 63 void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); 63 64 64 65 static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu,

+2

arch/x86/kvm/i8254.c

··· 724 724 struct kvm_pit *pit = kvm->arch.vpit; 725 725 726 726 if (pit) { 727 + mutex_lock(&kvm->slots_lock); 727 728 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); 728 729 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); 730 + mutex_unlock(&kvm->slots_lock); 729 731 kvm_pit_set_reinject(pit, false); 730 732 hrtimer_cancel(&pit->pit_state.timer); 731 733 kthread_destroy_worker(pit->worker);

+34 -1

arch/x86/kvm/mmu.c

··· 46 46 #include <asm/io.h> 47 47 #include <asm/vmx.h> 48 48 #include <asm/kvm_page_track.h> 49 + #include "trace.h" 49 50 50 51 /* 51 52 * When setting this variable to true it enables Two-Dimensional-Paging ··· 3749 3748 kvm_event_needs_reinjection(vcpu))) 3750 3749 return false; 3751 3750 3752 - if (is_guest_mode(vcpu)) 3751 + if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) 3753 3752 return false; 3754 3753 3755 3754 return kvm_x86_ops->interrupt_allowed(vcpu); ··· 3780 3779 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); 3781 3780 return false; 3782 3781 } 3782 + 3783 + int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, 3784 + u64 fault_address, char *insn, int insn_len, 3785 + bool need_unprotect) 3786 + { 3787 + int r = 1; 3788 + 3789 + switch (vcpu->arch.apf.host_apf_reason) { 3790 + default: 3791 + trace_kvm_page_fault(fault_address, error_code); 3792 + 3793 + if (need_unprotect && kvm_event_needs_reinjection(vcpu)) 3794 + kvm_mmu_unprotect_page_virt(vcpu, fault_address); 3795 + r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, 3796 + insn_len); 3797 + break; 3798 + case KVM_PV_REASON_PAGE_NOT_PRESENT: 3799 + vcpu->arch.apf.host_apf_reason = 0; 3800 + local_irq_disable(); 3801 + kvm_async_pf_task_wait(fault_address); 3802 + local_irq_enable(); 3803 + break; 3804 + case KVM_PV_REASON_PAGE_READY: 3805 + vcpu->arch.apf.host_apf_reason = 0; 3806 + local_irq_disable(); 3807 + kvm_async_pf_task_wake(fault_address); 3808 + local_irq_enable(); 3809 + break; 3810 + } 3811 + return r; 3812 + } 3813 + EXPORT_SYMBOL_GPL(kvm_handle_page_fault); 3783 3814 3784 3815 static bool 3785 3816 check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)

+3

arch/x86/kvm/mmu.h

··· 77 77 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, 78 78 bool accessed_dirty); 79 79 bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); 80 + int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, 81 + u64 fault_address, char *insn, int insn_len, 82 + bool need_unprotect); 80 83 81 84 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) 82 85 {

+82 -69

arch/x86/kvm/svm.c

··· 194 194 195 195 unsigned int3_injected; 196 196 unsigned long int3_rip; 197 - u32 apf_reason; 198 197 199 198 /* cached guest cpuid flags for faster access */ 200 199 bool nrips_enabled : 1; ··· 275 276 #ifdef CONFIG_X86_LOCAL_APIC 276 277 module_param(avic, int, S_IRUGO); 277 278 #endif 279 + 280 + /* enable/disable Virtual VMLOAD VMSAVE */ 281 + static int vls = true; 282 + module_param(vls, int, 0444); 278 283 279 284 /* AVIC VM ID bit masks and lock */ 280 285 static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR); ··· 636 633 svm_set_interrupt_shadow(vcpu, 0); 637 634 } 638 635 639 - static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 640 - bool has_error_code, u32 error_code, 641 - bool reinject) 636 + static void svm_queue_exception(struct kvm_vcpu *vcpu) 642 637 { 643 638 struct vcpu_svm *svm = to_svm(vcpu); 639 + unsigned nr = vcpu->arch.exception.nr; 640 + bool has_error_code = vcpu->arch.exception.has_error_code; 641 + bool reinject = vcpu->arch.exception.reinject; 642 + u32 error_code = vcpu->arch.exception.error_code; 644 643 645 644 /* 646 645 * If we are within a nested VM we'd better #VMEXIT and let the guest ··· 952 947 { 953 948 u32 *msrpm = svm->msrpm; 954 949 955 - svm->vmcb->control.lbr_ctl = 1; 950 + svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; 956 951 set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); 957 952 set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); 958 953 set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); ··· 963 958 { 964 959 u32 *msrpm = svm->msrpm; 965 960 966 - svm->vmcb->control.lbr_ctl = 0; 961 + svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; 967 962 set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); 968 963 set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); 969 964 set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); ··· 1095 1090 pr_info("AVIC enabled\n"); 1096 1091 1097 1092 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); 1093 + } 1094 + } 1095 + 1096 + if (vls) { 1097 + if (!npt_enabled || 1098 + !boot_cpu_has(X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE) || 1099 + !IS_ENABLED(CONFIG_X86_64)) { 1100 + vls = false; 1101 + } else { 1102 + pr_info("Virtual VMLOAD VMSAVE supported\n"); 1098 1103 } 1099 1104 } 1100 1105 ··· 1294 1279 1295 1280 if (avic) 1296 1281 avic_init_vmcb(svm); 1282 + 1283 + /* 1284 + * If hardware supports Virtual VMLOAD VMSAVE then enable it 1285 + * in VMCB and clear intercepts to avoid #VMEXIT. 1286 + */ 1287 + if (vls) { 1288 + clr_intercept(svm, INTERCEPT_VMLOAD); 1289 + clr_intercept(svm, INTERCEPT_VMSAVE); 1290 + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 1291 + } 1297 1292 1298 1293 mark_all_dirty(svm->vmcb); 1299 1294 ··· 2121 2096 static int pf_interception(struct vcpu_svm *svm) 2122 2097 { 2123 2098 u64 fault_address = svm->vmcb->control.exit_info_2; 2124 - u64 error_code; 2125 - int r = 1; 2099 + u64 error_code = svm->vmcb->control.exit_info_1; 2126 2100 2127 - switch (svm->apf_reason) { 2128 - default: 2129 - error_code = svm->vmcb->control.exit_info_1; 2130 - 2131 - trace_kvm_page_fault(fault_address, error_code); 2132 - if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) 2133 - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); 2134 - r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, 2101 + return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, 2135 2102 svm->vmcb->control.insn_bytes, 2136 - svm->vmcb->control.insn_len); 2137 - break; 2138 - case KVM_PV_REASON_PAGE_NOT_PRESENT: 2139 - svm->apf_reason = 0; 2140 - local_irq_disable(); 2141 - kvm_async_pf_task_wait(fault_address); 2142 - local_irq_enable(); 2143 - break; 2144 - case KVM_PV_REASON_PAGE_READY: 2145 - svm->apf_reason = 0; 2146 - local_irq_disable(); 2147 - kvm_async_pf_task_wake(fault_address); 2148 - local_irq_enable(); 2149 - break; 2150 - } 2151 - return r; 2103 + svm->vmcb->control.insn_len, !npt_enabled); 2152 2104 } 2153 2105 2154 2106 static int db_interception(struct vcpu_svm *svm) ··· 2269 2267 { 2270 2268 struct kvm_vcpu *vcpu = &svm->vcpu; 2271 2269 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 2272 - int size, in, string; 2270 + int size, in, string, ret; 2273 2271 unsigned port; 2274 2272 2275 2273 ++svm->vcpu.stat.io_exits; ··· 2281 2279 port = io_info >> 16; 2282 2280 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 2283 2281 svm->next_rip = svm->vmcb->control.exit_info_2; 2284 - skip_emulated_instruction(&svm->vcpu); 2282 + ret = kvm_skip_emulated_instruction(&svm->vcpu); 2285 2283 2286 - return in ? kvm_fast_pio_in(vcpu, size, port) 2287 - : kvm_fast_pio_out(vcpu, size, port); 2284 + /* 2285 + * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered 2286 + * KVM_EXIT_DEBUG here. 2287 + */ 2288 + if (in) 2289 + return kvm_fast_pio_in(vcpu, size, port) && ret; 2290 + else 2291 + return kvm_fast_pio_out(vcpu, size, port) && ret; 2288 2292 } 2289 2293 2290 2294 static int nmi_interception(struct vcpu_svm *svm) ··· 2423 2415 if (!is_guest_mode(&svm->vcpu)) 2424 2416 return 0; 2425 2417 2418 + vmexit = nested_svm_intercept(svm); 2419 + if (vmexit != NESTED_EXIT_DONE) 2420 + return 0; 2421 + 2426 2422 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; 2427 2423 svm->vmcb->control.exit_code_hi = 0; 2428 2424 svm->vmcb->control.exit_info_1 = error_code; 2429 - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; 2425 + if (svm->vcpu.arch.exception.nested_apf) 2426 + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; 2427 + else 2428 + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; 2430 2429 2431 - vmexit = nested_svm_intercept(svm); 2432 - if (vmexit == NESTED_EXIT_DONE) 2433 - svm->nested.exit_required = true; 2434 - 2430 + svm->nested.exit_required = true; 2435 2431 return vmexit; 2436 2432 } 2437 2433 ··· 2610 2598 break; 2611 2599 case SVM_EXIT_EXCP_BASE + PF_VECTOR: 2612 2600 /* When we're shadowing, trap PFs, but not async PF */ 2613 - if (!npt_enabled && svm->apf_reason == 0) 2601 + if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0) 2614 2602 return NESTED_EXIT_HOST; 2615 2603 break; 2616 2604 default: ··· 2657 2645 } 2658 2646 /* async page fault always cause vmexit */ 2659 2647 else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && 2660 - svm->apf_reason != 0) 2648 + svm->vcpu.arch.exception.nested_apf != 0) 2661 2649 vmexit = NESTED_EXIT_DONE; 2662 2650 break; 2663 2651 } ··· 2714 2702 dst->event_inj = from->event_inj; 2715 2703 dst->event_inj_err = from->event_inj_err; 2716 2704 dst->nested_cr3 = from->nested_cr3; 2717 - dst->lbr_ctl = from->lbr_ctl; 2705 + dst->virt_ext = from->virt_ext; 2718 2706 } 2719 2707 2720 2708 static int nested_svm_vmexit(struct vcpu_svm *svm) ··· 3020 3008 /* We don't want to see VMMCALLs from a nested guest */ 3021 3009 clr_intercept(svm, INTERCEPT_VMMCALL); 3022 3010 3023 - svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; 3011 + svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext; 3024 3012 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 3025 3013 svm->vmcb->control.int_state = nested_vmcb->control.int_state; 3026 3014 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; ··· 3067 3055 { 3068 3056 struct vmcb *nested_vmcb; 3069 3057 struct page *page; 3058 + int ret; 3070 3059 3071 3060 if (nested_svm_check_permissions(svm)) 3072 3061 return 1; ··· 3077 3064 return 1; 3078 3065 3079 3066 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3080 - skip_emulated_instruction(&svm->vcpu); 3067 + ret = kvm_skip_emulated_instruction(&svm->vcpu); 3081 3068 3082 3069 nested_svm_vmloadsave(nested_vmcb, svm->vmcb); 3083 3070 nested_svm_unmap(page); 3084 3071 3085 - return 1; 3072 + return ret; 3086 3073 } 3087 3074 3088 3075 static int vmsave_interception(struct vcpu_svm *svm) 3089 3076 { 3090 3077 struct vmcb *nested_vmcb; 3091 3078 struct page *page; 3079 + int ret; 3092 3080 3093 3081 if (nested_svm_check_permissions(svm)) 3094 3082 return 1; ··· 3099 3085 return 1; 3100 3086 3101 3087 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3102 - skip_emulated_instruction(&svm->vcpu); 3088 + ret = kvm_skip_emulated_instruction(&svm->vcpu); 3103 3089 3104 3090 nested_svm_vmloadsave(svm->vmcb, nested_vmcb); 3105 3091 nested_svm_unmap(page); 3106 3092 3107 - return 1; 3093 + return ret; 3108 3094 } 3109 3095 3110 3096 static int vmrun_interception(struct vcpu_svm *svm) ··· 3137 3123 3138 3124 static int stgi_interception(struct vcpu_svm *svm) 3139 3125 { 3126 + int ret; 3127 + 3140 3128 if (nested_svm_check_permissions(svm)) 3141 3129 return 1; 3142 3130 3143 3131 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3144 - skip_emulated_instruction(&svm->vcpu); 3132 + ret = kvm_skip_emulated_instruction(&svm->vcpu); 3145 3133 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3146 3134 3147 3135 enable_gif(svm); 3148 3136 3149 - return 1; 3137 + return ret; 3150 3138 } 3151 3139 3152 3140 static int clgi_interception(struct vcpu_svm *svm) 3153 3141 { 3142 + int ret; 3143 + 3154 3144 if (nested_svm_check_permissions(svm)) 3155 3145 return 1; 3156 3146 3157 3147 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3158 - skip_emulated_instruction(&svm->vcpu); 3148 + ret = kvm_skip_emulated_instruction(&svm->vcpu); 3159 3149 3160 3150 disable_gif(svm); 3161 3151 ··· 3170 3152 mark_dirty(svm->vmcb, VMCB_INTR); 3171 3153 } 3172 3154 3173 - return 1; 3155 + return ret; 3174 3156 } 3175 3157 3176 3158 static int invlpga_interception(struct vcpu_svm *svm) ··· 3184 3166 kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); 3185 3167 3186 3168 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3187 - skip_emulated_instruction(&svm->vcpu); 3188 - return 1; 3169 + return kvm_skip_emulated_instruction(&svm->vcpu); 3189 3170 } 3190 3171 3191 3172 static int skinit_interception(struct vcpu_svm *svm) ··· 3207 3190 3208 3191 if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { 3209 3192 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3210 - skip_emulated_instruction(&svm->vcpu); 3193 + return kvm_skip_emulated_instruction(&svm->vcpu); 3211 3194 } 3212 3195 3213 3196 return 1; ··· 3303 3286 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; 3304 3287 3305 3288 kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); 3306 - skip_emulated_instruction(&svm->vcpu); 3307 - return 1; 3289 + return kvm_skip_emulated_instruction(&svm->vcpu); 3308 3290 } 3309 3291 3310 3292 static int emulate_on_interception(struct vcpu_svm *svm) ··· 3453 3437 kvm_register_write(&svm->vcpu, reg, val); 3454 3438 } 3455 3439 3456 - skip_emulated_instruction(&svm->vcpu); 3457 - 3458 - return 1; 3440 + return kvm_skip_emulated_instruction(&svm->vcpu); 3459 3441 } 3460 3442 3461 3443 static int cr8_write_interception(struct vcpu_svm *svm) ··· 3576 3562 if (svm_get_msr(&svm->vcpu, &msr_info)) { 3577 3563 trace_kvm_msr_read_ex(ecx); 3578 3564 kvm_inject_gp(&svm->vcpu, 0); 3565 + return 1; 3579 3566 } else { 3580 3567 trace_kvm_msr_read(ecx, msr_info.data); 3581 3568 ··· 3585 3570 kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, 3586 3571 msr_info.data >> 32); 3587 3572 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 3588 - skip_emulated_instruction(&svm->vcpu); 3573 + return kvm_skip_emulated_instruction(&svm->vcpu); 3589 3574 } 3590 - return 1; 3591 3575 } 3592 3576 3593 3577 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) ··· 3712 3698 if (kvm_set_msr(&svm->vcpu, &msr)) { 3713 3699 trace_kvm_msr_write_ex(ecx, data); 3714 3700 kvm_inject_gp(&svm->vcpu, 0); 3701 + return 1; 3715 3702 } else { 3716 3703 trace_kvm_msr_write(ecx, data); 3717 - skip_emulated_instruction(&svm->vcpu); 3704 + return kvm_skip_emulated_instruction(&svm->vcpu); 3718 3705 } 3719 - return 1; 3720 3706 } 3721 3707 3722 3708 static int msr_interception(struct vcpu_svm *svm) ··· 3745 3731 3746 3732 static int nop_interception(struct vcpu_svm *svm) 3747 3733 { 3748 - skip_emulated_instruction(&(svm->vcpu)); 3749 - return 1; 3734 + return kvm_skip_emulated_instruction(&(svm->vcpu)); 3750 3735 } 3751 3736 3752 3737 static int monitor_interception(struct vcpu_svm *svm) ··· 4130 4117 pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); 4131 4118 pr_err("%-20s%08x\n", "event_inj:", control->event_inj); 4132 4119 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); 4133 - pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); 4120 + pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext); 4134 4121 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); 4135 4122 pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); 4136 4123 pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); ··· 4978 4965 4979 4966 /* if exit due to PF check for async PF */ 4980 4967 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) 4981 - svm->apf_reason = kvm_read_and_reset_pf_reason(); 4968 + svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); 4982 4969 4983 4970 if (npt_enabled) { 4984 4971 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);

+102 -60

arch/x86/kvm/vmx.c

··· 2422 2422 * KVM wants to inject page-faults which it got to the guest. This function 2423 2423 * checks whether in a nested guest, we need to inject them to L1 or L2. 2424 2424 */ 2425 - static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) 2425 + static int nested_vmx_check_exception(struct kvm_vcpu *vcpu) 2426 2426 { 2427 2427 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2428 + unsigned int nr = vcpu->arch.exception.nr; 2428 2429 2429 - if (!(vmcs12->exception_bitmap & (1u << nr))) 2430 + if (!((vmcs12->exception_bitmap & (1u << nr)) || 2431 + (nr == PF_VECTOR && vcpu->arch.exception.nested_apf))) 2430 2432 return 0; 2433 + 2434 + if (vcpu->arch.exception.nested_apf) { 2435 + vmcs_write32(VM_EXIT_INTR_ERROR_CODE, vcpu->arch.exception.error_code); 2436 + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 2437 + PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | 2438 + INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, 2439 + vcpu->arch.apf.nested_apf_token); 2440 + return 1; 2441 + } 2431 2442 2432 2443 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 2433 2444 vmcs_read32(VM_EXIT_INTR_INFO), ··· 2446 2435 return 1; 2447 2436 } 2448 2437 2449 - static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 2450 - bool has_error_code, u32 error_code, 2451 - bool reinject) 2438 + static void vmx_queue_exception(struct kvm_vcpu *vcpu) 2452 2439 { 2453 2440 struct vcpu_vmx *vmx = to_vmx(vcpu); 2441 + unsigned nr = vcpu->arch.exception.nr; 2442 + bool has_error_code = vcpu->arch.exception.has_error_code; 2443 + bool reinject = vcpu->arch.exception.reinject; 2444 + u32 error_code = vcpu->arch.exception.error_code; 2454 2445 u32 intr_info = nr | INTR_INFO_VALID_MASK; 2455 2446 2456 2447 if (!reinject && is_guest_mode(vcpu) && 2457 - nested_vmx_check_exception(vcpu, nr)) 2448 + nested_vmx_check_exception(vcpu)) 2458 2449 return; 2459 2450 2460 2451 if (has_error_code) { ··· 3777 3764 } 3778 3765 } 3779 3766 3767 + enum vmcs_field_type { 3768 + VMCS_FIELD_TYPE_U16 = 0, 3769 + VMCS_FIELD_TYPE_U64 = 1, 3770 + VMCS_FIELD_TYPE_U32 = 2, 3771 + VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 3772 + }; 3773 + 3774 + static inline int vmcs_field_type(unsigned long field) 3775 + { 3776 + if (0x1 & field) /* the *_HIGH fields are all 32 bit */ 3777 + return VMCS_FIELD_TYPE_U32; 3778 + return (field >> 13) & 0x3 ; 3779 + } 3780 + 3781 + static inline int vmcs_field_readonly(unsigned long field) 3782 + { 3783 + return (((field >> 10) & 0x3) == 1); 3784 + } 3785 + 3780 3786 static void init_vmcs_shadow_fields(void) 3781 3787 { 3782 3788 int i, j; ··· 3821 3789 3822 3790 /* shadowed fields guest access without vmexit */ 3823 3791 for (i = 0; i < max_shadow_read_write_fields; i++) { 3824 - clear_bit(shadow_read_write_fields[i], 3825 - vmx_vmwrite_bitmap); 3826 - clear_bit(shadow_read_write_fields[i], 3827 - vmx_vmread_bitmap); 3792 + unsigned long field = shadow_read_write_fields[i]; 3793 + 3794 + clear_bit(field, vmx_vmwrite_bitmap); 3795 + clear_bit(field, vmx_vmread_bitmap); 3796 + if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) { 3797 + clear_bit(field + 1, vmx_vmwrite_bitmap); 3798 + clear_bit(field + 1, vmx_vmread_bitmap); 3799 + } 3828 3800 } 3829 - for (i = 0; i < max_shadow_read_only_fields; i++) 3830 - clear_bit(shadow_read_only_fields[i], 3831 - vmx_vmread_bitmap); 3801 + for (i = 0; i < max_shadow_read_only_fields; i++) { 3802 + unsigned long field = shadow_read_only_fields[i]; 3803 + 3804 + clear_bit(field, vmx_vmread_bitmap); 3805 + if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) 3806 + clear_bit(field + 1, vmx_vmread_bitmap); 3807 + } 3832 3808 } 3833 3809 3834 3810 static __init int alloc_kvm_area(void) ··· 4672 4632 */ 4673 4633 4674 4634 return true; 4635 + } 4636 + 4637 + static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) 4638 + { 4639 + return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); 4675 4640 } 4676 4641 4677 4642 static int init_rmode_tss(struct kvm *kvm) ··· 5709 5664 } 5710 5665 5711 5666 if (is_page_fault(intr_info)) { 5712 - /* EPT won't cause page fault directly */ 5713 - BUG_ON(enable_ept); 5714 5667 cr2 = vmcs_readl(EXIT_QUALIFICATION); 5715 - trace_kvm_page_fault(cr2, error_code); 5716 - 5717 - if (kvm_event_needs_reinjection(vcpu)) 5718 - kvm_mmu_unprotect_page_virt(vcpu, cr2); 5719 - return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); 5668 + /* EPT won't cause page fault directly */ 5669 + WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); 5670 + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0, 5671 + true); 5720 5672 } 5721 5673 5722 5674 ex_no = intr_info & INTR_INFO_VECTOR_MASK; ··· 7256 7214 return nested_vmx_run(vcpu, false); 7257 7215 } 7258 7216 7259 - enum vmcs_field_type { 7260 - VMCS_FIELD_TYPE_U16 = 0, 7261 - VMCS_FIELD_TYPE_U64 = 1, 7262 - VMCS_FIELD_TYPE_U32 = 2, 7263 - VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 7264 - }; 7265 - 7266 - static inline int vmcs_field_type(unsigned long field) 7267 - { 7268 - if (0x1 & field) /* the *_HIGH fields are all 32 bit */ 7269 - return VMCS_FIELD_TYPE_U32; 7270 - return (field >> 13) & 0x3 ; 7271 - } 7272 - 7273 - static inline int vmcs_field_readonly(unsigned long field) 7274 - { 7275 - return (((field >> 10) & 0x3) == 1); 7276 - } 7277 - 7278 7217 /* 7279 7218 * Read a vmcs12 field. Since these can have varying lengths and we return 7280 7219 * one type, we chose the biggest type (u64) and zero-extend the return value ··· 8037 8014 if (is_nmi(intr_info)) 8038 8015 return false; 8039 8016 else if (is_page_fault(intr_info)) 8040 - return enable_ept; 8017 + return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; 8041 8018 else if (is_no_device(intr_info) && 8042 8019 !(vmcs12->guest_cr0 & X86_CR0_TS)) 8043 8020 return false; ··· 8441 8418 exit_reason != EXIT_REASON_TASK_SWITCH)) { 8442 8419 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 8443 8420 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; 8444 - vcpu->run->internal.ndata = 2; 8421 + vcpu->run->internal.ndata = 3; 8445 8422 vcpu->run->internal.data[0] = vectoring_info; 8446 8423 vcpu->run->internal.data[1] = exit_reason; 8424 + vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; 8425 + if (exit_reason == EXIT_REASON_EPT_MISCONFIG) { 8426 + vcpu->run->internal.ndata++; 8427 + vcpu->run->internal.data[3] = 8428 + vmcs_read64(GUEST_PHYSICAL_ADDRESS); 8429 + } 8447 8430 return 0; 8448 8431 } 8449 8432 ··· 8640 8611 8641 8612 static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) 8642 8613 { 8643 - u32 exit_intr_info; 8614 + u32 exit_intr_info = 0; 8615 + u16 basic_exit_reason = (u16)vmx->exit_reason; 8644 8616 8645 - if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY 8646 - || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) 8617 + if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY 8618 + || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI)) 8647 8619 return; 8648 8620 8649 - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 8650 - exit_intr_info = vmx->exit_intr_info; 8621 + if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) 8622 + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 8623 + vmx->exit_intr_info = exit_intr_info; 8624 + 8625 + /* if exit due to PF check for async PF */ 8626 + if (is_page_fault(exit_intr_info)) 8627 + vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); 8651 8628 8652 8629 /* Handle machine checks before interrupts are enabled */ 8653 - if (is_machine_check(exit_intr_info)) 8630 + if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY || 8631 + is_machine_check(exit_intr_info)) 8654 8632 kvm_machine_check(); 8655 8633 8656 8634 /* We need to handle NMIs before interrupts are enabled */ ··· 9625 9589 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); 9626 9590 } 9627 9591 9592 + static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 9593 + struct vmcs12 *vmcs12) 9594 + { 9595 + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 9596 + return 0; 9597 + 9598 + if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) || 9599 + !page_address_valid(vcpu, vmcs12->io_bitmap_b)) 9600 + return -EINVAL; 9601 + 9602 + return 0; 9603 + } 9604 + 9628 9605 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 9629 9606 struct vmcs12 *vmcs12) 9630 9607 { 9631 - int maxphyaddr; 9632 - u64 addr; 9633 - 9634 9608 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 9635 9609 return 0; 9636 9610 9637 - if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) { 9638 - WARN_ON(1); 9639 - return -EINVAL; 9640 - } 9641 - maxphyaddr = cpuid_maxphyaddr(vcpu); 9642 - 9643 - if (!PAGE_ALIGNED(vmcs12->msr_bitmap) || 9644 - ((addr + PAGE_SIZE) >> maxphyaddr)) 9611 + if (!page_address_valid(vcpu, vmcs12->msr_bitmap)) 9645 9612 return -EINVAL; 9646 9613 9647 9614 return 0; ··· 10332 10293 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) 10333 10294 return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 10334 10295 10296 + if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) 10297 + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 10298 + 10335 10299 if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) 10336 10300 return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 10337 10301 ··· 10470 10428 EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); 10471 10429 return 1; 10472 10430 } 10473 - 10474 - vmcs12->launch_state = 1; 10475 10431 10476 10432 /* 10477 10433 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point ··· 10844 10804 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 10845 10805 10846 10806 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 10807 + vmcs12->launch_state = 1; 10808 + 10847 10809 /* vm_entry_intr_info_field is cleared on exit. Emulate this 10848 10810 * instead of reading the real value. */ 10849 10811 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;

+30 -13

arch/x86/kvm/x86.c

··· 134 134 static bool __read_mostly vector_hashing = true; 135 135 module_param(vector_hashing, bool, S_IRUGO); 136 136 137 - static bool __read_mostly backwards_tsc_observed = false; 138 - 139 137 #define KVM_NR_SHARED_MSRS 16 140 138 141 139 struct kvm_shared_msrs_global { ··· 450 452 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) 451 453 { 452 454 ++vcpu->stat.pf_guest; 453 - vcpu->arch.cr2 = fault->address; 455 + vcpu->arch.exception.nested_apf = 456 + is_guest_mode(vcpu) && fault->async_page_fault; 457 + if (vcpu->arch.exception.nested_apf) 458 + vcpu->arch.apf.nested_apf_token = fault->address; 459 + else 460 + vcpu->arch.cr2 = fault->address; 454 461 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); 455 462 } 456 463 EXPORT_SYMBOL_GPL(kvm_inject_page_fault); ··· 1722 1719 &ka->master_cycle_now); 1723 1720 1724 1721 ka->use_master_clock = host_tsc_clocksource && vcpus_matched 1725 - && !backwards_tsc_observed 1722 + && !ka->backwards_tsc_observed 1726 1723 && !ka->boot_vcpu_runs_old_kvmclock; 1727 1724 1728 1725 if (ka->use_master_clock) ··· 2063 2060 { 2064 2061 gpa_t gpa = data & ~0x3f; 2065 2062 2066 - /* Bits 2:5 are reserved, Should be zero */ 2067 - if (data & 0x3c) 2063 + /* Bits 3:5 are reserved, Should be zero */ 2064 + if (data & 0x38) 2068 2065 return 1; 2069 2066 2070 2067 vcpu->arch.apf.msr_val = data; ··· 2080 2077 return 1; 2081 2078 2082 2079 vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); 2080 + vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; 2083 2081 kvm_async_pf_wakeup_all(vcpu); 2084 2082 return 0; 2085 2083 } ··· 2665 2661 case KVM_CAP_HYPERV_VAPIC: 2666 2662 case KVM_CAP_HYPERV_SPIN: 2667 2663 case KVM_CAP_HYPERV_SYNIC: 2664 + case KVM_CAP_HYPERV_SYNIC2: 2665 + case KVM_CAP_HYPERV_VP_INDEX: 2668 2666 case KVM_CAP_PCI_SEGMENT: 2669 2667 case KVM_CAP_DEBUGREGS: 2670 2668 case KVM_CAP_X86_ROBUST_SINGLESTEP: ··· 3390 3384 return -EINVAL; 3391 3385 3392 3386 switch (cap->cap) { 3387 + case KVM_CAP_HYPERV_SYNIC2: 3388 + if (cap->args[0]) 3389 + return -EINVAL; 3393 3390 case KVM_CAP_HYPERV_SYNIC: 3394 3391 if (!irqchip_in_kernel(vcpu->kvm)) 3395 3392 return -EINVAL; 3396 - return kvm_hv_activate_synic(vcpu); 3393 + return kvm_hv_activate_synic(vcpu, cap->cap == 3394 + KVM_CAP_HYPERV_SYNIC2); 3397 3395 default: 3398 3396 return -EINVAL; 3399 3397 } ··· 4198 4188 goto out; 4199 4189 4200 4190 r = 0; 4191 + /* 4192 + * TODO: userspace has to take care of races with VCPU_RUN, so 4193 + * kvm_gen_update_masterclock() can be cut down to locked 4194 + * pvclock_update_vm_gtod_copy(). 4195 + */ 4196 + kvm_gen_update_masterclock(kvm); 4201 4197 now_ns = get_kvmclock_ns(kvm); 4202 4198 kvm->arch.kvmclock_offset += user_ns.clock - now_ns; 4203 - kvm_gen_update_masterclock(kvm); 4199 + kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); 4204 4200 break; 4205 4201 } 4206 4202 case KVM_GET_CLOCK: { ··· 6363 6347 kvm_update_dr7(vcpu); 6364 6348 } 6365 6349 6366 - kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 6367 - vcpu->arch.exception.has_error_code, 6368 - vcpu->arch.exception.error_code, 6369 - vcpu->arch.exception.reinject); 6350 + kvm_x86_ops->queue_exception(vcpu); 6370 6351 return 0; 6371 6352 } 6372 6353 ··· 7689 7676 struct msr_data msr; 7690 7677 struct kvm *kvm = vcpu->kvm; 7691 7678 7679 + kvm_hv_vcpu_postcreate(vcpu); 7680 + 7692 7681 if (vcpu_load(vcpu)) 7693 7682 return; 7694 7683 msr.data = 0x0; ··· 7844 7829 */ 7845 7830 if (backwards_tsc) { 7846 7831 u64 delta_cyc = max_tsc - local_tsc; 7847 - backwards_tsc_observed = true; 7848 7832 list_for_each_entry(kvm, &vm_list, vm_list) { 7833 + kvm->arch.backwards_tsc_observed = true; 7849 7834 kvm_for_each_vcpu(i, vcpu, kvm) { 7850 7835 vcpu->arch.tsc_offset_adjustment += delta_cyc; 7851 7836 vcpu->arch.last_host_tsc = local_tsc; ··· 8591 8576 fault.error_code = 0; 8592 8577 fault.nested_page_fault = false; 8593 8578 fault.address = work->arch.token; 8579 + fault.async_page_fault = true; 8594 8580 kvm_inject_page_fault(vcpu, &fault); 8595 8581 } 8596 8582 } ··· 8614 8598 fault.error_code = 0; 8615 8599 fault.nested_page_fault = false; 8616 8600 fault.address = work->arch.token; 8601 + fault.async_page_fault = true; 8617 8602 kvm_inject_page_fault(vcpu, &fault); 8618 8603 } 8619 8604 vcpu->arch.apf.halted = false;

+11 -6

include/linux/kvm_host.h

··· 234 234 235 235 int guest_fpu_loaded, guest_xcr0_loaded; 236 236 struct swait_queue_head wq; 237 - struct pid *pid; 237 + struct pid __rcu *pid; 238 238 int sigset_active; 239 239 sigset_t sigset; 240 240 struct kvm_vcpu_stat stat; ··· 390 390 spinlock_t mmu_lock; 391 391 struct mutex slots_lock; 392 392 struct mm_struct *mm; /* userspace tied to this vm */ 393 - struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM]; 393 + struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; 394 394 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; 395 395 396 396 /* ··· 404 404 int last_boosted_vcpu; 405 405 struct list_head vm_list; 406 406 struct mutex lock; 407 - struct kvm_io_bus *buses[KVM_NR_BUSES]; 407 + struct kvm_io_bus __rcu *buses[KVM_NR_BUSES]; 408 408 #ifdef CONFIG_HAVE_KVM_EVENTFD 409 409 struct { 410 410 spinlock_t lock; ··· 472 472 ## __VA_ARGS__) 473 473 #define vcpu_err(vcpu, fmt, ...) \ 474 474 kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) 475 + 476 + static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) 477 + { 478 + return srcu_dereference_check(kvm->buses[idx], &kvm->srcu, 479 + lockdep_is_held(&kvm->slots_lock)); 480 + } 475 481 476 482 static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) 477 483 { ··· 568 562 569 563 static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id) 570 564 { 571 - return rcu_dereference_check(kvm->memslots[as_id], 572 - srcu_read_lock_held(&kvm->srcu) 573 - || lockdep_is_held(&kvm->slots_lock)); 565 + return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu, 566 + lockdep_is_held(&kvm->slots_lock)); 574 567 } 575 568 576 569 static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)

+3 -1

include/uapi/linux/kvm.h

··· 927 927 #define KVM_CAP_S390_CMMA_MIGRATION 145 928 928 #define KVM_CAP_PPC_FWNMI 146 929 929 #define KVM_CAP_PPC_SMT_POSSIBLE 147 930 + #define KVM_CAP_HYPERV_SYNIC2 148 931 + #define KVM_CAP_HYPERV_VP_INDEX 149 930 932 931 933 #ifdef KVM_CAP_IRQ_ROUTING 932 934 ··· 1353 1351 /* Available with KVM_CAP_X86_SMM */ 1354 1352 #define KVM_SMI _IO(KVMIO, 0xb7) 1355 1353 /* Available with KVM_CAP_S390_CMMA_MIGRATION */ 1356 - #define KVM_S390_GET_CMMA_BITS _IOW(KVMIO, 0xb8, struct kvm_s390_cmma_log) 1354 + #define KVM_S390_GET_CMMA_BITS _IOWR(KVMIO, 0xb8, struct kvm_s390_cmma_log) 1357 1355 #define KVM_S390_SET_CMMA_BITS _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log) 1358 1356 1359 1357 #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)

+5 -3

virt/kvm/eventfd.c

··· 825 825 if (ret < 0) 826 826 goto unlock_fail; 827 827 828 - kvm->buses[bus_idx]->ioeventfd_count++; 828 + kvm_get_bus(kvm, bus_idx)->ioeventfd_count++; 829 829 list_add_tail(&p->list, &kvm->ioeventfds); 830 830 831 831 mutex_unlock(&kvm->slots_lock); ··· 848 848 { 849 849 struct _ioeventfd *p, *tmp; 850 850 struct eventfd_ctx *eventfd; 851 + struct kvm_io_bus *bus; 851 852 int ret = -ENOENT; 852 853 853 854 eventfd = eventfd_ctx_fdget(args->fd); ··· 871 870 continue; 872 871 873 872 kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); 874 - if (kvm->buses[bus_idx]) 875 - kvm->buses[bus_idx]->ioeventfd_count--; 873 + bus = kvm_get_bus(kvm, bus_idx); 874 + if (bus) 875 + bus->ioeventfd_count--; 876 876 ioeventfd_release(p); 877 877 ret = 0; 878 878 break;

+1 -1

virt/kvm/irqchip.c

··· 230 230 } 231 231 232 232 mutex_lock(&kvm->irq_lock); 233 - old = kvm->irq_routing; 233 + old = rcu_dereference_protected(kvm->irq_routing, 1); 234 234 rcu_assign_pointer(kvm->irq_routing, new); 235 235 kvm_irq_routing_update(kvm); 236 236 kvm_arch_irq_routing_update(kvm);

+109 -22

virt/kvm/kvm_main.c

··· 130 130 131 131 static bool largepages_enabled = true; 132 132 133 + #define KVM_EVENT_CREATE_VM 0 134 + #define KVM_EVENT_DESTROY_VM 1 135 + static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); 136 + static unsigned long long kvm_createvm_count; 137 + static unsigned long long kvm_active_vms; 138 + 133 139 bool kvm_is_reserved_pfn(kvm_pfn_t pfn) 134 140 { 135 141 if (pfn_valid(pfn)) ··· 193 187 { 194 188 } 195 189 190 + static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait) 191 + { 192 + if (unlikely(!cpus)) 193 + cpus = cpu_online_mask; 194 + 195 + if (cpumask_empty(cpus)) 196 + return false; 197 + 198 + smp_call_function_many(cpus, ack_flush, NULL, wait); 199 + return true; 200 + } 201 + 196 202 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) 197 203 { 198 204 int i, cpu, me; 199 205 cpumask_var_t cpus; 200 - bool called = true; 201 - bool wait = req & KVM_REQUEST_WAIT; 206 + bool called; 202 207 struct kvm_vcpu *vcpu; 203 208 204 209 zalloc_cpumask_var(&cpus, GFP_ATOMIC); ··· 224 207 225 208 if (cpus != NULL && cpu != -1 && cpu != me && 226 209 kvm_request_needs_ipi(vcpu, req)) 227 - cpumask_set_cpu(cpu, cpus); 210 + __cpumask_set_cpu(cpu, cpus); 228 211 } 229 - if (unlikely(cpus == NULL)) 230 - smp_call_function_many(cpu_online_mask, ack_flush, NULL, wait); 231 - else if (!cpumask_empty(cpus)) 232 - smp_call_function_many(cpus, ack_flush, NULL, wait); 233 - else 234 - called = false; 212 + called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); 235 213 put_cpu(); 236 214 free_cpumask_var(cpus); 237 215 return called; ··· 305 293 306 294 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 307 295 { 308 - put_pid(vcpu->pid); 296 + /* 297 + * no need for rcu_read_lock as VCPU_RUN is the only place that 298 + * will change the vcpu->pid pointer and on uninit all file 299 + * descriptors are already gone. 300 + */ 301 + put_pid(rcu_dereference_protected(vcpu->pid, 1)); 309 302 kvm_arch_vcpu_uninit(vcpu); 310 303 free_page((unsigned long)vcpu->run); 311 304 } ··· 691 674 if (init_srcu_struct(&kvm->irq_srcu)) 692 675 goto out_err_no_irq_srcu; 693 676 for (i = 0; i < KVM_NR_BUSES; i++) { 694 - kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), 695 - GFP_KERNEL); 677 + rcu_assign_pointer(kvm->buses[i], 678 + kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL)); 696 679 if (!kvm->buses[i]) 697 680 goto out_err; 698 681 } ··· 717 700 hardware_disable_all(); 718 701 out_err_no_disable: 719 702 for (i = 0; i < KVM_NR_BUSES; i++) 720 - kfree(kvm->buses[i]); 703 + kfree(rcu_access_pointer(kvm->buses[i])); 721 704 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 722 - kvm_free_memslots(kvm, kvm->memslots[i]); 705 + kvm_free_memslots(kvm, 706 + rcu_dereference_protected(kvm->memslots[i], 1)); 723 707 kvm_arch_free_vm(kvm); 724 708 mmdrop(current->mm); 725 709 return ERR_PTR(r); ··· 746 728 int i; 747 729 struct mm_struct *mm = kvm->mm; 748 730 731 + kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); 749 732 kvm_destroy_vm_debugfs(kvm); 750 733 kvm_arch_sync_events(kvm); 751 734 spin_lock(&kvm_lock); ··· 754 735 spin_unlock(&kvm_lock); 755 736 kvm_free_irq_routing(kvm); 756 737 for (i = 0; i < KVM_NR_BUSES; i++) { 757 - if (kvm->buses[i]) 758 - kvm_io_bus_destroy(kvm->buses[i]); 738 + struct kvm_io_bus *bus; 739 + 740 + bus = rcu_dereference_protected(kvm->buses[i], 1); 741 + if (bus) 742 + kvm_io_bus_destroy(bus); 759 743 kvm->buses[i] = NULL; 760 744 } 761 745 kvm_coalesced_mmio_free(kvm); ··· 770 748 kvm_arch_destroy_vm(kvm); 771 749 kvm_destroy_devices(kvm); 772 750 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 773 - kvm_free_memslots(kvm, kvm->memslots[i]); 751 + kvm_free_memslots(kvm, 752 + rcu_dereference_protected(kvm->memslots[i], 1)); 774 753 cleanup_srcu_struct(&kvm->irq_srcu); 775 754 cleanup_srcu_struct(&kvm->srcu); 776 755 kvm_arch_free_vm(kvm); ··· 2574 2551 if (r) 2575 2552 return r; 2576 2553 switch (ioctl) { 2577 - case KVM_RUN: 2554 + case KVM_RUN: { 2555 + struct pid *oldpid; 2578 2556 r = -EINVAL; 2579 2557 if (arg) 2580 2558 goto out; 2581 - if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { 2559 + oldpid = rcu_access_pointer(vcpu->pid); 2560 + if (unlikely(oldpid != current->pids[PIDTYPE_PID].pid)) { 2582 2561 /* The thread running this VCPU changed. */ 2583 - struct pid *oldpid = vcpu->pid; 2584 2562 struct pid *newpid = get_task_pid(current, PIDTYPE_PID); 2585 2563 2586 2564 rcu_assign_pointer(vcpu->pid, newpid); ··· 2592 2568 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 2593 2569 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 2594 2570 break; 2571 + } 2595 2572 case KVM_GET_REGS: { 2596 2573 struct kvm_regs *kvm_regs; 2597 2574 ··· 3227 3202 fput(file); 3228 3203 return -ENOMEM; 3229 3204 } 3205 + kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); 3230 3206 3231 3207 fd_install(r, file); 3232 3208 return r; ··· 3589 3563 { 3590 3564 struct kvm_io_bus *new_bus, *bus; 3591 3565 3592 - bus = kvm->buses[bus_idx]; 3566 + bus = kvm_get_bus(kvm, bus_idx); 3593 3567 if (!bus) 3594 3568 return -ENOMEM; 3595 3569 ··· 3618 3592 int i; 3619 3593 struct kvm_io_bus *new_bus, *bus; 3620 3594 3621 - bus = kvm->buses[bus_idx]; 3595 + bus = kvm_get_bus(kvm, bus_idx); 3622 3596 if (!bus) 3623 3597 return; 3624 3598 ··· 3879 3853 [KVM_STAT_VCPU] = &vcpu_stat_fops, 3880 3854 [KVM_STAT_VM] = &vm_stat_fops, 3881 3855 }; 3856 + 3857 + static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) 3858 + { 3859 + struct kobj_uevent_env *env; 3860 + char *tmp, *pathbuf = NULL; 3861 + unsigned long long created, active; 3862 + 3863 + if (!kvm_dev.this_device || !kvm) 3864 + return; 3865 + 3866 + spin_lock(&kvm_lock); 3867 + if (type == KVM_EVENT_CREATE_VM) { 3868 + kvm_createvm_count++; 3869 + kvm_active_vms++; 3870 + } else if (type == KVM_EVENT_DESTROY_VM) { 3871 + kvm_active_vms--; 3872 + } 3873 + created = kvm_createvm_count; 3874 + active = kvm_active_vms; 3875 + spin_unlock(&kvm_lock); 3876 + 3877 + env = kzalloc(sizeof(*env), GFP_KERNEL); 3878 + if (!env) 3879 + return; 3880 + 3881 + add_uevent_var(env, "CREATED=%llu", created); 3882 + add_uevent_var(env, "COUNT=%llu", active); 3883 + 3884 + if (type == KVM_EVENT_CREATE_VM) 3885 + add_uevent_var(env, "EVENT=create"); 3886 + else if (type == KVM_EVENT_DESTROY_VM) 3887 + add_uevent_var(env, "EVENT=destroy"); 3888 + 3889 + if (kvm->debugfs_dentry) { 3890 + char p[ITOA_MAX_LEN]; 3891 + 3892 + snprintf(p, sizeof(p), "%s", kvm->debugfs_dentry->d_name.name); 3893 + tmp = strchrnul(p + 1, '-'); 3894 + *tmp = '\0'; 3895 + add_uevent_var(env, "PID=%s", p); 3896 + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); 3897 + if (pathbuf) { 3898 + /* sizeof counts the final '\0' */ 3899 + int len = sizeof("STATS_PATH=") - 1; 3900 + const char *pvar = "STATS_PATH="; 3901 + 3902 + tmp = dentry_path_raw(kvm->debugfs_dentry, 3903 + pathbuf + len, 3904 + PATH_MAX - len); 3905 + if (!IS_ERR(tmp)) { 3906 + memcpy(tmp - len, pvar, len); 3907 + env->envp[env->envp_idx++] = tmp - len; 3908 + } 3909 + } 3910 + } 3911 + /* no need for checks, since we are adding at most only 5 keys */ 3912 + env->envp[env->envp_idx++] = NULL; 3913 + kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp); 3914 + kfree(env); 3915 + kfree(pathbuf); 3916 + } 3882 3917 3883 3918 static int kvm_init_debug(void) 3884 3919 {