Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

tjh.dev / kernel

fork atom

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork atom

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM fixes from Paolo Bonzini:
"More fixes for ARM and x86"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
KVM: LAPIC: Advancing the timer expiration on guest initiated write
KVM: x86/mmu: Skip !MMU-present SPTEs when removing SP in exclusive mode
KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged
kvm: x86: annotate RCU pointers
KVM: arm64: Fix exclusive limit for IPA size
KVM: arm64: Reject VM creation when the default IPA size is unsupported
KVM: arm64: Ensure I-cache isolation between vcpus of a same VM
KVM: arm64: Don't use cbz/adr with external symbols
KVM: arm64: Fix range alignment when walking page tables
KVM: arm64: Workaround firmware wrongly advertising GICv2-on-v3 compatibility
KVM: arm64: Rename __vgic_v3_get_ich_vtr_el2() to __vgic_v3_get_gic_config()
KVM: arm64: Don't access PMSELR_EL0/PMUSERENR_EL0 when no PMU is available
KVM: arm64: Turn kvm_arm_support_pmu_v3() into a static key
KVM: arm64: Fix nVHE hyp panic host context restore
KVM: arm64: Avoid corrupting vCPU context register in guest exit
KVM: arm64: nvhe: Save the SPE context early
kvm: x86: use NULL instead of using plain integer as pointer
KVM: SVM: Connect 'npt' module param to KVM's internal 'npt_enabled'
KVM: x86: Ensure deadline timer has truly expired before posting its IRQ

Linus Torvalds 5 years ago 9d0c8e79 50eb842f

+197 -84

27 changed files

expand all collapse all

Documentation

virt

kvm

api.rst

arch

arm64

include

asm

kvm_asm.h

kvm_hyp.h

kernel

image-vars.h

kvm

arm.c

hyp

entry.S

include

hyp

switch.h

nvhe

debug-sr.c

host.S

hyp-main.c

switch.c

tlb.c

pgtable.c

vgic-v3-sr.c

vhe

tlb.c

mmu.c

perf.c

pmu-emul.c

reset.c

vgic

vgic-v3.c

x86

include

asm

kvm_host.h

kernel

kvmclock.c

kvm

lapic.c

mmu

tdp_mmu.c

svm

svm.c

x86.c

include

kvm

arm_pmu.h

Documentation/virt/kvm/api.rst

reviewed

··· 182 182 be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION 183 183 ioctl() at run-time. 184 184 185 185 + Creation of the VM will fail if the requested IPA size (whether it is 186 186 + implicit or explicit) is unsupported on the host. 187 187 + 185 188 Please note that configuring the IPA size does not affect the capability 186 189 exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects 187 190 size of the address translated by the stage2 level (guest physical to

+4 -4

arch/arm64/include/asm/kvm_asm.h

reviewed

··· 47 47 #define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2 48 48 #define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3 49 49 #define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4 50 50 - #define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_vmid 5 50 50 + #define __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context 5 51 51 #define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6 52 52 #define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7 53 53 - #define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_ich_vtr_el2 8 53 53 + #define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_gic_config 8 54 54 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr 9 55 55 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr 10 56 56 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs 11 ··· 183 183 #define __bp_harden_hyp_vecs CHOOSE_HYP_SYM(__bp_harden_hyp_vecs) 184 184 185 185 extern void __kvm_flush_vm_context(void); 186 186 + extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu); 186 187 extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, 187 188 int level); 188 189 extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu); 189 189 - extern void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu); 190 190 191 191 extern void __kvm_timer_set_cntvoff(u64 cntvoff); 192 192 193 193 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); 194 194 195 195 - extern u64 __vgic_v3_get_ich_vtr_el2(void); 195 195 + extern u64 __vgic_v3_get_gic_config(void); 196 196 extern u64 __vgic_v3_read_vmcr(void); 197 197 extern void __vgic_v3_write_vmcr(u32 vmcr); 198 198 extern void __vgic_v3_init_lrs(void);

+7 -1

arch/arm64/include/asm/kvm_hyp.h

reviewed

··· 83 83 void __debug_switch_to_guest(struct kvm_vcpu *vcpu); 84 84 void __debug_switch_to_host(struct kvm_vcpu *vcpu); 85 85 86 86 + #ifdef __KVM_NVHE_HYPERVISOR__ 87 87 + void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu); 88 88 + void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu); 89 89 + #endif 90 90 + 86 91 void __fpsimd_save_state(struct user_fpsimd_state *fp_regs); 87 92 void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); 88 93 ··· 102 97 103 98 void __noreturn hyp_panic(void); 104 99 #ifdef __KVM_NVHE_HYPERVISOR__ 105 105 - void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); 100 100 + void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr, 101 101 + u64 elr, u64 par); 106 102 #endif 107 103 108 104 #endif /* __ARM64_KVM_HYP_H__ */

arch/arm64/kernel/image-vars.h

reviewed

··· 101 101 /* Array containing bases of nVHE per-CPU memory regions. */ 102 102 KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base); 103 103 104 104 + /* PMU available static key */ 105 105 + KVM_NVHE_ALIAS(kvm_arm_pmu_available); 106 106 + 104 107 #endif /* CONFIG_KVM */ 105 108 106 109 #endif /* __ARM64_KERNEL_IMAGE_VARS_H */

+6 -1

arch/arm64/kvm/arm.c

reviewed

··· 385 385 last_ran = this_cpu_ptr(mmu->last_vcpu_ran); 386 386 387 387 /* 388 388 + * We guarantee that both TLBs and I-cache are private to each 389 389 + * vcpu. If detecting that a vcpu from the same VM has 390 390 + * previously run on the same physical CPU, call into the 391 391 + * hypervisor code to nuke the relevant contexts. 392 392 + * 388 393 * We might get preempted before the vCPU actually runs, but 389 394 * over-invalidation doesn't affect correctness. 390 395 */ 391 396 if (*last_ran != vcpu->vcpu_id) { 392 392 - kvm_call_hyp(__kvm_tlb_flush_local_vmid, mmu); 397 397 + kvm_call_hyp(__kvm_flush_cpu_context, mmu); 393 398 *last_ran = vcpu->vcpu_id; 394 399 } 395 400

+5 -3

arch/arm64/kvm/hyp/entry.S

reviewed

··· 85 85 86 86 // If the hyp context is loaded, go straight to hyp_panic 87 87 get_loaded_vcpu x0, x1 88 88 - cbz x0, hyp_panic 88 88 + cbnz x0, 1f 89 89 + b hyp_panic 89 90 91 91 + 1: 90 92 // The hyp context is saved so make sure it is restored to allow 91 93 // hyp_panic to run at hyp and, subsequently, panic to run in the host. 92 94 // This makes use of __guest_exit to avoid duplication but sets the ··· 96 94 // current state is saved to the guest context but it will only be 97 95 // accurate if the guest had been completely restored. 98 96 adr_this_cpu x0, kvm_hyp_ctxt, x1 99 99 - adr x1, hyp_panic 97 97 + adr_l x1, hyp_panic 100 98 str x1, [x0, #CPU_XREG_OFFSET(30)] 101 99 102 100 get_vcpu_ptr x1, x0 ··· 148 146 // Now restore the hyp regs 149 147 restore_callee_saved_regs x2 150 148 151 151 - set_loaded_vcpu xzr, x1, x2 149 149 + set_loaded_vcpu xzr, x2, x3 152 150 153 151 alternative_if ARM64_HAS_RAS_EXTN 154 152 // If we have the RAS extensions we can consume a pending error

+6 -3

arch/arm64/kvm/hyp/include/hyp/switch.h

reviewed

··· 90 90 * counter, which could make a PMXEVCNTR_EL0 access UNDEF at 91 91 * EL1 instead of being trapped to EL2. 92 92 */ 93 93 - write_sysreg(0, pmselr_el0); 94 94 - write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); 93 93 + if (kvm_arm_support_pmu_v3()) { 94 94 + write_sysreg(0, pmselr_el0); 95 95 + write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); 96 96 + } 95 97 write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); 96 98 } 97 99 98 100 static inline void __deactivate_traps_common(void) 99 101 { 100 102 write_sysreg(0, hstr_el2); 101 101 - write_sysreg(0, pmuserenr_el0); 103 103 + if (kvm_arm_support_pmu_v3()) 104 104 + write_sysreg(0, pmuserenr_el0); 102 105 } 103 106 104 107 static inline void ___activate_traps(struct kvm_vcpu *vcpu)

+10 -2

arch/arm64/kvm/hyp/nvhe/debug-sr.c

reviewed

··· 58 58 write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1); 59 59 } 60 60 61 61 - void __debug_switch_to_guest(struct kvm_vcpu *vcpu) 61 61 + void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) 62 62 { 63 63 /* Disable and flush SPE data generation */ 64 64 __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1); 65 65 + } 66 66 + 67 67 + void __debug_switch_to_guest(struct kvm_vcpu *vcpu) 68 68 + { 65 69 __debug_switch_to_guest_common(vcpu); 70 70 + } 71 71 + 72 72 + void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) 73 73 + { 74 74 + __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); 66 75 } 67 76 68 77 void __debug_switch_to_host(struct kvm_vcpu *vcpu) 69 78 { 70 70 - __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); 71 79 __debug_switch_to_host_common(vcpu); 72 80 } 73 81

+8 -7

arch/arm64/kvm/hyp/nvhe/host.S

reviewed

··· 71 71 SYM_FUNC_END(__host_enter) 72 72 73 73 /* 74 74 - * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); 74 74 + * void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr, 75 75 + * u64 elr, u64 par); 75 76 */ 76 77 SYM_FUNC_START(__hyp_do_panic) 77 78 /* Prepare and exit to the host's panic funciton. */ ··· 83 82 hyp_kimg_va lr, x6 84 83 msr elr_el2, lr 85 84 86 86 - /* Set the panic format string. Use the, now free, LR as scratch. */ 87 87 - ldr lr, =__hyp_panic_string 88 88 - hyp_kimg_va lr, x6 85 85 + mov x29, x0 86 86 + 87 87 + /* Load the format string into x0 and arguments into x1-7 */ 88 88 + ldr x0, =__hyp_panic_string 89 89 + hyp_kimg_va x0, x6 89 90 90 91 /* Load the format arguments into x1-7. */ 91 92 mov x6, x3 ··· 97 94 mrs x5, hpfar_el2 98 95 99 96 /* Enter the host, conditionally restoring the host context. */ 100 100 - cmp x0, xzr 101 101 - mov x0, lr 102 102 - b.eq __host_enter_without_restoring 97 97 + cbz x29, __host_enter_without_restoring 103 98 b __host_enter_for_panic 104 99 SYM_FUNC_END(__hyp_do_panic) 105 100

+6 -6

arch/arm64/kvm/hyp/nvhe/hyp-main.c

reviewed

··· 46 46 __kvm_tlb_flush_vmid(kern_hyp_va(mmu)); 47 47 } 48 48 49 49 - static void handle___kvm_tlb_flush_local_vmid(struct kvm_cpu_context *host_ctxt) 49 49 + static void handle___kvm_flush_cpu_context(struct kvm_cpu_context *host_ctxt) 50 50 { 51 51 DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); 52 52 53 53 - __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu)); 53 53 + __kvm_flush_cpu_context(kern_hyp_va(mmu)); 54 54 } 55 55 56 56 static void handle___kvm_timer_set_cntvoff(struct kvm_cpu_context *host_ctxt) ··· 67 67 write_sysreg_el2(tmp, SYS_SCTLR); 68 68 } 69 69 70 70 - static void handle___vgic_v3_get_ich_vtr_el2(struct kvm_cpu_context *host_ctxt) 70 70 + static void handle___vgic_v3_get_gic_config(struct kvm_cpu_context *host_ctxt) 71 71 { 72 72 - cpu_reg(host_ctxt, 1) = __vgic_v3_get_ich_vtr_el2(); 72 72 + cpu_reg(host_ctxt, 1) = __vgic_v3_get_gic_config(); 73 73 } 74 74 75 75 static void handle___vgic_v3_read_vmcr(struct kvm_cpu_context *host_ctxt) ··· 115 115 HANDLE_FUNC(__kvm_flush_vm_context), 116 116 HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), 117 117 HANDLE_FUNC(__kvm_tlb_flush_vmid), 118 118 - HANDLE_FUNC(__kvm_tlb_flush_local_vmid), 118 118 + HANDLE_FUNC(__kvm_flush_cpu_context), 119 119 HANDLE_FUNC(__kvm_timer_set_cntvoff), 120 120 HANDLE_FUNC(__kvm_enable_ssbs), 121 121 - HANDLE_FUNC(__vgic_v3_get_ich_vtr_el2), 121 121 + HANDLE_FUNC(__vgic_v3_get_gic_config), 122 122 HANDLE_FUNC(__vgic_v3_read_vmcr), 123 123 HANDLE_FUNC(__vgic_v3_write_vmcr), 124 124 HANDLE_FUNC(__vgic_v3_init_lrs),

+11 -3

arch/arm64/kvm/hyp/nvhe/switch.c

reviewed

··· 192 192 pmu_switch_needed = __pmu_switch_to_guest(host_ctxt); 193 193 194 194 __sysreg_save_state_nvhe(host_ctxt); 195 195 + /* 196 196 + * We must flush and disable the SPE buffer for nVHE, as 197 197 + * the translation regime(EL1&0) is going to be loaded with 198 198 + * that of the guest. And we must do this before we change the 199 199 + * translation regime to EL2 (via MDCR_EL2_E2PB == 0) and 200 200 + * before we load guest Stage1. 201 201 + */ 202 202 + __debug_save_host_buffers_nvhe(vcpu); 195 203 196 204 __adjust_pc(vcpu); 197 205 ··· 242 234 if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) 243 235 __fpsimd_save_fpexc32(vcpu); 244 236 237 237 + __debug_switch_to_host(vcpu); 245 238 /* 246 239 * This must come after restoring the host sysregs, since a non-VHE 247 240 * system may enable SPE here and make use of the TTBRs. 248 241 */ 249 249 - __debug_switch_to_host(vcpu); 242 242 + __debug_restore_host_buffers_nvhe(vcpu); 250 243 251 244 if (pmu_switch_needed) 252 245 __pmu_switch_to_host(host_ctxt); ··· 266 257 u64 spsr = read_sysreg_el2(SYS_SPSR); 267 258 u64 elr = read_sysreg_el2(SYS_ELR); 268 259 u64 par = read_sysreg_par(); 269 269 - bool restore_host = true; 270 260 struct kvm_cpu_context *host_ctxt; 271 261 struct kvm_vcpu *vcpu; 272 262 ··· 279 271 __sysreg_restore_state_nvhe(host_ctxt); 280 272 } 281 273 282 282 - __hyp_do_panic(restore_host, spsr, elr, par); 274 274 + __hyp_do_panic(host_ctxt, spsr, elr, par); 283 275 unreachable(); 284 276 } 285 277

+2 -1

arch/arm64/kvm/hyp/nvhe/tlb.c

reviewed

··· 123 123 __tlb_switch_to_host(&cxt); 124 124 } 125 125 126 126 - void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) 126 126 + void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) 127 127 { 128 128 struct tlb_inv_context cxt; 129 129 ··· 131 131 __tlb_switch_to_guest(mmu, &cxt); 132 132 133 133 __tlbi(vmalle1); 134 134 + asm volatile("ic iallu"); 134 135 dsb(nsh); 135 136 isb(); 136 137

arch/arm64/kvm/hyp/pgtable.c

reviewed

··· 223 223 goto out; 224 224 225 225 if (!table) { 226 226 + data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level)); 226 227 data->addr += kvm_granule_size(level); 227 228 goto out; 228 229 }

+38 -2

arch/arm64/kvm/hyp/vgic-v3-sr.c

reviewed

··· 405 405 __gic_v3_set_lr(0, i); 406 406 } 407 407 408 408 - u64 __vgic_v3_get_ich_vtr_el2(void) 408 408 + /* 409 409 + * Return the GIC CPU configuration: 410 410 + * - [31:0] ICH_VTR_EL2 411 411 + * - [62:32] RES0 412 412 + * - [63] MMIO (GICv2) capable 413 413 + */ 414 414 + u64 __vgic_v3_get_gic_config(void) 409 415 { 410 410 - return read_gicreg(ICH_VTR_EL2); 416 416 + u64 val, sre = read_gicreg(ICC_SRE_EL1); 417 417 + unsigned long flags = 0; 418 418 + 419 419 + /* 420 420 + * To check whether we have a MMIO-based (GICv2 compatible) 421 421 + * CPU interface, we need to disable the system register 422 422 + * view. To do that safely, we have to prevent any interrupt 423 423 + * from firing (which would be deadly). 424 424 + * 425 425 + * Note that this only makes sense on VHE, as interrupts are 426 426 + * already masked for nVHE as part of the exception entry to 427 427 + * EL2. 428 428 + */ 429 429 + if (has_vhe()) 430 430 + flags = local_daif_save(); 431 431 + 432 432 + write_gicreg(0, ICC_SRE_EL1); 433 433 + isb(); 434 434 + 435 435 + val = read_gicreg(ICC_SRE_EL1); 436 436 + 437 437 + write_gicreg(sre, ICC_SRE_EL1); 438 438 + isb(); 439 439 + 440 440 + if (has_vhe()) 441 441 + local_daif_restore(flags); 442 442 + 443 443 + val = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63); 444 444 + val |= read_gicreg(ICH_VTR_EL2); 445 445 + 446 446 + return val; 411 447 } 412 448 413 449 u64 __vgic_v3_read_vmcr(void)

+2 -1

arch/arm64/kvm/hyp/vhe/tlb.c

reviewed

··· 127 127 __tlb_switch_to_host(&cxt); 128 128 } 129 129 130 130 - void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) 130 130 + void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) 131 131 { 132 132 struct tlb_inv_context cxt; 133 133 ··· 135 135 __tlb_switch_to_guest(mmu, &cxt); 136 136 137 137 __tlbi(vmalle1); 138 138 + asm volatile("ic iallu"); 138 139 dsb(nsh); 139 140 isb(); 140 141

+1 -2

arch/arm64/kvm/mmu.c

reviewed

··· 1312 1312 * Prevent userspace from creating a memory region outside of the IPA 1313 1313 * space addressable by the KVM guest IPA space. 1314 1314 */ 1315 1315 - if (memslot->base_gfn + memslot->npages >= 1316 1316 - (kvm_phys_size(kvm) >> PAGE_SHIFT)) 1315 1315 + if ((memslot->base_gfn + memslot->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT)) 1317 1316 return -EFAULT; 1318 1317 1319 1318 mmap_read_lock(current->mm);

+10

arch/arm64/kvm/perf.c

reviewed

··· 11 11 12 12 #include <asm/kvm_emulate.h> 13 13 14 14 + DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available); 15 15 + 14 16 static int kvm_is_in_guest(void) 15 17 { 16 18 return kvm_get_running_vcpu() != NULL; ··· 50 48 51 49 int kvm_perf_init(void) 52 50 { 51 51 + /* 52 52 + * Check if HW_PERF_EVENTS are supported by checking the number of 53 53 + * hardware performance counters. This could ensure the presence of 54 54 + * a physical PMU and CONFIG_PERF_EVENT is selected. 55 55 + */ 56 56 + if (IS_ENABLED(CONFIG_ARM_PMU) && perf_num_counters() > 0) 57 57 + static_branch_enable(&kvm_arm_pmu_available); 58 58 + 53 59 return perf_register_guest_info_callbacks(&kvm_guest_cbs); 54 60 } 55 61

-10

arch/arm64/kvm/pmu-emul.c

reviewed

··· 823 823 return val & mask; 824 824 } 825 825 826 826 - bool kvm_arm_support_pmu_v3(void) 827 827 - { 828 828 - /* 829 829 - * Check if HW_PERF_EVENTS are supported by checking the number of 830 830 - * hardware performance counters. This could ensure the presence of 831 831 - * a physical PMU and CONFIG_PERF_EVENT is selected. 832 832 - */ 833 833 - return (perf_num_counters() > 0); 834 834 - } 835 835 - 836 826 int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) 837 827 { 838 828 if (!kvm_vcpu_has_pmu(vcpu))

+8 -4

arch/arm64/kvm/reset.c

reviewed

··· 326 326 } 327 327 328 328 kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange); 329 329 - WARN(kvm_ipa_limit < KVM_PHYS_SHIFT, 330 330 - "KVM IPA Size Limit (%d bits) is smaller than default size\n", 331 331 - kvm_ipa_limit); 332 332 - kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit); 329 329 + kvm_info("IPA Size Limit: %d bits%s\n", kvm_ipa_limit, 330 330 + ((kvm_ipa_limit < KVM_PHYS_SHIFT) ? 331 331 + " (Reduced IPA size, limited VM/VMM compatibility)" : "")); 333 332 334 333 return 0; 335 334 } ··· 357 358 return -EINVAL; 358 359 } else { 359 360 phys_shift = KVM_PHYS_SHIFT; 361 361 + if (phys_shift > kvm_ipa_limit) { 362 362 + pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", 363 363 + current->comm); 364 364 + return -EINVAL; 365 365 + } 360 366 } 361 367 362 368 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);

+9 -3

arch/arm64/kvm/vgic/vgic-v3.c

reviewed

··· 574 574 */ 575 575 int vgic_v3_probe(const struct gic_kvm_info *info) 576 576 { 577 577 - u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2); 577 577 + u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config); 578 578 + bool has_v2; 578 579 int ret; 580 580 + 581 581 + has_v2 = ich_vtr_el2 >> 63; 582 582 + ich_vtr_el2 = (u32)ich_vtr_el2; 579 583 580 584 /* 581 585 * The ListRegs field is 5 bits, but there is an architectural ··· 598 594 gicv4_enable ? "en" : "dis"); 599 595 } 600 596 597 597 + kvm_vgic_global_state.vcpu_base = 0; 598 598 + 601 599 if (!info->vcpu.start) { 602 600 kvm_info("GICv3: no GICV resource entry\n"); 603 603 - kvm_vgic_global_state.vcpu_base = 0; 601 601 + } else if (!has_v2) { 602 602 + pr_warn(FW_BUG "CPU interface incapable of MMIO access\n"); 604 603 } else if (!PAGE_ALIGNED(info->vcpu.start)) { 605 604 pr_warn("GICV physical address 0x%llx not page aligned\n", 606 605 (unsigned long long)info->vcpu.start); 607 607 - kvm_vgic_global_state.vcpu_base = 0; 608 606 } else { 609 607 kvm_vgic_global_state.vcpu_base = info->vcpu.start; 610 608 kvm_vgic_global_state.can_emulate_gicv2 = true;

+2 -2

arch/x86/include/asm/kvm_host.h

reviewed

··· 963 963 struct kvm_pit *vpit; 964 964 atomic_t vapics_in_nmi_mode; 965 965 struct mutex apic_map_lock; 966 966 - struct kvm_apic_map *apic_map; 966 966 + struct kvm_apic_map __rcu *apic_map; 967 967 atomic_t apic_map_dirty; 968 968 969 969 bool apic_access_page_done; ··· 1036 1036 1037 1037 bool bus_lock_detection_enabled; 1038 1038 1039 1039 - struct kvm_pmu_event_filter *pmu_event_filter; 1039 1039 + struct kvm_pmu_event_filter __rcu *pmu_event_filter; 1040 1040 struct task_struct *nx_lpage_recovery_thread; 1041 1041 1042 1042 #ifdef CONFIG_X86_64

+12 -13

arch/x86/kernel/kvmclock.c

reviewed

··· 268 268 269 269 static int __init kvm_setup_vsyscall_timeinfo(void) 270 270 { 271 271 - #ifdef CONFIG_X86_64 272 272 - u8 flags; 273 273 - 274 274 - if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) 275 275 - return 0; 276 276 - 277 277 - flags = pvclock_read_flags(&hv_clock_boot[0].pvti); 278 278 - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) 279 279 - return 0; 280 280 - 281 281 - kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; 282 282 - #endif 283 283 - 284 271 kvmclock_init_mem(); 272 272 + 273 273 + #ifdef CONFIG_X86_64 274 274 + if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) { 275 275 + u8 flags; 276 276 + 277 277 + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); 278 278 + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) 279 279 + return 0; 280 280 + 281 281 + kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; 282 282 + } 283 283 + #endif 285 284 286 285 return 0; 287 286 }

+11 -1

arch/x86/kvm/lapic.c

reviewed

··· 1642 1642 } 1643 1643 1644 1644 if (kvm_use_posted_timer_interrupt(apic->vcpu)) { 1645 1645 - kvm_wait_lapic_expire(vcpu); 1645 1645 + /* 1646 1646 + * Ensure the guest's timer has truly expired before posting an 1647 1647 + * interrupt. Open code the relevant checks to avoid querying 1648 1648 + * lapic_timer_int_injected(), which will be false since the 1649 1649 + * interrupt isn't yet injected. Waiting until after injecting 1650 1650 + * is not an option since that won't help a posted interrupt. 1651 1651 + */ 1652 1652 + if (vcpu->arch.apic->lapic_timer.expired_tscdeadline && 1653 1653 + vcpu->arch.apic->lapic_timer.timer_advance_ns) 1654 1654 + __kvm_wait_lapic_expire(vcpu); 1646 1655 kvm_apic_inject_pending_timer_irqs(apic); 1647 1656 return; 1648 1657 } ··· 2604 2595 2605 2596 apic_update_ppr(apic); 2606 2597 hrtimer_cancel(&apic->lapic_timer.timer); 2598 2598 + apic->lapic_timer.expired_tscdeadline = 0; 2607 2599 apic_update_lvtt(apic); 2608 2600 apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); 2609 2601 update_divide_count(apic);

+11

arch/x86/kvm/mmu/tdp_mmu.c

reviewed

··· 337 337 cpu_relax(); 338 338 } 339 339 } else { 340 340 + /* 341 341 + * If the SPTE is not MMU-present, there is no backing 342 342 + * page associated with the SPTE and so no side effects 343 343 + * that need to be recorded, and exclusive ownership of 344 344 + * mmu_lock ensures the SPTE can't be made present. 345 345 + * Note, zapping MMIO SPTEs is also unnecessary as they 346 346 + * are guarded by the memslots generation, not by being 347 347 + * unreachable. 348 348 + */ 340 349 old_child_spte = READ_ONCE(*sptep); 350 350 + if (!is_shadow_present_pte(old_child_spte)) 351 351 + continue; 341 352 342 353 /* 343 354 * Marking the SPTE as a removed SPTE is not

+13 -12

arch/x86/kvm/svm/svm.c

reviewed

··· 115 115 { .index = MSR_INVALID, .always = false }, 116 116 }; 117 117 118 118 - /* enable NPT for AMD64 and X86 with PAE */ 119 119 - #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 120 120 - bool npt_enabled = true; 121 121 - #else 122 122 - bool npt_enabled; 123 123 - #endif 124 124 - 125 118 /* 126 119 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 127 120 * pause_filter_count: On processors that support Pause filtering(indicated ··· 163 170 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX; 164 171 module_param(pause_filter_count_max, ushort, 0444); 165 172 166 166 - /* allow nested paging (virtualized MMU) for all guests */ 167 167 - static int npt = true; 168 168 - module_param(npt, int, S_IRUGO); 173 173 + /* 174 174 + * Use nested page tables by default. Note, NPT may get forced off by 175 175 + * svm_hardware_setup() if it's unsupported by hardware or the host kernel. 176 176 + */ 177 177 + bool npt_enabled = true; 178 178 + module_param_named(npt, npt_enabled, bool, 0444); 169 179 170 180 /* allow nested virtualization in KVM/SVM */ 171 181 static int nested = true; ··· 984 988 goto err; 985 989 } 986 990 987 987 - if (!boot_cpu_has(X86_FEATURE_NPT)) 991 991 + /* 992 992 + * KVM's MMU doesn't support using 2-level paging for itself, and thus 993 993 + * NPT isn't supported if the host is using 2-level paging since host 994 994 + * CR4 is unchanged on VMRUN. 995 995 + */ 996 996 + if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) 988 997 npt_enabled = false; 989 998 990 990 - if (npt_enabled && !npt) 999 999 + if (!boot_cpu_has(X86_FEATURE_NPT)) 991 1000 npt_enabled = false; 992 1001 993 1002 kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);

+1 -1

arch/x86/kvm/x86.c

reviewed

··· 10601 10601 return (void __user *)hva; 10602 10602 } else { 10603 10603 if (!slot || !slot->npages) 10604 10604 - return 0; 10604 10604 + return NULL; 10605 10605 10606 10606 old_npages = slot->npages; 10607 10607 hva = slot->userspace_addr;

+7 -2

include/kvm/arm_pmu.h

reviewed

··· 13 13 #define ARMV8_PMU_CYCLE_IDX (ARMV8_PMU_MAX_COUNTERS - 1) 14 14 #define ARMV8_PMU_MAX_COUNTER_PAIRS ((ARMV8_PMU_MAX_COUNTERS + 1) >> 1) 15 15 16 16 + DECLARE_STATIC_KEY_FALSE(kvm_arm_pmu_available); 17 17 + 18 18 + static __always_inline bool kvm_arm_support_pmu_v3(void) 19 19 + { 20 20 + return static_branch_likely(&kvm_arm_pmu_available); 21 21 + } 22 22 + 16 23 #ifdef CONFIG_HW_PERF_EVENTS 17 24 18 25 struct kvm_pmc { ··· 54 47 void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val); 55 48 void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, 56 49 u64 select_idx); 57 57 - bool kvm_arm_support_pmu_v3(void); 58 50 int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, 59 51 struct kvm_device_attr *attr); 60 52 int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, ··· 93 87 static inline void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) {} 94 88 static inline void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, 95 89 u64 data, u64 select_idx) {} 96 96 - static inline bool kvm_arm_support_pmu_v3(void) { return false; } 97 90 static inline int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, 98 91 struct kvm_device_attr *attr) 99 92 {