commit 3efc57369a0ce8f76bf0804f7e673982384e4ac9 · tjh.dev/kernel

+17

Documentation/admin-guide/kernel-parameters.txt

··· 2677 2678 Default is Y (on). 2679 2680 kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface. 2681 Default is false (don't support). 2682

··· 2677 2678 Default is Y (on). 2679 2680 + kvm.enable_virt_at_load=[KVM,ARM64,LOONGARCH,MIPS,RISCV,X86] 2681 + If enabled, KVM will enable virtualization in hardware 2682 + when KVM is loaded, and disable virtualization when KVM 2683 + is unloaded (if KVM is built as a module). 2684 + 2685 + If disabled, KVM will dynamically enable and disable 2686 + virtualization on-demand when creating and destroying 2687 + VMs, i.e. on the 0=>1 and 1=>0 transitions of the 2688 + number of VMs. 2689 + 2690 + Enabling virtualization at module lode avoids potential 2691 + latency for creation of the 0=>1 VM, as KVM serializes 2692 + virtualization enabling across all online CPUs. The 2693 + "cost" of enabling virtualization when KVM is loaded, 2694 + is that doing so may interfere with using out-of-tree 2695 + hypervisors that want to "own" virtualization hardware. 2696 + 2697 kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface. 2698 Default is false (don't support). 2699

+27 -4

Documentation/virt/kvm/api.rst

··· 4214 enabled. If KVM_MSR_EXIT_REASON_FILTER is enabled, KVM will exit to userspace 4215 on denied accesses, i.e. userspace effectively intercepts the MSR access. If 4216 KVM_MSR_EXIT_REASON_FILTER is not enabled, KVM will inject a #GP into the guest 4217 - on denied accesses. 4218 4219 If an MSR access is allowed by userspace, KVM will emulate and/or virtualize 4220 the access in accordance with the vCPU model. Note, KVM may still ultimately ··· 4231 an error. 4232 4233 .. warning:: 4234 - MSR accesses as part of nested VM-Enter/VM-Exit are not filtered. 4235 - This includes both writes to individual VMCS fields and reads/writes 4236 - through the MSR lists pointed to by the VMCS. 4237 4238 x2APIC MSR accesses cannot be filtered (KVM silently ignores filters that 4239 cover any x2APIC MSRs). ··· 8097 guest CPUID on writes to MISC_ENABLE if 8098 KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT is 8099 disabled. 8100 =================================== ============================================ 8101 8102 7.32 KVM_CAP_MAX_VCPU_ID

··· 4214 enabled. If KVM_MSR_EXIT_REASON_FILTER is enabled, KVM will exit to userspace 4215 on denied accesses, i.e. userspace effectively intercepts the MSR access. If 4216 KVM_MSR_EXIT_REASON_FILTER is not enabled, KVM will inject a #GP into the guest 4217 + on denied accesses. Note, if an MSR access is denied during emulation of MSR 4218 + load/stores during VMX transitions, KVM ignores KVM_MSR_EXIT_REASON_FILTER. 4219 + See the below warning for full details. 4220 4221 If an MSR access is allowed by userspace, KVM will emulate and/or virtualize 4222 the access in accordance with the vCPU model. Note, KVM may still ultimately ··· 4229 an error. 4230 4231 .. warning:: 4232 + MSR accesses that are side effects of instruction execution (emulated or 4233 + native) are not filtered as hardware does not honor MSR bitmaps outside of 4234 + RDMSR and WRMSR, and KVM mimics that behavior when emulating instructions 4235 + to avoid pointless divergence from hardware. E.g. RDPID reads MSR_TSC_AUX, 4236 + SYSENTER reads the SYSENTER MSRs, etc. 4237 + 4238 + MSRs that are loaded/stored via dedicated VMCS fields are not filtered as 4239 + part of VM-Enter/VM-Exit emulation. 4240 + 4241 + MSRs that are loaded/store via VMX's load/store lists _are_ filtered as part 4242 + of VM-Enter/VM-Exit emulation. If an MSR access is denied on VM-Enter, KVM 4243 + synthesizes a consistency check VM-Exit(EXIT_REASON_MSR_LOAD_FAIL). If an 4244 + MSR access is denied on VM-Exit, KVM synthesizes a VM-Abort. In short, KVM 4245 + extends Intel's architectural list of MSRs that cannot be loaded/saved via 4246 + the VM-Enter/VM-Exit MSR list. It is platform owner's responsibility to 4247 + to communicate any such restrictions to their end users. 4248 4249 x2APIC MSR accesses cannot be filtered (KVM silently ignores filters that 4250 cover any x2APIC MSRs). ··· 8082 guest CPUID on writes to MISC_ENABLE if 8083 KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT is 8084 disabled. 8085 + 8086 + KVM_X86_QUIRK_SLOT_ZAP_ALL By default, KVM invalidates all SPTEs in 8087 + fast way for memslot deletion when VM type 8088 + is KVM_X86_DEFAULT_VM. 8089 + When this quirk is disabled or when VM type 8090 + is other than KVM_X86_DEFAULT_VM, KVM zaps 8091 + only leaf SPTEs that are within the range of 8092 + the memslot being deleted. 8093 =================================== ============================================ 8094 8095 7.32 KVM_CAP_MAX_VCPU_ID

+24 -8

Documentation/virt/kvm/locking.rst

··· 11 12 - cpus_read_lock() is taken outside kvm_lock 13 14 - kvm->lock is taken outside vcpu->mutex 15 16 - kvm->lock is taken outside kvm->slots_lock and kvm->irq_lock ··· 25 use the same memslots array. kvm->slots_lock and kvm->slots_arch_lock 26 are taken on the waiting side when modifying memslots, so MMU notifiers 27 must not take either kvm->slots_lock or kvm->slots_arch_lock. 28 29 For SRCU: 30 ··· 236 :Type: mutex 237 :Arch: any 238 :Protects: - vm_list 239 - - kvm_usage_count 240 - hardware virtualization enable/disable 241 - :Comment: KVM also disables CPU hotplug via cpus_read_lock() during 242 - enable/disable. 243 244 ``kvm->mn_invalidate_lock`` 245 ^^^^^^^^^^^^^^^^^^^^^^^^^^^ ··· 305 wakeup. 306 307 ``vendor_module_lock`` 308 - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 309 :Type: mutex 310 :Arch: x86 311 :Protects: loading a vendor module (kvm_amd or kvm_intel) 312 - :Comment: Exists because using kvm_lock leads to deadlock. cpu_hotplug_lock is 313 - taken outside of kvm_lock, e.g. in KVM's CPU online/offline callbacks, and 314 - many operations need to take cpu_hotplug_lock when loading a vendor module, 315 - e.g. updating static calls.

··· 11 12 - cpus_read_lock() is taken outside kvm_lock 13 14 + - kvm_usage_lock is taken outside cpus_read_lock() 15 + 16 - kvm->lock is taken outside vcpu->mutex 17 18 - kvm->lock is taken outside kvm->slots_lock and kvm->irq_lock ··· 23 use the same memslots array. kvm->slots_lock and kvm->slots_arch_lock 24 are taken on the waiting side when modifying memslots, so MMU notifiers 25 must not take either kvm->slots_lock or kvm->slots_arch_lock. 26 + 27 + cpus_read_lock() vs kvm_lock: 28 + 29 + - Taking cpus_read_lock() outside of kvm_lock is problematic, despite that 30 + being the official ordering, as it is quite easy to unknowingly trigger 31 + cpus_read_lock() while holding kvm_lock. Use caution when walking vm_list, 32 + e.g. avoid complex operations when possible. 33 34 For SRCU: 35 ··· 227 :Type: mutex 228 :Arch: any 229 :Protects: - vm_list 230 + 231 + ``kvm_usage_lock`` 232 + ^^^^^^^^^^^^^^^^^^ 233 + 234 + :Type: mutex 235 + :Arch: any 236 + :Protects: - kvm_usage_count 237 - hardware virtualization enable/disable 238 + :Comment: Exists to allow taking cpus_read_lock() while kvm_usage_count is 239 + protected, which simplifies the virtualization enabling logic. 240 241 ``kvm->mn_invalidate_lock`` 242 ^^^^^^^^^^^^^^^^^^^^^^^^^^^ ··· 290 wakeup. 291 292 ``vendor_module_lock`` 293 + ^^^^^^^^^^^^^^^^^^^^^^ 294 :Type: mutex 295 :Arch: x86 296 :Protects: loading a vendor module (kvm_amd or kvm_intel) 297 + :Comment: Exists because using kvm_lock leads to deadlock. kvm_lock is taken 298 + in notifiers, e.g. __kvmclock_cpufreq_notifier(), that may be invoked while 299 + cpu_hotplug_lock is held, e.g. from cpufreq_boost_trigger_state(), and many 300 + operations need to take cpu_hotplug_lock when loading a vendor module, e.g. 301 + updating static calls.

+3 -3

arch/arm64/kvm/arm.c

··· 2164 } 2165 } 2166 2167 - int kvm_arch_hardware_enable(void) 2168 { 2169 /* 2170 * Most calls to this function are made with migration ··· 2184 return 0; 2185 } 2186 2187 - void kvm_arch_hardware_disable(void) 2188 { 2189 kvm_timer_cpu_down(); 2190 kvm_vgic_cpu_down(); ··· 2380 2381 /* 2382 * The stub hypercalls are now disabled, so set our local flag to 2383 - * prevent a later re-init attempt in kvm_arch_hardware_enable(). 2384 */ 2385 __this_cpu_write(kvm_hyp_initialized, 1); 2386 preempt_enable();

··· 2164 } 2165 } 2166 2167 + int kvm_arch_enable_virtualization_cpu(void) 2168 { 2169 /* 2170 * Most calls to this function are made with migration ··· 2184 return 0; 2185 } 2186 2187 + void kvm_arch_disable_virtualization_cpu(void) 2188 { 2189 kvm_timer_cpu_down(); 2190 kvm_vgic_cpu_down(); ··· 2380 2381 /* 2382 * The stub hypercalls are now disabled, so set our local flag to 2383 + * prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu(). 2384 */ 2385 __this_cpu_write(kvm_hyp_initialized, 1); 2386 preempt_enable();

+2 -2

arch/loongarch/kvm/main.c

··· 261 return -ENOIOCTLCMD; 262 } 263 264 - int kvm_arch_hardware_enable(void) 265 { 266 unsigned long env, gcfg = 0; 267 ··· 300 return 0; 301 } 302 303 - void kvm_arch_hardware_disable(void) 304 { 305 write_csr_gcfg(0); 306 write_csr_gstat(0);

··· 261 return -ENOIOCTLCMD; 262 } 263 264 + int kvm_arch_enable_virtualization_cpu(void) 265 { 266 unsigned long env, gcfg = 0; 267 ··· 300 return 0; 301 } 302 303 + void kvm_arch_disable_virtualization_cpu(void) 304 { 305 write_csr_gcfg(0); 306 write_csr_gstat(0);

+2 -2

arch/mips/include/asm/kvm_host.h

··· 728 int (*handle_fpe)(struct kvm_vcpu *vcpu); 729 int (*handle_msa_disabled)(struct kvm_vcpu *vcpu); 730 int (*handle_guest_exit)(struct kvm_vcpu *vcpu); 731 - int (*hardware_enable)(void); 732 - void (*hardware_disable)(void); 733 int (*check_extension)(struct kvm *kvm, long ext); 734 int (*vcpu_init)(struct kvm_vcpu *vcpu); 735 void (*vcpu_uninit)(struct kvm_vcpu *vcpu);

··· 728 int (*handle_fpe)(struct kvm_vcpu *vcpu); 729 int (*handle_msa_disabled)(struct kvm_vcpu *vcpu); 730 int (*handle_guest_exit)(struct kvm_vcpu *vcpu); 731 + int (*enable_virtualization_cpu)(void); 732 + void (*disable_virtualization_cpu)(void); 733 int (*check_extension)(struct kvm *kvm, long ext); 734 int (*vcpu_init)(struct kvm_vcpu *vcpu); 735 void (*vcpu_uninit)(struct kvm_vcpu *vcpu);

+4 -4

arch/mips/kvm/mips.c

··· 125 return 1; 126 } 127 128 - int kvm_arch_hardware_enable(void) 129 { 130 - return kvm_mips_callbacks->hardware_enable(); 131 } 132 133 - void kvm_arch_hardware_disable(void) 134 { 135 - kvm_mips_callbacks->hardware_disable(); 136 } 137 138 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)

··· 125 return 1; 126 } 127 128 + int kvm_arch_enable_virtualization_cpu(void) 129 { 130 + return kvm_mips_callbacks->enable_virtualization_cpu(); 131 } 132 133 + void kvm_arch_disable_virtualization_cpu(void) 134 { 135 + kvm_mips_callbacks->disable_virtualization_cpu(); 136 } 137 138 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)

+4 -4

arch/mips/kvm/vz.c

··· 2869 return ret + 1; 2870 } 2871 2872 - static int kvm_vz_hardware_enable(void) 2873 { 2874 unsigned int mmu_size, guest_mmu_size, ftlb_size; 2875 u64 guest_cvmctl, cvmvmconfig; ··· 2983 return 0; 2984 } 2985 2986 - static void kvm_vz_hardware_disable(void) 2987 { 2988 u64 cvmvmconfig; 2989 unsigned int mmu_size; ··· 3280 .handle_msa_disabled = kvm_trap_vz_handle_msa_disabled, 3281 .handle_guest_exit = kvm_trap_vz_handle_guest_exit, 3282 3283 - .hardware_enable = kvm_vz_hardware_enable, 3284 - .hardware_disable = kvm_vz_hardware_disable, 3285 .check_extension = kvm_vz_check_extension, 3286 .vcpu_init = kvm_vz_vcpu_init, 3287 .vcpu_uninit = kvm_vz_vcpu_uninit,

··· 2869 return ret + 1; 2870 } 2871 2872 + static int kvm_vz_enable_virtualization_cpu(void) 2873 { 2874 unsigned int mmu_size, guest_mmu_size, ftlb_size; 2875 u64 guest_cvmctl, cvmvmconfig; ··· 2983 return 0; 2984 } 2985 2986 + static void kvm_vz_disable_virtualization_cpu(void) 2987 { 2988 u64 cvmvmconfig; 2989 unsigned int mmu_size; ··· 3280 .handle_msa_disabled = kvm_trap_vz_handle_msa_disabled, 3281 .handle_guest_exit = kvm_trap_vz_handle_guest_exit, 3282 3283 + .enable_virtualization_cpu = kvm_vz_enable_virtualization_cpu, 3284 + .disable_virtualization_cpu = kvm_vz_disable_virtualization_cpu, 3285 .check_extension = kvm_vz_check_extension, 3286 .vcpu_init = kvm_vz_vcpu_init, 3287 .vcpu_uninit = kvm_vz_vcpu_uninit,

+2 -2

arch/riscv/kvm/main.c

··· 20 return -EINVAL; 21 } 22 23 - int kvm_arch_hardware_enable(void) 24 { 25 csr_write(CSR_HEDELEG, KVM_HEDELEG_DEFAULT); 26 csr_write(CSR_HIDELEG, KVM_HIDELEG_DEFAULT); ··· 35 return 0; 36 } 37 38 - void kvm_arch_hardware_disable(void) 39 { 40 kvm_riscv_aia_disable(); 41

··· 20 return -EINVAL; 21 } 22 23 + int kvm_arch_enable_virtualization_cpu(void) 24 { 25 csr_write(CSR_HEDELEG, KVM_HEDELEG_DEFAULT); 26 csr_write(CSR_HIDELEG, KVM_HIDELEG_DEFAULT); ··· 35 return 0; 36 } 37 38 + void kvm_arch_disable_virtualization_cpu(void) 39 { 40 kvm_riscv_aia_disable(); 41

+1

arch/s390/configs/debug_defconfig

··· 59 CONFIG_APPLDATA_BASE=y 60 CONFIG_S390_HYPFS_FS=y 61 CONFIG_KVM=m 62 CONFIG_S390_UNWIND_SELFTEST=m 63 CONFIG_S390_KPROBES_SANITY_TEST=m 64 CONFIG_S390_MODULES_SANITY_TEST=m

··· 59 CONFIG_APPLDATA_BASE=y 60 CONFIG_S390_HYPFS_FS=y 61 CONFIG_KVM=m 62 + CONFIG_KVM_S390_UCONTROL=y 63 CONFIG_S390_UNWIND_SELFTEST=m 64 CONFIG_S390_KPROBES_SANITY_TEST=m 65 CONFIG_S390_MODULES_SANITY_TEST=m

+18 -9

arch/s390/kvm/kvm-s390.c

··· 348 return cc == 0; 349 } 350 351 - static __always_inline void __insn32_query(unsigned int opcode, u8 *query) 352 { 353 asm volatile( 354 " lghi 0,0\n" 355 - " lgr 1,%[query]\n" 356 /* Parameter registers are ignored */ 357 - " .insn rrf,%[opc] << 16,2,4,6,0\n" 358 : 359 - : [query] "d" ((unsigned long)query), [opc] "i" (opcode) 360 - : "cc", "memory", "0", "1"); 361 } 362 363 - #define INSN_SORTL 0xb938 364 - #define INSN_DFLTCC 0xb939 365 366 static void __init kvm_s390_cpu_feat_init(void) 367 { ··· 424 kvm_s390_available_subfunc.kdsa); 425 426 if (test_facility(150)) /* SORTL */ 427 - __insn32_query(INSN_SORTL, kvm_s390_available_subfunc.sortl); 428 429 if (test_facility(151)) /* DFLTCC */ 430 - __insn32_query(INSN_DFLTCC, kvm_s390_available_subfunc.dfltcc); 431 432 if (MACHINE_HAS_ESOP) 433 allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);

··· 348 return cc == 0; 349 } 350 351 + static __always_inline void __sortl_query(u8 (*query)[32]) 352 { 353 asm volatile( 354 " lghi 0,0\n" 355 + " la 1,%[query]\n" 356 /* Parameter registers are ignored */ 357 + " .insn rre,0xb9380000,2,4\n" 358 + : [query] "=R" (*query) 359 : 360 + : "cc", "0", "1"); 361 } 362 363 + static __always_inline void __dfltcc_query(u8 (*query)[32]) 364 + { 365 + asm volatile( 366 + " lghi 0,0\n" 367 + " la 1,%[query]\n" 368 + /* Parameter registers are ignored */ 369 + " .insn rrf,0xb9390000,2,4,6,0\n" 370 + : [query] "=R" (*query) 371 + : 372 + : "cc", "0", "1"); 373 + } 374 375 static void __init kvm_s390_cpu_feat_init(void) 376 { ··· 415 kvm_s390_available_subfunc.kdsa); 416 417 if (test_facility(150)) /* SORTL */ 418 + __sortl_query(&kvm_s390_available_subfunc.sortl); 419 420 if (test_facility(151)) /* DFLTCC */ 421 + __dfltcc_query(&kvm_s390_available_subfunc.dfltcc); 422 423 if (MACHINE_HAS_ESOP) 424 allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);

+1

arch/x86/include/asm/cpuid.h

··· 179 case 0x1d: 180 case 0x1e: 181 case 0x1f: 182 case 0x8000001d: 183 return true; 184 }

··· 179 case 0x1d: 180 case 0x1e: 181 case 0x1f: 182 + case 0x24: 183 case 0x8000001d: 184 return true; 185 }

+3 -3

arch/x86/include/asm/kvm-x86-ops.h

··· 14 * be __static_call_return0. 15 */ 16 KVM_X86_OP(check_processor_compatibility) 17 - KVM_X86_OP(hardware_enable) 18 - KVM_X86_OP(hardware_disable) 19 KVM_X86_OP(hardware_unsetup) 20 KVM_X86_OP(has_emulated_msr) 21 KVM_X86_OP(vcpu_after_set_cpuid) ··· 125 KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) 126 KVM_X86_OP_OPTIONAL(vm_move_enc_context_from) 127 KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) 128 - KVM_X86_OP(get_msr_feature) 129 KVM_X86_OP(check_emulate_instruction) 130 KVM_X86_OP(apic_init_signal_blocked) 131 KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)

··· 14 * be __static_call_return0. 15 */ 16 KVM_X86_OP(check_processor_compatibility) 17 + KVM_X86_OP(enable_virtualization_cpu) 18 + KVM_X86_OP(disable_virtualization_cpu) 19 KVM_X86_OP(hardware_unsetup) 20 KVM_X86_OP(has_emulated_msr) 21 KVM_X86_OP(vcpu_after_set_cpuid) ··· 125 KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) 126 KVM_X86_OP_OPTIONAL(vm_move_enc_context_from) 127 KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) 128 + KVM_X86_OP(get_feature_msr) 129 KVM_X86_OP(check_emulate_instruction) 130 KVM_X86_OP(apic_init_signal_blocked) 131 KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)

+23 -9

arch/x86/include/asm/kvm_host.h

··· 36 #include <asm/kvm_page_track.h> 37 #include <asm/kvm_vcpu_regs.h> 38 #include <asm/hyperv-tlfs.h> 39 40 #define __KVM_HAVE_ARCH_VCPU_DEBUGFS 41 ··· 212 EXIT_FASTPATH_NONE, 213 EXIT_FASTPATH_REENTER_GUEST, 214 EXIT_FASTPATH_EXIT_HANDLED, 215 }; 216 typedef enum exit_fastpath_completion fastpath_t; 217 ··· 281 */ 282 #define PFERR_PRIVATE_ACCESS BIT_ULL(49) 283 #define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS) 284 - 285 - #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ 286 - PFERR_WRITE_MASK | \ 287 - PFERR_PRESENT_MASK) 288 289 /* apic attention bits */ 290 #define KVM_APIC_CHECK_VAPIC 0 ··· 1627 1628 int (*check_processor_compatibility)(void); 1629 1630 - int (*hardware_enable)(void); 1631 - void (*hardware_disable)(void); 1632 void (*hardware_unsetup)(void); 1633 bool (*has_emulated_msr)(struct kvm *kvm, u32 index); 1634 void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); ··· 1727 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 1728 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 1729 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 1730 const unsigned long required_apicv_inhibits; 1731 bool allow_apicv_in_x2apic_without_x2apic_virtualization; 1732 void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); ··· 1808 int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); 1809 void (*guest_memory_reclaimed)(struct kvm *kvm); 1810 1811 - int (*get_msr_feature)(struct kvm_msr_entry *entry); 1812 1813 int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, 1814 void *insn, int insn_len); ··· 2062 2063 void kvm_enable_efer_bits(u64); 2064 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); 2065 int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated); 2066 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); 2067 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); ··· 2140 2141 void kvm_update_dr7(struct kvm_vcpu *vcpu); 2142 2143 - int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); 2144 void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, 2145 ulong roots_to_free); 2146 void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); ··· 2266 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 2267 int kvm_cpu_has_extint(struct kvm_vcpu *v); 2268 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 2269 int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 2270 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); 2271 ··· 2358 KVM_X86_QUIRK_OUT_7E_INC_RIP | \ 2359 KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ 2360 KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ 2361 - KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) 2362 2363 /* 2364 * KVM previously used a u32 field in kvm_run to indicate the hypercall was

··· 36 #include <asm/kvm_page_track.h> 37 #include <asm/kvm_vcpu_regs.h> 38 #include <asm/hyperv-tlfs.h> 39 + #include <asm/reboot.h> 40 41 #define __KVM_HAVE_ARCH_VCPU_DEBUGFS 42 ··· 211 EXIT_FASTPATH_NONE, 212 EXIT_FASTPATH_REENTER_GUEST, 213 EXIT_FASTPATH_EXIT_HANDLED, 214 + EXIT_FASTPATH_EXIT_USERSPACE, 215 }; 216 typedef enum exit_fastpath_completion fastpath_t; 217 ··· 279 */ 280 #define PFERR_PRIVATE_ACCESS BIT_ULL(49) 281 #define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS) 282 283 /* apic attention bits */ 284 #define KVM_APIC_CHECK_VAPIC 0 ··· 1629 1630 int (*check_processor_compatibility)(void); 1631 1632 + int (*enable_virtualization_cpu)(void); 1633 + void (*disable_virtualization_cpu)(void); 1634 + cpu_emergency_virt_cb *emergency_disable_virtualization_cpu; 1635 + 1636 void (*hardware_unsetup)(void); 1637 bool (*has_emulated_msr)(struct kvm *kvm, u32 index); 1638 void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); ··· 1727 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 1728 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 1729 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 1730 + 1731 + const bool x2apic_icr_is_split; 1732 const unsigned long required_apicv_inhibits; 1733 bool allow_apicv_in_x2apic_without_x2apic_virtualization; 1734 void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); ··· 1806 int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); 1807 void (*guest_memory_reclaimed)(struct kvm *kvm); 1808 1809 + int (*get_feature_msr)(u32 msr, u64 *data); 1810 1811 int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, 1812 void *insn, int insn_len); ··· 2060 2061 void kvm_enable_efer_bits(u64); 2062 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); 2063 + int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data); 2064 + int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data); 2065 int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated); 2066 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); 2067 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); ··· 2136 2137 void kvm_update_dr7(struct kvm_vcpu *vcpu); 2138 2139 + bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 2140 + bool always_retry); 2141 + 2142 + static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, 2143 + gpa_t cr2_or_gpa) 2144 + { 2145 + return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false); 2146 + } 2147 + 2148 void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, 2149 ulong roots_to_free); 2150 void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); ··· 2254 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 2255 int kvm_cpu_has_extint(struct kvm_vcpu *v); 2256 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 2257 + int kvm_cpu_get_extint(struct kvm_vcpu *v); 2258 int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 2259 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); 2260 ··· 2345 KVM_X86_QUIRK_OUT_7E_INC_RIP | \ 2346 KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ 2347 KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ 2348 + KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \ 2349 + KVM_X86_QUIRK_SLOT_ZAP_ALL) 2350 2351 /* 2352 * KVM previously used a u32 field in kvm_run to indicate the hypercall was

+20 -14

arch/x86/include/asm/msr-index.h

··· 36 #define EFER_FFXSR (1<<_EFER_FFXSR) 37 #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) 38 39 /* FRED MSRs */ 40 #define MSR_IA32_FRED_RSP0 0x1cc /* Level 0 stack pointer */ 41 #define MSR_IA32_FRED_RSP1 0x1cd /* Level 1 stack pointer */ ··· 378 #define MSR_MTRRdefType 0x000002ff 379 380 #define MSR_IA32_CR_PAT 0x00000277 381 382 #define MSR_IA32_DEBUGCTLMSR 0x000001d9 383 #define MSR_IA32_LASTBRANCHFROMIP 0x000001db ··· 1179 #define MSR_IA32_VMX_VMFUNC 0x00000491 1180 #define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492 1181 1182 - /* VMX_BASIC bits and bitmasks */ 1183 - #define VMX_BASIC_VMCS_SIZE_SHIFT 32 1184 - #define VMX_BASIC_TRUE_CTLS (1ULL << 55) 1185 - #define VMX_BASIC_64 0x0001000000000000LLU 1186 - #define VMX_BASIC_MEM_TYPE_SHIFT 50 1187 - #define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU 1188 - #define VMX_BASIC_MEM_TYPE_WB 6LLU 1189 - #define VMX_BASIC_INOUT 0x0040000000000000LLU 1190 - 1191 /* Resctrl MSRs: */ 1192 /* - Intel: */ 1193 #define MSR_IA32_L3_QOS_CFG 0xc81 ··· 1195 #define MSR_IA32_MBA_BW_BASE 0xc0000200 1196 #define MSR_IA32_SMBA_BW_BASE 0xc0000280 1197 #define MSR_IA32_EVT_CFG_BASE 0xc0000400 1198 - 1199 - /* MSR_IA32_VMX_MISC bits */ 1200 - #define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) 1201 - #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) 1202 - #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F 1203 1204 /* AMD-V MSRs */ 1205 #define MSR_VM_CR 0xc0010114

··· 36 #define EFER_FFXSR (1<<_EFER_FFXSR) 37 #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) 38 39 + /* 40 + * Architectural memory types that are common to MTRRs, PAT, VMX MSRs, etc. 41 + * Most MSRs support/allow only a subset of memory types, but the values 42 + * themselves are common across all relevant MSRs. 43 + */ 44 + #define X86_MEMTYPE_UC 0ull /* Uncacheable, a.k.a. Strong Uncacheable */ 45 + #define X86_MEMTYPE_WC 1ull /* Write Combining */ 46 + /* RESERVED 2 */ 47 + /* RESERVED 3 */ 48 + #define X86_MEMTYPE_WT 4ull /* Write Through */ 49 + #define X86_MEMTYPE_WP 5ull /* Write Protected */ 50 + #define X86_MEMTYPE_WB 6ull /* Write Back */ 51 + #define X86_MEMTYPE_UC_MINUS 7ull /* Weak Uncacheabled (PAT only) */ 52 + 53 /* FRED MSRs */ 54 #define MSR_IA32_FRED_RSP0 0x1cc /* Level 0 stack pointer */ 55 #define MSR_IA32_FRED_RSP1 0x1cd /* Level 1 stack pointer */ ··· 364 #define MSR_MTRRdefType 0x000002ff 365 366 #define MSR_IA32_CR_PAT 0x00000277 367 + 368 + #define PAT_VALUE(p0, p1, p2, p3, p4, p5, p6, p7) \ 369 + ((X86_MEMTYPE_ ## p0) | (X86_MEMTYPE_ ## p1 << 8) | \ 370 + (X86_MEMTYPE_ ## p2 << 16) | (X86_MEMTYPE_ ## p3 << 24) | \ 371 + (X86_MEMTYPE_ ## p4 << 32) | (X86_MEMTYPE_ ## p5 << 40) | \ 372 + (X86_MEMTYPE_ ## p6 << 48) | (X86_MEMTYPE_ ## p7 << 56)) 373 374 #define MSR_IA32_DEBUGCTLMSR 0x000001d9 375 #define MSR_IA32_LASTBRANCHFROMIP 0x000001db ··· 1159 #define MSR_IA32_VMX_VMFUNC 0x00000491 1160 #define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492 1161 1162 /* Resctrl MSRs: */ 1163 /* - Intel: */ 1164 #define MSR_IA32_L3_QOS_CFG 0xc81 ··· 1184 #define MSR_IA32_MBA_BW_BASE 0xc0000200 1185 #define MSR_IA32_SMBA_BW_BASE 0xc0000280 1186 #define MSR_IA32_EVT_CFG_BASE 0xc0000400 1187 1188 /* AMD-V MSRs */ 1189 #define MSR_VM_CR 0xc0010114

+1 -1

arch/x86/include/asm/reboot.h

··· 25 #define MRR_BIOS 0 26 #define MRR_APM 1 27 28 - #if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) 29 typedef void (cpu_emergency_virt_cb)(void); 30 void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); 31 void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); 32 void cpu_emergency_disable_virtualization(void);

··· 25 #define MRR_BIOS 0 26 #define MRR_APM 1 27 28 typedef void (cpu_emergency_virt_cb)(void); 29 + #if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) 30 void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); 31 void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); 32 void cpu_emergency_disable_virtualization(void);

+15 -5

arch/x86/include/asm/svm.h

··· 516 u32 ghcb_usage; 517 } __packed; 518 519 520 #define EXPECTED_VMCB_SAVE_AREA_SIZE 744 521 #define EXPECTED_GHCB_SAVE_AREA_SIZE 1032 ··· 546 BUILD_BUG_ON(sizeof(struct ghcb_save_area) != EXPECTED_GHCB_SAVE_AREA_SIZE); 547 BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE); 548 BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); 549 BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); 550 551 /* Check offsets of reserved fields */ ··· 582 583 BUILD_BUG_RESERVED_OFFSET(ghcb, 0xff0); 584 } 585 - 586 - struct vmcb { 587 - struct vmcb_control_area control; 588 - struct vmcb_save_area save; 589 - } __packed; 590 591 #define SVM_CPUID_FUNC 0x8000000a 592

··· 516 u32 ghcb_usage; 517 } __packed; 518 519 + struct vmcb { 520 + struct vmcb_control_area control; 521 + union { 522 + struct vmcb_save_area save; 523 + 524 + /* 525 + * For SEV-ES VMs, the save area in the VMCB is used only to 526 + * save/load host state. Guest state resides in a separate 527 + * page, the aptly named VM Save Area (VMSA), that is encrypted 528 + * with the guest's private key. 529 + */ 530 + struct sev_es_save_area host_sev_es_save; 531 + }; 532 + } __packed; 533 534 #define EXPECTED_VMCB_SAVE_AREA_SIZE 744 535 #define EXPECTED_GHCB_SAVE_AREA_SIZE 1032 ··· 532 BUILD_BUG_ON(sizeof(struct ghcb_save_area) != EXPECTED_GHCB_SAVE_AREA_SIZE); 533 BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE); 534 BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); 535 + BUILD_BUG_ON(offsetof(struct vmcb, save) != EXPECTED_VMCB_CONTROL_AREA_SIZE); 536 BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); 537 538 /* Check offsets of reserved fields */ ··· 567 568 BUILD_BUG_RESERVED_OFFSET(ghcb, 0xff0); 569 } 570 571 #define SVM_CPUID_FUNC 0x8000000a 572

+30 -10

arch/x86/include/asm/vmx.h

··· 122 123 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff 124 125 - #define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f 126 - #define VMX_MISC_SAVE_EFER_LMA 0x00000020 127 - #define VMX_MISC_ACTIVITY_HLT 0x00000040 128 - #define VMX_MISC_ACTIVITY_WAIT_SIPI 0x00000100 129 - #define VMX_MISC_ZERO_LEN_INS 0x40000000 130 - #define VMX_MISC_MSR_LIST_MULTIPLIER 512 131 - 132 /* VMFUNC functions */ 133 #define VMFUNC_CONTROL_BIT(x) BIT((VMX_FEATURE_##x & 0x1f) - 28) 134 135 #define VMX_VMFUNC_EPTP_SWITCHING VMFUNC_CONTROL_BIT(EPTP_SWITCHING) 136 #define VMFUNC_EPTP_ENTRIES 512 137 138 static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) 139 { ··· 143 return (vmx_basic & GENMASK_ULL(44, 32)) >> 32; 144 } 145 146 static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) 147 { 148 - return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; 149 } 150 151 static inline int vmx_misc_cr3_count(u64 vmx_misc) ··· 527 #define VMX_EPTP_PWL_4 0x18ull 528 #define VMX_EPTP_PWL_5 0x20ull 529 #define VMX_EPTP_AD_ENABLE_BIT (1ull << 6) 530 #define VMX_EPTP_MT_MASK 0x7ull 531 - #define VMX_EPTP_MT_WB 0x6ull 532 - #define VMX_EPTP_MT_UC 0x0ull 533 #define VMX_EPT_READABLE_MASK 0x1ull 534 #define VMX_EPT_WRITABLE_MASK 0x2ull 535 #define VMX_EPT_EXECUTABLE_MASK 0x4ull

··· 122 123 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff 124 125 /* VMFUNC functions */ 126 #define VMFUNC_CONTROL_BIT(x) BIT((VMX_FEATURE_##x & 0x1f) - 28) 127 128 #define VMX_VMFUNC_EPTP_SWITCHING VMFUNC_CONTROL_BIT(EPTP_SWITCHING) 129 #define VMFUNC_EPTP_ENTRIES 512 130 + 131 + #define VMX_BASIC_32BIT_PHYS_ADDR_ONLY BIT_ULL(48) 132 + #define VMX_BASIC_DUAL_MONITOR_TREATMENT BIT_ULL(49) 133 + #define VMX_BASIC_INOUT BIT_ULL(54) 134 + #define VMX_BASIC_TRUE_CTLS BIT_ULL(55) 135 136 static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) 137 { ··· 145 return (vmx_basic & GENMASK_ULL(44, 32)) >> 32; 146 } 147 148 + static inline u32 vmx_basic_vmcs_mem_type(u64 vmx_basic) 149 + { 150 + return (vmx_basic & GENMASK_ULL(53, 50)) >> 50; 151 + } 152 + 153 + static inline u64 vmx_basic_encode_vmcs_info(u32 revision, u16 size, u8 memtype) 154 + { 155 + return revision | ((u64)size << 32) | ((u64)memtype << 50); 156 + } 157 + 158 + #define VMX_MISC_SAVE_EFER_LMA BIT_ULL(5) 159 + #define VMX_MISC_ACTIVITY_HLT BIT_ULL(6) 160 + #define VMX_MISC_ACTIVITY_SHUTDOWN BIT_ULL(7) 161 + #define VMX_MISC_ACTIVITY_WAIT_SIPI BIT_ULL(8) 162 + #define VMX_MISC_INTEL_PT BIT_ULL(14) 163 + #define VMX_MISC_RDMSR_IN_SMM BIT_ULL(15) 164 + #define VMX_MISC_VMXOFF_BLOCK_SMI BIT_ULL(28) 165 + #define VMX_MISC_VMWRITE_SHADOW_RO_FIELDS BIT_ULL(29) 166 + #define VMX_MISC_ZERO_LEN_INS BIT_ULL(30) 167 + #define VMX_MISC_MSR_LIST_MULTIPLIER 512 168 + 169 static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) 170 { 171 + return vmx_misc & GENMASK_ULL(4, 0); 172 } 173 174 static inline int vmx_misc_cr3_count(u64 vmx_misc) ··· 508 #define VMX_EPTP_PWL_4 0x18ull 509 #define VMX_EPTP_PWL_5 0x20ull 510 #define VMX_EPTP_AD_ENABLE_BIT (1ull << 6) 511 + /* The EPTP memtype is encoded in bits 2:0, i.e. doesn't need to be shifted. */ 512 #define VMX_EPTP_MT_MASK 0x7ull 513 + #define VMX_EPTP_MT_WB X86_MEMTYPE_WB 514 + #define VMX_EPTP_MT_UC X86_MEMTYPE_UC 515 #define VMX_EPT_READABLE_MASK 0x1ull 516 #define VMX_EPT_WRITABLE_MASK 0x2ull 517 #define VMX_EPT_EXECUTABLE_MASK 0x4ull

+1

arch/x86/include/uapi/asm/kvm.h

··· 439 #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) 440 #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) 441 #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) 442 443 #define KVM_STATE_NESTED_FORMAT_VMX 0 444 #define KVM_STATE_NESTED_FORMAT_SVM 1

··· 439 #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) 440 #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) 441 #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) 442 + #define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7) 443 444 #define KVM_STATE_NESTED_FORMAT_VMX 0 445 #define KVM_STATE_NESTED_FORMAT_SVM 1

+6

arch/x86/kernel/cpu/mtrr/mtrr.c

··· 55 56 #include "mtrr.h" 57 58 /* arch_phys_wc_add returns an MTRR register index plus this offset. */ 59 #define MTRR_TO_PHYS_WC_OFFSET 1000 60

··· 55 56 #include "mtrr.h" 57 58 + static_assert(X86_MEMTYPE_UC == MTRR_TYPE_UNCACHABLE); 59 + static_assert(X86_MEMTYPE_WC == MTRR_TYPE_WRCOMB); 60 + static_assert(X86_MEMTYPE_WT == MTRR_TYPE_WRTHROUGH); 61 + static_assert(X86_MEMTYPE_WP == MTRR_TYPE_WRPROT); 62 + static_assert(X86_MEMTYPE_WB == MTRR_TYPE_WRBACK); 63 + 64 /* arch_phys_wc_add returns an MTRR register index plus this offset. */ 65 #define MTRR_TO_PHYS_WC_OFFSET 1000 66

+28 -2

arch/x86/kvm/cpuid.c

··· 705 706 kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, 707 F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) | 708 - F(AMX_COMPLEX) 709 ); 710 711 kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX, ··· 719 720 kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX, 721 SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA) 722 ); 723 724 kvm_cpu_cap_mask(CPUID_8000_0001_ECX, ··· 953 switch (function) { 954 case 0: 955 /* Limited to the highest leaf implemented in KVM. */ 956 - entry->eax = min(entry->eax, 0x1fU); 957 break; 958 case 1: 959 cpuid_entry_override(entry, CPUID_1_EDX); ··· 1178 break; 1179 } 1180 break; 1181 case KVM_CPUID_SIGNATURE: { 1182 const u32 *sigptr = (const u32 *)KVM_SIGNATURE; 1183 entry->eax = KVM_CPUID_FEATURES;

··· 705 706 kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, 707 F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) | 708 + F(AMX_COMPLEX) | F(AVX10) 709 ); 710 711 kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX, ··· 719 720 kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX, 721 SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA) 722 + ); 723 + 724 + kvm_cpu_cap_init_kvm_defined(CPUID_24_0_EBX, 725 + F(AVX10_128) | F(AVX10_256) | F(AVX10_512) 726 ); 727 728 kvm_cpu_cap_mask(CPUID_8000_0001_ECX, ··· 949 switch (function) { 950 case 0: 951 /* Limited to the highest leaf implemented in KVM. */ 952 + entry->eax = min(entry->eax, 0x24U); 953 break; 954 case 1: 955 cpuid_entry_override(entry, CPUID_1_EDX); ··· 1174 break; 1175 } 1176 break; 1177 + case 0x24: { 1178 + u8 avx10_version; 1179 + 1180 + if (!kvm_cpu_cap_has(X86_FEATURE_AVX10)) { 1181 + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1182 + break; 1183 + } 1184 + 1185 + /* 1186 + * The AVX10 version is encoded in EBX[7:0]. Note, the version 1187 + * is guaranteed to be >=1 if AVX10 is supported. Note #2, the 1188 + * version needs to be captured before overriding EBX features! 1189 + */ 1190 + avx10_version = min_t(u8, entry->ebx & 0xff, 1); 1191 + cpuid_entry_override(entry, CPUID_24_0_EBX); 1192 + entry->ebx |= avx10_version; 1193 + 1194 + entry->eax = 0; 1195 + entry->ecx = 0; 1196 + entry->edx = 0; 1197 + break; 1198 + } 1199 case KVM_CPUID_SIGNATURE: { 1200 const u32 *sigptr = (const u32 *)KVM_SIGNATURE; 1201 entry->eax = KVM_CPUID_FEATURES;

+7 -3

arch/x86/kvm/irq.c

··· 108 * Read pending interrupt(from non-APIC source) 109 * vector and intack. 110 */ 111 - static int kvm_cpu_get_extint(struct kvm_vcpu *v) 112 { 113 if (!kvm_cpu_has_extint(v)) { 114 WARN_ON(!lapic_in_kernel(v)); ··· 131 } else 132 return kvm_pic_read_irq(v->kvm); /* PIC */ 133 } 134 135 /* 136 * Read pending interrupt vector and intack. ··· 142 if (vector != -1) 143 return vector; /* PIC */ 144 145 - return kvm_get_apic_interrupt(v); /* APIC */ 146 } 147 - EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); 148 149 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 150 {

··· 108 * Read pending interrupt(from non-APIC source) 109 * vector and intack. 110 */ 111 + int kvm_cpu_get_extint(struct kvm_vcpu *v) 112 { 113 if (!kvm_cpu_has_extint(v)) { 114 WARN_ON(!lapic_in_kernel(v)); ··· 131 } else 132 return kvm_pic_read_irq(v->kvm); /* PIC */ 133 } 134 + EXPORT_SYMBOL_GPL(kvm_cpu_get_extint); 135 136 /* 137 * Read pending interrupt vector and intack. ··· 141 if (vector != -1) 142 return vector; /* PIC */ 143 144 + vector = kvm_apic_has_interrupt(v); /* APIC */ 145 + if (vector != -1) 146 + kvm_apic_ack_interrupt(v, vector); 147 + 148 + return vector; 149 } 150 151 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 152 {

+57 -27

arch/x86/kvm/lapic.c

··· 1944 u64 ns = 0; 1945 ktime_t expire; 1946 struct kvm_vcpu *vcpu = apic->vcpu; 1947 - unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; 1948 unsigned long flags; 1949 ktime_t now; 1950 ··· 2453 } 2454 EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); 2455 2456 /* emulate APIC access in a trap manner */ 2457 void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) 2458 { ··· 2507 * maybe-unecessary write, and both are in the noise anyways. 2508 */ 2509 if (apic_x2apic_mode(apic) && offset == APIC_ICR) 2510 - kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR)); 2511 else 2512 kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); 2513 } ··· 2959 } 2960 } 2961 2962 - int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) 2963 { 2964 - int vector = kvm_apic_has_interrupt(vcpu); 2965 struct kvm_lapic *apic = vcpu->arch.apic; 2966 u32 ppr; 2967 2968 - if (vector == -1) 2969 - return -1; 2970 2971 /* 2972 * We get here even with APIC virtualization enabled, if doing ··· 2993 __apic_update_ppr(apic, &ppr); 2994 } 2995 2996 - return vector; 2997 } 2998 2999 static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, 3000 struct kvm_lapic_state *s, bool set) ··· 3026 3027 /* 3028 * In x2APIC mode, the LDR is fixed and based on the id. And 3029 - * ICR is internally a single 64-bit register, but needs to be 3030 - * split to ICR+ICR2 in userspace for backwards compatibility. 3031 */ 3032 - if (set) { 3033 *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id); 3034 3035 - icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | 3036 - (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; 3037 - __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); 3038 - } else { 3039 - icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); 3040 - __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); 3041 } 3042 } 3043 ··· 3234 return 0; 3235 } 3236 3237 - int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) 3238 - { 3239 - data &= ~APIC_ICR_BUSY; 3240 - 3241 - kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); 3242 - kvm_lapic_set_reg64(apic, APIC_ICR, data); 3243 - trace_kvm_apic_write(APIC_ICR, data); 3244 - return 0; 3245 - } 3246 - 3247 static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data) 3248 { 3249 u32 low; 3250 3251 if (reg == APIC_ICR) { 3252 - *data = kvm_lapic_get_reg64(apic, APIC_ICR); 3253 return 0; 3254 } 3255

··· 1944 u64 ns = 0; 1945 ktime_t expire; 1946 struct kvm_vcpu *vcpu = apic->vcpu; 1947 + u32 this_tsc_khz = vcpu->arch.virtual_tsc_khz; 1948 unsigned long flags; 1949 ktime_t now; 1950 ··· 2453 } 2454 EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); 2455 2456 + #define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13)) 2457 + 2458 + int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) 2459 + { 2460 + if (data & X2APIC_ICR_RESERVED_BITS) 2461 + return 1; 2462 + 2463 + /* 2464 + * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but 2465 + * only AMD requires it to be zero, Intel essentially just ignores the 2466 + * bit. And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled, 2467 + * the CPU performs the reserved bits checks, i.e. the underlying CPU 2468 + * behavior will "win". Arbitrarily clear the BUSY bit, as there is no 2469 + * sane way to provide consistent behavior with respect to hardware. 2470 + */ 2471 + data &= ~APIC_ICR_BUSY; 2472 + 2473 + kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); 2474 + if (kvm_x86_ops.x2apic_icr_is_split) { 2475 + kvm_lapic_set_reg(apic, APIC_ICR, data); 2476 + kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32); 2477 + } else { 2478 + kvm_lapic_set_reg64(apic, APIC_ICR, data); 2479 + } 2480 + trace_kvm_apic_write(APIC_ICR, data); 2481 + return 0; 2482 + } 2483 + 2484 + static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic) 2485 + { 2486 + if (kvm_x86_ops.x2apic_icr_is_split) 2487 + return (u64)kvm_lapic_get_reg(apic, APIC_ICR) | 2488 + (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32; 2489 + 2490 + return kvm_lapic_get_reg64(apic, APIC_ICR); 2491 + } 2492 + 2493 /* emulate APIC access in a trap manner */ 2494 void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) 2495 { ··· 2470 * maybe-unecessary write, and both are in the noise anyways. 2471 */ 2472 if (apic_x2apic_mode(apic) && offset == APIC_ICR) 2473 + WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic))); 2474 else 2475 kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); 2476 } ··· 2922 } 2923 } 2924 2925 + void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector) 2926 { 2927 struct kvm_lapic *apic = vcpu->arch.apic; 2928 u32 ppr; 2929 2930 + if (WARN_ON_ONCE(vector < 0 || !apic)) 2931 + return; 2932 2933 /* 2934 * We get here even with APIC virtualization enabled, if doing ··· 2957 __apic_update_ppr(apic, &ppr); 2958 } 2959 2960 } 2961 + EXPORT_SYMBOL_GPL(kvm_apic_ack_interrupt); 2962 2963 static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, 2964 struct kvm_lapic_state *s, bool set) ··· 2990 2991 /* 2992 * In x2APIC mode, the LDR is fixed and based on the id. And 2993 + * if the ICR is _not_ split, ICR is internally a single 64-bit 2994 + * register, but needs to be split to ICR+ICR2 in userspace for 2995 + * backwards compatibility. 2996 */ 2997 + if (set) 2998 *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id); 2999 3000 + if (!kvm_x86_ops.x2apic_icr_is_split) { 3001 + if (set) { 3002 + icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | 3003 + (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; 3004 + __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); 3005 + } else { 3006 + icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); 3007 + __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); 3008 + } 3009 } 3010 } 3011 ··· 3194 return 0; 3195 } 3196 3197 static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data) 3198 { 3199 u32 low; 3200 3201 if (reg == APIC_ICR) { 3202 + *data = kvm_x2apic_icr_read(apic); 3203 return 0; 3204 } 3205

+1 -2

arch/x86/kvm/lapic.h

··· 88 void kvm_free_lapic(struct kvm_vcpu *vcpu); 89 90 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); 91 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); 92 - int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); 93 int kvm_apic_accept_events(struct kvm_vcpu *vcpu); 94 void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event); 95 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); 96 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); 97 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); 98 void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); 99 - u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); 100 void kvm_recalculate_apic_map(struct kvm *kvm); 101 void kvm_apic_set_version(struct kvm_vcpu *vcpu); 102 void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu);

··· 88 void kvm_free_lapic(struct kvm_vcpu *vcpu); 89 90 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); 91 + void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector); 92 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); 93 int kvm_apic_accept_events(struct kvm_vcpu *vcpu); 94 void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event); 95 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); 96 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); 97 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); 98 void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); 99 void kvm_recalculate_apic_map(struct kvm *kvm); 100 void kvm_apic_set_version(struct kvm_vcpu *vcpu); 101 void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu);

-2

arch/x86/kvm/mmu.h

··· 223 224 bool kvm_mmu_may_ignore_guest_pat(void); 225 226 - int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); 227 - 228 int kvm_mmu_post_init_vm(struct kvm *kvm); 229 void kvm_mmu_pre_destroy_vm(struct kvm *kvm); 230

··· 223 224 bool kvm_mmu_may_ignore_guest_pat(void); 225 226 int kvm_mmu_post_init_vm(struct kvm *kvm); 227 void kvm_mmu_pre_destroy_vm(struct kvm *kvm); 228

+320 -238

arch/x86/kvm/mmu/mmu.c

··· 614 return __get_spte_lockless(sptep); 615 } 616 617 - /* Returns the Accessed status of the PTE and resets it at the same time. */ 618 - static bool mmu_spte_age(u64 *sptep) 619 - { 620 - u64 spte = mmu_spte_get_lockless(sptep); 621 - 622 - if (!is_accessed_spte(spte)) 623 - return false; 624 - 625 - if (spte_ad_enabled(spte)) { 626 - clear_bit((ffs(shadow_accessed_mask) - 1), 627 - (unsigned long *)sptep); 628 - } else { 629 - /* 630 - * Capture the dirty status of the page, so that it doesn't get 631 - * lost when the SPTE is marked for access tracking. 632 - */ 633 - if (is_writable_pte(spte)) 634 - kvm_set_pfn_dirty(spte_to_pfn(spte)); 635 - 636 - spte = mark_spte_for_access_track(spte); 637 - mmu_spte_update_no_track(sptep, spte); 638 - } 639 - 640 - return true; 641 - } 642 - 643 static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu) 644 { 645 return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct; ··· 912 * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct 913 * pte_list_desc containing more mappings. 914 */ 915 916 /* 917 * Returns the number of pointers in the rmap chain, not counting the new one. ··· 925 926 if (!rmap_head->val) { 927 rmap_head->val = (unsigned long)spte; 928 - } else if (!(rmap_head->val & 1)) { 929 desc = kvm_mmu_memory_cache_alloc(cache); 930 desc->sptes[0] = (u64 *)rmap_head->val; 931 desc->sptes[1] = spte; 932 desc->spte_count = 2; 933 desc->tail_count = 0; 934 - rmap_head->val = (unsigned long)desc | 1; 935 ++count; 936 } else { 937 - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 938 count = desc->tail_count + desc->spte_count; 939 940 /* ··· 943 */ 944 if (desc->spte_count == PTE_LIST_EXT) { 945 desc = kvm_mmu_memory_cache_alloc(cache); 946 - desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul); 947 desc->spte_count = 0; 948 desc->tail_count = count; 949 - rmap_head->val = (unsigned long)desc | 1; 950 } 951 desc->sptes[desc->spte_count++] = spte; 952 } ··· 957 struct kvm_rmap_head *rmap_head, 958 struct pte_list_desc *desc, int i) 959 { 960 - struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 961 int j = head_desc->spte_count - 1; 962 963 /* ··· 986 if (!head_desc->more) 987 rmap_head->val = 0; 988 else 989 - rmap_head->val = (unsigned long)head_desc->more | 1; 990 mmu_free_pte_list_desc(head_desc); 991 } 992 ··· 999 if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm)) 1000 return; 1001 1002 - if (!(rmap_head->val & 1)) { 1003 if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) 1004 return; 1005 1006 rmap_head->val = 0; 1007 } else { 1008 - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 1009 while (desc) { 1010 for (i = 0; i < desc->spte_count; ++i) { 1011 if (desc->sptes[i] == spte) { ··· 1038 if (!rmap_head->val) 1039 return false; 1040 1041 - if (!(rmap_head->val & 1)) { 1042 mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); 1043 goto out; 1044 } 1045 1046 - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 1047 1048 for (; desc; desc = next) { 1049 for (i = 0; i < desc->spte_count; i++) ··· 1063 1064 if (!rmap_head->val) 1065 return 0; 1066 - else if (!(rmap_head->val & 1)) 1067 return 1; 1068 1069 - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 1070 return desc->tail_count + desc->spte_count; 1071 } 1072 ··· 1128 if (!rmap_head->val) 1129 return NULL; 1130 1131 - if (!(rmap_head->val & 1)) { 1132 iter->desc = NULL; 1133 sptep = (u64 *)rmap_head->val; 1134 goto out; 1135 } 1136 1137 - iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 1138 iter->pos = 0; 1139 sptep = iter->desc->sptes[iter->pos]; 1140 out: ··· 1282 return flush; 1283 } 1284 1285 - /** 1286 - * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages 1287 - * @kvm: kvm instance 1288 - * @slot: slot to protect 1289 - * @gfn_offset: start of the BITS_PER_LONG pages we care about 1290 - * @mask: indicates which pages we should protect 1291 - * 1292 - * Used when we do not need to care about huge page mappings. 1293 - */ 1294 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 1295 struct kvm_memory_slot *slot, 1296 gfn_t gfn_offset, unsigned long mask) ··· 1305 } 1306 } 1307 1308 - /** 1309 - * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write 1310 - * protect the page if the D-bit isn't supported. 1311 - * @kvm: kvm instance 1312 - * @slot: slot to clear D-bit 1313 - * @gfn_offset: start of the BITS_PER_LONG pages we care about 1314 - * @mask: indicates which pages we should clear D-bit 1315 - * 1316 - * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. 1317 - */ 1318 static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1319 struct kvm_memory_slot *slot, 1320 gfn_t gfn_offset, unsigned long mask) ··· 1328 } 1329 } 1330 1331 - /** 1332 - * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected 1333 - * PT level pages. 1334 - * 1335 - * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to 1336 - * enable dirty logging for them. 1337 - * 1338 - * We need to care about huge page mappings: e.g. during dirty logging we may 1339 - * have such mappings. 1340 - */ 1341 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1342 struct kvm_memory_slot *slot, 1343 gfn_t gfn_offset, unsigned long mask) 1344 { 1345 /* 1346 - * Huge pages are NOT write protected when we start dirty logging in 1347 - * initially-all-set mode; must write protect them here so that they 1348 - * are split to 4K on the first write. 1349 * 1350 * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn 1351 * of memslot has no such restriction, so the range can cross two large ··· 1359 PG_LEVEL_2M); 1360 } 1361 1362 - /* Now handle 4K PTEs. */ 1363 if (kvm_x86_ops.cpu_dirty_log_size) 1364 kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); 1365 else ··· 1410 return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); 1411 } 1412 1413 - static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1414 - const struct kvm_memory_slot *slot) 1415 { 1416 return kvm_zap_all_rmap_sptes(kvm, rmap_head); 1417 - } 1418 - 1419 - static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1420 - struct kvm_memory_slot *slot, gfn_t gfn, int level) 1421 - { 1422 - return __kvm_zap_rmap(kvm, rmap_head, slot); 1423 } 1424 1425 struct slot_rmap_walk_iterator { ··· 1464 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) 1465 { 1466 while (++iterator->rmap <= iterator->end_rmap) { 1467 - iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); 1468 1469 if (iterator->rmap->val) 1470 return; ··· 1485 slot_rmap_walk_okay(_iter_); \ 1486 slot_rmap_walk_next(_iter_)) 1487 1488 - typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1489 - struct kvm_memory_slot *slot, gfn_t gfn, 1490 - int level); 1491 1492 - static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, 1493 - struct kvm_gfn_range *range, 1494 - rmap_handler_t handler) 1495 { 1496 struct slot_rmap_walk_iterator iterator; 1497 - bool ret = false; 1498 1499 - for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, 1500 - range->start, range->end - 1, &iterator) 1501 - ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn, 1502 - iterator.level); 1503 1504 - return ret; 1505 } 1506 1507 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) ··· 1557 bool flush = false; 1558 1559 if (kvm_memslots_have_rmaps(kvm)) 1560 - flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap); 1561 1562 if (tdp_mmu_enabled) 1563 flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); ··· 1569 kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); 1570 1571 return flush; 1572 - } 1573 - 1574 - static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1575 - struct kvm_memory_slot *slot, gfn_t gfn, int level) 1576 - { 1577 - u64 *sptep; 1578 - struct rmap_iterator iter; 1579 - int young = 0; 1580 - 1581 - for_each_rmap_spte(rmap_head, &iter, sptep) 1582 - young |= mmu_spte_age(sptep); 1583 - 1584 - return young; 1585 - } 1586 - 1587 - static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1588 - struct kvm_memory_slot *slot, gfn_t gfn, int level) 1589 - { 1590 - u64 *sptep; 1591 - struct rmap_iterator iter; 1592 - 1593 - for_each_rmap_spte(rmap_head, &iter, sptep) 1594 - if (is_accessed_spte(*sptep)) 1595 - return true; 1596 - return false; 1597 } 1598 1599 #define RMAP_RECYCLE_THRESHOLD 1000 ··· 1605 __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access); 1606 } 1607 1608 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1609 { 1610 bool young = false; 1611 1612 if (kvm_memslots_have_rmaps(kvm)) 1613 - young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap); 1614 1615 if (tdp_mmu_enabled) 1616 young |= kvm_tdp_mmu_age_gfn_range(kvm, range); ··· 1663 bool young = false; 1664 1665 if (kvm_memslots_have_rmaps(kvm)) 1666 - young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap); 1667 1668 if (tdp_mmu_enabled) 1669 young |= kvm_tdp_mmu_test_age_gfn(kvm, range); ··· 2729 write_unlock(&kvm->mmu_lock); 2730 } 2731 2732 - int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 2733 { 2734 - struct kvm_mmu_page *sp; 2735 LIST_HEAD(invalid_list); 2736 - int r; 2737 2738 - r = 0; 2739 - write_lock(&kvm->mmu_lock); 2740 - for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { 2741 - r = 1; 2742 - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 2743 } 2744 kvm_mmu_commit_zap_page(kvm, &invalid_list); 2745 write_unlock(&kvm->mmu_lock); 2746 2747 - return r; 2748 - } 2749 - 2750 - static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 2751 - { 2752 - gpa_t gpa; 2753 - int r; 2754 - 2755 - if (vcpu->arch.mmu->root_role.direct) 2756 - return 0; 2757 - 2758 - gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 2759 - 2760 - r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2761 - 2762 return r; 2763 } 2764 ··· 2943 trace_kvm_mmu_set_spte(level, gfn, sptep); 2944 } 2945 2946 - if (wrprot) { 2947 - if (write_fault) 2948 - ret = RET_PF_EMULATE; 2949 - } 2950 2951 if (flush) 2952 kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); ··· 4576 return RET_PF_RETRY; 4577 4578 if (page_fault_handle_page_track(vcpu, fault)) 4579 - return RET_PF_EMULATE; 4580 4581 r = fast_page_fault(vcpu, fault); 4582 if (r != RET_PF_INVALID) ··· 4645 if (!flags) { 4646 trace_kvm_page_fault(vcpu, fault_address, error_code); 4647 4648 - if (kvm_event_needs_reinjection(vcpu)) 4649 - kvm_mmu_unprotect_page_virt(vcpu, fault_address); 4650 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, 4651 insn_len); 4652 } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { ··· 4667 int r; 4668 4669 if (page_fault_handle_page_track(vcpu, fault)) 4670 - return RET_PF_EMULATE; 4671 4672 r = fast_page_fault(vcpu, fault); 4673 if (r != RET_PF_INVALID) ··· 4744 switch (r) { 4745 case RET_PF_FIXED: 4746 case RET_PF_SPURIOUS: 4747 return 0; 4748 4749 case RET_PF_EMULATE: ··· 5989 write_unlock(&vcpu->kvm->mmu_lock); 5990 } 5991 5992 int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, 5993 void *insn, int insn_len) 5994 { ··· 6134 if (r < 0) 6135 return r; 6136 6137 if (r == RET_PF_FIXED) 6138 vcpu->stat.pf_fixed++; 6139 else if (r == RET_PF_EMULATE) ··· 6148 if (r != RET_PF_EMULATE) 6149 return 1; 6150 6151 - /* 6152 - * Before emulating the instruction, check if the error code 6153 - * was due to a RO violation while translating the guest page. 6154 - * This can occur when using nested virtualization with nested 6155 - * paging in both guests. If true, we simply unprotect the page 6156 - * and resume the guest. 6157 - */ 6158 - if (vcpu->arch.mmu->root_role.direct && 6159 - (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { 6160 - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); 6161 - return 1; 6162 - } 6163 - 6164 - /* 6165 - * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still 6166 - * optimistically try to just unprotect the page and let the processor 6167 - * re-execute the instruction that caused the page fault. Do not allow 6168 - * retrying MMIO emulation, as it's not only pointless but could also 6169 - * cause us to enter an infinite loop because the processor will keep 6170 - * faulting on the non-existent MMIO address. Retrying an instruction 6171 - * from a nested guest is also pointless and dangerous as we are only 6172 - * explicitly shadowing L1's page tables, i.e. unprotecting something 6173 - * for L1 isn't going to magically fix whatever issue cause L2 to fail. 6174 - */ 6175 - if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) 6176 - emulation_type |= EMULTYPE_ALLOW_RETRY_PF; 6177 emulate: 6178 return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, 6179 insn_len); ··· 6305 max_huge_page_level = PG_LEVEL_2M; 6306 } 6307 EXPORT_SYMBOL_GPL(kvm_configure_mmu); 6308 - 6309 - /* The return value indicates if tlb flush on all vcpus is needed. */ 6310 - typedef bool (*slot_rmaps_handler) (struct kvm *kvm, 6311 - struct kvm_rmap_head *rmap_head, 6312 - const struct kvm_memory_slot *slot); 6313 - 6314 - static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, 6315 - const struct kvm_memory_slot *slot, 6316 - slot_rmaps_handler fn, 6317 - int start_level, int end_level, 6318 - gfn_t start_gfn, gfn_t end_gfn, 6319 - bool flush_on_yield, bool flush) 6320 - { 6321 - struct slot_rmap_walk_iterator iterator; 6322 - 6323 - lockdep_assert_held_write(&kvm->mmu_lock); 6324 - 6325 - for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, 6326 - end_gfn, &iterator) { 6327 - if (iterator.rmap) 6328 - flush |= fn(kvm, iterator.rmap, slot); 6329 - 6330 - if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 6331 - if (flush && flush_on_yield) { 6332 - kvm_flush_remote_tlbs_range(kvm, start_gfn, 6333 - iterator.gfn - start_gfn + 1); 6334 - flush = false; 6335 - } 6336 - cond_resched_rwlock_write(&kvm->mmu_lock); 6337 - } 6338 - } 6339 - 6340 - return flush; 6341 - } 6342 - 6343 - static __always_inline bool walk_slot_rmaps(struct kvm *kvm, 6344 - const struct kvm_memory_slot *slot, 6345 - slot_rmaps_handler fn, 6346 - int start_level, int end_level, 6347 - bool flush_on_yield) 6348 - { 6349 - return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, 6350 - slot->base_gfn, slot->base_gfn + slot->npages - 1, 6351 - flush_on_yield, false); 6352 - } 6353 - 6354 - static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, 6355 - const struct kvm_memory_slot *slot, 6356 - slot_rmaps_handler fn, 6357 - bool flush_on_yield) 6358 - { 6359 - return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); 6360 - } 6361 6362 static void free_mmu_pages(struct kvm_mmu *mmu) 6363 { ··· 6579 if (WARN_ON_ONCE(start >= end)) 6580 continue; 6581 6582 - flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap, 6583 - PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, 6584 - start, end - 1, true, flush); 6585 } 6586 } 6587 ··· 6868 */ 6869 for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) 6870 __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages, 6871 - level, level, start, end - 1, true, false); 6872 } 6873 6874 /* Must be called with the mmu_lock held in write-mode. */ ··· 7047 kvm_mmu_zap_all(kvm); 7048 } 7049 7050 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 7051 struct kvm_memory_slot *slot) 7052 { 7053 - kvm_mmu_zap_all_fast(kvm); 7054 } 7055 7056 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)

··· 614 return __get_spte_lockless(sptep); 615 } 616 617 static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu) 618 { 619 return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct; ··· 938 * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct 939 * pte_list_desc containing more mappings. 940 */ 941 + #define KVM_RMAP_MANY BIT(0) 942 943 /* 944 * Returns the number of pointers in the rmap chain, not counting the new one. ··· 950 951 if (!rmap_head->val) { 952 rmap_head->val = (unsigned long)spte; 953 + } else if (!(rmap_head->val & KVM_RMAP_MANY)) { 954 desc = kvm_mmu_memory_cache_alloc(cache); 955 desc->sptes[0] = (u64 *)rmap_head->val; 956 desc->sptes[1] = spte; 957 desc->spte_count = 2; 958 desc->tail_count = 0; 959 + rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; 960 ++count; 961 } else { 962 + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); 963 count = desc->tail_count + desc->spte_count; 964 965 /* ··· 968 */ 969 if (desc->spte_count == PTE_LIST_EXT) { 970 desc = kvm_mmu_memory_cache_alloc(cache); 971 + desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); 972 desc->spte_count = 0; 973 desc->tail_count = count; 974 + rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; 975 } 976 desc->sptes[desc->spte_count++] = spte; 977 } ··· 982 struct kvm_rmap_head *rmap_head, 983 struct pte_list_desc *desc, int i) 984 { 985 + struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); 986 int j = head_desc->spte_count - 1; 987 988 /* ··· 1011 if (!head_desc->more) 1012 rmap_head->val = 0; 1013 else 1014 + rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY; 1015 mmu_free_pte_list_desc(head_desc); 1016 } 1017 ··· 1024 if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm)) 1025 return; 1026 1027 + if (!(rmap_head->val & KVM_RMAP_MANY)) { 1028 if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) 1029 return; 1030 1031 rmap_head->val = 0; 1032 } else { 1033 + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); 1034 while (desc) { 1035 for (i = 0; i < desc->spte_count; ++i) { 1036 if (desc->sptes[i] == spte) { ··· 1063 if (!rmap_head->val) 1064 return false; 1065 1066 + if (!(rmap_head->val & KVM_RMAP_MANY)) { 1067 mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); 1068 goto out; 1069 } 1070 1071 + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); 1072 1073 for (; desc; desc = next) { 1074 for (i = 0; i < desc->spte_count; i++) ··· 1088 1089 if (!rmap_head->val) 1090 return 0; 1091 + else if (!(rmap_head->val & KVM_RMAP_MANY)) 1092 return 1; 1093 1094 + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); 1095 return desc->tail_count + desc->spte_count; 1096 } 1097 ··· 1153 if (!rmap_head->val) 1154 return NULL; 1155 1156 + if (!(rmap_head->val & KVM_RMAP_MANY)) { 1157 iter->desc = NULL; 1158 sptep = (u64 *)rmap_head->val; 1159 goto out; 1160 } 1161 1162 + iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); 1163 iter->pos = 0; 1164 sptep = iter->desc->sptes[iter->pos]; 1165 out: ··· 1307 return flush; 1308 } 1309 1310 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 1311 struct kvm_memory_slot *slot, 1312 gfn_t gfn_offset, unsigned long mask) ··· 1339 } 1340 } 1341 1342 static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1343 struct kvm_memory_slot *slot, 1344 gfn_t gfn_offset, unsigned long mask) ··· 1372 } 1373 } 1374 1375 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1376 struct kvm_memory_slot *slot, 1377 gfn_t gfn_offset, unsigned long mask) 1378 { 1379 /* 1380 + * If the slot was assumed to be "initially all dirty", write-protect 1381 + * huge pages to ensure they are split to 4KiB on the first write (KVM 1382 + * dirty logs at 4KiB granularity). If eager page splitting is enabled, 1383 + * immediately try to split huge pages, e.g. so that vCPUs don't get 1384 + * saddled with the cost of splitting. 1385 * 1386 * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn 1387 * of memslot has no such restriction, so the range can cross two large ··· 1411 PG_LEVEL_2M); 1412 } 1413 1414 + /* 1415 + * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in 1416 + * mask. If PML is enabled and the GFN doesn't need to be write- 1417 + * protected for other reasons, e.g. shadow paging, clear the Dirty bit. 1418 + * Otherwise clear the Writable bit. 1419 + * 1420 + * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is 1421 + * enabled but it chooses between clearing the Dirty bit and Writeable 1422 + * bit based on the context. 1423 + */ 1424 if (kvm_x86_ops.cpu_dirty_log_size) 1425 kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); 1426 else ··· 1453 return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); 1454 } 1455 1456 + static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1457 + const struct kvm_memory_slot *slot) 1458 { 1459 return kvm_zap_all_rmap_sptes(kvm, rmap_head); 1460 } 1461 1462 struct slot_rmap_walk_iterator { ··· 1513 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) 1514 { 1515 while (++iterator->rmap <= iterator->end_rmap) { 1516 + iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level); 1517 1518 if (iterator->rmap->val) 1519 return; ··· 1534 slot_rmap_walk_okay(_iter_); \ 1535 slot_rmap_walk_next(_iter_)) 1536 1537 + /* The return value indicates if tlb flush on all vcpus is needed. */ 1538 + typedef bool (*slot_rmaps_handler) (struct kvm *kvm, 1539 + struct kvm_rmap_head *rmap_head, 1540 + const struct kvm_memory_slot *slot); 1541 1542 + static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, 1543 + const struct kvm_memory_slot *slot, 1544 + slot_rmaps_handler fn, 1545 + int start_level, int end_level, 1546 + gfn_t start_gfn, gfn_t end_gfn, 1547 + bool can_yield, bool flush_on_yield, 1548 + bool flush) 1549 { 1550 struct slot_rmap_walk_iterator iterator; 1551 1552 + lockdep_assert_held_write(&kvm->mmu_lock); 1553 1554 + for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, 1555 + end_gfn, &iterator) { 1556 + if (iterator.rmap) 1557 + flush |= fn(kvm, iterator.rmap, slot); 1558 + 1559 + if (!can_yield) 1560 + continue; 1561 + 1562 + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 1563 + if (flush && flush_on_yield) { 1564 + kvm_flush_remote_tlbs_range(kvm, start_gfn, 1565 + iterator.gfn - start_gfn + 1); 1566 + flush = false; 1567 + } 1568 + cond_resched_rwlock_write(&kvm->mmu_lock); 1569 + } 1570 + } 1571 + 1572 + return flush; 1573 + } 1574 + 1575 + static __always_inline bool walk_slot_rmaps(struct kvm *kvm, 1576 + const struct kvm_memory_slot *slot, 1577 + slot_rmaps_handler fn, 1578 + int start_level, int end_level, 1579 + bool flush_on_yield) 1580 + { 1581 + return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, 1582 + slot->base_gfn, slot->base_gfn + slot->npages - 1, 1583 + true, flush_on_yield, false); 1584 + } 1585 + 1586 + static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, 1587 + const struct kvm_memory_slot *slot, 1588 + slot_rmaps_handler fn, 1589 + bool flush_on_yield) 1590 + { 1591 + return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); 1592 + } 1593 + 1594 + static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm, 1595 + const struct kvm_memory_slot *slot, 1596 + gfn_t start, gfn_t end, bool can_yield, 1597 + bool flush) 1598 + { 1599 + return __walk_slot_rmaps(kvm, slot, kvm_zap_rmap, 1600 + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, 1601 + start, end - 1, can_yield, true, flush); 1602 } 1603 1604 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) ··· 1558 bool flush = false; 1559 1560 if (kvm_memslots_have_rmaps(kvm)) 1561 + flush = __kvm_rmap_zap_gfn_range(kvm, range->slot, 1562 + range->start, range->end, 1563 + range->may_block, flush); 1564 1565 if (tdp_mmu_enabled) 1566 flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); ··· 1568 kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); 1569 1570 return flush; 1571 } 1572 1573 #define RMAP_RECYCLE_THRESHOLD 1000 ··· 1629 __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access); 1630 } 1631 1632 + static bool kvm_rmap_age_gfn_range(struct kvm *kvm, 1633 + struct kvm_gfn_range *range, bool test_only) 1634 + { 1635 + struct slot_rmap_walk_iterator iterator; 1636 + struct rmap_iterator iter; 1637 + bool young = false; 1638 + u64 *sptep; 1639 + 1640 + for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, 1641 + range->start, range->end - 1, &iterator) { 1642 + for_each_rmap_spte(iterator.rmap, &iter, sptep) { 1643 + u64 spte = *sptep; 1644 + 1645 + if (!is_accessed_spte(spte)) 1646 + continue; 1647 + 1648 + if (test_only) 1649 + return true; 1650 + 1651 + if (spte_ad_enabled(spte)) { 1652 + clear_bit((ffs(shadow_accessed_mask) - 1), 1653 + (unsigned long *)sptep); 1654 + } else { 1655 + /* 1656 + * Capture the dirty status of the page, so that 1657 + * it doesn't get lost when the SPTE is marked 1658 + * for access tracking. 1659 + */ 1660 + if (is_writable_pte(spte)) 1661 + kvm_set_pfn_dirty(spte_to_pfn(spte)); 1662 + 1663 + spte = mark_spte_for_access_track(spte); 1664 + mmu_spte_update_no_track(sptep, spte); 1665 + } 1666 + young = true; 1667 + } 1668 + } 1669 + return young; 1670 + } 1671 + 1672 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1673 { 1674 bool young = false; 1675 1676 if (kvm_memslots_have_rmaps(kvm)) 1677 + young = kvm_rmap_age_gfn_range(kvm, range, false); 1678 1679 if (tdp_mmu_enabled) 1680 young |= kvm_tdp_mmu_age_gfn_range(kvm, range); ··· 1647 bool young = false; 1648 1649 if (kvm_memslots_have_rmaps(kvm)) 1650 + young = kvm_rmap_age_gfn_range(kvm, range, true); 1651 1652 if (tdp_mmu_enabled) 1653 young |= kvm_tdp_mmu_test_age_gfn(kvm, range); ··· 2713 write_unlock(&kvm->mmu_lock); 2714 } 2715 2716 + bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 2717 + bool always_retry) 2718 { 2719 + struct kvm *kvm = vcpu->kvm; 2720 LIST_HEAD(invalid_list); 2721 + struct kvm_mmu_page *sp; 2722 + gpa_t gpa = cr2_or_gpa; 2723 + bool r = false; 2724 2725 + /* 2726 + * Bail early if there aren't any write-protected shadow pages to avoid 2727 + * unnecessarily taking mmu_lock lock, e.g. if the gfn is write-tracked 2728 + * by a third party. Reading indirect_shadow_pages without holding 2729 + * mmu_lock is safe, as this is purely an optimization, i.e. a false 2730 + * positive is benign, and a false negative will simply result in KVM 2731 + * skipping the unprotect+retry path, which is also an optimization. 2732 + */ 2733 + if (!READ_ONCE(kvm->arch.indirect_shadow_pages)) 2734 + goto out; 2735 + 2736 + if (!vcpu->arch.mmu->root_role.direct) { 2737 + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); 2738 + if (gpa == INVALID_GPA) 2739 + goto out; 2740 } 2741 + 2742 + write_lock(&kvm->mmu_lock); 2743 + for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa)) 2744 + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 2745 + 2746 + /* 2747 + * Snapshot the result before zapping, as zapping will remove all list 2748 + * entries, i.e. checking the list later would yield a false negative. 2749 + */ 2750 + r = !list_empty(&invalid_list); 2751 kvm_mmu_commit_zap_page(kvm, &invalid_list); 2752 write_unlock(&kvm->mmu_lock); 2753 2754 + out: 2755 + if (r || always_retry) { 2756 + vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); 2757 + vcpu->arch.last_retry_addr = cr2_or_gpa; 2758 + } 2759 return r; 2760 } 2761 ··· 2914 trace_kvm_mmu_set_spte(level, gfn, sptep); 2915 } 2916 2917 + if (wrprot && write_fault) 2918 + ret = RET_PF_WRITE_PROTECTED; 2919 2920 if (flush) 2921 kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); ··· 4549 return RET_PF_RETRY; 4550 4551 if (page_fault_handle_page_track(vcpu, fault)) 4552 + return RET_PF_WRITE_PROTECTED; 4553 4554 r = fast_page_fault(vcpu, fault); 4555 if (r != RET_PF_INVALID) ··· 4618 if (!flags) { 4619 trace_kvm_page_fault(vcpu, fault_address, error_code); 4620 4621 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, 4622 insn_len); 4623 } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { ··· 4642 int r; 4643 4644 if (page_fault_handle_page_track(vcpu, fault)) 4645 + return RET_PF_WRITE_PROTECTED; 4646 4647 r = fast_page_fault(vcpu, fault); 4648 if (r != RET_PF_INVALID) ··· 4719 switch (r) { 4720 case RET_PF_FIXED: 4721 case RET_PF_SPURIOUS: 4722 + case RET_PF_WRITE_PROTECTED: 4723 return 0; 4724 4725 case RET_PF_EMULATE: ··· 5963 write_unlock(&vcpu->kvm->mmu_lock); 5964 } 5965 5966 + static bool is_write_to_guest_page_table(u64 error_code) 5967 + { 5968 + const u64 mask = PFERR_GUEST_PAGE_MASK | PFERR_WRITE_MASK | PFERR_PRESENT_MASK; 5969 + 5970 + return (error_code & mask) == mask; 5971 + } 5972 + 5973 + static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 5974 + u64 error_code, int *emulation_type) 5975 + { 5976 + bool direct = vcpu->arch.mmu->root_role.direct; 5977 + 5978 + /* 5979 + * Do not try to unprotect and retry if the vCPU re-faulted on the same 5980 + * RIP with the same address that was previously unprotected, as doing 5981 + * so will likely put the vCPU into an infinite. E.g. if the vCPU uses 5982 + * a non-page-table modifying instruction on the PDE that points to the 5983 + * instruction, then unprotecting the gfn will unmap the instruction's 5984 + * code, i.e. make it impossible for the instruction to ever complete. 5985 + */ 5986 + if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) && 5987 + vcpu->arch.last_retry_addr == cr2_or_gpa) 5988 + return RET_PF_EMULATE; 5989 + 5990 + /* 5991 + * Reset the unprotect+retry values that guard against infinite loops. 5992 + * The values will be refreshed if KVM explicitly unprotects a gfn and 5993 + * retries, in all other cases it's safe to retry in the future even if 5994 + * the next page fault happens on the same RIP+address. 5995 + */ 5996 + vcpu->arch.last_retry_eip = 0; 5997 + vcpu->arch.last_retry_addr = 0; 5998 + 5999 + /* 6000 + * It should be impossible to reach this point with an MMIO cache hit, 6001 + * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid, 6002 + * writable memslot, and creating a memslot should invalidate the MMIO 6003 + * cache by way of changing the memslot generation. WARN and disallow 6004 + * retry if MMIO is detected, as retrying MMIO emulation is pointless 6005 + * and could put the vCPU into an infinite loop because the processor 6006 + * will keep faulting on the non-existent MMIO address. 6007 + */ 6008 + if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct))) 6009 + return RET_PF_EMULATE; 6010 + 6011 + /* 6012 + * Before emulating the instruction, check to see if the access was due 6013 + * to a read-only violation while the CPU was walking non-nested NPT 6014 + * page tables, i.e. for a direct MMU, for _guest_ page tables in L1. 6015 + * If L1 is sharing (a subset of) its page tables with L2, e.g. by 6016 + * having nCR3 share lower level page tables with hCR3, then when KVM 6017 + * (L0) write-protects the nested NPTs, i.e. npt12 entries, KVM is also 6018 + * unknowingly write-protecting L1's guest page tables, which KVM isn't 6019 + * shadowing. 6020 + * 6021 + * Because the CPU (by default) walks NPT page tables using a write 6022 + * access (to ensure the CPU can do A/D updates), page walks in L1 can 6023 + * trigger write faults for the above case even when L1 isn't modifying 6024 + * PTEs. As a result, KVM will unnecessarily emulate (or at least, try 6025 + * to emulate) an excessive number of L1 instructions; because L1's MMU 6026 + * isn't shadowed by KVM, there is no need to write-protect L1's gPTEs 6027 + * and thus no need to emulate in order to guarantee forward progress. 6028 + * 6029 + * Try to unprotect the gfn, i.e. zap any shadow pages, so that L1 can 6030 + * proceed without triggering emulation. If one or more shadow pages 6031 + * was zapped, skip emulation and resume L1 to let it natively execute 6032 + * the instruction. If no shadow pages were zapped, then the write- 6033 + * fault is due to something else entirely, i.e. KVM needs to emulate, 6034 + * as resuming the guest will put it into an infinite loop. 6035 + * 6036 + * Note, this code also applies to Intel CPUs, even though it is *very* 6037 + * unlikely that an L1 will share its page tables (IA32/PAE/paging64 6038 + * format) with L2's page tables (EPT format). 6039 + * 6040 + * For indirect MMUs, i.e. if KVM is shadowing the current MMU, try to 6041 + * unprotect the gfn and retry if an event is awaiting reinjection. If 6042 + * KVM emulates multiple instructions before completing event injection, 6043 + * the event could be delayed beyond what is architecturally allowed, 6044 + * e.g. KVM could inject an IRQ after the TPR has been raised. 6045 + */ 6046 + if (((direct && is_write_to_guest_page_table(error_code)) || 6047 + (!direct && kvm_event_needs_reinjection(vcpu))) && 6048 + kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) 6049 + return RET_PF_RETRY; 6050 + 6051 + /* 6052 + * The gfn is write-protected, but if KVM detects its emulating an 6053 + * instruction that is unlikely to be used to modify page tables, or if 6054 + * emulation fails, KVM can try to unprotect the gfn and let the CPU 6055 + * re-execute the instruction that caused the page fault. Do not allow 6056 + * retrying an instruction from a nested guest as KVM is only explicitly 6057 + * shadowing L1's page tables, i.e. unprotecting something for L1 isn't 6058 + * going to magically fix whatever issue caused L2 to fail. 6059 + */ 6060 + if (!is_guest_mode(vcpu)) 6061 + *emulation_type |= EMULTYPE_ALLOW_RETRY_PF; 6062 + 6063 + return RET_PF_EMULATE; 6064 + } 6065 + 6066 int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, 6067 void *insn, int insn_len) 6068 { ··· 6008 if (r < 0) 6009 return r; 6010 6011 + if (r == RET_PF_WRITE_PROTECTED) 6012 + r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code, 6013 + &emulation_type); 6014 + 6015 if (r == RET_PF_FIXED) 6016 vcpu->stat.pf_fixed++; 6017 else if (r == RET_PF_EMULATE) ··· 6018 if (r != RET_PF_EMULATE) 6019 return 1; 6020 6021 emulate: 6022 return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, 6023 insn_len); ··· 6201 max_huge_page_level = PG_LEVEL_2M; 6202 } 6203 EXPORT_SYMBOL_GPL(kvm_configure_mmu); 6204 6205 static void free_mmu_pages(struct kvm_mmu *mmu) 6206 { ··· 6528 if (WARN_ON_ONCE(start >= end)) 6529 continue; 6530 6531 + flush = __kvm_rmap_zap_gfn_range(kvm, memslot, start, 6532 + end, true, flush); 6533 } 6534 } 6535 ··· 6818 */ 6819 for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) 6820 __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages, 6821 + level, level, start, end - 1, true, true, false); 6822 } 6823 6824 /* Must be called with the mmu_lock held in write-mode. */ ··· 6997 kvm_mmu_zap_all(kvm); 6998 } 6999 7000 + /* 7001 + * Zapping leaf SPTEs with memslot range when a memslot is moved/deleted. 7002 + * 7003 + * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst 7004 + * case scenario we'll have unused shadow pages lying around until they 7005 + * are recycled due to age or when the VM is destroyed. 7006 + */ 7007 + static void kvm_mmu_zap_memslot_leafs(struct kvm *kvm, struct kvm_memory_slot *slot) 7008 + { 7009 + struct kvm_gfn_range range = { 7010 + .slot = slot, 7011 + .start = slot->base_gfn, 7012 + .end = slot->base_gfn + slot->npages, 7013 + .may_block = true, 7014 + }; 7015 + 7016 + write_lock(&kvm->mmu_lock); 7017 + if (kvm_unmap_gfn_range(kvm, &range)) 7018 + kvm_flush_remote_tlbs_memslot(kvm, slot); 7019 + 7020 + write_unlock(&kvm->mmu_lock); 7021 + } 7022 + 7023 + static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm) 7024 + { 7025 + return kvm->arch.vm_type == KVM_X86_DEFAULT_VM && 7026 + kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL); 7027 + } 7028 + 7029 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 7030 struct kvm_memory_slot *slot) 7031 { 7032 + if (kvm_memslot_flush_zap_all(kvm)) 7033 + kvm_mmu_zap_all_fast(kvm); 7034 + else 7035 + kvm_mmu_zap_memslot_leafs(kvm, slot); 7036 } 7037 7038 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)

+3 -2

arch/x86/kvm/mmu/mmu_internal.h

··· 258 * RET_PF_CONTINUE: So far, so good, keep handling the page fault. 259 * RET_PF_RETRY: let CPU fault again on the address. 260 * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. 261 * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. 262 * RET_PF_FIXED: The faulting entry has been fixed. 263 * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. ··· 276 RET_PF_CONTINUE = 0, 277 RET_PF_RETRY, 278 RET_PF_EMULATE, 279 RET_PF_INVALID, 280 RET_PF_FIXED, 281 RET_PF_SPURIOUS, ··· 351 int max_level); 352 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); 353 void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); 354 - 355 - void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); 356 357 void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); 358 void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);

··· 258 * RET_PF_CONTINUE: So far, so good, keep handling the page fault. 259 * RET_PF_RETRY: let CPU fault again on the address. 260 * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. 261 + * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the 262 + * gfn and retry, or emulate the instruction directly. 263 * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. 264 * RET_PF_FIXED: The faulting entry has been fixed. 265 * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. ··· 274 RET_PF_CONTINUE = 0, 275 RET_PF_RETRY, 276 RET_PF_EMULATE, 277 + RET_PF_WRITE_PROTECTED, 278 RET_PF_INVALID, 279 RET_PF_FIXED, 280 RET_PF_SPURIOUS, ··· 348 int max_level); 349 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); 350 void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); 351 352 void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); 353 void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);

+1

arch/x86/kvm/mmu/mmutrace.h

··· 57 TRACE_DEFINE_ENUM(RET_PF_CONTINUE); 58 TRACE_DEFINE_ENUM(RET_PF_RETRY); 59 TRACE_DEFINE_ENUM(RET_PF_EMULATE); 60 TRACE_DEFINE_ENUM(RET_PF_INVALID); 61 TRACE_DEFINE_ENUM(RET_PF_FIXED); 62 TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);

··· 57 TRACE_DEFINE_ENUM(RET_PF_CONTINUE); 58 TRACE_DEFINE_ENUM(RET_PF_RETRY); 59 TRACE_DEFINE_ENUM(RET_PF_EMULATE); 60 + TRACE_DEFINE_ENUM(RET_PF_WRITE_PROTECTED); 61 TRACE_DEFINE_ENUM(RET_PF_INVALID); 62 TRACE_DEFINE_ENUM(RET_PF_FIXED); 63 TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);

+32 -31

arch/x86/kvm/mmu/paging_tmpl.h

··· 646 * really care if it changes underneath us after this point). 647 */ 648 if (FNAME(gpte_changed)(vcpu, gw, top_level)) 649 - goto out_gpte_changed; 650 651 if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) 652 - goto out_gpte_changed; 653 654 /* 655 * Load a new root and retry the faulting instruction in the extremely ··· 659 */ 660 if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) { 661 kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu); 662 - goto out_gpte_changed; 663 } 664 665 for_each_shadow_entry(vcpu, fault->addr, it) { ··· 674 sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, 675 false, access); 676 677 - if (sp != ERR_PTR(-EEXIST)) { 678 - /* 679 - * We must synchronize the pagetable before linking it 680 - * because the guest doesn't need to flush tlb when 681 - * the gpte is changed from non-present to present. 682 - * Otherwise, the guest may use the wrong mapping. 683 - * 684 - * For PG_LEVEL_4K, kvm_mmu_get_page() has already 685 - * synchronized it transiently via kvm_sync_page(). 686 - * 687 - * For higher level pagetable, we synchronize it via 688 - * the slower mmu_sync_children(). If it needs to 689 - * break, some progress has been made; return 690 - * RET_PF_RETRY and retry on the next #PF. 691 - * KVM_REQ_MMU_SYNC is not necessary but it 692 - * expedites the process. 693 - */ 694 - if (sp->unsync_children && 695 - mmu_sync_children(vcpu, sp, false)) 696 - return RET_PF_RETRY; 697 - } 698 699 /* 700 - * Verify that the gpte in the page we've just write 701 - * protected is still there. 702 */ 703 if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) 704 - goto out_gpte_changed; 705 706 if (sp != ERR_PTR(-EEXIST)) 707 link_shadow_page(vcpu, it.sptep, sp); ··· 759 760 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 761 return ret; 762 - 763 - out_gpte_changed: 764 - return RET_PF_RETRY; 765 } 766 767 /* ··· 806 807 if (page_fault_handle_page_track(vcpu, fault)) { 808 shadow_page_table_clear_flood(vcpu, fault->addr); 809 - return RET_PF_EMULATE; 810 } 811 812 r = mmu_topup_memory_caches(vcpu, true);

··· 646 * really care if it changes underneath us after this point). 647 */ 648 if (FNAME(gpte_changed)(vcpu, gw, top_level)) 649 + return RET_PF_RETRY; 650 651 if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) 652 + return RET_PF_RETRY; 653 654 /* 655 * Load a new root and retry the faulting instruction in the extremely ··· 659 */ 660 if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) { 661 kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu); 662 + return RET_PF_RETRY; 663 } 664 665 for_each_shadow_entry(vcpu, fault->addr, it) { ··· 674 sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, 675 false, access); 676 677 + /* 678 + * Synchronize the new page before linking it, as the CPU (KVM) 679 + * is architecturally disallowed from inserting non-present 680 + * entries into the TLB, i.e. the guest isn't required to flush 681 + * the TLB when changing the gPTE from non-present to present. 682 + * 683 + * For PG_LEVEL_4K, kvm_mmu_find_shadow_page() has already 684 + * synchronized the page via kvm_sync_page(). 685 + * 686 + * For higher level pages, which cannot be unsync themselves 687 + * but can have unsync children, synchronize via the slower 688 + * mmu_sync_children(). If KVM needs to drop mmu_lock due to 689 + * contention or to reschedule, instruct the caller to retry 690 + * the #PF (mmu_sync_children() ensures forward progress will 691 + * be made). 692 + */ 693 + if (sp != ERR_PTR(-EEXIST) && sp->unsync_children && 694 + mmu_sync_children(vcpu, sp, false)) 695 + return RET_PF_RETRY; 696 697 /* 698 + * Verify that the gpte in the page, which is now either 699 + * write-protected or unsync, wasn't modified between the fault 700 + * and acquiring mmu_lock. This needs to be done even when 701 + * reusing an existing shadow page to ensure the information 702 + * gathered by the walker matches the information stored in the 703 + * shadow page (which could have been modified by a different 704 + * vCPU even if the page was already linked). Holding mmu_lock 705 + * prevents the shadow page from changing after this point. 706 */ 707 if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) 708 + return RET_PF_RETRY; 709 710 if (sp != ERR_PTR(-EEXIST)) 711 link_shadow_page(vcpu, it.sptep, sp); ··· 755 756 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 757 return ret; 758 } 759 760 /* ··· 805 806 if (page_fault_handle_page_track(vcpu, fault)) { 807 shadow_page_table_clear_flood(vcpu, fault->addr); 808 + return RET_PF_WRITE_PROTECTED; 809 } 810 811 r = mmu_topup_memory_caches(vcpu, true);

+2 -4

arch/x86/kvm/mmu/tdp_mmu.c

··· 1046 * protected, emulation is needed. If the emulation was skipped, 1047 * the vCPU would have the same fault again. 1048 */ 1049 - if (wrprot) { 1050 - if (fault->write) 1051 - ret = RET_PF_EMULATE; 1052 - } 1053 1054 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 1055 if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {

··· 1046 * protected, emulation is needed. If the emulation was skipped, 1047 * the vCPU would have the same fault again. 1048 */ 1049 + if (wrprot && fault->write) 1050 + ret = RET_PF_WRITE_PROTECTED; 1051 1052 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 1053 if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {

+8

arch/x86/kvm/reverse_cpuid.h

··· 17 CPUID_8000_0007_EDX, 18 CPUID_8000_0022_EAX, 19 CPUID_7_2_EDX, 20 NR_KVM_CPU_CAPS, 21 22 NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, ··· 47 #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) 48 #define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8) 49 #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) 50 51 /* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */ 52 #define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0) ··· 56 #define X86_FEATURE_DDPD_U KVM_X86_FEATURE(CPUID_7_2_EDX, 3) 57 #define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) 58 #define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) 59 60 /* CPUID level 0x80000007 (EDX). */ 61 #define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) ··· 97 [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX}, 98 [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX}, 99 [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX}, 100 }; 101 102 /*

··· 17 CPUID_8000_0007_EDX, 18 CPUID_8000_0022_EAX, 19 CPUID_7_2_EDX, 20 + CPUID_24_0_EBX, 21 NR_KVM_CPU_CAPS, 22 23 NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, ··· 46 #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) 47 #define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8) 48 #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) 49 + #define X86_FEATURE_AVX10 KVM_X86_FEATURE(CPUID_7_1_EDX, 19) 50 51 /* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */ 52 #define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0) ··· 54 #define X86_FEATURE_DDPD_U KVM_X86_FEATURE(CPUID_7_2_EDX, 3) 55 #define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) 56 #define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) 57 + 58 + /* Intel-defined sub-features, CPUID level 0x00000024:0 (EBX) */ 59 + #define X86_FEATURE_AVX10_128 KVM_X86_FEATURE(CPUID_24_0_EBX, 16) 60 + #define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17) 61 + #define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18) 62 63 /* CPUID level 0x80000007 (EDX). */ 64 #define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) ··· 90 [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX}, 91 [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX}, 92 [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX}, 93 + [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX}, 94 }; 95 96 /*

+19 -5

arch/x86/kvm/smm.c

··· 624 #endif 625 626 /* 627 - * Give leave_smm() a chance to make ISA-specific changes to the vCPU 628 - * state (e.g. enter guest mode) before loading state from the SMM 629 - * state-save area. 630 */ 631 if (kvm_x86_call(leave_smm)(vcpu, &smram)) 632 return X86EMUL_UNHANDLEABLE; 633 634 #ifdef CONFIG_X86_64 635 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) 636 - return rsm_load_state_64(ctxt, &smram.smram64); 637 else 638 #endif 639 - return rsm_load_state_32(ctxt, &smram.smram32); 640 }

··· 624 #endif 625 626 /* 627 + * FIXME: When resuming L2 (a.k.a. guest mode), the transition to guest 628 + * mode should happen _after_ loading state from SMRAM. However, KVM 629 + * piggybacks the nested VM-Enter flows (which is wrong for many other 630 + * reasons), and so nSVM/nVMX would clobber state that is loaded from 631 + * SMRAM and from the VMCS/VMCB. 632 */ 633 if (kvm_x86_call(leave_smm)(vcpu, &smram)) 634 return X86EMUL_UNHANDLEABLE; 635 636 #ifdef CONFIG_X86_64 637 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) 638 + ret = rsm_load_state_64(ctxt, &smram.smram64); 639 else 640 #endif 641 + ret = rsm_load_state_32(ctxt, &smram.smram32); 642 + 643 + /* 644 + * If RSM fails and triggers shutdown, architecturally the shutdown 645 + * occurs *before* the transition to guest mode. But due to KVM's 646 + * flawed handling of RSM to L2 (see above), the vCPU may already be 647 + * in_guest_mode(). Force the vCPU out of guest mode before delivering 648 + * the shutdown, so that L1 enters shutdown instead of seeing a VM-Exit 649 + * that architecturally shouldn't be possible. 650 + */ 651 + if (ret != X86EMUL_CONTINUE && is_guest_mode(vcpu)) 652 + kvm_leave_nested(vcpu); 653 + return ret; 654 }

+2 -2

arch/x86/kvm/svm/nested.c

··· 1693 return -EINVAL; 1694 1695 ret = -ENOMEM; 1696 - ctl = kzalloc(sizeof(*ctl), GFP_KERNEL_ACCOUNT); 1697 - save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT); 1698 if (!ctl || !save) 1699 goto out_free; 1700

··· 1693 return -EINVAL; 1694 1695 ret = -ENOMEM; 1696 + ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); 1697 + save = kzalloc(sizeof(*save), GFP_KERNEL); 1698 if (!ctl || !save) 1699 goto out_free; 1700

+49 -38

arch/x86/kvm/svm/svm.c

··· 573 574 static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd) 575 { 576 - return page_address(sd->save_area) + 0x400; 577 } 578 579 static inline void kvm_cpu_svm_disable(void) ··· 592 } 593 } 594 595 - static void svm_emergency_disable(void) 596 { 597 kvm_rebooting = true; 598 599 kvm_cpu_svm_disable(); 600 } 601 602 - static void svm_hardware_disable(void) 603 { 604 /* Make sure we clean up behind us */ 605 if (tsc_scaling) ··· 610 amd_pmu_disable_virt(); 611 } 612 613 - static int svm_hardware_enable(void) 614 { 615 616 struct svm_cpu_data *sd; ··· 696 return; 697 698 kfree(sd->sev_vmcbs); 699 - __free_page(sd->save_area); 700 sd->save_area_pa = 0; 701 sd->save_area = NULL; 702 } ··· 704 static int svm_cpu_init(int cpu) 705 { 706 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 707 int ret = -ENOMEM; 708 709 memset(sd, 0, sizeof(struct svm_cpu_data)); 710 - sd->save_area = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL); 711 - if (!sd->save_area) 712 return ret; 713 714 ret = sev_cpu_init(sd); 715 if (ret) 716 goto free_save_area; 717 718 - sd->save_area_pa = __sme_page_pa(sd->save_area); 719 return 0; 720 721 free_save_area: 722 - __free_page(sd->save_area); 723 - sd->save_area = NULL; 724 return ret; 725 726 } ··· 1125 for_each_possible_cpu(cpu) 1126 svm_cpu_uninit(cpu); 1127 1128 - __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), 1129 - get_order(IOPM_SIZE)); 1130 iopm_base = 0; 1131 } 1132 ··· 1301 if (!kvm_hlt_in_guest(vcpu->kvm)) 1302 svm_set_intercept(svm, INTERCEPT_HLT); 1303 1304 - control->iopm_base_pa = __sme_set(iopm_base); 1305 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); 1306 control->int_ctl = V_INTR_MASKING_MASK; 1307 ··· 1503 1504 sev_free_vcpu(vcpu); 1505 1506 - __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT)); 1507 __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); 1508 } 1509 ··· 1533 * TSC_AUX is always virtualized for SEV-ES guests when the feature is 1534 * available. The user return MSR support is not required in this case 1535 * because TSC_AUX is restored on #VMEXIT from the host save area 1536 - * (which has been initialized in svm_hardware_enable()). 1537 */ 1538 if (likely(tsc_aux_uret_slot >= 0) && 1539 (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) ··· 2825 return kvm_complete_insn_gp(vcpu, ret); 2826 } 2827 2828 - static int svm_get_msr_feature(struct kvm_msr_entry *msr) 2829 { 2830 - msr->data = 0; 2831 2832 - switch (msr->index) { 2833 case MSR_AMD64_DE_CFG: 2834 if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) 2835 - msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; 2836 break; 2837 default: 2838 - return KVM_MSR_RET_INVALID; 2839 } 2840 2841 return 0; ··· 3144 * feature is available. The user return MSR support is not 3145 * required in this case because TSC_AUX is restored on #VMEXIT 3146 * from the host save area (which has been initialized in 3147 - * svm_hardware_enable()). 3148 */ 3149 if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) 3150 break; ··· 3191 kvm_pr_unimpl_wrmsr(vcpu, ecx, data); 3192 break; 3193 case MSR_AMD64_DE_CFG: { 3194 - struct kvm_msr_entry msr_entry; 3195 3196 - msr_entry.index = msr->index; 3197 - if (svm_get_msr_feature(&msr_entry)) 3198 return 1; 3199 3200 - /* Check the supported bits */ 3201 - if (data & ~msr_entry.data) 3202 return 1; 3203 3204 - /* Don't allow the guest to change a bit, #GP */ 3205 - if (!msr->host_initiated && (data ^ msr_entry.data)) 3206 return 1; 3207 3208 svm->msr_decfg = data; ··· 4159 4160 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 4161 { 4162 if (is_guest_mode(vcpu)) 4163 return EXIT_FASTPATH_NONE; 4164 4165 - if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && 4166 - to_svm(vcpu)->vmcb->control.exit_info_1) 4167 return handle_fastpath_set_msr_irqoff(vcpu); 4168 4169 return EXIT_FASTPATH_NONE; 4170 } ··· 5004 .check_processor_compatibility = svm_check_processor_compat, 5005 5006 .hardware_unsetup = svm_hardware_unsetup, 5007 - .hardware_enable = svm_hardware_enable, 5008 - .hardware_disable = svm_hardware_disable, 5009 .has_emulated_msr = svm_has_emulated_msr, 5010 5011 .vcpu_create = svm_vcpu_create, ··· 5024 .vcpu_unblocking = avic_vcpu_unblocking, 5025 5026 .update_exception_bitmap = svm_update_exception_bitmap, 5027 - .get_msr_feature = svm_get_msr_feature, 5028 .get_msr = svm_get_msr, 5029 .set_msr = svm_set_msr, 5030 .get_segment_base = svm_get_segment_base, ··· 5075 .enable_nmi_window = svm_enable_nmi_window, 5076 .enable_irq_window = svm_enable_irq_window, 5077 .update_cr8_intercept = svm_update_cr8_intercept, 5078 .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, 5079 .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, 5080 .apicv_post_state_restore = avic_apicv_post_state_restore, ··· 5281 5282 iopm_va = page_address(iopm_pages); 5283 memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); 5284 - iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; 5285 5286 init_msrpm_offsets(); 5287 ··· 5440 static void __svm_exit(void) 5441 { 5442 kvm_x86_vendor_exit(); 5443 - 5444 - cpu_emergency_unregister_virt_callback(svm_emergency_disable); 5445 } 5446 5447 static int __init svm_init(void) ··· 5454 r = kvm_x86_vendor_init(&svm_init_ops); 5455 if (r) 5456 return r; 5457 - 5458 - cpu_emergency_register_virt_callback(svm_emergency_disable); 5459 5460 /* 5461 * Common KVM initialization _must_ come last, after this, /dev/kvm is

··· 573 574 static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd) 575 { 576 + return &sd->save_area->host_sev_es_save; 577 } 578 579 static inline void kvm_cpu_svm_disable(void) ··· 592 } 593 } 594 595 + static void svm_emergency_disable_virtualization_cpu(void) 596 { 597 kvm_rebooting = true; 598 599 kvm_cpu_svm_disable(); 600 } 601 602 + static void svm_disable_virtualization_cpu(void) 603 { 604 /* Make sure we clean up behind us */ 605 if (tsc_scaling) ··· 610 amd_pmu_disable_virt(); 611 } 612 613 + static int svm_enable_virtualization_cpu(void) 614 { 615 616 struct svm_cpu_data *sd; ··· 696 return; 697 698 kfree(sd->sev_vmcbs); 699 + __free_page(__sme_pa_to_page(sd->save_area_pa)); 700 sd->save_area_pa = 0; 701 sd->save_area = NULL; 702 } ··· 704 static int svm_cpu_init(int cpu) 705 { 706 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 707 + struct page *save_area_page; 708 int ret = -ENOMEM; 709 710 memset(sd, 0, sizeof(struct svm_cpu_data)); 711 + save_area_page = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL); 712 + if (!save_area_page) 713 return ret; 714 715 ret = sev_cpu_init(sd); 716 if (ret) 717 goto free_save_area; 718 719 + sd->save_area = page_address(save_area_page); 720 + sd->save_area_pa = __sme_page_pa(save_area_page); 721 return 0; 722 723 free_save_area: 724 + __free_page(save_area_page); 725 return ret; 726 727 } ··· 1124 for_each_possible_cpu(cpu) 1125 svm_cpu_uninit(cpu); 1126 1127 + __free_pages(__sme_pa_to_page(iopm_base), get_order(IOPM_SIZE)); 1128 iopm_base = 0; 1129 } 1130 ··· 1301 if (!kvm_hlt_in_guest(vcpu->kvm)) 1302 svm_set_intercept(svm, INTERCEPT_HLT); 1303 1304 + control->iopm_base_pa = iopm_base; 1305 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); 1306 control->int_ctl = V_INTR_MASKING_MASK; 1307 ··· 1503 1504 sev_free_vcpu(vcpu); 1505 1506 + __free_page(__sme_pa_to_page(svm->vmcb01.pa)); 1507 __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); 1508 } 1509 ··· 1533 * TSC_AUX is always virtualized for SEV-ES guests when the feature is 1534 * available. The user return MSR support is not required in this case 1535 * because TSC_AUX is restored on #VMEXIT from the host save area 1536 + * (which has been initialized in svm_enable_virtualization_cpu()). 1537 */ 1538 if (likely(tsc_aux_uret_slot >= 0) && 1539 (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) ··· 2825 return kvm_complete_insn_gp(vcpu, ret); 2826 } 2827 2828 + static int svm_get_feature_msr(u32 msr, u64 *data) 2829 { 2830 + *data = 0; 2831 2832 + switch (msr) { 2833 case MSR_AMD64_DE_CFG: 2834 if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) 2835 + *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; 2836 break; 2837 default: 2838 + return KVM_MSR_RET_UNSUPPORTED; 2839 } 2840 2841 return 0; ··· 3144 * feature is available. The user return MSR support is not 3145 * required in this case because TSC_AUX is restored on #VMEXIT 3146 * from the host save area (which has been initialized in 3147 + * svm_enable_virtualization_cpu()). 3148 */ 3149 if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) 3150 break; ··· 3191 kvm_pr_unimpl_wrmsr(vcpu, ecx, data); 3192 break; 3193 case MSR_AMD64_DE_CFG: { 3194 + u64 supported_de_cfg; 3195 3196 + if (svm_get_feature_msr(ecx, &supported_de_cfg)) 3197 return 1; 3198 3199 + if (data & ~supported_de_cfg) 3200 return 1; 3201 3202 + /* 3203 + * Don't let the guest change the host-programmed value. The 3204 + * MSR is very model specific, i.e. contains multiple bits that 3205 + * are completely unknown to KVM, and the one bit known to KVM 3206 + * is simply a reflection of hardware capabilities. 3207 + */ 3208 + if (!msr->host_initiated && data != svm->msr_decfg) 3209 return 1; 3210 3211 svm->msr_decfg = data; ··· 4156 4157 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 4158 { 4159 + struct vcpu_svm *svm = to_svm(vcpu); 4160 + 4161 if (is_guest_mode(vcpu)) 4162 return EXIT_FASTPATH_NONE; 4163 4164 + switch (svm->vmcb->control.exit_code) { 4165 + case SVM_EXIT_MSR: 4166 + if (!svm->vmcb->control.exit_info_1) 4167 + break; 4168 return handle_fastpath_set_msr_irqoff(vcpu); 4169 + case SVM_EXIT_HLT: 4170 + return handle_fastpath_hlt(vcpu); 4171 + default: 4172 + break; 4173 + } 4174 4175 return EXIT_FASTPATH_NONE; 4176 } ··· 4992 .check_processor_compatibility = svm_check_processor_compat, 4993 4994 .hardware_unsetup = svm_hardware_unsetup, 4995 + .enable_virtualization_cpu = svm_enable_virtualization_cpu, 4996 + .disable_virtualization_cpu = svm_disable_virtualization_cpu, 4997 + .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu, 4998 .has_emulated_msr = svm_has_emulated_msr, 4999 5000 .vcpu_create = svm_vcpu_create, ··· 5011 .vcpu_unblocking = avic_vcpu_unblocking, 5012 5013 .update_exception_bitmap = svm_update_exception_bitmap, 5014 + .get_feature_msr = svm_get_feature_msr, 5015 .get_msr = svm_get_msr, 5016 .set_msr = svm_set_msr, 5017 .get_segment_base = svm_get_segment_base, ··· 5062 .enable_nmi_window = svm_enable_nmi_window, 5063 .enable_irq_window = svm_enable_irq_window, 5064 .update_cr8_intercept = svm_update_cr8_intercept, 5065 + 5066 + .x2apic_icr_is_split = true, 5067 .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, 5068 .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, 5069 .apicv_post_state_restore = avic_apicv_post_state_restore, ··· 5266 5267 iopm_va = page_address(iopm_pages); 5268 memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); 5269 + iopm_base = __sme_page_pa(iopm_pages); 5270 5271 init_msrpm_offsets(); 5272 ··· 5425 static void __svm_exit(void) 5426 { 5427 kvm_x86_vendor_exit(); 5428 } 5429 5430 static int __init svm_init(void) ··· 5441 r = kvm_x86_vendor_init(&svm_init_ops); 5442 if (r) 5443 return r; 5444 5445 /* 5446 * Common KVM initialization _must_ come last, after this, /dev/kvm is

+16 -2

arch/x86/kvm/svm/svm.h

··· 25 #include "cpuid.h" 26 #include "kvm_cache_regs.h" 27 28 - #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) 29 30 #define IOPM_SIZE PAGE_SIZE * 3 31 #define MSRPM_SIZE PAGE_SIZE * 2 ··· 335 u32 next_asid; 336 u32 min_asid; 337 338 - struct page *save_area; 339 unsigned long save_area_pa; 340 341 struct vmcb *current_vmcb;

··· 25 #include "cpuid.h" 26 #include "kvm_cache_regs.h" 27 28 + /* 29 + * Helpers to convert to/from physical addresses for pages whose address is 30 + * consumed directly by hardware. Even though it's a physical address, SVM 31 + * often restricts the address to the natural width, hence 'unsigned long' 32 + * instead of 'hpa_t'. 33 + */ 34 + static inline unsigned long __sme_page_pa(struct page *page) 35 + { 36 + return __sme_set(page_to_pfn(page) << PAGE_SHIFT); 37 + } 38 + 39 + static inline struct page *__sme_pa_to_page(unsigned long pa) 40 + { 41 + return pfn_to_page(__sme_clr(pa) >> PAGE_SHIFT); 42 + } 43 44 #define IOPM_SIZE PAGE_SIZE * 3 45 #define MSRPM_SIZE PAGE_SIZE * 2 ··· 321 u32 next_asid; 322 u32 min_asid; 323 324 + struct vmcb *save_area; 325 unsigned long save_area_pa; 326 327 struct vmcb *current_vmcb;

+2 -6

arch/x86/kvm/svm/vmenter.S

··· 209 7: vmload %_ASM_AX 210 8: 211 212 - #ifdef CONFIG_MITIGATION_RETPOLINE 213 /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ 214 - FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE 215 - #endif 216 217 /* Clobbers RAX, RCX, RDX. */ 218 RESTORE_HOST_SPEC_CTRL ··· 346 347 2: cli 348 349 - #ifdef CONFIG_MITIGATION_RETPOLINE 350 /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ 351 - FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE 352 - #endif 353 354 /* Clobbers RAX, RCX, RDX, consumes RDI (@svm) and RSI (@spec_ctrl_intercepted). */ 355 RESTORE_HOST_SPEC_CTRL

··· 209 7: vmload %_ASM_AX 210 8: 211 212 /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ 213 + FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT 214 215 /* Clobbers RAX, RCX, RDX. */ 216 RESTORE_HOST_SPEC_CTRL ··· 348 349 2: cli 350 351 /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ 352 + FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT 353 354 /* Clobbers RAX, RCX, RDX, consumes RDI (@svm) and RSI (@spec_ctrl_intercepted). */ 355 RESTORE_HOST_SPEC_CTRL

+4 -6

arch/x86/kvm/vmx/capabilities.h

··· 54 }; 55 56 struct vmcs_config { 57 - int size; 58 - u32 basic_cap; 59 - u32 revision_id; 60 u32 pin_based_exec_ctrl; 61 u32 cpu_based_exec_ctrl; 62 u32 cpu_based_2nd_exec_ctrl; ··· 74 75 static inline bool cpu_has_vmx_basic_inout(void) 76 { 77 - return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); 78 } 79 80 static inline bool cpu_has_virtual_nmis(void) ··· 223 static inline bool cpu_has_vmx_shadow_vmcs(void) 224 { 225 /* check if the cpu supports writing r/o exit information fields */ 226 - if (!(vmcs_config.misc & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) 227 return false; 228 229 return vmcs_config.cpu_based_2nd_exec_ctrl & ··· 365 366 static inline bool cpu_has_vmx_intel_pt(void) 367 { 368 - return (vmcs_config.misc & MSR_IA32_VMX_MISC_INTEL_PT) && 369 (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) && 370 (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); 371 }

··· 54 }; 55 56 struct vmcs_config { 57 + u64 basic; 58 u32 pin_based_exec_ctrl; 59 u32 cpu_based_exec_ctrl; 60 u32 cpu_based_2nd_exec_ctrl; ··· 76 77 static inline bool cpu_has_vmx_basic_inout(void) 78 { 79 + return vmcs_config.basic & VMX_BASIC_INOUT; 80 } 81 82 static inline bool cpu_has_virtual_nmis(void) ··· 225 static inline bool cpu_has_vmx_shadow_vmcs(void) 226 { 227 /* check if the cpu supports writing r/o exit information fields */ 228 + if (!(vmcs_config.misc & VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) 229 return false; 230 231 return vmcs_config.cpu_based_2nd_exec_ctrl & ··· 367 368 static inline bool cpu_has_vmx_intel_pt(void) 369 { 370 + return (vmcs_config.misc & VMX_MISC_INTEL_PT) && 371 (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) && 372 (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); 373 }

+7 -3

arch/x86/kvm/vmx/main.c

··· 23 24 .hardware_unsetup = vmx_hardware_unsetup, 25 26 - .hardware_enable = vmx_hardware_enable, 27 - .hardware_disable = vmx_hardware_disable, 28 .has_emulated_msr = vmx_has_emulated_msr, 29 30 .vm_size = sizeof(struct kvm_vmx), ··· 43 .vcpu_put = vmx_vcpu_put, 44 45 .update_exception_bitmap = vmx_update_exception_bitmap, 46 - .get_msr_feature = vmx_get_msr_feature, 47 .get_msr = vmx_get_msr, 48 .set_msr = vmx_set_msr, 49 .get_segment_base = vmx_get_segment_base, ··· 91 .enable_nmi_window = vmx_enable_nmi_window, 92 .enable_irq_window = vmx_enable_irq_window, 93 .update_cr8_intercept = vmx_update_cr8_intercept, 94 .set_virtual_apic_mode = vmx_set_virtual_apic_mode, 95 .set_apic_access_page_addr = vmx_set_apic_access_page_addr, 96 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,

··· 23 24 .hardware_unsetup = vmx_hardware_unsetup, 25 26 + .enable_virtualization_cpu = vmx_enable_virtualization_cpu, 27 + .disable_virtualization_cpu = vmx_disable_virtualization_cpu, 28 + .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu, 29 + 30 .has_emulated_msr = vmx_has_emulated_msr, 31 32 .vm_size = sizeof(struct kvm_vmx), ··· 41 .vcpu_put = vmx_vcpu_put, 42 43 .update_exception_bitmap = vmx_update_exception_bitmap, 44 + .get_feature_msr = vmx_get_feature_msr, 45 .get_msr = vmx_get_msr, 46 .set_msr = vmx_set_msr, 47 .get_segment_base = vmx_get_segment_base, ··· 89 .enable_nmi_window = vmx_enable_nmi_window, 90 .enable_irq_window = vmx_enable_irq_window, 91 .update_cr8_intercept = vmx_update_cr8_intercept, 92 + 93 + .x2apic_icr_is_split = false, 94 .set_virtual_apic_mode = vmx_set_virtual_apic_mode, 95 .set_apic_access_page_addr = vmx_set_apic_access_page_addr, 96 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,

+96 -38

arch/x86/kvm/vmx/nested.c

··· 981 __func__, i, e.index, e.reserved); 982 goto fail; 983 } 984 - if (kvm_set_msr(vcpu, e.index, e.value)) { 985 pr_debug_ratelimited( 986 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 987 __func__, i, e.index, e.value); ··· 1017 } 1018 } 1019 1020 - if (kvm_get_msr(vcpu, msr_index, data)) { 1021 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 1022 msr_index); 1023 return false; ··· 1112 /* 1113 * Emulated VMEntry does not fail here. Instead a less 1114 * accurate value will be returned by 1115 - * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1116 - * instead of reading the value from the vmcs02 VMExit 1117 - * MSR-store area. 1118 */ 1119 pr_warn_ratelimited( 1120 "Not enough msr entries in msr_autostore. Can't add msr %x\n", ··· 1251 1252 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1253 { 1254 - const u64 feature_and_reserved = 1255 - /* feature (except bit 48; see below) */ 1256 - BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1257 - /* reserved */ 1258 - BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1259 u64 vmx_basic = vmcs_config.nested.basic; 1260 1261 - if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1262 return -EINVAL; 1263 1264 /* 1265 * KVM does not emulate a version of VMX that constrains physical 1266 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1267 */ 1268 - if (data & BIT_ULL(48)) 1269 return -EINVAL; 1270 1271 if (vmx_basic_vmcs_revision_id(vmx_basic) != ··· 1345 1346 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1347 { 1348 - const u64 feature_and_reserved_bits = 1349 - /* feature */ 1350 - BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1351 - BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1352 - /* reserved */ 1353 - GENMASK_ULL(13, 9) | BIT_ULL(31); 1354 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1355 vmcs_config.nested.misc_high); 1356 1357 - if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1358 return -EINVAL; 1359 1360 if ((vmx->nested.msrs.pinbased_ctls_high & ··· 2341 2342 /* Posted interrupts setting is only taken from vmcs12. */ 2343 vmx->nested.pi_pending = false; 2344 - if (nested_cpu_has_posted_intr(vmcs12)) 2345 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2346 - else 2347 exec_control &= ~PIN_BASED_POSTED_INTR; 2348 pin_controls_set(vmx, exec_control); 2349 2350 /* ··· 2496 2497 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2498 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2499 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2500 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2501 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); ··· 2534 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2535 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2536 2537 - vmx->segment_cache.bitmask = 0; 2538 } 2539 2540 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & ··· 4311 } 4312 4313 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4314 if (block_nested_events) 4315 return -EBUSY; 4316 if (!nested_exit_on_intr(vcpu)) 4317 goto no_vmexit; 4318 - nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4319 return 0; 4320 } 4321 ··· 4874 goto vmabort; 4875 } 4876 4877 - if (kvm_set_msr(vcpu, h.index, h.value)) { 4878 pr_debug_ratelimited( 4879 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4880 __func__, j, h.index, h.value); ··· 5037 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5038 5039 if (likely(!vmx->fail)) { 5040 - if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 5041 - nested_exit_intr_ack_set(vcpu)) { 5042 - int irq = kvm_cpu_get_interrupt(vcpu); 5043 - WARN_ON(irq < 0); 5044 - vmcs12->vm_exit_intr_info = irq | 5045 - INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 5046 - } 5047 - 5048 if (vm_exit_reason != -1) 5049 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 5050 vmcs12->exit_qualification, ··· 7111 { 7112 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 7113 msrs->misc_low |= 7114 - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 7115 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 7116 VMX_MISC_ACTIVITY_HLT | 7117 VMX_MISC_ACTIVITY_WAIT_SIPI; ··· 7126 * guest, and the VMCS structure we give it - not about the 7127 * VMX support of the underlying hardware. 7128 */ 7129 - msrs->basic = 7130 - VMCS12_REVISION | 7131 - VMX_BASIC_TRUE_CTLS | 7132 - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 7133 - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 7134 7135 if (cpu_has_vmx_basic_inout()) 7136 msrs->basic |= VMX_BASIC_INOUT; 7137 }

··· 981 __func__, i, e.index, e.reserved); 982 goto fail; 983 } 984 + if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) { 985 pr_debug_ratelimited( 986 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 987 __func__, i, e.index, e.value); ··· 1017 } 1018 } 1019 1020 + if (kvm_get_msr_with_filter(vcpu, msr_index, data)) { 1021 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 1022 msr_index); 1023 return false; ··· 1112 /* 1113 * Emulated VMEntry does not fail here. Instead a less 1114 * accurate value will be returned by 1115 + * nested_vmx_get_vmexit_msr_value() by reading KVM's 1116 + * internal MSR state instead of reading the value from 1117 + * the vmcs02 VMExit MSR-store area. 1118 */ 1119 pr_warn_ratelimited( 1120 "Not enough msr entries in msr_autostore. Can't add msr %x\n", ··· 1251 1252 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1253 { 1254 + const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | 1255 + VMX_BASIC_INOUT | 1256 + VMX_BASIC_TRUE_CTLS; 1257 + 1258 + const u64 reserved_bits = GENMASK_ULL(63, 56) | 1259 + GENMASK_ULL(47, 45) | 1260 + BIT_ULL(31); 1261 + 1262 u64 vmx_basic = vmcs_config.nested.basic; 1263 1264 + BUILD_BUG_ON(feature_bits & reserved_bits); 1265 + 1266 + /* 1267 + * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has 1268 + * inverted polarity), the incoming value must not set feature bits or 1269 + * reserved bits that aren't allowed/supported by KVM. Fields, i.e. 1270 + * multi-bit values, are explicitly checked below. 1271 + */ 1272 + if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) 1273 return -EINVAL; 1274 1275 /* 1276 * KVM does not emulate a version of VMX that constrains physical 1277 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1278 */ 1279 + if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 1280 return -EINVAL; 1281 1282 if (vmx_basic_vmcs_revision_id(vmx_basic) != ··· 1334 1335 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1336 { 1337 + const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA | 1338 + VMX_MISC_ACTIVITY_HLT | 1339 + VMX_MISC_ACTIVITY_SHUTDOWN | 1340 + VMX_MISC_ACTIVITY_WAIT_SIPI | 1341 + VMX_MISC_INTEL_PT | 1342 + VMX_MISC_RDMSR_IN_SMM | 1343 + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 1344 + VMX_MISC_VMXOFF_BLOCK_SMI | 1345 + VMX_MISC_ZERO_LEN_INS; 1346 + 1347 + const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9); 1348 + 1349 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1350 vmcs_config.nested.misc_high); 1351 1352 + BUILD_BUG_ON(feature_bits & reserved_bits); 1353 + 1354 + /* 1355 + * The incoming value must not set feature bits or reserved bits that 1356 + * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are 1357 + * explicitly checked below. 1358 + */ 1359 + if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) 1360 return -EINVAL; 1361 1362 if ((vmx->nested.msrs.pinbased_ctls_high & ··· 2317 2318 /* Posted interrupts setting is only taken from vmcs12. */ 2319 vmx->nested.pi_pending = false; 2320 + if (nested_cpu_has_posted_intr(vmcs12)) { 2321 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2322 + } else { 2323 + vmx->nested.posted_intr_nv = -1; 2324 exec_control &= ~PIN_BASED_POSTED_INTR; 2325 + } 2326 pin_controls_set(vmx, exec_control); 2327 2328 /* ··· 2470 2471 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2472 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2473 + 2474 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2475 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2476 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); ··· 2507 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2508 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2509 2510 + vmx_segment_cache_clear(vmx); 2511 } 2512 2513 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & ··· 4284 } 4285 4286 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4287 + int irq; 4288 + 4289 if (block_nested_events) 4290 return -EBUSY; 4291 if (!nested_exit_on_intr(vcpu)) 4292 goto no_vmexit; 4293 + 4294 + if (!nested_exit_intr_ack_set(vcpu)) { 4295 + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4296 + return 0; 4297 + } 4298 + 4299 + irq = kvm_cpu_get_extint(vcpu); 4300 + if (irq != -1) { 4301 + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4302 + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4303 + return 0; 4304 + } 4305 + 4306 + irq = kvm_apic_has_interrupt(vcpu); 4307 + if (WARN_ON_ONCE(irq < 0)) 4308 + goto no_vmexit; 4309 + 4310 + /* 4311 + * If the IRQ is L2's PI notification vector, process posted 4312 + * interrupts for L2 instead of injecting VM-Exit, as the 4313 + * detection/morphing architecturally occurs when the IRQ is 4314 + * delivered to the CPU. Note, only interrupts that are routed 4315 + * through the local APIC trigger posted interrupt processing, 4316 + * and enabling posted interrupts requires ACK-on-exit. 4317 + */ 4318 + if (irq == vmx->nested.posted_intr_nv) { 4319 + vmx->nested.pi_pending = true; 4320 + kvm_apic_clear_irr(vcpu, irq); 4321 + goto no_vmexit; 4322 + } 4323 + 4324 + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4325 + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4326 + 4327 + /* 4328 + * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must 4329 + * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI 4330 + * if APICv is active. 4331 + */ 4332 + kvm_apic_ack_interrupt(vcpu, irq); 4333 return 0; 4334 } 4335 ··· 4806 goto vmabort; 4807 } 4808 4809 + if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) { 4810 pr_debug_ratelimited( 4811 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4812 __func__, j, h.index, h.value); ··· 4969 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4970 4971 if (likely(!vmx->fail)) { 4972 if (vm_exit_reason != -1) 4973 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4974 vmcs12->exit_qualification, ··· 7051 { 7052 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 7053 msrs->misc_low |= 7054 + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 7055 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 7056 VMX_MISC_ACTIVITY_HLT | 7057 VMX_MISC_ACTIVITY_WAIT_SIPI; ··· 7066 * guest, and the VMCS structure we give it - not about the 7067 * VMX support of the underlying hardware. 7068 */ 7069 + msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, 7070 + X86_MEMTYPE_WB); 7071 7072 + msrs->basic |= VMX_BASIC_TRUE_CTLS; 7073 if (cpu_has_vmx_basic_inout()) 7074 msrs->basic |= VMX_BASIC_INOUT; 7075 }

+7 -1

arch/x86/kvm/vmx/nested.h

··· 39 40 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) 41 { 42 return to_vmx(vcpu)->nested.cached_vmcs12; 43 } 44 45 static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) 46 { 47 return to_vmx(vcpu)->nested.cached_shadow_vmcs12; 48 } 49 ··· 115 static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu) 116 { 117 return to_vmx(vcpu)->nested.msrs.misc_low & 118 - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; 119 } 120 121 static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)

··· 39 40 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) 41 { 42 + lockdep_assert_once(lockdep_is_held(&vcpu->mutex) || 43 + !refcount_read(&vcpu->kvm->users_count)); 44 + 45 return to_vmx(vcpu)->nested.cached_vmcs12; 46 } 47 48 static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) 49 { 50 + lockdep_assert_once(lockdep_is_held(&vcpu->mutex) || 51 + !refcount_read(&vcpu->kvm->users_count)); 52 + 53 return to_vmx(vcpu)->nested.cached_shadow_vmcs12; 54 } 55 ··· 109 static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu) 110 { 111 return to_vmx(vcpu)->nested.msrs.misc_low & 112 + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; 113 } 114 115 static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)

+1 -1

arch/x86/kvm/vmx/sgx.c

··· 274 * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to 275 * enforce restriction of access to the PROVISIONKEY. 276 */ 277 - contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT); 278 if (!contents) 279 return -ENOMEM; 280

··· 274 * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to 275 * enforce restriction of access to the PROVISIONKEY. 276 */ 277 + contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL); 278 if (!contents) 279 return -ENOMEM; 280

+35 -32

arch/x86/kvm/vmx/vmx.c

··· 525 VMX_SEGMENT_FIELD(LDTR), 526 }; 527 528 - static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) 529 - { 530 - vmx->segment_cache.bitmask = 0; 531 - } 532 533 static unsigned long host_idt_base; 534 ··· 751 return -EIO; 752 } 753 754 - static void vmx_emergency_disable(void) 755 { 756 int cpu = raw_smp_processor_id(); 757 struct loaded_vmcs *v; ··· 1994 return !(msr->data & ~valid_bits); 1995 } 1996 1997 - int vmx_get_msr_feature(struct kvm_msr_entry *msr) 1998 { 1999 - switch (msr->index) { 2000 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2001 if (!nested) 2002 return 1; 2003 - return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); 2004 default: 2005 - return KVM_MSR_RET_INVALID; 2006 } 2007 } 2008 ··· 2601 static int setup_vmcs_config(struct vmcs_config *vmcs_conf, 2602 struct vmx_capability *vmx_cap) 2603 { 2604 - u32 vmx_msr_low, vmx_msr_high; 2605 u32 _pin_based_exec_control = 0; 2606 u32 _cpu_based_exec_control = 0; 2607 u32 _cpu_based_2nd_exec_control = 0; 2608 u64 _cpu_based_3rd_exec_control = 0; 2609 u32 _vmexit_control = 0; 2610 u32 _vmentry_control = 0; 2611 u64 misc_msr; 2612 int i; 2613 ··· 2730 _vmexit_control &= ~x_ctrl; 2731 } 2732 2733 - rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); 2734 2735 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 2736 - if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) 2737 return -EIO; 2738 2739 #ifdef CONFIG_X86_64 2740 - /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ 2741 - if (vmx_msr_high & (1u<<16)) 2742 return -EIO; 2743 #endif 2744 2745 /* Require Write-Back (WB) memory type for VMCS accesses. */ 2746 - if (((vmx_msr_high >> 18) & 15) != 6) 2747 return -EIO; 2748 2749 rdmsrl(MSR_IA32_VMX_MISC, misc_msr); 2750 2751 - vmcs_conf->size = vmx_msr_high & 0x1fff; 2752 - vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; 2753 - 2754 - vmcs_conf->revision_id = vmx_msr_low; 2755 - 2756 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 2757 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 2758 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; ··· 2840 return -EFAULT; 2841 } 2842 2843 - int vmx_hardware_enable(void) 2844 { 2845 int cpu = raw_smp_processor_id(); 2846 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); ··· 2877 __loaded_vmcs_clear(v); 2878 } 2879 2880 - void vmx_hardware_disable(void) 2881 { 2882 vmclear_local_loaded_vmcss(); 2883 ··· 2899 if (!pages) 2900 return NULL; 2901 vmcs = page_address(pages); 2902 - memset(vmcs, 0, vmcs_config.size); 2903 2904 /* KVM supports Enlightened VMCS v1 only */ 2905 if (kvm_is_using_evmcs()) 2906 vmcs->hdr.revision_id = KVM_EVMCS_VERSION; 2907 else 2908 - vmcs->hdr.revision_id = vmcs_config.revision_id; 2909 2910 if (shadow) 2911 vmcs->hdr.shadow_vmcs = 1; ··· 2998 * physical CPU. 2999 */ 3000 if (kvm_is_using_evmcs()) 3001 - vmcs->hdr.revision_id = vmcs_config.revision_id; 3002 3003 per_cpu(vmxarea, cpu) = vmcs; 3004 } ··· 4215 { 4216 struct vcpu_vmx *vmx = to_vmx(vcpu); 4217 4218 if (is_guest_mode(vcpu) && 4219 vector == vmx->nested.posted_intr_nv) { 4220 /* ··· 5807 error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) 5808 ? PFERR_PRESENT_MASK : 0; 5809 5810 - error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ? 5811 - PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; 5812 5813 /* 5814 * Check that the GPA doesn't exceed physical memory limits, as that is ··· 7269 return handle_fastpath_set_msr_irqoff(vcpu); 7270 case EXIT_REASON_PREEMPTION_TIMER: 7271 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); 7272 default: 7273 return EXIT_FASTPATH_NONE; 7274 } ··· 7971 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); 7972 kvm_cpu_cap_clear(X86_FEATURE_SGX1); 7973 kvm_cpu_cap_clear(X86_FEATURE_SGX2); 7974 } 7975 7976 if (vmx_umip_emulated()) ··· 8522 u64 use_timer_freq = 5000ULL * 1000 * 1000; 8523 8524 cpu_preemption_timer_multi = 8525 - vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; 8526 8527 if (tsc_khz) 8528 use_timer_freq = (u64)tsc_khz * 1000; ··· 8589 { 8590 allow_smaller_maxphyaddr = false; 8591 8592 - cpu_emergency_unregister_virt_callback(vmx_emergency_disable); 8593 - 8594 vmx_cleanup_l1d_flush(); 8595 } 8596 ··· 8634 8635 pi_init_cpu(cpu); 8636 } 8637 - 8638 - cpu_emergency_register_virt_callback(vmx_emergency_disable); 8639 8640 vmx_check_vmcs12_offsets(); 8641

··· 525 VMX_SEGMENT_FIELD(LDTR), 526 }; 527 528 529 static unsigned long host_idt_base; 530 ··· 755 return -EIO; 756 } 757 758 + void vmx_emergency_disable_virtualization_cpu(void) 759 { 760 int cpu = raw_smp_processor_id(); 761 struct loaded_vmcs *v; ··· 1998 return !(msr->data & ~valid_bits); 1999 } 2000 2001 + int vmx_get_feature_msr(u32 msr, u64 *data) 2002 { 2003 + switch (msr) { 2004 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2005 if (!nested) 2006 return 1; 2007 + return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); 2008 default: 2009 + return KVM_MSR_RET_UNSUPPORTED; 2010 } 2011 } 2012 ··· 2605 static int setup_vmcs_config(struct vmcs_config *vmcs_conf, 2606 struct vmx_capability *vmx_cap) 2607 { 2608 u32 _pin_based_exec_control = 0; 2609 u32 _cpu_based_exec_control = 0; 2610 u32 _cpu_based_2nd_exec_control = 0; 2611 u64 _cpu_based_3rd_exec_control = 0; 2612 u32 _vmexit_control = 0; 2613 u32 _vmentry_control = 0; 2614 + u64 basic_msr; 2615 u64 misc_msr; 2616 int i; 2617 ··· 2734 _vmexit_control &= ~x_ctrl; 2735 } 2736 2737 + rdmsrl(MSR_IA32_VMX_BASIC, basic_msr); 2738 2739 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 2740 + if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) 2741 return -EIO; 2742 2743 #ifdef CONFIG_X86_64 2744 + /* 2745 + * KVM expects to be able to shove all legal physical addresses into 2746 + * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always 2747 + * 0 for processors that support Intel 64 architecture". 2748 + */ 2749 + if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 2750 return -EIO; 2751 #endif 2752 2753 /* Require Write-Back (WB) memory type for VMCS accesses. */ 2754 + if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) 2755 return -EIO; 2756 2757 rdmsrl(MSR_IA32_VMX_MISC, misc_msr); 2758 2759 + vmcs_conf->basic = basic_msr; 2760 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 2761 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 2762 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; ··· 2844 return -EFAULT; 2845 } 2846 2847 + int vmx_enable_virtualization_cpu(void) 2848 { 2849 int cpu = raw_smp_processor_id(); 2850 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); ··· 2881 __loaded_vmcs_clear(v); 2882 } 2883 2884 + void vmx_disable_virtualization_cpu(void) 2885 { 2886 vmclear_local_loaded_vmcss(); 2887 ··· 2903 if (!pages) 2904 return NULL; 2905 vmcs = page_address(pages); 2906 + memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic)); 2907 2908 /* KVM supports Enlightened VMCS v1 only */ 2909 if (kvm_is_using_evmcs()) 2910 vmcs->hdr.revision_id = KVM_EVMCS_VERSION; 2911 else 2912 + vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 2913 2914 if (shadow) 2915 vmcs->hdr.shadow_vmcs = 1; ··· 3002 * physical CPU. 3003 */ 3004 if (kvm_is_using_evmcs()) 3005 + vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 3006 3007 per_cpu(vmxarea, cpu) = vmcs; 3008 } ··· 4219 { 4220 struct vcpu_vmx *vmx = to_vmx(vcpu); 4221 4222 + /* 4223 + * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated 4224 + * and freed, and must not be accessed outside of vcpu->mutex. The 4225 + * vCPU's cached PI NV is valid if and only if posted interrupts 4226 + * enabled in its vmcs12, i.e. checking the vector also checks that 4227 + * L1 has enabled posted interrupts for L2. 4228 + */ 4229 if (is_guest_mode(vcpu) && 4230 vector == vmx->nested.posted_intr_nv) { 4231 /* ··· 5804 error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) 5805 ? PFERR_PRESENT_MASK : 0; 5806 5807 + if (error_code & EPT_VIOLATION_GVA_IS_VALID) 5808 + error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? 5809 + PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; 5810 5811 /* 5812 * Check that the GPA doesn't exceed physical memory limits, as that is ··· 7265 return handle_fastpath_set_msr_irqoff(vcpu); 7266 case EXIT_REASON_PREEMPTION_TIMER: 7267 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); 7268 + case EXIT_REASON_HLT: 7269 + return handle_fastpath_hlt(vcpu); 7270 default: 7271 return EXIT_FASTPATH_NONE; 7272 } ··· 7965 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); 7966 kvm_cpu_cap_clear(X86_FEATURE_SGX1); 7967 kvm_cpu_cap_clear(X86_FEATURE_SGX2); 7968 + kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA); 7969 } 7970 7971 if (vmx_umip_emulated()) ··· 8515 u64 use_timer_freq = 5000ULL * 1000 * 1000; 8516 8517 cpu_preemption_timer_multi = 8518 + vmx_misc_preemption_timer_rate(vmcs_config.misc); 8519 8520 if (tsc_khz) 8521 use_timer_freq = (u64)tsc_khz * 1000; ··· 8582 { 8583 allow_smaller_maxphyaddr = false; 8584 8585 vmx_cleanup_l1d_flush(); 8586 } 8587 ··· 8629 8630 pi_init_cpu(cpu); 8631 } 8632 8633 vmx_check_vmcs12_offsets(); 8634

+5 -4

arch/x86/kvm/vmx/vmx.h

··· 17 #include "run_flags.h" 18 #include "../mmu.h" 19 20 - #define MSR_TYPE_R 1 21 - #define MSR_TYPE_W 2 22 - #define MSR_TYPE_RW 3 23 - 24 #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) 25 26 #ifdef CONFIG_X86_64 ··· 750 static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) 751 { 752 return lapic_in_kernel(vcpu) && enable_ipiv; 753 } 754 755 #endif /* __KVM_X86_VMX_H */

··· 17 #include "run_flags.h" 18 #include "../mmu.h" 19 20 #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) 21 22 #ifdef CONFIG_X86_64 ··· 754 static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) 755 { 756 return lapic_in_kernel(vcpu) && enable_ipiv; 757 + } 758 + 759 + static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) 760 + { 761 + vmx->segment_cache.bitmask = 0; 762 } 763 764 #endif /* __KVM_X86_VMX_H */

+8

arch/x86/kvm/vmx/vmx_onhyperv.h

··· 104 struct hv_vp_assist_page *vp_ap = 105 hv_get_vp_assist_page(smp_processor_id()); 106 107 if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall) 108 vp_ap->nested_control.features.directhypercall = 1; 109 vp_ap->current_nested_vmcs = phys_addr;

··· 104 struct hv_vp_assist_page *vp_ap = 105 hv_get_vp_assist_page(smp_processor_id()); 106 107 + /* 108 + * When enabling eVMCS, KVM verifies that every CPU has a valid hv_vp_assist_page() 109 + * and aborts enabling the feature otherwise. CPU onlining path is also checked in 110 + * vmx_hardware_enable(). 111 + */ 112 + if (KVM_BUG_ON(!vp_ap, kvm_get_running_vcpu()->kvm)) 113 + return; 114 + 115 if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall) 116 vp_ap->nested_control.features.directhypercall = 1; 117 vp_ap->current_nested_vmcs = phys_addr;

+1 -1

arch/x86/kvm/vmx/vmx_ops.h

··· 47 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, 48 "16-bit accessor invalid for 64-bit high field"); 49 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, 50 - "16-bit accessor invalid for 32-bit high field"); 51 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, 52 "16-bit accessor invalid for natural width field"); 53 }

··· 47 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, 48 "16-bit accessor invalid for 64-bit high field"); 49 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, 50 + "16-bit accessor invalid for 32-bit field"); 51 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, 52 "16-bit accessor invalid for natural width field"); 53 }

+4 -3

arch/x86/kvm/vmx/x86_ops.h

··· 13 14 void vmx_hardware_unsetup(void); 15 int vmx_check_processor_compat(void); 16 - int vmx_hardware_enable(void); 17 - void vmx_hardware_disable(void); 18 int vmx_vm_init(struct kvm *kvm); 19 void vmx_vm_destroy(struct kvm *kvm); 20 int vmx_vcpu_precreate(struct kvm *kvm); ··· 57 void vmx_msr_filter_changed(struct kvm_vcpu *vcpu); 58 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); 59 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); 60 - int vmx_get_msr_feature(struct kvm_msr_entry *msr); 61 int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); 62 u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg); 63 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);

··· 13 14 void vmx_hardware_unsetup(void); 15 int vmx_check_processor_compat(void); 16 + int vmx_enable_virtualization_cpu(void); 17 + void vmx_disable_virtualization_cpu(void); 18 + void vmx_emergency_disable_virtualization_cpu(void); 19 int vmx_vm_init(struct kvm *kvm); 20 void vmx_vm_destroy(struct kvm *kvm); 21 int vmx_vcpu_precreate(struct kvm *kvm); ··· 56 void vmx_msr_filter_changed(struct kvm_vcpu *vcpu); 57 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); 58 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); 59 + int vmx_get_feature_msr(u32 msr, u64 *data); 60 int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); 61 u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg); 62 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);

+484 -522

arch/x86/kvm/x86.c

··· 305 static struct kmem_cache *x86_emulator_cache; 306 307 /* 308 - * When called, it means the previous get/set msr reached an invalid msr. 309 - * Return true if we want to ignore/silent this failed msr access. 310 */ 311 - static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) 312 - { 313 - const char *op = write ? "wrmsr" : "rdmsr"; 314 315 - if (ignore_msrs) { 316 - if (report_ignored_msrs) 317 - kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", 318 - op, msr, data); 319 - /* Mask the error */ 320 return true; 321 - } else { 322 - kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", 323 - op, msr, data); 324 - return false; 325 } 326 } 327 328 static struct kmem_cache *kvm_alloc_emulator_cache(void) ··· 568 569 /* 570 * Disabling irqs at this point since the following code could be 571 - * interrupted and executed through kvm_arch_hardware_disable() 572 */ 573 local_irq_save(flags); 574 if (msrs->registered) { ··· 626 627 static void kvm_user_return_msr_cpu_online(void) 628 { 629 - unsigned int cpu = smp_processor_id(); 630 - struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); 631 u64 value; 632 int i; 633 ··· 831 ex->error_code = error_code; 832 ex->has_payload = has_payload; 833 ex->payload = payload; 834 - } 835 - 836 - /* Forcibly leave the nested mode in cases like a vCPU reset */ 837 - static void kvm_leave_nested(struct kvm_vcpu *vcpu) 838 - { 839 - kvm_x86_ops.nested_ops->leave_nested(vcpu); 840 } 841 842 static void kvm_multiple_exception(struct kvm_vcpu *vcpu, ··· 1618 EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); 1619 1620 /* 1621 - * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track 1622 - * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, 1623 - * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that 1624 - * require host support, i.e. should be probed via RDMSR. emulated_msrs holds 1625 - * MSRs that KVM emulates without strictly requiring host support. 1626 - * msr_based_features holds MSRs that enumerate features, i.e. are effectively 1627 - * CPUID leafs. Note, msr_based_features isn't mutually exclusive with 1628 - * msrs_to_save and emulated_msrs. 1629 - */ 1630 - 1631 - static const u32 msrs_to_save_base[] = { 1632 - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 1633 - MSR_STAR, 1634 - #ifdef CONFIG_X86_64 1635 - MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 1636 - #endif 1637 - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, 1638 - MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, 1639 - MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, 1640 - MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, 1641 - MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, 1642 - MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, 1643 - MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, 1644 - MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, 1645 - MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, 1646 - MSR_IA32_UMWAIT_CONTROL, 1647 - 1648 - MSR_IA32_XFD, MSR_IA32_XFD_ERR, 1649 - }; 1650 - 1651 - static const u32 msrs_to_save_pmu[] = { 1652 - MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, 1653 - MSR_ARCH_PERFMON_FIXED_CTR0 + 2, 1654 - MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, 1655 - MSR_CORE_PERF_GLOBAL_CTRL, 1656 - MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, 1657 - 1658 - /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */ 1659 - MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, 1660 - MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, 1661 - MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, 1662 - MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, 1663 - MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, 1664 - MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, 1665 - MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, 1666 - MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, 1667 - 1668 - MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, 1669 - MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, 1670 - 1671 - /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */ 1672 - MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, 1673 - MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, 1674 - MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, 1675 - MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, 1676 - 1677 - MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 1678 - MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, 1679 - MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, 1680 - }; 1681 - 1682 - static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + 1683 - ARRAY_SIZE(msrs_to_save_pmu)]; 1684 - static unsigned num_msrs_to_save; 1685 - 1686 - static const u32 emulated_msrs_all[] = { 1687 - MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 1688 - MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 1689 - 1690 - #ifdef CONFIG_KVM_HYPERV 1691 - HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 1692 - HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, 1693 - HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, 1694 - HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, 1695 - HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, 1696 - HV_X64_MSR_RESET, 1697 - HV_X64_MSR_VP_INDEX, 1698 - HV_X64_MSR_VP_RUNTIME, 1699 - HV_X64_MSR_SCONTROL, 1700 - HV_X64_MSR_STIMER0_CONFIG, 1701 - HV_X64_MSR_VP_ASSIST_PAGE, 1702 - HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, 1703 - HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, 1704 - HV_X64_MSR_SYNDBG_OPTIONS, 1705 - HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, 1706 - HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, 1707 - HV_X64_MSR_SYNDBG_PENDING_BUFFER, 1708 - #endif 1709 - 1710 - MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 1711 - MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, 1712 - 1713 - MSR_IA32_TSC_ADJUST, 1714 - MSR_IA32_TSC_DEADLINE, 1715 - MSR_IA32_ARCH_CAPABILITIES, 1716 - MSR_IA32_PERF_CAPABILITIES, 1717 - MSR_IA32_MISC_ENABLE, 1718 - MSR_IA32_MCG_STATUS, 1719 - MSR_IA32_MCG_CTL, 1720 - MSR_IA32_MCG_EXT_CTL, 1721 - MSR_IA32_SMBASE, 1722 - MSR_SMI_COUNT, 1723 - MSR_PLATFORM_INFO, 1724 - MSR_MISC_FEATURES_ENABLES, 1725 - MSR_AMD64_VIRT_SPEC_CTRL, 1726 - MSR_AMD64_TSC_RATIO, 1727 - MSR_IA32_POWER_CTL, 1728 - MSR_IA32_UCODE_REV, 1729 - 1730 - /* 1731 - * KVM always supports the "true" VMX control MSRs, even if the host 1732 - * does not. The VMX MSRs as a whole are considered "emulated" as KVM 1733 - * doesn't strictly require them to exist in the host (ignoring that 1734 - * KVM would refuse to load in the first place if the core set of MSRs 1735 - * aren't supported). 1736 - */ 1737 - MSR_IA32_VMX_BASIC, 1738 - MSR_IA32_VMX_TRUE_PINBASED_CTLS, 1739 - MSR_IA32_VMX_TRUE_PROCBASED_CTLS, 1740 - MSR_IA32_VMX_TRUE_EXIT_CTLS, 1741 - MSR_IA32_VMX_TRUE_ENTRY_CTLS, 1742 - MSR_IA32_VMX_MISC, 1743 - MSR_IA32_VMX_CR0_FIXED0, 1744 - MSR_IA32_VMX_CR4_FIXED0, 1745 - MSR_IA32_VMX_VMCS_ENUM, 1746 - MSR_IA32_VMX_PROCBASED_CTLS2, 1747 - MSR_IA32_VMX_EPT_VPID_CAP, 1748 - MSR_IA32_VMX_VMFUNC, 1749 - 1750 - MSR_K7_HWCR, 1751 - MSR_KVM_POLL_CONTROL, 1752 - }; 1753 - 1754 - static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; 1755 - static unsigned num_emulated_msrs; 1756 - 1757 - /* 1758 - * List of MSRs that control the existence of MSR-based features, i.e. MSRs 1759 - * that are effectively CPUID leafs. VMX MSRs are also included in the set of 1760 - * feature MSRs, but are handled separately to allow expedited lookups. 1761 - */ 1762 - static const u32 msr_based_features_all_except_vmx[] = { 1763 - MSR_AMD64_DE_CFG, 1764 - MSR_IA32_UCODE_REV, 1765 - MSR_IA32_ARCH_CAPABILITIES, 1766 - MSR_IA32_PERF_CAPABILITIES, 1767 - }; 1768 - 1769 - static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + 1770 - (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; 1771 - static unsigned int num_msr_based_features; 1772 - 1773 - /* 1774 - * All feature MSRs except uCode revID, which tracks the currently loaded uCode 1775 - * patch, are immutable once the vCPU model is defined. 1776 - */ 1777 - static bool kvm_is_immutable_feature_msr(u32 msr) 1778 - { 1779 - int i; 1780 - 1781 - if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) 1782 - return true; 1783 - 1784 - for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { 1785 - if (msr == msr_based_features_all_except_vmx[i]) 1786 - return msr != MSR_IA32_UCODE_REV; 1787 - } 1788 - 1789 - return false; 1790 - } 1791 - 1792 - /* 1793 * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM 1794 * does not yet virtualize. These include: 1795 * 10 - MISC_PACKAGE_CTRLS ··· 1694 return data; 1695 } 1696 1697 - static int kvm_get_msr_feature(struct kvm_msr_entry *msr) 1698 { 1699 - switch (msr->index) { 1700 case MSR_IA32_ARCH_CAPABILITIES: 1701 - msr->data = kvm_get_arch_capabilities(); 1702 break; 1703 case MSR_IA32_PERF_CAPABILITIES: 1704 - msr->data = kvm_caps.supported_perf_cap; 1705 break; 1706 case MSR_IA32_UCODE_REV: 1707 - rdmsrl_safe(msr->index, &msr->data); 1708 break; 1709 default: 1710 - return kvm_x86_call(get_msr_feature)(msr); 1711 } 1712 return 0; 1713 } 1714 1715 - static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 1716 { 1717 - struct kvm_msr_entry msr; 1718 - int r; 1719 - 1720 - /* Unconditionally clear the output for simplicity */ 1721 - msr.data = 0; 1722 - msr.index = index; 1723 - r = kvm_get_msr_feature(&msr); 1724 - 1725 - if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false)) 1726 - r = 0; 1727 - 1728 - *data = msr.data; 1729 - 1730 - return r; 1731 } 1732 1733 static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) ··· 1905 return kvm_x86_call(set_msr)(vcpu, &msr); 1906 } 1907 1908 static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, 1909 u32 index, u64 data, bool host_initiated) 1910 { 1911 - int ret = __kvm_set_msr(vcpu, index, data, host_initiated); 1912 - 1913 - if (ret == KVM_MSR_RET_INVALID) 1914 - if (kvm_msr_ignored_check(index, data, true)) 1915 - ret = 0; 1916 - 1917 - return ret; 1918 } 1919 1920 /* ··· 1954 static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, 1955 u32 index, u64 *data, bool host_initiated) 1956 { 1957 - int ret = __kvm_get_msr(vcpu, index, data, host_initiated); 1958 - 1959 - if (ret == KVM_MSR_RET_INVALID) { 1960 - /* Unconditionally clear *data for simplicity */ 1961 - *data = 0; 1962 - if (kvm_msr_ignored_check(index, 0, false)) 1963 - ret = 0; 1964 - } 1965 - 1966 - return ret; 1967 } 1968 1969 - static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) 1970 { 1971 if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) 1972 return KVM_MSR_RET_FILTERED; 1973 return kvm_get_msr_ignored_check(vcpu, index, data, false); 1974 } 1975 1976 - static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) 1977 { 1978 if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) 1979 return KVM_MSR_RET_FILTERED; 1980 return kvm_set_msr_ignored_check(vcpu, index, data, false); 1981 } 1982 1983 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) 1984 { ··· 2019 static u64 kvm_msr_reason(int r) 2020 { 2021 switch (r) { 2022 - case KVM_MSR_RET_INVALID: 2023 return KVM_MSR_EXIT_REASON_UNKNOWN; 2024 case KVM_MSR_RET_FILTERED: 2025 return KVM_MSR_EXIT_REASON_FILTER; ··· 2182 { 2183 u32 msr = kvm_rcx_read(vcpu); 2184 u64 data; 2185 - fastpath_t ret = EXIT_FASTPATH_NONE; 2186 2187 kvm_vcpu_srcu_read_lock(vcpu); 2188 2189 switch (msr) { 2190 case APIC_BASE_MSR + (APIC_ICR >> 4): 2191 data = kvm_read_edx_eax(vcpu); 2192 - if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { 2193 - kvm_skip_emulated_instruction(vcpu); 2194 - ret = EXIT_FASTPATH_EXIT_HANDLED; 2195 - } 2196 break; 2197 case MSR_IA32_TSC_DEADLINE: 2198 data = kvm_read_edx_eax(vcpu); 2199 - if (!handle_fastpath_set_tscdeadline(vcpu, data)) { 2200 - kvm_skip_emulated_instruction(vcpu); 2201 - ret = EXIT_FASTPATH_REENTER_GUEST; 2202 - } 2203 break; 2204 default: 2205 break; 2206 } 2207 2208 - if (ret != EXIT_FASTPATH_NONE) 2209 trace_kvm_msr_write(msr, data); 2210 2211 kvm_vcpu_srcu_read_unlock(vcpu); 2212 ··· 3769 mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); 3770 } 3771 3772 - static bool kvm_is_msr_to_save(u32 msr_index) 3773 - { 3774 - unsigned int i; 3775 - 3776 - for (i = 0; i < num_msrs_to_save; i++) { 3777 - if (msrs_to_save[i] == msr_index) 3778 - return true; 3779 - } 3780 - 3781 - return false; 3782 - } 3783 - 3784 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 3785 { 3786 u32 msr = msr_info->index; ··· 4150 if (kvm_pmu_is_valid_msr(vcpu, msr)) 4151 return kvm_pmu_set_msr(vcpu, msr_info); 4152 4153 - /* 4154 - * Userspace is allowed to write '0' to MSRs that KVM reports 4155 - * as to-be-saved, even if an MSRs isn't fully supported. 4156 - */ 4157 - if (msr_info->host_initiated && !data && 4158 - kvm_is_msr_to_save(msr)) 4159 - break; 4160 - 4161 - return KVM_MSR_RET_INVALID; 4162 } 4163 return 0; 4164 } ··· 4501 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) 4502 return kvm_pmu_get_msr(vcpu, msr_info); 4503 4504 - /* 4505 - * Userspace is allowed to read MSRs that KVM reports as 4506 - * to-be-saved, even if an MSR isn't fully supported. 4507 - */ 4508 - if (msr_info->host_initiated && 4509 - kvm_is_msr_to_save(msr_info->index)) { 4510 - msr_info->data = 0; 4511 - break; 4512 - } 4513 - 4514 - return KVM_MSR_RET_INVALID; 4515 } 4516 return 0; 4517 } ··· 4939 break; 4940 } 4941 case KVM_GET_MSRS: 4942 - r = msr_io(NULL, argp, do_get_msr_feature, 1); 4943 break; 4944 #ifdef CONFIG_KVM_HYPERV 4945 case KVM_GET_SUPPORTED_HV_CPUID: ··· 7376 7377 static void kvm_probe_feature_msr(u32 msr_index) 7378 { 7379 - struct kvm_msr_entry msr = { 7380 - .index = msr_index, 7381 - }; 7382 7383 - if (kvm_get_msr_feature(&msr)) 7384 return; 7385 7386 msr_based_features[num_msr_based_features++] = msr_index; ··· 8856 return 1; 8857 } 8858 8859 - static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 8860 - int emulation_type) 8861 { 8862 - gpa_t gpa = cr2_or_gpa; 8863 - kvm_pfn_t pfn; 8864 - 8865 if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) 8866 return false; 8867 - 8868 - if (WARN_ON_ONCE(is_guest_mode(vcpu)) || 8869 - WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) 8870 - return false; 8871 - 8872 - if (!vcpu->arch.mmu->root_role.direct) { 8873 - /* 8874 - * Write permission should be allowed since only 8875 - * write access need to be emulated. 8876 - */ 8877 - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); 8878 - 8879 - /* 8880 - * If the mapping is invalid in guest, let cpu retry 8881 - * it to generate fault. 8882 - */ 8883 - if (gpa == INVALID_GPA) 8884 - return true; 8885 - } 8886 - 8887 - /* 8888 - * Do not retry the unhandleable instruction if it faults on the 8889 - * readonly host memory, otherwise it will goto a infinite loop: 8890 - * retry instruction -> write #PF -> emulation fail -> retry 8891 - * instruction -> ... 8892 - */ 8893 - pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); 8894 - 8895 - /* 8896 - * If the instruction failed on the error pfn, it can not be fixed, 8897 - * report the error to userspace. 8898 - */ 8899 - if (is_error_noslot_pfn(pfn)) 8900 - return false; 8901 - 8902 - kvm_release_pfn_clean(pfn); 8903 - 8904 - /* 8905 - * If emulation may have been triggered by a write to a shadowed page 8906 - * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the 8907 - * guest to let the CPU re-execute the instruction in the hope that the 8908 - * CPU can cleanly execute the instruction that KVM failed to emulate. 8909 - */ 8910 - if (vcpu->kvm->arch.indirect_shadow_pages) 8911 - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); 8912 8913 /* 8914 * If the failed instruction faulted on an access to page tables that ··· 8873 * then zap the SPTE to unprotect the gfn, and then do it all over 8874 * again. Report the error to userspace. 8875 */ 8876 - return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP); 8877 - } 8878 - 8879 - static bool retry_instruction(struct x86_emulate_ctxt *ctxt, 8880 - gpa_t cr2_or_gpa, int emulation_type) 8881 - { 8882 - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 8883 - unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa; 8884 - 8885 - last_retry_eip = vcpu->arch.last_retry_eip; 8886 - last_retry_addr = vcpu->arch.last_retry_addr; 8887 8888 /* 8889 - * If the emulation is caused by #PF and it is non-page_table 8890 - * writing instruction, it means the VM-EXIT is caused by shadow 8891 - * page protected, we can zap the shadow page and retry this 8892 - * instruction directly. 8893 - * 8894 - * Note: if the guest uses a non-page-table modifying instruction 8895 - * on the PDE that points to the instruction, then we will unmap 8896 - * the instruction and go to an infinite loop. So, we cache the 8897 - * last retried eip and the last fault address, if we meet the eip 8898 - * and the address again, we can break out of the potential infinite 8899 - * loop. 8900 */ 8901 - vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; 8902 8903 - if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) 8904 - return false; 8905 - 8906 - if (WARN_ON_ONCE(is_guest_mode(vcpu)) || 8907 - WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) 8908 - return false; 8909 - 8910 - if (x86_page_table_writing_insn(ctxt)) 8911 - return false; 8912 - 8913 - if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa) 8914 - return false; 8915 - 8916 - vcpu->arch.last_retry_eip = ctxt->eip; 8917 - vcpu->arch.last_retry_addr = cr2_or_gpa; 8918 - 8919 - if (!vcpu->arch.mmu->root_role.direct) 8920 - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); 8921 - 8922 - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); 8923 - 8924 return true; 8925 } 8926 ··· 9090 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 9091 bool writeback = true; 9092 9093 r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len); 9094 if (r != X86EMUL_CONTINUE) { 9095 if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT) ··· 9125 kvm_queue_exception(vcpu, UD_VECTOR); 9126 return 1; 9127 } 9128 - if (reexecute_instruction(vcpu, cr2_or_gpa, 9129 - emulation_type)) 9130 return 1; 9131 9132 if (ctxt->have_exception && ··· 9173 return 1; 9174 } 9175 9176 - if (retry_instruction(ctxt, cr2_or_gpa, emulation_type)) 9177 return 1; 9178 9179 /* this is needed for vmware backdoor interface to work since it ··· 9212 return 1; 9213 9214 if (r == EMULATION_FAILED) { 9215 - if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type)) 9216 return 1; 9217 9218 return handle_emulation_failure(vcpu, emulation_type); ··· 9681 9682 guard(mutex)(&vendor_module_lock); 9683 9684 - if (kvm_x86_ops.hardware_enable) { 9685 pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name); 9686 return -EEXIST; 9687 } ··· 9808 return 0; 9809 9810 out_unwind_ops: 9811 - kvm_x86_ops.hardware_enable = NULL; 9812 kvm_x86_call(hardware_unsetup)(); 9813 out_mmu_exit: 9814 kvm_mmu_vendor_module_exit(); ··· 9849 WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); 9850 #endif 9851 mutex_lock(&vendor_module_lock); 9852 - kvm_x86_ops.hardware_enable = NULL; 9853 mutex_unlock(&vendor_module_lock); 9854 } 9855 EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); 9856 - 9857 - static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) 9858 - { 9859 - /* 9860 - * The vCPU has halted, e.g. executed HLT. Update the run state if the 9861 - * local APIC is in-kernel, the run loop will detect the non-runnable 9862 - * state and halt the vCPU. Exit to userspace if the local APIC is 9863 - * managed by userspace, in which case userspace is responsible for 9864 - * handling wake events. 9865 - */ 9866 - ++vcpu->stat.halt_exits; 9867 - if (lapic_in_kernel(vcpu)) { 9868 - vcpu->arch.mp_state = state; 9869 - return 1; 9870 - } else { 9871 - vcpu->run->exit_reason = reason; 9872 - return 0; 9873 - } 9874 - } 9875 - 9876 - int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) 9877 - { 9878 - return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); 9879 - } 9880 - EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); 9881 - 9882 - int kvm_emulate_halt(struct kvm_vcpu *vcpu) 9883 - { 9884 - int ret = kvm_skip_emulated_instruction(vcpu); 9885 - /* 9886 - * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered 9887 - * KVM_EXIT_DEBUG here. 9888 - */ 9889 - return kvm_emulate_halt_noskip(vcpu) && ret; 9890 - } 9891 - EXPORT_SYMBOL_GPL(kvm_emulate_halt); 9892 - 9893 - int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) 9894 - { 9895 - int ret = kvm_skip_emulated_instruction(vcpu); 9896 - 9897 - return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, 9898 - KVM_EXIT_AP_RESET_HOLD) && ret; 9899 - } 9900 - EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); 9901 9902 #ifdef CONFIG_X86_64 9903 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, ··· 11090 if (vcpu->arch.apic_attention) 11091 kvm_lapic_sync_from_vapic(vcpu); 11092 11093 r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath); 11094 return r; 11095 ··· 11104 kvm_lapic_sync_from_vapic(vcpu); 11105 out: 11106 return r; 11107 } 11108 11109 /* Called within kvm->srcu read side. */ ··· 11238 return 1; 11239 } 11240 11241 - static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) 11242 - { 11243 - return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 11244 - !vcpu->arch.apf.halted); 11245 - } 11246 - 11247 /* Called within kvm->srcu read side. */ 11248 static int vcpu_run(struct kvm_vcpu *vcpu) 11249 { ··· 11287 } 11288 11289 return r; 11290 } 11291 11292 static inline int complete_emulated_io(struct kvm_vcpu *vcpu) ··· 12297 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); 12298 vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); 12299 12300 - vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; 12301 - 12302 kvm_async_pf_hash_reset(vcpu); 12303 12304 vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap; ··· 12462 if (!init_event) { 12463 vcpu->arch.smbase = 0x30000; 12464 12465 vcpu->arch.msr_misc_features_enables = 0; 12466 vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | 12467 MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; ··· 12549 } 12550 EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); 12551 12552 - int kvm_arch_hardware_enable(void) 12553 { 12554 struct kvm *kvm; 12555 struct kvm_vcpu *vcpu; ··· 12575 if (ret) 12576 return ret; 12577 12578 - ret = kvm_x86_call(hardware_enable)(); 12579 if (ret != 0) 12580 return ret; 12581 ··· 12655 return 0; 12656 } 12657 12658 - void kvm_arch_hardware_disable(void) 12659 { 12660 - kvm_x86_call(hardware_disable)(); 12661 drop_user_return_notifiers(); 12662 } 12663 ··· 13203 /* Free the arrays associated with the old memslot. */ 13204 if (change == KVM_MR_MOVE) 13205 kvm_arch_free_memslot(kvm, old); 13206 - } 13207 - 13208 - static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) 13209 - { 13210 - if (!list_empty_careful(&vcpu->async_pf.done)) 13211 - return true; 13212 - 13213 - if (kvm_apic_has_pending_init_or_sipi(vcpu) && 13214 - kvm_apic_init_sipi_allowed(vcpu)) 13215 - return true; 13216 - 13217 - if (vcpu->arch.pv.pv_unhalted) 13218 - return true; 13219 - 13220 - if (kvm_is_exception_pending(vcpu)) 13221 - return true; 13222 - 13223 - if (kvm_test_request(KVM_REQ_NMI, vcpu) || 13224 - (vcpu->arch.nmi_pending && 13225 - kvm_x86_call(nmi_allowed)(vcpu, false))) 13226 - return true; 13227 - 13228 - #ifdef CONFIG_KVM_SMM 13229 - if (kvm_test_request(KVM_REQ_SMI, vcpu) || 13230 - (vcpu->arch.smi_pending && 13231 - kvm_x86_call(smi_allowed)(vcpu, false))) 13232 - return true; 13233 - #endif 13234 - 13235 - if (kvm_test_request(KVM_REQ_PMI, vcpu)) 13236 - return true; 13237 - 13238 - if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) 13239 - return true; 13240 - 13241 - if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) 13242 - return true; 13243 - 13244 - if (kvm_hv_has_stimer_pending(vcpu)) 13245 - return true; 13246 - 13247 - if (is_guest_mode(vcpu) && 13248 - kvm_x86_ops.nested_ops->has_events && 13249 - kvm_x86_ops.nested_ops->has_events(vcpu, false)) 13250 - return true; 13251 - 13252 - if (kvm_xen_has_pending_events(vcpu)) 13253 - return true; 13254 - 13255 - return false; 13256 - } 13257 - 13258 - int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 13259 - { 13260 - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); 13261 - } 13262 - 13263 - bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) 13264 - { 13265 - return kvm_vcpu_apicv_active(vcpu) && 13266 - kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu); 13267 - } 13268 - 13269 - bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) 13270 - { 13271 - return vcpu->arch.preempted_in_kernel; 13272 - } 13273 - 13274 - bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) 13275 - { 13276 - if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) 13277 - return true; 13278 - 13279 - if (kvm_test_request(KVM_REQ_NMI, vcpu) || 13280 - #ifdef CONFIG_KVM_SMM 13281 - kvm_test_request(KVM_REQ_SMI, vcpu) || 13282 - #endif 13283 - kvm_test_request(KVM_REQ_EVENT, vcpu)) 13284 - return true; 13285 - 13286 - return kvm_arch_dy_has_pending_interrupt(vcpu); 13287 } 13288 13289 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)

··· 305 static struct kmem_cache *x86_emulator_cache; 306 307 /* 308 + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track 309 + * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, 310 + * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that 311 + * require host support, i.e. should be probed via RDMSR. emulated_msrs holds 312 + * MSRs that KVM emulates without strictly requiring host support. 313 + * msr_based_features holds MSRs that enumerate features, i.e. are effectively 314 + * CPUID leafs. Note, msr_based_features isn't mutually exclusive with 315 + * msrs_to_save and emulated_msrs. 316 */ 317 318 + static const u32 msrs_to_save_base[] = { 319 + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 320 + MSR_STAR, 321 + #ifdef CONFIG_X86_64 322 + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 323 + #endif 324 + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, 325 + MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, 326 + MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, 327 + MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, 328 + MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, 329 + MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, 330 + MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, 331 + MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, 332 + MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, 333 + MSR_IA32_UMWAIT_CONTROL, 334 + 335 + MSR_IA32_XFD, MSR_IA32_XFD_ERR, 336 + }; 337 + 338 + static const u32 msrs_to_save_pmu[] = { 339 + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, 340 + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, 341 + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, 342 + MSR_CORE_PERF_GLOBAL_CTRL, 343 + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, 344 + 345 + /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */ 346 + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, 347 + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, 348 + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, 349 + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, 350 + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, 351 + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, 352 + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, 353 + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, 354 + 355 + MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, 356 + MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, 357 + 358 + /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */ 359 + MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, 360 + MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, 361 + MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, 362 + MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, 363 + 364 + MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 365 + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, 366 + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, 367 + }; 368 + 369 + static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + 370 + ARRAY_SIZE(msrs_to_save_pmu)]; 371 + static unsigned num_msrs_to_save; 372 + 373 + static const u32 emulated_msrs_all[] = { 374 + MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 375 + MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 376 + 377 + #ifdef CONFIG_KVM_HYPERV 378 + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 379 + HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, 380 + HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, 381 + HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, 382 + HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, 383 + HV_X64_MSR_RESET, 384 + HV_X64_MSR_VP_INDEX, 385 + HV_X64_MSR_VP_RUNTIME, 386 + HV_X64_MSR_SCONTROL, 387 + HV_X64_MSR_STIMER0_CONFIG, 388 + HV_X64_MSR_VP_ASSIST_PAGE, 389 + HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, 390 + HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, 391 + HV_X64_MSR_SYNDBG_OPTIONS, 392 + HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, 393 + HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, 394 + HV_X64_MSR_SYNDBG_PENDING_BUFFER, 395 + #endif 396 + 397 + MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 398 + MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, 399 + 400 + MSR_IA32_TSC_ADJUST, 401 + MSR_IA32_TSC_DEADLINE, 402 + MSR_IA32_ARCH_CAPABILITIES, 403 + MSR_IA32_PERF_CAPABILITIES, 404 + MSR_IA32_MISC_ENABLE, 405 + MSR_IA32_MCG_STATUS, 406 + MSR_IA32_MCG_CTL, 407 + MSR_IA32_MCG_EXT_CTL, 408 + MSR_IA32_SMBASE, 409 + MSR_SMI_COUNT, 410 + MSR_PLATFORM_INFO, 411 + MSR_MISC_FEATURES_ENABLES, 412 + MSR_AMD64_VIRT_SPEC_CTRL, 413 + MSR_AMD64_TSC_RATIO, 414 + MSR_IA32_POWER_CTL, 415 + MSR_IA32_UCODE_REV, 416 + 417 + /* 418 + * KVM always supports the "true" VMX control MSRs, even if the host 419 + * does not. The VMX MSRs as a whole are considered "emulated" as KVM 420 + * doesn't strictly require them to exist in the host (ignoring that 421 + * KVM would refuse to load in the first place if the core set of MSRs 422 + * aren't supported). 423 + */ 424 + MSR_IA32_VMX_BASIC, 425 + MSR_IA32_VMX_TRUE_PINBASED_CTLS, 426 + MSR_IA32_VMX_TRUE_PROCBASED_CTLS, 427 + MSR_IA32_VMX_TRUE_EXIT_CTLS, 428 + MSR_IA32_VMX_TRUE_ENTRY_CTLS, 429 + MSR_IA32_VMX_MISC, 430 + MSR_IA32_VMX_CR0_FIXED0, 431 + MSR_IA32_VMX_CR4_FIXED0, 432 + MSR_IA32_VMX_VMCS_ENUM, 433 + MSR_IA32_VMX_PROCBASED_CTLS2, 434 + MSR_IA32_VMX_EPT_VPID_CAP, 435 + MSR_IA32_VMX_VMFUNC, 436 + 437 + MSR_K7_HWCR, 438 + MSR_KVM_POLL_CONTROL, 439 + }; 440 + 441 + static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; 442 + static unsigned num_emulated_msrs; 443 + 444 + /* 445 + * List of MSRs that control the existence of MSR-based features, i.e. MSRs 446 + * that are effectively CPUID leafs. VMX MSRs are also included in the set of 447 + * feature MSRs, but are handled separately to allow expedited lookups. 448 + */ 449 + static const u32 msr_based_features_all_except_vmx[] = { 450 + MSR_AMD64_DE_CFG, 451 + MSR_IA32_UCODE_REV, 452 + MSR_IA32_ARCH_CAPABILITIES, 453 + MSR_IA32_PERF_CAPABILITIES, 454 + }; 455 + 456 + static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + 457 + (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; 458 + static unsigned int num_msr_based_features; 459 + 460 + /* 461 + * All feature MSRs except uCode revID, which tracks the currently loaded uCode 462 + * patch, are immutable once the vCPU model is defined. 463 + */ 464 + static bool kvm_is_immutable_feature_msr(u32 msr) 465 + { 466 + int i; 467 + 468 + if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) 469 return true; 470 + 471 + for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { 472 + if (msr == msr_based_features_all_except_vmx[i]) 473 + return msr != MSR_IA32_UCODE_REV; 474 } 475 + 476 + return false; 477 + } 478 + 479 + static bool kvm_is_advertised_msr(u32 msr_index) 480 + { 481 + unsigned int i; 482 + 483 + for (i = 0; i < num_msrs_to_save; i++) { 484 + if (msrs_to_save[i] == msr_index) 485 + return true; 486 + } 487 + 488 + for (i = 0; i < num_emulated_msrs; i++) { 489 + if (emulated_msrs[i] == msr_index) 490 + return true; 491 + } 492 + 493 + return false; 494 + } 495 + 496 + typedef int (*msr_access_t)(struct kvm_vcpu *vcpu, u32 index, u64 *data, 497 + bool host_initiated); 498 + 499 + static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr, 500 + u64 *data, bool host_initiated, 501 + enum kvm_msr_access rw, 502 + msr_access_t msr_access_fn) 503 + { 504 + const char *op = rw == MSR_TYPE_W ? "wrmsr" : "rdmsr"; 505 + int ret; 506 + 507 + BUILD_BUG_ON(rw != MSR_TYPE_R && rw != MSR_TYPE_W); 508 + 509 + /* 510 + * Zero the data on read failures to avoid leaking stack data to the 511 + * guest and/or userspace, e.g. if the failure is ignored below. 512 + */ 513 + ret = msr_access_fn(vcpu, msr, data, host_initiated); 514 + if (ret && rw == MSR_TYPE_R) 515 + *data = 0; 516 + 517 + if (ret != KVM_MSR_RET_UNSUPPORTED) 518 + return ret; 519 + 520 + /* 521 + * Userspace is allowed to read MSRs, and write '0' to MSRs, that KVM 522 + * advertises to userspace, even if an MSR isn't fully supported. 523 + * Simply check that @data is '0', which covers both the write '0' case 524 + * and all reads (in which case @data is zeroed on failure; see above). 525 + */ 526 + if (host_initiated && !*data && kvm_is_advertised_msr(msr)) 527 + return 0; 528 + 529 + if (!ignore_msrs) { 530 + kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", 531 + op, msr, *data); 532 + return ret; 533 + } 534 + 535 + if (report_ignored_msrs) 536 + kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, *data); 537 + 538 + return 0; 539 } 540 541 static struct kmem_cache *kvm_alloc_emulator_cache(void) ··· 355 356 /* 357 * Disabling irqs at this point since the following code could be 358 + * interrupted and executed through kvm_arch_disable_virtualization_cpu() 359 */ 360 local_irq_save(flags); 361 if (msrs->registered) { ··· 413 414 static void kvm_user_return_msr_cpu_online(void) 415 { 416 + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); 417 u64 value; 418 int i; 419 ··· 619 ex->error_code = error_code; 620 ex->has_payload = has_payload; 621 ex->payload = payload; 622 } 623 624 static void kvm_multiple_exception(struct kvm_vcpu *vcpu, ··· 1412 EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); 1413 1414 /* 1415 * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM 1416 * does not yet virtualize. These include: 1417 * 10 - MISC_PACKAGE_CTRLS ··· 1660 return data; 1661 } 1662 1663 + static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, 1664 + bool host_initiated) 1665 { 1666 + WARN_ON_ONCE(!host_initiated); 1667 + 1668 + switch (index) { 1669 case MSR_IA32_ARCH_CAPABILITIES: 1670 + *data = kvm_get_arch_capabilities(); 1671 break; 1672 case MSR_IA32_PERF_CAPABILITIES: 1673 + *data = kvm_caps.supported_perf_cap; 1674 break; 1675 case MSR_IA32_UCODE_REV: 1676 + rdmsrl_safe(index, data); 1677 break; 1678 default: 1679 + return kvm_x86_call(get_feature_msr)(index, data); 1680 } 1681 return 0; 1682 } 1683 1684 + static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 1685 { 1686 + return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R, 1687 + kvm_get_feature_msr); 1688 } 1689 1690 static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) ··· 1880 return kvm_x86_call(set_msr)(vcpu, &msr); 1881 } 1882 1883 + static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, 1884 + bool host_initiated) 1885 + { 1886 + return __kvm_set_msr(vcpu, index, *data, host_initiated); 1887 + } 1888 + 1889 static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, 1890 u32 index, u64 data, bool host_initiated) 1891 { 1892 + return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W, 1893 + _kvm_set_msr); 1894 } 1895 1896 /* ··· 1928 static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, 1929 u32 index, u64 *data, bool host_initiated) 1930 { 1931 + return kvm_do_msr_access(vcpu, index, data, host_initiated, MSR_TYPE_R, 1932 + __kvm_get_msr); 1933 } 1934 1935 + int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) 1936 { 1937 if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) 1938 return KVM_MSR_RET_FILTERED; 1939 return kvm_get_msr_ignored_check(vcpu, index, data, false); 1940 } 1941 + EXPORT_SYMBOL_GPL(kvm_get_msr_with_filter); 1942 1943 + int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) 1944 { 1945 if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) 1946 return KVM_MSR_RET_FILTERED; 1947 return kvm_set_msr_ignored_check(vcpu, index, data, false); 1948 } 1949 + EXPORT_SYMBOL_GPL(kvm_set_msr_with_filter); 1950 1951 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) 1952 { ··· 1999 static u64 kvm_msr_reason(int r) 2000 { 2001 switch (r) { 2002 + case KVM_MSR_RET_UNSUPPORTED: 2003 return KVM_MSR_EXIT_REASON_UNKNOWN; 2004 case KVM_MSR_RET_FILTERED: 2005 return KVM_MSR_EXIT_REASON_FILTER; ··· 2162 { 2163 u32 msr = kvm_rcx_read(vcpu); 2164 u64 data; 2165 + fastpath_t ret; 2166 + bool handled; 2167 2168 kvm_vcpu_srcu_read_lock(vcpu); 2169 2170 switch (msr) { 2171 case APIC_BASE_MSR + (APIC_ICR >> 4): 2172 data = kvm_read_edx_eax(vcpu); 2173 + handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); 2174 break; 2175 case MSR_IA32_TSC_DEADLINE: 2176 data = kvm_read_edx_eax(vcpu); 2177 + handled = !handle_fastpath_set_tscdeadline(vcpu, data); 2178 break; 2179 default: 2180 + handled = false; 2181 break; 2182 } 2183 2184 + if (handled) { 2185 + if (!kvm_skip_emulated_instruction(vcpu)) 2186 + ret = EXIT_FASTPATH_EXIT_USERSPACE; 2187 + else 2188 + ret = EXIT_FASTPATH_REENTER_GUEST; 2189 trace_kvm_msr_write(msr, data); 2190 + } else { 2191 + ret = EXIT_FASTPATH_NONE; 2192 + } 2193 2194 kvm_vcpu_srcu_read_unlock(vcpu); 2195 ··· 3746 mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); 3747 } 3748 3749 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 3750 { 3751 u32 msr = msr_info->index; ··· 4139 if (kvm_pmu_is_valid_msr(vcpu, msr)) 4140 return kvm_pmu_set_msr(vcpu, msr_info); 4141 4142 + return KVM_MSR_RET_UNSUPPORTED; 4143 } 4144 return 0; 4145 } ··· 4498 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) 4499 return kvm_pmu_get_msr(vcpu, msr_info); 4500 4501 + return KVM_MSR_RET_UNSUPPORTED; 4502 } 4503 return 0; 4504 } ··· 4946 break; 4947 } 4948 case KVM_GET_MSRS: 4949 + r = msr_io(NULL, argp, do_get_feature_msr, 1); 4950 break; 4951 #ifdef CONFIG_KVM_HYPERV 4952 case KVM_GET_SUPPORTED_HV_CPUID: ··· 7383 7384 static void kvm_probe_feature_msr(u32 msr_index) 7385 { 7386 + u64 data; 7387 7388 + if (kvm_get_feature_msr(NULL, msr_index, &data, true)) 7389 return; 7390 7391 msr_based_features[num_msr_based_features++] = msr_index; ··· 8865 return 1; 8866 } 8867 8868 + static bool kvm_unprotect_and_retry_on_failure(struct kvm_vcpu *vcpu, 8869 + gpa_t cr2_or_gpa, 8870 + int emulation_type) 8871 { 8872 if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) 8873 return false; 8874 8875 /* 8876 * If the failed instruction faulted on an access to page tables that ··· 8929 * then zap the SPTE to unprotect the gfn, and then do it all over 8930 * again. Report the error to userspace. 8931 */ 8932 + if (emulation_type & EMULTYPE_WRITE_PF_TO_SP) 8933 + return false; 8934 8935 /* 8936 + * If emulation may have been triggered by a write to a shadowed page 8937 + * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the 8938 + * guest to let the CPU re-execute the instruction in the hope that the 8939 + * CPU can cleanly execute the instruction that KVM failed to emulate. 8940 */ 8941 + __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, true); 8942 8943 + /* 8944 + * Retry even if _this_ vCPU didn't unprotect the gfn, as it's possible 8945 + * all SPTEs were already zapped by a different task. The alternative 8946 + * is to report the error to userspace and likely terminate the guest, 8947 + * and the last_retry_{eip,addr} checks will prevent retrying the page 8948 + * fault indefinitely, i.e. there's nothing to lose by retrying. 8949 + */ 8950 return true; 8951 } 8952 ··· 9176 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 9177 bool writeback = true; 9178 9179 + if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) && 9180 + (WARN_ON_ONCE(is_guest_mode(vcpu)) || 9181 + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))) 9182 + emulation_type &= ~EMULTYPE_ALLOW_RETRY_PF; 9183 + 9184 r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len); 9185 if (r != X86EMUL_CONTINUE) { 9186 if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT) ··· 9206 kvm_queue_exception(vcpu, UD_VECTOR); 9207 return 1; 9208 } 9209 + if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa, 9210 + emulation_type)) 9211 return 1; 9212 9213 if (ctxt->have_exception && ··· 9254 return 1; 9255 } 9256 9257 + /* 9258 + * If emulation was caused by a write-protection #PF on a non-page_table 9259 + * writing instruction, try to unprotect the gfn, i.e. zap shadow pages, 9260 + * and retry the instruction, as the vCPU is likely no longer using the 9261 + * gfn as a page table. 9262 + */ 9263 + if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) && 9264 + !x86_page_table_writing_insn(ctxt) && 9265 + kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) 9266 return 1; 9267 9268 /* this is needed for vmware backdoor interface to work since it ··· 9285 return 1; 9286 9287 if (r == EMULATION_FAILED) { 9288 + if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa, 9289 + emulation_type)) 9290 return 1; 9291 9292 return handle_emulation_failure(vcpu, emulation_type); ··· 9753 9754 guard(mutex)(&vendor_module_lock); 9755 9756 + if (kvm_x86_ops.enable_virtualization_cpu) { 9757 pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name); 9758 return -EEXIST; 9759 } ··· 9880 return 0; 9881 9882 out_unwind_ops: 9883 + kvm_x86_ops.enable_virtualization_cpu = NULL; 9884 kvm_x86_call(hardware_unsetup)(); 9885 out_mmu_exit: 9886 kvm_mmu_vendor_module_exit(); ··· 9921 WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); 9922 #endif 9923 mutex_lock(&vendor_module_lock); 9924 + kvm_x86_ops.enable_virtualization_cpu = NULL; 9925 mutex_unlock(&vendor_module_lock); 9926 } 9927 EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); 9928 9929 #ifdef CONFIG_X86_64 9930 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, ··· 11207 if (vcpu->arch.apic_attention) 11208 kvm_lapic_sync_from_vapic(vcpu); 11209 11210 + if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE)) 11211 + return 0; 11212 + 11213 r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath); 11214 return r; 11215 ··· 11218 kvm_lapic_sync_from_vapic(vcpu); 11219 out: 11220 return r; 11221 + } 11222 + 11223 + static bool kvm_vcpu_running(struct kvm_vcpu *vcpu) 11224 + { 11225 + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 11226 + !vcpu->arch.apf.halted); 11227 + } 11228 + 11229 + static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) 11230 + { 11231 + if (!list_empty_careful(&vcpu->async_pf.done)) 11232 + return true; 11233 + 11234 + if (kvm_apic_has_pending_init_or_sipi(vcpu) && 11235 + kvm_apic_init_sipi_allowed(vcpu)) 11236 + return true; 11237 + 11238 + if (vcpu->arch.pv.pv_unhalted) 11239 + return true; 11240 + 11241 + if (kvm_is_exception_pending(vcpu)) 11242 + return true; 11243 + 11244 + if (kvm_test_request(KVM_REQ_NMI, vcpu) || 11245 + (vcpu->arch.nmi_pending && 11246 + kvm_x86_call(nmi_allowed)(vcpu, false))) 11247 + return true; 11248 + 11249 + #ifdef CONFIG_KVM_SMM 11250 + if (kvm_test_request(KVM_REQ_SMI, vcpu) || 11251 + (vcpu->arch.smi_pending && 11252 + kvm_x86_call(smi_allowed)(vcpu, false))) 11253 + return true; 11254 + #endif 11255 + 11256 + if (kvm_test_request(KVM_REQ_PMI, vcpu)) 11257 + return true; 11258 + 11259 + if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) 11260 + return true; 11261 + 11262 + if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) 11263 + return true; 11264 + 11265 + if (kvm_hv_has_stimer_pending(vcpu)) 11266 + return true; 11267 + 11268 + if (is_guest_mode(vcpu) && 11269 + kvm_x86_ops.nested_ops->has_events && 11270 + kvm_x86_ops.nested_ops->has_events(vcpu, false)) 11271 + return true; 11272 + 11273 + if (kvm_xen_has_pending_events(vcpu)) 11274 + return true; 11275 + 11276 + return false; 11277 + } 11278 + 11279 + int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 11280 + { 11281 + return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); 11282 } 11283 11284 /* Called within kvm->srcu read side. */ ··· 11291 return 1; 11292 } 11293 11294 /* Called within kvm->srcu read side. */ 11295 static int vcpu_run(struct kvm_vcpu *vcpu) 11296 { ··· 11346 } 11347 11348 return r; 11349 + } 11350 + 11351 + static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) 11352 + { 11353 + /* 11354 + * The vCPU has halted, e.g. executed HLT. Update the run state if the 11355 + * local APIC is in-kernel, the run loop will detect the non-runnable 11356 + * state and halt the vCPU. Exit to userspace if the local APIC is 11357 + * managed by userspace, in which case userspace is responsible for 11358 + * handling wake events. 11359 + */ 11360 + ++vcpu->stat.halt_exits; 11361 + if (lapic_in_kernel(vcpu)) { 11362 + if (kvm_vcpu_has_events(vcpu)) 11363 + vcpu->arch.pv.pv_unhalted = false; 11364 + else 11365 + vcpu->arch.mp_state = state; 11366 + return 1; 11367 + } else { 11368 + vcpu->run->exit_reason = reason; 11369 + return 0; 11370 + } 11371 + } 11372 + 11373 + int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) 11374 + { 11375 + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); 11376 + } 11377 + EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); 11378 + 11379 + int kvm_emulate_halt(struct kvm_vcpu *vcpu) 11380 + { 11381 + int ret = kvm_skip_emulated_instruction(vcpu); 11382 + /* 11383 + * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered 11384 + * KVM_EXIT_DEBUG here. 11385 + */ 11386 + return kvm_emulate_halt_noskip(vcpu) && ret; 11387 + } 11388 + EXPORT_SYMBOL_GPL(kvm_emulate_halt); 11389 + 11390 + fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) 11391 + { 11392 + int ret; 11393 + 11394 + kvm_vcpu_srcu_read_lock(vcpu); 11395 + ret = kvm_emulate_halt(vcpu); 11396 + kvm_vcpu_srcu_read_unlock(vcpu); 11397 + 11398 + if (!ret) 11399 + return EXIT_FASTPATH_EXIT_USERSPACE; 11400 + 11401 + if (kvm_vcpu_running(vcpu)) 11402 + return EXIT_FASTPATH_REENTER_GUEST; 11403 + 11404 + return EXIT_FASTPATH_EXIT_HANDLED; 11405 + } 11406 + EXPORT_SYMBOL_GPL(handle_fastpath_hlt); 11407 + 11408 + int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) 11409 + { 11410 + int ret = kvm_skip_emulated_instruction(vcpu); 11411 + 11412 + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, 11413 + KVM_EXIT_AP_RESET_HOLD) && ret; 11414 + } 11415 + EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); 11416 + 11417 + bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) 11418 + { 11419 + return kvm_vcpu_apicv_active(vcpu) && 11420 + kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu); 11421 + } 11422 + 11423 + bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) 11424 + { 11425 + return vcpu->arch.preempted_in_kernel; 11426 + } 11427 + 11428 + bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) 11429 + { 11430 + if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) 11431 + return true; 11432 + 11433 + if (kvm_test_request(KVM_REQ_NMI, vcpu) || 11434 + #ifdef CONFIG_KVM_SMM 11435 + kvm_test_request(KVM_REQ_SMI, vcpu) || 11436 + #endif 11437 + kvm_test_request(KVM_REQ_EVENT, vcpu)) 11438 + return true; 11439 + 11440 + return kvm_arch_dy_has_pending_interrupt(vcpu); 11441 } 11442 11443 static inline int complete_emulated_io(struct kvm_vcpu *vcpu) ··· 12264 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); 12265 vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); 12266 12267 kvm_async_pf_hash_reset(vcpu); 12268 12269 vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap; ··· 12431 if (!init_event) { 12432 vcpu->arch.smbase = 0x30000; 12433 12434 + vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; 12435 + 12436 vcpu->arch.msr_misc_features_enables = 0; 12437 vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | 12438 MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; ··· 12516 } 12517 EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); 12518 12519 + void kvm_arch_enable_virtualization(void) 12520 + { 12521 + cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); 12522 + } 12523 + 12524 + void kvm_arch_disable_virtualization(void) 12525 + { 12526 + cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); 12527 + } 12528 + 12529 + int kvm_arch_enable_virtualization_cpu(void) 12530 { 12531 struct kvm *kvm; 12532 struct kvm_vcpu *vcpu; ··· 12532 if (ret) 12533 return ret; 12534 12535 + ret = kvm_x86_call(enable_virtualization_cpu)(); 12536 if (ret != 0) 12537 return ret; 12538 ··· 12612 return 0; 12613 } 12614 12615 + void kvm_arch_disable_virtualization_cpu(void) 12616 { 12617 + kvm_x86_call(disable_virtualization_cpu)(); 12618 drop_user_return_notifiers(); 12619 } 12620 ··· 13160 /* Free the arrays associated with the old memslot. */ 13161 if (change == KVM_MR_MOVE) 13162 kvm_arch_free_memslot(kvm, old); 13163 } 13164 13165 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)

+26 -5

arch/x86/kvm/x86.h

··· 103 return max(val, min); 104 } 105 106 - #define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL 107 108 void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); 109 int kvm_check_nested_events(struct kvm_vcpu *vcpu); 110 111 static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) 112 { ··· 341 int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 342 int emulation_type, void *insn, int insn_len); 343 fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); 344 345 extern struct kvm_caps kvm_caps; 346 extern struct kvm_host_values kvm_host; ··· 512 int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); 513 bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); 514 515 /* 516 * Internal error codes that are used to indicate that MSR emulation encountered 517 - * an error that should result in #GP in the guest, unless userspace 518 - * handles it. 519 */ 520 - #define KVM_MSR_RET_INVALID 2 /* in-kernel MSR emulation #GP condition */ 521 - #define KVM_MSR_RET_FILTERED 3 /* #GP due to userspace MSR filter */ 522 523 #define __cr4_reserved_bits(__cpu_has, __c) \ 524 ({ \

··· 103 return max(val, min); 104 } 105 106 + #define MSR_IA32_CR_PAT_DEFAULT \ 107 + PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC) 108 109 void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); 110 int kvm_check_nested_events(struct kvm_vcpu *vcpu); 111 + 112 + /* Forcibly leave the nested mode in cases like a vCPU reset */ 113 + static inline void kvm_leave_nested(struct kvm_vcpu *vcpu) 114 + { 115 + kvm_x86_ops.nested_ops->leave_nested(vcpu); 116 + } 117 118 static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) 119 { ··· 334 int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 335 int emulation_type, void *insn, int insn_len); 336 fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); 337 + fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu); 338 339 extern struct kvm_caps kvm_caps; 340 extern struct kvm_host_values kvm_host; ··· 504 int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); 505 bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); 506 507 + enum kvm_msr_access { 508 + MSR_TYPE_R = BIT(0), 509 + MSR_TYPE_W = BIT(1), 510 + MSR_TYPE_RW = MSR_TYPE_R | MSR_TYPE_W, 511 + }; 512 + 513 /* 514 * Internal error codes that are used to indicate that MSR emulation encountered 515 + * an error that should result in #GP in the guest, unless userspace handles it. 516 + * Note, '1', '0', and negative numbers are off limits, as they are used by KVM 517 + * as part of KVM's lightly documented internal KVM_RUN return codes. 518 + * 519 + * UNSUPPORTED - The MSR isn't supported, either because it is completely 520 + * unknown to KVM, or because the MSR should not exist according 521 + * to the vCPU model. 522 + * 523 + * FILTERED - Access to the MSR is denied by a userspace MSR filter. 524 */ 525 + #define KVM_MSR_RET_UNSUPPORTED 2 526 + #define KVM_MSR_RET_FILTERED 3 527 528 #define __cr4_reserved_bits(__cpu_has, __c) \ 529 ({ \

+10 -26

arch/x86/mm/pat/memtype.c

··· 176 } 177 #endif 178 179 - enum { 180 - PAT_UC = 0, /* uncached */ 181 - PAT_WC = 1, /* Write combining */ 182 - PAT_WT = 4, /* Write Through */ 183 - PAT_WP = 5, /* Write Protected */ 184 - PAT_WB = 6, /* Write Back (default) */ 185 - PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */ 186 - }; 187 - 188 #define CM(c) (_PAGE_CACHE_MODE_ ## c) 189 190 static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val, ··· 185 char *cache_mode; 186 187 switch (pat_val) { 188 - case PAT_UC: cache = CM(UC); cache_mode = "UC "; break; 189 - case PAT_WC: cache = CM(WC); cache_mode = "WC "; break; 190 - case PAT_WT: cache = CM(WT); cache_mode = "WT "; break; 191 - case PAT_WP: cache = CM(WP); cache_mode = "WP "; break; 192 - case PAT_WB: cache = CM(WB); cache_mode = "WB "; break; 193 - case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; 194 - default: cache = CM(WB); cache_mode = "WB "; break; 195 } 196 197 memcpy(msg, cache_mode, 4); ··· 248 void __init pat_bp_init(void) 249 { 250 struct cpuinfo_x86 *c = &boot_cpu_data; 251 - #define PAT(p0, p1, p2, p3, p4, p5, p6, p7) \ 252 - (((u64)PAT_ ## p0) | ((u64)PAT_ ## p1 << 8) | \ 253 - ((u64)PAT_ ## p2 << 16) | ((u64)PAT_ ## p3 << 24) | \ 254 - ((u64)PAT_ ## p4 << 32) | ((u64)PAT_ ## p5 << 40) | \ 255 - ((u64)PAT_ ## p6 << 48) | ((u64)PAT_ ## p7 << 56)) 256 - 257 258 if (!IS_ENABLED(CONFIG_X86_PAT)) 259 pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n"); ··· 278 * NOTE: When WC or WP is used, it is redirected to UC- per 279 * the default setup in __cachemode2pte_tbl[]. 280 */ 281 - pat_msr_val = PAT(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC); 282 } 283 284 /* ··· 313 * NOTE: When WT or WP is used, it is redirected to UC- per 314 * the default setup in __cachemode2pte_tbl[]. 315 */ 316 - pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC); 317 } else { 318 /* 319 * Full PAT support. We put WT in slot 7 to improve ··· 341 * The reserved slots are unused, but mapped to their 342 * corresponding types in the presence of PAT errata. 343 */ 344 - pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT); 345 } 346 347 memory_caching_control |= CACHE_PAT; 348 349 init_cache_modes(pat_msr_val); 350 - #undef PAT 351 } 352 353 static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */

··· 176 } 177 #endif 178 179 #define CM(c) (_PAGE_CACHE_MODE_ ## c) 180 181 static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val, ··· 194 char *cache_mode; 195 196 switch (pat_val) { 197 + case X86_MEMTYPE_UC: cache = CM(UC); cache_mode = "UC "; break; 198 + case X86_MEMTYPE_WC: cache = CM(WC); cache_mode = "WC "; break; 199 + case X86_MEMTYPE_WT: cache = CM(WT); cache_mode = "WT "; break; 200 + case X86_MEMTYPE_WP: cache = CM(WP); cache_mode = "WP "; break; 201 + case X86_MEMTYPE_WB: cache = CM(WB); cache_mode = "WB "; break; 202 + case X86_MEMTYPE_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; 203 + default: cache = CM(WB); cache_mode = "WB "; break; 204 } 205 206 memcpy(msg, cache_mode, 4); ··· 257 void __init pat_bp_init(void) 258 { 259 struct cpuinfo_x86 *c = &boot_cpu_data; 260 261 if (!IS_ENABLED(CONFIG_X86_PAT)) 262 pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n"); ··· 293 * NOTE: When WC or WP is used, it is redirected to UC- per 294 * the default setup in __cachemode2pte_tbl[]. 295 */ 296 + pat_msr_val = PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC); 297 } 298 299 /* ··· 328 * NOTE: When WT or WP is used, it is redirected to UC- per 329 * the default setup in __cachemode2pte_tbl[]. 330 */ 331 + pat_msr_val = PAT_VALUE(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC); 332 } else { 333 /* 334 * Full PAT support. We put WT in slot 7 to improve ··· 356 * The reserved slots are unused, but mapped to their 357 * corresponding types in the presence of PAT errata. 358 */ 359 + pat_msr_val = PAT_VALUE(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT); 360 } 361 362 memory_caching_control |= CACHE_PAT; 363 364 init_cache_modes(pat_msr_val); 365 } 366 367 static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */

+16 -2

include/linux/kvm_host.h

··· 1529 #endif 1530 1531 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING 1532 - int kvm_arch_hardware_enable(void); 1533 - void kvm_arch_hardware_disable(void); 1534 #endif 1535 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); 1536 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);

··· 1529 #endif 1530 1531 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING 1532 + /* 1533 + * kvm_arch_{enable,disable}_virtualization() are called on one CPU, under 1534 + * kvm_usage_lock, immediately after/before 0=>1 and 1=>0 transitions of 1535 + * kvm_usage_count, i.e. at the beginning of the generic hardware enabling 1536 + * sequence, and at the end of the generic hardware disabling sequence. 1537 + */ 1538 + void kvm_arch_enable_virtualization(void); 1539 + void kvm_arch_disable_virtualization(void); 1540 + /* 1541 + * kvm_arch_{enable,disable}_virtualization_cpu() are called on "every" CPU to 1542 + * do the actual twiddling of hardware bits. The hooks are called on all 1543 + * online CPUs when KVM enables/disabled virtualization, and on a single CPU 1544 + * when that CPU is onlined/offlined (including for Resume/Suspend). 1545 + */ 1546 + int kvm_arch_enable_virtualization_cpu(void); 1547 + void kvm_arch_disable_virtualization_cpu(void); 1548 #endif 1549 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); 1550 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);

+4

tools/testing/selftests/kvm/.gitignore

··· 5 !*.h 6 !*.S 7 !*.sh

··· 5 !*.h 6 !*.S 7 !*.sh 8 + !.gitignore 9 + !config 10 + !settings 11 + !Makefile

+4

tools/testing/selftests/kvm/Makefile

··· 130 TEST_GEN_PROGS_x86_64 += x86_64/triple_fault_event_test 131 TEST_GEN_PROGS_x86_64 += x86_64/recalc_apic_map_test 132 TEST_GEN_PROGS_x86_64 += access_tracking_perf_test 133 TEST_GEN_PROGS_x86_64 += demand_paging_test 134 TEST_GEN_PROGS_x86_64 += dirty_log_test 135 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test ··· 168 TEST_GEN_PROGS_aarch64 += aarch64/no-vgic-v3 169 TEST_GEN_PROGS_aarch64 += access_tracking_perf_test 170 TEST_GEN_PROGS_aarch64 += arch_timer 171 TEST_GEN_PROGS_aarch64 += demand_paging_test 172 TEST_GEN_PROGS_aarch64 += dirty_log_test 173 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test ··· 190 TEST_GEN_PROGS_s390x += s390x/cmma_test 191 TEST_GEN_PROGS_s390x += s390x/debug_test 192 TEST_GEN_PROGS_s390x += s390x/shared_zeropage_test 193 TEST_GEN_PROGS_s390x += demand_paging_test 194 TEST_GEN_PROGS_s390x += dirty_log_test 195 TEST_GEN_PROGS_s390x += guest_print_test ··· 203 TEST_GEN_PROGS_riscv += riscv/sbi_pmu_test 204 TEST_GEN_PROGS_riscv += riscv/ebreak_test 205 TEST_GEN_PROGS_riscv += arch_timer 206 TEST_GEN_PROGS_riscv += demand_paging_test 207 TEST_GEN_PROGS_riscv += dirty_log_test 208 TEST_GEN_PROGS_riscv += get-reg-list

··· 130 TEST_GEN_PROGS_x86_64 += x86_64/triple_fault_event_test 131 TEST_GEN_PROGS_x86_64 += x86_64/recalc_apic_map_test 132 TEST_GEN_PROGS_x86_64 += access_tracking_perf_test 133 + TEST_GEN_PROGS_x86_64 += coalesced_io_test 134 TEST_GEN_PROGS_x86_64 += demand_paging_test 135 TEST_GEN_PROGS_x86_64 += dirty_log_test 136 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test ··· 167 TEST_GEN_PROGS_aarch64 += aarch64/no-vgic-v3 168 TEST_GEN_PROGS_aarch64 += access_tracking_perf_test 169 TEST_GEN_PROGS_aarch64 += arch_timer 170 + TEST_GEN_PROGS_aarch64 += coalesced_io_test 171 TEST_GEN_PROGS_aarch64 += demand_paging_test 172 TEST_GEN_PROGS_aarch64 += dirty_log_test 173 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test ··· 188 TEST_GEN_PROGS_s390x += s390x/cmma_test 189 TEST_GEN_PROGS_s390x += s390x/debug_test 190 TEST_GEN_PROGS_s390x += s390x/shared_zeropage_test 191 + TEST_GEN_PROGS_s390x += s390x/ucontrol_test 192 TEST_GEN_PROGS_s390x += demand_paging_test 193 TEST_GEN_PROGS_s390x += dirty_log_test 194 TEST_GEN_PROGS_s390x += guest_print_test ··· 200 TEST_GEN_PROGS_riscv += riscv/sbi_pmu_test 201 TEST_GEN_PROGS_riscv += riscv/ebreak_test 202 TEST_GEN_PROGS_riscv += arch_timer 203 + TEST_GEN_PROGS_riscv += coalesced_io_test 204 TEST_GEN_PROGS_riscv += demand_paging_test 205 TEST_GEN_PROGS_riscv += dirty_log_test 206 TEST_GEN_PROGS_riscv += get-reg-list

+236

tools/testing/selftests/kvm/coalesced_io_test.c

···

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <signal.h> 3 + #include <stdio.h> 4 + #include <stdlib.h> 5 + #include <string.h> 6 + #include <sys/ioctl.h> 7 + 8 + #include <linux/sizes.h> 9 + 10 + #include <kvm_util.h> 11 + #include <processor.h> 12 + 13 + #include "ucall_common.h" 14 + 15 + struct kvm_coalesced_io { 16 + struct kvm_coalesced_mmio_ring *ring; 17 + uint32_t ring_size; 18 + uint64_t mmio_gpa; 19 + uint64_t *mmio; 20 + 21 + /* 22 + * x86-only, but define pio_port for all architectures to minimize the 23 + * amount of #ifdeffery and complexity, without having to sacrifice 24 + * verbose error messages. 25 + */ 26 + uint8_t pio_port; 27 + }; 28 + 29 + static struct kvm_coalesced_io kvm_builtin_io_ring; 30 + 31 + #ifdef __x86_64__ 32 + static const int has_pio = 1; 33 + #else 34 + static const int has_pio = 0; 35 + #endif 36 + 37 + static void guest_code(struct kvm_coalesced_io *io) 38 + { 39 + int i, j; 40 + 41 + for (;;) { 42 + for (j = 0; j < 1 + has_pio; j++) { 43 + /* 44 + * KVM always leaves one free entry, i.e. exits to 45 + * userspace before the last entry is filled. 46 + */ 47 + for (i = 0; i < io->ring_size - 1; i++) { 48 + #ifdef __x86_64__ 49 + if (i & 1) 50 + outl(io->pio_port, io->pio_port + i); 51 + else 52 + #endif 53 + WRITE_ONCE(*io->mmio, io->mmio_gpa + i); 54 + } 55 + #ifdef __x86_64__ 56 + if (j & 1) 57 + outl(io->pio_port, io->pio_port + i); 58 + else 59 + #endif 60 + WRITE_ONCE(*io->mmio, io->mmio_gpa + i); 61 + } 62 + GUEST_SYNC(0); 63 + 64 + WRITE_ONCE(*io->mmio, io->mmio_gpa + i); 65 + #ifdef __x86_64__ 66 + outl(io->pio_port, io->pio_port + i); 67 + #endif 68 + } 69 + } 70 + 71 + static void vcpu_run_and_verify_io_exit(struct kvm_vcpu *vcpu, 72 + struct kvm_coalesced_io *io, 73 + uint32_t ring_start, 74 + uint32_t expected_exit) 75 + { 76 + const bool want_pio = expected_exit == KVM_EXIT_IO; 77 + struct kvm_coalesced_mmio_ring *ring = io->ring; 78 + struct kvm_run *run = vcpu->run; 79 + uint32_t pio_value; 80 + 81 + WRITE_ONCE(ring->first, ring_start); 82 + WRITE_ONCE(ring->last, ring_start); 83 + 84 + vcpu_run(vcpu); 85 + 86 + /* 87 + * Annoyingly, reading PIO data is safe only for PIO exits, otherwise 88 + * data_offset is garbage, e.g. an MMIO gpa. 89 + */ 90 + if (run->exit_reason == KVM_EXIT_IO) 91 + pio_value = *(uint32_t *)((void *)run + run->io.data_offset); 92 + else 93 + pio_value = 0; 94 + 95 + TEST_ASSERT((!want_pio && (run->exit_reason == KVM_EXIT_MMIO && run->mmio.is_write && 96 + run->mmio.phys_addr == io->mmio_gpa && run->mmio.len == 8 && 97 + *(uint64_t *)run->mmio.data == io->mmio_gpa + io->ring_size - 1)) || 98 + (want_pio && (run->exit_reason == KVM_EXIT_IO && run->io.port == io->pio_port && 99 + run->io.direction == KVM_EXIT_IO_OUT && run->io.count == 1 && 100 + pio_value == io->pio_port + io->ring_size - 1)), 101 + "For start = %u, expected exit on %u-byte %s write 0x%llx = %lx, got exit_reason = %u (%s)\n " 102 + "(MMIO addr = 0x%llx, write = %u, len = %u, data = %lx)\n " 103 + "(PIO port = 0x%x, write = %u, len = %u, count = %u, data = %x", 104 + ring_start, want_pio ? 4 : 8, want_pio ? "PIO" : "MMIO", 105 + want_pio ? (unsigned long long)io->pio_port : io->mmio_gpa, 106 + (want_pio ? io->pio_port : io->mmio_gpa) + io->ring_size - 1, run->exit_reason, 107 + run->exit_reason == KVM_EXIT_MMIO ? "MMIO" : run->exit_reason == KVM_EXIT_IO ? "PIO" : "other", 108 + run->mmio.phys_addr, run->mmio.is_write, run->mmio.len, *(uint64_t *)run->mmio.data, 109 + run->io.port, run->io.direction, run->io.size, run->io.count, pio_value); 110 + } 111 + 112 + static void vcpu_run_and_verify_coalesced_io(struct kvm_vcpu *vcpu, 113 + struct kvm_coalesced_io *io, 114 + uint32_t ring_start, 115 + uint32_t expected_exit) 116 + { 117 + struct kvm_coalesced_mmio_ring *ring = io->ring; 118 + int i; 119 + 120 + vcpu_run_and_verify_io_exit(vcpu, io, ring_start, expected_exit); 121 + 122 + TEST_ASSERT((ring->last + 1) % io->ring_size == ring->first, 123 + "Expected ring to be full (minus 1), first = %u, last = %u, max = %u, start = %u", 124 + ring->first, ring->last, io->ring_size, ring_start); 125 + 126 + for (i = 0; i < io->ring_size - 1; i++) { 127 + uint32_t idx = (ring->first + i) % io->ring_size; 128 + struct kvm_coalesced_mmio *entry = &ring->coalesced_mmio[idx]; 129 + 130 + #ifdef __x86_64__ 131 + if (i & 1) 132 + TEST_ASSERT(entry->phys_addr == io->pio_port && 133 + entry->len == 4 && entry->pio && 134 + *(uint32_t *)entry->data == io->pio_port + i, 135 + "Wanted 4-byte port I/O 0x%x = 0x%x in entry %u, got %u-byte %s 0x%llx = 0x%x", 136 + io->pio_port, io->pio_port + i, i, 137 + entry->len, entry->pio ? "PIO" : "MMIO", 138 + entry->phys_addr, *(uint32_t *)entry->data); 139 + else 140 + #endif 141 + TEST_ASSERT(entry->phys_addr == io->mmio_gpa && 142 + entry->len == 8 && !entry->pio, 143 + "Wanted 8-byte MMIO to 0x%lx = %lx in entry %u, got %u-byte %s 0x%llx = 0x%lx", 144 + io->mmio_gpa, io->mmio_gpa + i, i, 145 + entry->len, entry->pio ? "PIO" : "MMIO", 146 + entry->phys_addr, *(uint64_t *)entry->data); 147 + } 148 + } 149 + 150 + static void test_coalesced_io(struct kvm_vcpu *vcpu, 151 + struct kvm_coalesced_io *io, uint32_t ring_start) 152 + { 153 + struct kvm_coalesced_mmio_ring *ring = io->ring; 154 + 155 + kvm_vm_register_coalesced_io(vcpu->vm, io->mmio_gpa, 8, false /* pio */); 156 + #ifdef __x86_64__ 157 + kvm_vm_register_coalesced_io(vcpu->vm, io->pio_port, 8, true /* pio */); 158 + #endif 159 + 160 + vcpu_run_and_verify_coalesced_io(vcpu, io, ring_start, KVM_EXIT_MMIO); 161 + #ifdef __x86_64__ 162 + vcpu_run_and_verify_coalesced_io(vcpu, io, ring_start, KVM_EXIT_IO); 163 + #endif 164 + 165 + /* 166 + * Verify ucall, which may use non-coalesced MMIO or PIO, generates an 167 + * immediate exit. 168 + */ 169 + WRITE_ONCE(ring->first, ring_start); 170 + WRITE_ONCE(ring->last, ring_start); 171 + vcpu_run(vcpu); 172 + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC); 173 + TEST_ASSERT_EQ(ring->first, ring_start); 174 + TEST_ASSERT_EQ(ring->last, ring_start); 175 + 176 + /* Verify that non-coalesced MMIO/PIO generates an exit to userspace. */ 177 + kvm_vm_unregister_coalesced_io(vcpu->vm, io->mmio_gpa, 8, false /* pio */); 178 + vcpu_run_and_verify_io_exit(vcpu, io, ring_start, KVM_EXIT_MMIO); 179 + 180 + #ifdef __x86_64__ 181 + kvm_vm_unregister_coalesced_io(vcpu->vm, io->pio_port, 8, true /* pio */); 182 + vcpu_run_and_verify_io_exit(vcpu, io, ring_start, KVM_EXIT_IO); 183 + #endif 184 + } 185 + 186 + int main(int argc, char *argv[]) 187 + { 188 + struct kvm_vcpu *vcpu; 189 + struct kvm_vm *vm; 190 + int i; 191 + 192 + TEST_REQUIRE(kvm_has_cap(KVM_CAP_COALESCED_MMIO)); 193 + 194 + #ifdef __x86_64__ 195 + TEST_REQUIRE(kvm_has_cap(KVM_CAP_COALESCED_PIO)); 196 + #endif 197 + 198 + vm = vm_create_with_one_vcpu(&vcpu, guest_code); 199 + 200 + kvm_builtin_io_ring = (struct kvm_coalesced_io) { 201 + /* 202 + * The I/O ring is a kernel-allocated page whose address is 203 + * relative to each vCPU's run page, with the page offset 204 + * provided by KVM in the return of KVM_CAP_COALESCED_MMIO. 205 + */ 206 + .ring = (void *)vcpu->run + 207 + (kvm_check_cap(KVM_CAP_COALESCED_MMIO) * getpagesize()), 208 + 209 + /* 210 + * The size of the I/O ring is fixed, but KVM defines the sized 211 + * based on the kernel's PAGE_SIZE. Thus, userspace must query 212 + * the host's page size at runtime to compute the ring size. 213 + */ 214 + .ring_size = (getpagesize() - sizeof(struct kvm_coalesced_mmio_ring)) / 215 + sizeof(struct kvm_coalesced_mmio), 216 + 217 + /* 218 + * Arbitrary address+port (MMIO mustn't overlap memslots), with 219 + * the MMIO GPA identity mapped in the guest. 220 + */ 221 + .mmio_gpa = 4ull * SZ_1G, 222 + .mmio = (uint64_t *)(4ull * SZ_1G), 223 + .pio_port = 0x80, 224 + }; 225 + 226 + virt_map(vm, (uint64_t)kvm_builtin_io_ring.mmio, kvm_builtin_io_ring.mmio_gpa, 1); 227 + 228 + sync_global_to_guest(vm, kvm_builtin_io_ring); 229 + vcpu_args_set(vcpu, 1, &kvm_builtin_io_ring); 230 + 231 + for (i = 0; i < kvm_builtin_io_ring.ring_size; i++) 232 + test_coalesced_io(vcpu, &kvm_builtin_io_ring, i); 233 + 234 + kvm_vm_free(vm); 235 + return 0; 236 + }

+17 -2

tools/testing/selftests/kvm/guest_print_test.c

··· 107 expected_assert_msg, &assert_msg[offset]); 108 } 109 110 static void run_test(struct kvm_vcpu *vcpu, const char *expected_printf, 111 const char *expected_assert) 112 { ··· 129 struct ucall uc; 130 131 while (1) { 132 - vcpu_run(vcpu); 133 134 TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON, 135 "Unexpected exit reason: %u (%s),", ··· 174 175 vm = vm_create_with_one_vcpu(&vcpu, guest_code_limits); 176 run = vcpu->run; 177 - vcpu_run(vcpu); 178 179 TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON, 180 "Unexpected exit reason: %u (%s),",

··· 107 expected_assert_msg, &assert_msg[offset]); 108 } 109 110 + /* 111 + * Open code vcpu_run(), sans the UCALL_ABORT handling, so that intentional 112 + * guest asserts guest can be verified instead of being reported as failures. 113 + */ 114 + static void do_vcpu_run(struct kvm_vcpu *vcpu) 115 + { 116 + int r; 117 + 118 + do { 119 + r = __vcpu_run(vcpu); 120 + } while (r == -1 && errno == EINTR); 121 + 122 + TEST_ASSERT(!r, KVM_IOCTL_ERROR(KVM_RUN, r)); 123 + } 124 + 125 static void run_test(struct kvm_vcpu *vcpu, const char *expected_printf, 126 const char *expected_assert) 127 { ··· 114 struct ucall uc; 115 116 while (1) { 117 + do_vcpu_run(vcpu); 118 119 TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON, 120 "Unexpected exit reason: %u (%s),", ··· 159 160 vm = vm_create_with_one_vcpu(&vcpu, guest_code_limits); 161 run = vcpu->run; 162 + do_vcpu_run(vcpu); 163 164 TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON, 165 "Unexpected exit reason: %u (%s),",

+26 -2

tools/testing/selftests/kvm/include/kvm_util.h

··· 428 void kvm_vm_free(struct kvm_vm *vmp); 429 void kvm_vm_restart(struct kvm_vm *vmp); 430 void kvm_vm_release(struct kvm_vm *vmp); 431 - int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, 432 - size_t len); 433 void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename); 434 int kvm_memfd_alloc(size_t size, bool hugepages); 435 ··· 456 static inline uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm) 457 { 458 return __vm_ioctl(vm, KVM_RESET_DIRTY_RINGS, NULL); 459 } 460 461 static inline int vm_get_stats_fd(struct kvm_vm *vm)

··· 428 void kvm_vm_free(struct kvm_vm *vmp); 429 void kvm_vm_restart(struct kvm_vm *vmp); 430 void kvm_vm_release(struct kvm_vm *vmp); 431 void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename); 432 int kvm_memfd_alloc(size_t size, bool hugepages); 433 ··· 458 static inline uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm) 459 { 460 return __vm_ioctl(vm, KVM_RESET_DIRTY_RINGS, NULL); 461 + } 462 + 463 + static inline void kvm_vm_register_coalesced_io(struct kvm_vm *vm, 464 + uint64_t address, 465 + uint64_t size, bool pio) 466 + { 467 + struct kvm_coalesced_mmio_zone zone = { 468 + .addr = address, 469 + .size = size, 470 + .pio = pio, 471 + }; 472 + 473 + vm_ioctl(vm, KVM_REGISTER_COALESCED_MMIO, &zone); 474 + } 475 + 476 + static inline void kvm_vm_unregister_coalesced_io(struct kvm_vm *vm, 477 + uint64_t address, 478 + uint64_t size, bool pio) 479 + { 480 + struct kvm_coalesced_mmio_zone zone = { 481 + .addr = address, 482 + .size = size, 483 + .pio = pio, 484 + }; 485 + 486 + vm_ioctl(vm, KVM_UNREGISTER_COALESCED_MMIO, &zone); 487 } 488 489 static inline int vm_get_stats_fd(struct kvm_vm *vm)

+69

tools/testing/selftests/kvm/include/s390x/debug_print.h

···

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Definition for kernel virtual machines on s390x 4 + * 5 + * Copyright IBM Corp. 2024 6 + * 7 + * Authors: 8 + * Christoph Schlameuss <schlameuss@linux.ibm.com> 9 + */ 10 + 11 + #ifndef SELFTEST_KVM_DEBUG_PRINT_H 12 + #define SELFTEST_KVM_DEBUG_PRINT_H 13 + 14 + #include "asm/ptrace.h" 15 + #include "kvm_util.h" 16 + #include "sie.h" 17 + 18 + static inline void print_hex_bytes(const char *name, u64 addr, size_t len) 19 + { 20 + u64 pos; 21 + 22 + pr_debug("%s (%p)\n", name, (void *)addr); 23 + pr_debug(" 0/0x00---------|"); 24 + if (len > 8) 25 + pr_debug(" 8/0x08---------|"); 26 + if (len > 16) 27 + pr_debug(" 16/0x10--------|"); 28 + if (len > 24) 29 + pr_debug(" 24/0x18--------|"); 30 + for (pos = 0; pos < len; pos += 8) { 31 + if ((pos % 32) == 0) 32 + pr_debug("\n %3lu 0x%.3lx ", pos, pos); 33 + pr_debug(" %16lx", *((u64 *)(addr + pos))); 34 + } 35 + pr_debug("\n"); 36 + } 37 + 38 + static inline void print_hex(const char *name, u64 addr) 39 + { 40 + print_hex_bytes(name, addr, 512); 41 + } 42 + 43 + static inline void print_psw(struct kvm_run *run, struct kvm_s390_sie_block *sie_block) 44 + { 45 + pr_debug("flags:0x%x psw:0x%.16llx:0x%.16llx exit:%u %s\n", 46 + run->flags, 47 + run->psw_mask, run->psw_addr, 48 + run->exit_reason, exit_reason_str(run->exit_reason)); 49 + pr_debug("sie_block psw:0x%.16llx:0x%.16llx\n", 50 + sie_block->psw_mask, sie_block->psw_addr); 51 + } 52 + 53 + static inline void print_run(struct kvm_run *run, struct kvm_s390_sie_block *sie_block) 54 + { 55 + print_hex_bytes("run", (u64)run, 0x150); 56 + print_hex("sie_block", (u64)sie_block); 57 + print_psw(run, sie_block); 58 + } 59 + 60 + static inline void print_regs(struct kvm_run *run) 61 + { 62 + struct kvm_sync_regs *sync_regs = &run->s.regs; 63 + 64 + print_hex_bytes("GPRS", (u64)sync_regs->gprs, 8 * NUM_GPRS); 65 + print_hex_bytes("ACRS", (u64)sync_regs->acrs, 4 * NUM_ACRS); 66 + print_hex_bytes("CRS", (u64)sync_regs->crs, 8 * NUM_CRS); 67 + } 68 + 69 + #endif /* SELFTEST_KVM_DEBUG_PRINT_H */

+5

tools/testing/selftests/kvm/include/s390x/processor.h

··· 21 #define PAGE_PROTECT 0x200 /* HW read-only bit */ 22 #define PAGE_NOEXEC 0x100 /* HW no-execute bit */ 23 24 /* Is there a portable way to do this? */ 25 static inline void cpu_relax(void) 26 {

··· 21 #define PAGE_PROTECT 0x200 /* HW read-only bit */ 22 #define PAGE_NOEXEC 0x100 /* HW no-execute bit */ 23 24 + /* Page size definitions */ 25 + #define PAGE_SHIFT 12 26 + #define PAGE_SIZE BIT_ULL(PAGE_SHIFT) 27 + #define PAGE_MASK (~(PAGE_SIZE - 1)) 28 + 29 /* Is there a portable way to do this? */ 30 static inline void cpu_relax(void) 31 {

+240

tools/testing/selftests/kvm/include/s390x/sie.h

···

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Definition for kernel virtual machines on s390. 4 + * 5 + * Adapted copy of struct definition kvm_s390_sie_block from 6 + * arch/s390/include/asm/kvm_host.h for use in userspace selftest programs. 7 + * 8 + * Copyright IBM Corp. 2008, 2024 9 + * 10 + * Authors: 11 + * Christoph Schlameuss <schlameuss@linux.ibm.com> 12 + * Carsten Otte <cotte@de.ibm.com> 13 + */ 14 + 15 + #ifndef SELFTEST_KVM_SIE_H 16 + #define SELFTEST_KVM_SIE_H 17 + 18 + #include <linux/types.h> 19 + 20 + struct kvm_s390_sie_block { 21 + #define CPUSTAT_STOPPED 0x80000000 22 + #define CPUSTAT_WAIT 0x10000000 23 + #define CPUSTAT_ECALL_PEND 0x08000000 24 + #define CPUSTAT_STOP_INT 0x04000000 25 + #define CPUSTAT_IO_INT 0x02000000 26 + #define CPUSTAT_EXT_INT 0x01000000 27 + #define CPUSTAT_RUNNING 0x00800000 28 + #define CPUSTAT_RETAINED 0x00400000 29 + #define CPUSTAT_TIMING_SUB 0x00020000 30 + #define CPUSTAT_SIE_SUB 0x00010000 31 + #define CPUSTAT_RRF 0x00008000 32 + #define CPUSTAT_SLSV 0x00004000 33 + #define CPUSTAT_SLSR 0x00002000 34 + #define CPUSTAT_ZARCH 0x00000800 35 + #define CPUSTAT_MCDS 0x00000100 36 + #define CPUSTAT_KSS 0x00000200 37 + #define CPUSTAT_SM 0x00000080 38 + #define CPUSTAT_IBS 0x00000040 39 + #define CPUSTAT_GED2 0x00000010 40 + #define CPUSTAT_G 0x00000008 41 + #define CPUSTAT_GED 0x00000004 42 + #define CPUSTAT_J 0x00000002 43 + #define CPUSTAT_P 0x00000001 44 + __u32 cpuflags; /* 0x0000 */ 45 + __u32: 1; /* 0x0004 */ 46 + __u32 prefix : 18; 47 + __u32: 1; 48 + __u32 ibc : 12; 49 + __u8 reserved08[4]; /* 0x0008 */ 50 + #define PROG_IN_SIE BIT(0) 51 + __u32 prog0c; /* 0x000c */ 52 + union { 53 + __u8 reserved10[16]; /* 0x0010 */ 54 + struct { 55 + __u64 pv_handle_cpu; 56 + __u64 pv_handle_config; 57 + }; 58 + }; 59 + #define PROG_BLOCK_SIE BIT(0) 60 + #define PROG_REQUEST BIT(1) 61 + __u32 prog20; /* 0x0020 */ 62 + __u8 reserved24[4]; /* 0x0024 */ 63 + __u64 cputm; /* 0x0028 */ 64 + __u64 ckc; /* 0x0030 */ 65 + __u64 epoch; /* 0x0038 */ 66 + __u32 svcc; /* 0x0040 */ 67 + #define LCTL_CR0 0x8000 68 + #define LCTL_CR6 0x0200 69 + #define LCTL_CR9 0x0040 70 + #define LCTL_CR10 0x0020 71 + #define LCTL_CR11 0x0010 72 + #define LCTL_CR14 0x0002 73 + __u16 lctl; /* 0x0044 */ 74 + __s16 icpua; /* 0x0046 */ 75 + #define ICTL_OPEREXC 0x80000000 76 + #define ICTL_PINT 0x20000000 77 + #define ICTL_LPSW 0x00400000 78 + #define ICTL_STCTL 0x00040000 79 + #define ICTL_ISKE 0x00004000 80 + #define ICTL_SSKE 0x00002000 81 + #define ICTL_RRBE 0x00001000 82 + #define ICTL_TPROT 0x00000200 83 + __u32 ictl; /* 0x0048 */ 84 + #define ECA_CEI 0x80000000 85 + #define ECA_IB 0x40000000 86 + #define ECA_SIGPI 0x10000000 87 + #define ECA_MVPGI 0x01000000 88 + #define ECA_AIV 0x00200000 89 + #define ECA_VX 0x00020000 90 + #define ECA_PROTEXCI 0x00002000 91 + #define ECA_APIE 0x00000008 92 + #define ECA_SII 0x00000001 93 + __u32 eca; /* 0x004c */ 94 + #define ICPT_INST 0x04 95 + #define ICPT_PROGI 0x08 96 + #define ICPT_INSTPROGI 0x0C 97 + #define ICPT_EXTREQ 0x10 98 + #define ICPT_EXTINT 0x14 99 + #define ICPT_IOREQ 0x18 100 + #define ICPT_WAIT 0x1c 101 + #define ICPT_VALIDITY 0x20 102 + #define ICPT_STOP 0x28 103 + #define ICPT_OPEREXC 0x2C 104 + #define ICPT_PARTEXEC 0x38 105 + #define ICPT_IOINST 0x40 106 + #define ICPT_KSS 0x5c 107 + #define ICPT_MCHKREQ 0x60 108 + #define ICPT_INT_ENABLE 0x64 109 + #define ICPT_PV_INSTR 0x68 110 + #define ICPT_PV_NOTIFY 0x6c 111 + #define ICPT_PV_PREF 0x70 112 + __u8 icptcode; /* 0x0050 */ 113 + __u8 icptstatus; /* 0x0051 */ 114 + __u16 ihcpu; /* 0x0052 */ 115 + __u8 reserved54; /* 0x0054 */ 116 + #define IICTL_CODE_NONE 0x00 117 + #define IICTL_CODE_MCHK 0x01 118 + #define IICTL_CODE_EXT 0x02 119 + #define IICTL_CODE_IO 0x03 120 + #define IICTL_CODE_RESTART 0x04 121 + #define IICTL_CODE_SPECIFICATION 0x10 122 + #define IICTL_CODE_OPERAND 0x11 123 + __u8 iictl; /* 0x0055 */ 124 + __u16 ipa; /* 0x0056 */ 125 + __u32 ipb; /* 0x0058 */ 126 + __u32 scaoh; /* 0x005c */ 127 + #define FPF_BPBC 0x20 128 + __u8 fpf; /* 0x0060 */ 129 + #define ECB_GS 0x40 130 + #define ECB_TE 0x10 131 + #define ECB_SPECI 0x08 132 + #define ECB_SRSI 0x04 133 + #define ECB_HOSTPROTINT 0x02 134 + #define ECB_PTF 0x01 135 + __u8 ecb; /* 0x0061 */ 136 + #define ECB2_CMMA 0x80 137 + #define ECB2_IEP 0x20 138 + #define ECB2_PFMFI 0x08 139 + #define ECB2_ESCA 0x04 140 + #define ECB2_ZPCI_LSI 0x02 141 + __u8 ecb2; /* 0x0062 */ 142 + #define ECB3_AISI 0x20 143 + #define ECB3_AISII 0x10 144 + #define ECB3_DEA 0x08 145 + #define ECB3_AES 0x04 146 + #define ECB3_RI 0x01 147 + __u8 ecb3; /* 0x0063 */ 148 + #define ESCA_SCAOL_MASK ~0x3fU 149 + __u32 scaol; /* 0x0064 */ 150 + __u8 sdf; /* 0x0068 */ 151 + __u8 epdx; /* 0x0069 */ 152 + __u8 cpnc; /* 0x006a */ 153 + __u8 reserved6b; /* 0x006b */ 154 + __u32 todpr; /* 0x006c */ 155 + #define GISA_FORMAT1 0x00000001 156 + __u32 gd; /* 0x0070 */ 157 + __u8 reserved74[12]; /* 0x0074 */ 158 + __u64 mso; /* 0x0080 */ 159 + __u64 msl; /* 0x0088 */ 160 + __u64 psw_mask; /* 0x0090 */ 161 + __u64 psw_addr; /* 0x0098 */ 162 + __u64 gg14; /* 0x00a0 */ 163 + __u64 gg15; /* 0x00a8 */ 164 + __u8 reservedb0[8]; /* 0x00b0 */ 165 + #define HPID_KVM 0x4 166 + #define HPID_VSIE 0x5 167 + __u8 hpid; /* 0x00b8 */ 168 + __u8 reservedb9[7]; /* 0x00b9 */ 169 + union { 170 + struct { 171 + __u32 eiparams; /* 0x00c0 */ 172 + __u16 extcpuaddr; /* 0x00c4 */ 173 + __u16 eic; /* 0x00c6 */ 174 + }; 175 + __u64 mcic; /* 0x00c0 */ 176 + } __packed; 177 + __u32 reservedc8; /* 0x00c8 */ 178 + union { 179 + struct { 180 + __u16 pgmilc; /* 0x00cc */ 181 + __u16 iprcc; /* 0x00ce */ 182 + }; 183 + __u32 edc; /* 0x00cc */ 184 + } __packed; 185 + union { 186 + struct { 187 + __u32 dxc; /* 0x00d0 */ 188 + __u16 mcn; /* 0x00d4 */ 189 + __u8 perc; /* 0x00d6 */ 190 + __u8 peratmid; /* 0x00d7 */ 191 + }; 192 + __u64 faddr; /* 0x00d0 */ 193 + } __packed; 194 + __u64 peraddr; /* 0x00d8 */ 195 + __u8 eai; /* 0x00e0 */ 196 + __u8 peraid; /* 0x00e1 */ 197 + __u8 oai; /* 0x00e2 */ 198 + __u8 armid; /* 0x00e3 */ 199 + __u8 reservede4[4]; /* 0x00e4 */ 200 + union { 201 + __u64 tecmc; /* 0x00e8 */ 202 + struct { 203 + __u16 subchannel_id; /* 0x00e8 */ 204 + __u16 subchannel_nr; /* 0x00ea */ 205 + __u32 io_int_parm; /* 0x00ec */ 206 + __u32 io_int_word; /* 0x00f0 */ 207 + }; 208 + } __packed; 209 + __u8 reservedf4[8]; /* 0x00f4 */ 210 + #define CRYCB_FORMAT_MASK 0x00000003 211 + #define CRYCB_FORMAT0 0x00000000 212 + #define CRYCB_FORMAT1 0x00000001 213 + #define CRYCB_FORMAT2 0x00000003 214 + __u32 crycbd; /* 0x00fc */ 215 + __u64 gcr[16]; /* 0x0100 */ 216 + union { 217 + __u64 gbea; /* 0x0180 */ 218 + __u64 sidad; 219 + }; 220 + __u8 reserved188[8]; /* 0x0188 */ 221 + __u64 sdnxo; /* 0x0190 */ 222 + __u8 reserved198[8]; /* 0x0198 */ 223 + __u32 fac; /* 0x01a0 */ 224 + __u8 reserved1a4[20]; /* 0x01a4 */ 225 + __u64 cbrlo; /* 0x01b8 */ 226 + __u8 reserved1c0[8]; /* 0x01c0 */ 227 + #define ECD_HOSTREGMGMT 0x20000000 228 + #define ECD_MEF 0x08000000 229 + #define ECD_ETOKENF 0x02000000 230 + #define ECD_ECC 0x00200000 231 + __u32 ecd; /* 0x01c8 */ 232 + __u8 reserved1cc[18]; /* 0x01cc */ 233 + __u64 pp; /* 0x01de */ 234 + __u8 reserved1e6[2]; /* 0x01e6 */ 235 + __u64 itdba; /* 0x01e8 */ 236 + __u64 riccbd; /* 0x01f0 */ 237 + __u64 gvrd; /* 0x01f8 */ 238 + } __packed __aligned(512); 239 + 240 + #endif /* SELFTEST_KVM_SIE_H */

+20 -1

tools/testing/selftests/kvm/include/x86_64/apic.h

··· 11 #include <stdint.h> 12 13 #include "processor.h" 14 15 #define APIC_DEFAULT_GPA 0xfee00000ULL 16 ··· 94 return rdmsr(APIC_BASE_MSR + (reg >> 4)); 95 } 96 97 static inline void x2apic_write_reg(unsigned int reg, uint64_t value) 98 { 99 - wrmsr(APIC_BASE_MSR + (reg >> 4), value); 100 } 101 102 #endif /* SELFTEST_KVM_APIC_H */

··· 11 #include <stdint.h> 12 13 #include "processor.h" 14 + #include "ucall_common.h" 15 16 #define APIC_DEFAULT_GPA 0xfee00000ULL 17 ··· 93 return rdmsr(APIC_BASE_MSR + (reg >> 4)); 94 } 95 96 + static inline uint8_t x2apic_write_reg_safe(unsigned int reg, uint64_t value) 97 + { 98 + return wrmsr_safe(APIC_BASE_MSR + (reg >> 4), value); 99 + } 100 + 101 static inline void x2apic_write_reg(unsigned int reg, uint64_t value) 102 { 103 + uint8_t fault = x2apic_write_reg_safe(reg, value); 104 + 105 + __GUEST_ASSERT(!fault, "Unexpected fault 0x%x on WRMSR(%x) = %lx\n", 106 + fault, APIC_BASE_MSR + (reg >> 4), value); 107 } 108 + 109 + static inline void x2apic_write_reg_fault(unsigned int reg, uint64_t value) 110 + { 111 + uint8_t fault = x2apic_write_reg_safe(reg, value); 112 + 113 + __GUEST_ASSERT(fault == GP_VECTOR, 114 + "Wanted #GP on WRMSR(%x) = %lx, got 0x%x\n", 115 + APIC_BASE_MSR + (reg >> 4), value, fault); 116 + } 117 + 118 119 #endif /* SELFTEST_KVM_APIC_H */

+18

tools/testing/selftests/kvm/include/x86_64/hyperv.h

··· 186 #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED \ 187 KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 14) 188 189 /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */ 190 #define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING \ 191 KVM_X86_CPU_FEATURE(HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0, EAX, 1) ··· 354 355 /* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */ 356 #define HV_INVARIANT_TSC_EXPOSED BIT_ULL(0) 357 358 #endif /* !SELFTEST_KVM_HYPERV_H */

··· 186 #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED \ 187 KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 14) 188 189 + /* HYPERV_CPUID_NESTED_FEATURES.EAX */ 190 + #define HV_X64_NESTED_DIRECT_FLUSH \ 191 + KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 17) 192 + #define HV_X64_NESTED_GUEST_MAPPING_FLUSH \ 193 + KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 18) 194 + #define HV_X64_NESTED_MSR_BITMAP \ 195 + KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 19) 196 + 197 + /* HYPERV_CPUID_NESTED_FEATURES.EBX */ 198 + #define HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL \ 199 + KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EBX, 0) 200 + 201 /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */ 202 #define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING \ 203 KVM_X86_CPU_FEATURE(HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0, EAX, 1) ··· 342 343 /* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */ 344 #define HV_INVARIANT_TSC_EXPOSED BIT_ULL(0) 345 + 346 + const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); 347 + const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu); 348 + void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu); 349 + 350 + bool kvm_hv_cpu_has(struct kvm_x86_cpu_feature feature); 351 352 #endif /* !SELFTEST_KVM_HYPERV_H */

+4 -3

tools/testing/selftests/kvm/include/x86_64/processor.h

··· 25 extern bool host_cpu_is_amd; 26 extern uint64_t guest_tsc_khz; 27 28 /* Forced emulation prefix, used to invoke the emulator unconditionally. */ 29 #define KVM_FEP "ud2; .byte 'k', 'v', 'm';" 30 ··· 912 const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid, 913 uint32_t function, uint32_t index); 914 const struct kvm_cpuid2 *kvm_get_supported_cpuid(void); 915 - const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); 916 - const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu); 917 918 static inline uint32_t kvm_cpu_fms(void) 919 { ··· 1011 } 1012 1013 void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid); 1014 - void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu); 1015 1016 static inline struct kvm_cpuid_entry2 *__vcpu_get_cpuid_entry(struct kvm_vcpu *vcpu, 1017 uint32_t function,

··· 25 extern bool host_cpu_is_amd; 26 extern uint64_t guest_tsc_khz; 27 28 + #ifndef MAX_NR_CPUID_ENTRIES 29 + #define MAX_NR_CPUID_ENTRIES 100 30 + #endif 31 + 32 /* Forced emulation prefix, used to invoke the emulator unconditionally. */ 33 #define KVM_FEP "ud2; .byte 'k', 'v', 'm';" 34 ··· 908 const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid, 909 uint32_t function, uint32_t index); 910 const struct kvm_cpuid2 *kvm_get_supported_cpuid(void); 911 912 static inline uint32_t kvm_cpu_fms(void) 913 { ··· 1009 } 1010 1011 void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid); 1012 1013 static inline struct kvm_cpuid_entry2 *__vcpu_get_cpuid_entry(struct kvm_vcpu *vcpu, 1014 uint32_t function,

+6 -79

tools/testing/selftests/kvm/lib/kvm_util.c

··· 712 } 713 714 static void __vm_mem_region_delete(struct kvm_vm *vm, 715 - struct userspace_mem_region *region, 716 - bool unlink) 717 { 718 int ret; 719 720 - if (unlink) { 721 - rb_erase(&region->gpa_node, &vm->regions.gpa_tree); 722 - rb_erase(&region->hva_node, &vm->regions.hva_tree); 723 - hash_del(&region->slot_node); 724 - } 725 726 region->region.memory_size = 0; 727 vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region); ··· 759 760 /* Free userspace_mem_regions. */ 761 hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node) 762 - __vm_mem_region_delete(vmp, region, false); 763 764 /* Free sparsebit arrays. */ 765 sparsebit_free(&vmp->vpages_valid); ··· 789 TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); 790 791 return fd; 792 - } 793 - 794 - /* 795 - * Memory Compare, host virtual to guest virtual 796 - * 797 - * Input Args: 798 - * hva - Starting host virtual address 799 - * vm - Virtual Machine 800 - * gva - Starting guest virtual address 801 - * len - number of bytes to compare 802 - * 803 - * Output Args: None 804 - * 805 - * Input/Output Args: None 806 - * 807 - * Return: 808 - * Returns 0 if the bytes starting at hva for a length of len 809 - * are equal the guest virtual bytes starting at gva. Returns 810 - * a value < 0, if bytes at hva are less than those at gva. 811 - * Otherwise a value > 0 is returned. 812 - * 813 - * Compares the bytes starting at the host virtual address hva, for 814 - * a length of len, to the guest bytes starting at the guest virtual 815 - * address given by gva. 816 - */ 817 - int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len) 818 - { 819 - size_t amt; 820 - 821 - /* 822 - * Compare a batch of bytes until either a match is found 823 - * or all the bytes have been compared. 824 - */ 825 - for (uintptr_t offset = 0; offset < len; offset += amt) { 826 - uintptr_t ptr1 = (uintptr_t)hva + offset; 827 - 828 - /* 829 - * Determine host address for guest virtual address 830 - * at offset. 831 - */ 832 - uintptr_t ptr2 = (uintptr_t)addr_gva2hva(vm, gva + offset); 833 - 834 - /* 835 - * Determine amount to compare on this pass. 836 - * Don't allow the comparsion to cross a page boundary. 837 - */ 838 - amt = len - offset; 839 - if ((ptr1 >> vm->page_shift) != ((ptr1 + amt) >> vm->page_shift)) 840 - amt = vm->page_size - (ptr1 % vm->page_size); 841 - if ((ptr2 >> vm->page_shift) != ((ptr2 + amt) >> vm->page_shift)) 842 - amt = vm->page_size - (ptr2 % vm->page_size); 843 - 844 - assert((ptr1 >> vm->page_shift) == ((ptr1 + amt - 1) >> vm->page_shift)); 845 - assert((ptr2 >> vm->page_shift) == ((ptr2 + amt - 1) >> vm->page_shift)); 846 - 847 - /* 848 - * Perform the comparison. If there is a difference 849 - * return that result to the caller, otherwise need 850 - * to continue on looking for a mismatch. 851 - */ 852 - int ret = memcmp((void *)ptr1, (void *)ptr2, amt); 853 - if (ret != 0) 854 - return ret; 855 - } 856 - 857 - /* 858 - * No mismatch found. Let the caller know the two memory 859 - * areas are equal. 860 - */ 861 - return 0; 862 } 863 864 static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree, ··· 1197 */ 1198 void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) 1199 { 1200 - __vm_mem_region_delete(vm, memslot2region(vm, slot), true); 1201 } 1202 1203 void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size,

··· 712 } 713 714 static void __vm_mem_region_delete(struct kvm_vm *vm, 715 + struct userspace_mem_region *region) 716 { 717 int ret; 718 719 + rb_erase(&region->gpa_node, &vm->regions.gpa_tree); 720 + rb_erase(&region->hva_node, &vm->regions.hva_tree); 721 + hash_del(&region->slot_node); 722 723 region->region.memory_size = 0; 724 vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region); ··· 762 763 /* Free userspace_mem_regions. */ 764 hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node) 765 + __vm_mem_region_delete(vmp, region); 766 767 /* Free sparsebit arrays. */ 768 sparsebit_free(&vmp->vpages_valid); ··· 792 TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); 793 794 return fd; 795 } 796 797 static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree, ··· 1270 */ 1271 void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) 1272 { 1273 + __vm_mem_region_delete(vm, memslot2region(vm, slot)); 1274 } 1275 1276 void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size,

+5 -5

tools/testing/selftests/kvm/lib/s390x/processor.c

··· 14 { 15 vm_paddr_t paddr; 16 17 - TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", 18 vm->page_size); 19 20 if (vm->pgd_created) ··· 79 } 80 81 /* Fill in page table entry */ 82 - idx = (gva >> 12) & 0x0ffu; /* page index */ 83 if (!(entry[idx] & PAGE_INVALID)) 84 fprintf(stderr, 85 "WARNING: PTE for gpa=0x%"PRIx64" already set!\n", gpa); ··· 91 int ri, idx; 92 uint64_t *entry; 93 94 - TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", 95 vm->page_size); 96 97 entry = addr_gpa2hva(vm, vm->pgd); ··· 103 entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN); 104 } 105 106 - idx = (gva >> 12) & 0x0ffu; /* page index */ 107 108 TEST_ASSERT(!(entry[idx] & PAGE_INVALID), 109 "No page mapping for vm virtual address 0x%lx", gva); ··· 168 struct kvm_sregs sregs; 169 struct kvm_vcpu *vcpu; 170 171 - TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", 172 vm->page_size); 173 174 stack_vaddr = __vm_vaddr_alloc(vm, stack_size,

··· 14 { 15 vm_paddr_t paddr; 16 17 + TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x", 18 vm->page_size); 19 20 if (vm->pgd_created) ··· 79 } 80 81 /* Fill in page table entry */ 82 + idx = (gva >> PAGE_SHIFT) & 0x0ffu; /* page index */ 83 if (!(entry[idx] & PAGE_INVALID)) 84 fprintf(stderr, 85 "WARNING: PTE for gpa=0x%"PRIx64" already set!\n", gpa); ··· 91 int ri, idx; 92 uint64_t *entry; 93 94 + TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x", 95 vm->page_size); 96 97 entry = addr_gpa2hva(vm, vm->pgd); ··· 103 entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN); 104 } 105 106 + idx = (gva >> PAGE_SHIFT) & 0x0ffu; /* page index */ 107 108 TEST_ASSERT(!(entry[idx] & PAGE_INVALID), 109 "No page mapping for vm virtual address 0x%lx", gva); ··· 168 struct kvm_sregs sregs; 169 struct kvm_vcpu *vcpu; 170 171 + TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x", 172 vm->page_size); 173 174 stack_vaddr = __vm_vaddr_alloc(vm, stack_size,

+67

tools/testing/selftests/kvm/lib/x86_64/hyperv.c

··· 8 #include "processor.h" 9 #include "hyperv.h" 10 11 struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm, 12 vm_vaddr_t *p_hv_pages_gva) 13 {

··· 8 #include "processor.h" 9 #include "hyperv.h" 10 11 + const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) 12 + { 13 + static struct kvm_cpuid2 *cpuid; 14 + int kvm_fd; 15 + 16 + if (cpuid) 17 + return cpuid; 18 + 19 + cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); 20 + kvm_fd = open_kvm_dev_path_or_exit(); 21 + 22 + kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid); 23 + 24 + close(kvm_fd); 25 + return cpuid; 26 + } 27 + 28 + void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu) 29 + { 30 + static struct kvm_cpuid2 *cpuid_full; 31 + const struct kvm_cpuid2 *cpuid_sys, *cpuid_hv; 32 + int i, nent = 0; 33 + 34 + if (!cpuid_full) { 35 + cpuid_sys = kvm_get_supported_cpuid(); 36 + cpuid_hv = kvm_get_supported_hv_cpuid(); 37 + 38 + cpuid_full = allocate_kvm_cpuid2(cpuid_sys->nent + cpuid_hv->nent); 39 + if (!cpuid_full) { 40 + perror("malloc"); 41 + abort(); 42 + } 43 + 44 + /* Need to skip KVM CPUID leaves 0x400000xx */ 45 + for (i = 0; i < cpuid_sys->nent; i++) { 46 + if (cpuid_sys->entries[i].function >= 0x40000000 && 47 + cpuid_sys->entries[i].function < 0x40000100) 48 + continue; 49 + cpuid_full->entries[nent] = cpuid_sys->entries[i]; 50 + nent++; 51 + } 52 + 53 + memcpy(&cpuid_full->entries[nent], cpuid_hv->entries, 54 + cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2)); 55 + cpuid_full->nent = nent + cpuid_hv->nent; 56 + } 57 + 58 + vcpu_init_cpuid(vcpu, cpuid_full); 59 + } 60 + 61 + const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu) 62 + { 63 + struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); 64 + 65 + vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, cpuid); 66 + 67 + return cpuid; 68 + } 69 + 70 + bool kvm_hv_cpu_has(struct kvm_x86_cpu_feature feature) 71 + { 72 + if (!kvm_has_cap(KVM_CAP_SYS_HYPERV_CPUID)) 73 + return false; 74 + 75 + return kvm_cpuid_has(kvm_get_supported_hv_cpuid(), feature); 76 + } 77 + 78 struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm, 79 vm_vaddr_t *p_hv_pages_gva) 80 {

+3 -66

tools/testing/selftests/kvm/lib/x86_64/processor.c

··· 19 #define KERNEL_DS 0x10 20 #define KERNEL_TSS 0x18 21 22 - #define MAX_NR_CPUID_ENTRIES 100 23 - 24 vm_vaddr_t exception_handlers; 25 bool host_cpu_is_amd; 26 bool host_cpu_is_intel; ··· 564 if (kvm_fixup_exception(regs)) 565 return; 566 567 - ucall_assert(UCALL_UNHANDLED, 568 - "Unhandled exception in guest", __FILE__, __LINE__, 569 - "Unhandled exception '0x%lx' at guest RIP '0x%lx'", 570 - regs->vector, regs->rip); 571 } 572 573 static void vm_init_descriptor_tables(struct kvm_vm *vm) ··· 607 { 608 struct ucall uc; 609 610 - if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED) 611 REPORT_GUEST_ASSERT(uc); 612 } 613 ··· 1189 void xen_hypercall(uint64_t nr, uint64_t a0, void *a1) 1190 { 1191 GUEST_ASSERT(!__xen_hypercall(nr, a0, a1)); 1192 - } 1193 - 1194 - const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) 1195 - { 1196 - static struct kvm_cpuid2 *cpuid; 1197 - int kvm_fd; 1198 - 1199 - if (cpuid) 1200 - return cpuid; 1201 - 1202 - cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); 1203 - kvm_fd = open_kvm_dev_path_or_exit(); 1204 - 1205 - kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid); 1206 - 1207 - close(kvm_fd); 1208 - return cpuid; 1209 - } 1210 - 1211 - void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu) 1212 - { 1213 - static struct kvm_cpuid2 *cpuid_full; 1214 - const struct kvm_cpuid2 *cpuid_sys, *cpuid_hv; 1215 - int i, nent = 0; 1216 - 1217 - if (!cpuid_full) { 1218 - cpuid_sys = kvm_get_supported_cpuid(); 1219 - cpuid_hv = kvm_get_supported_hv_cpuid(); 1220 - 1221 - cpuid_full = allocate_kvm_cpuid2(cpuid_sys->nent + cpuid_hv->nent); 1222 - if (!cpuid_full) { 1223 - perror("malloc"); 1224 - abort(); 1225 - } 1226 - 1227 - /* Need to skip KVM CPUID leaves 0x400000xx */ 1228 - for (i = 0; i < cpuid_sys->nent; i++) { 1229 - if (cpuid_sys->entries[i].function >= 0x40000000 && 1230 - cpuid_sys->entries[i].function < 0x40000100) 1231 - continue; 1232 - cpuid_full->entries[nent] = cpuid_sys->entries[i]; 1233 - nent++; 1234 - } 1235 - 1236 - memcpy(&cpuid_full->entries[nent], cpuid_hv->entries, 1237 - cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2)); 1238 - cpuid_full->nent = nent + cpuid_hv->nent; 1239 - } 1240 - 1241 - vcpu_init_cpuid(vcpu, cpuid_full); 1242 - } 1243 - 1244 - const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu) 1245 - { 1246 - struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); 1247 - 1248 - vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, cpuid); 1249 - 1250 - return cpuid; 1251 } 1252 1253 unsigned long vm_compute_max_gfn(struct kvm_vm *vm)

··· 19 #define KERNEL_DS 0x10 20 #define KERNEL_TSS 0x18 21 22 vm_vaddr_t exception_handlers; 23 bool host_cpu_is_amd; 24 bool host_cpu_is_intel; ··· 566 if (kvm_fixup_exception(regs)) 567 return; 568 569 + GUEST_FAIL("Unhandled exception '0x%lx' at guest RIP '0x%lx'", 570 + regs->vector, regs->rip); 571 } 572 573 static void vm_init_descriptor_tables(struct kvm_vm *vm) ··· 611 { 612 struct ucall uc; 613 614 + if (get_ucall(vcpu, &uc) == UCALL_ABORT) 615 REPORT_GUEST_ASSERT(uc); 616 } 617 ··· 1193 void xen_hypercall(uint64_t nr, uint64_t a0, void *a1) 1194 { 1195 GUEST_ASSERT(!__xen_hypercall(nr, a0, a1)); 1196 } 1197 1198 unsigned long vm_compute_max_gfn(struct kvm_vm *vm)

+17 -2

tools/testing/selftests/kvm/memslot_modification_stress_test.c

··· 79 useconds_t delay; 80 uint64_t nr_iterations; 81 bool partition_vcpu_memory_access; 82 }; 83 84 static void run_test(enum vm_guest_mode mode, void *arg) ··· 90 vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, 91 VM_MEM_SRC_ANONYMOUS, 92 p->partition_vcpu_memory_access); 93 94 pr_info("Finished creating vCPUs\n"); 95 ··· 115 static void help(char *name) 116 { 117 puts(""); 118 - printf("usage: %s [-h] [-m mode] [-d delay_usec]\n" 119 " [-b memory] [-v vcpus] [-o] [-i iterations]\n", name); 120 guest_modes_help(); 121 printf(" -d: add a delay between each iteration of adding and\n" 122 " deleting a memslot in usec.\n"); 123 printf(" -b: specify the size of the memory region which should be\n" 124 " accessed by each vCPU. e.g. 10M or 3G.\n" 125 " Default: 1G\n"); ··· 146 147 guest_modes_append_default(); 148 149 - while ((opt = getopt(argc, argv, "hm:d:b:v:oi:")) != -1) { 150 switch (opt) { 151 case 'm': 152 guest_modes_cmdline(optarg); ··· 168 break; 169 case 'i': 170 p.nr_iterations = atoi_positive("Number of iterations", optarg); 171 break; 172 case 'h': 173 default:

··· 79 useconds_t delay; 80 uint64_t nr_iterations; 81 bool partition_vcpu_memory_access; 82 + bool disable_slot_zap_quirk; 83 }; 84 85 static void run_test(enum vm_guest_mode mode, void *arg) ··· 89 vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, 90 VM_MEM_SRC_ANONYMOUS, 91 p->partition_vcpu_memory_access); 92 + #ifdef __x86_64__ 93 + if (p->disable_slot_zap_quirk) 94 + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL); 95 + 96 + pr_info("Memslot zap quirk %s\n", p->disable_slot_zap_quirk ? 97 + "disabled" : "enabled"); 98 + #endif 99 100 pr_info("Finished creating vCPUs\n"); 101 ··· 107 static void help(char *name) 108 { 109 puts(""); 110 + printf("usage: %s [-h] [-m mode] [-d delay_usec] [-q]\n" 111 " [-b memory] [-v vcpus] [-o] [-i iterations]\n", name); 112 guest_modes_help(); 113 printf(" -d: add a delay between each iteration of adding and\n" 114 " deleting a memslot in usec.\n"); 115 + printf(" -q: Disable memslot zap quirk.\n"); 116 printf(" -b: specify the size of the memory region which should be\n" 117 " accessed by each vCPU. e.g. 10M or 3G.\n" 118 " Default: 1G\n"); ··· 137 138 guest_modes_append_default(); 139 140 + while ((opt = getopt(argc, argv, "hm:d:qb:v:oi:")) != -1) { 141 switch (opt) { 142 case 'm': 143 guest_modes_cmdline(optarg); ··· 159 break; 160 case 'i': 161 p.nr_iterations = atoi_positive("Number of iterations", optarg); 162 + break; 163 + case 'q': 164 + p.disable_slot_zap_quirk = true; 165 + 166 + TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & 167 + KVM_X86_QUIRK_SLOT_ZAP_ALL); 168 break; 169 case 'h': 170 default:

+11 -1

tools/testing/selftests/kvm/memslot_perf_test.c

··· 113 static sem_t vcpu_ready; 114 115 static bool map_unmap_verify; 116 117 static bool verbose; 118 #define pr_info_v(...) \ ··· 579 uint32_t guest_page_size = data->vm->page_size; 580 uint64_t movesrcgpa, movetestgpa; 581 582 movesrcgpa = vm_slot2gpa(data, data->nslots - 1); 583 584 if (isactive) { ··· 900 pr_info(" -h: print this help screen.\n"); 901 pr_info(" -v: enable verbose mode (not for benchmarking).\n"); 902 pr_info(" -d: enable extra debug checks.\n"); 903 pr_info(" -s: specify memslot count cap (-1 means no cap; currently: %i)\n", 904 targs->nslots); 905 pr_info(" -f: specify the first test to run (currently: %i; max %zu)\n", ··· 959 uint32_t max_mem_slots; 960 int opt; 961 962 - while ((opt = getopt(argc, argv, "hvds:f:e:l:r:")) != -1) { 963 switch (opt) { 964 case 'h': 965 default: ··· 970 break; 971 case 'd': 972 map_unmap_verify = true; 973 break; 974 case 's': 975 targs->nslots = atoi_paranoid(optarg);

··· 113 static sem_t vcpu_ready; 114 115 static bool map_unmap_verify; 116 + static bool disable_slot_zap_quirk; 117 118 static bool verbose; 119 #define pr_info_v(...) \ ··· 578 uint32_t guest_page_size = data->vm->page_size; 579 uint64_t movesrcgpa, movetestgpa; 580 581 + if (disable_slot_zap_quirk) 582 + vm_enable_cap(data->vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL); 583 + 584 movesrcgpa = vm_slot2gpa(data, data->nslots - 1); 585 586 if (isactive) { ··· 896 pr_info(" -h: print this help screen.\n"); 897 pr_info(" -v: enable verbose mode (not for benchmarking).\n"); 898 pr_info(" -d: enable extra debug checks.\n"); 899 + pr_info(" -q: Disable memslot zap quirk during memslot move.\n"); 900 pr_info(" -s: specify memslot count cap (-1 means no cap; currently: %i)\n", 901 targs->nslots); 902 pr_info(" -f: specify the first test to run (currently: %i; max %zu)\n", ··· 954 uint32_t max_mem_slots; 955 int opt; 956 957 + while ((opt = getopt(argc, argv, "hvdqs:f:e:l:r:")) != -1) { 958 switch (opt) { 959 case 'h': 960 default: ··· 965 break; 966 case 'd': 967 map_unmap_verify = true; 968 + break; 969 + case 'q': 970 + disable_slot_zap_quirk = true; 971 + TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & 972 + KVM_X86_QUIRK_SLOT_ZAP_ALL); 973 break; 974 case 's': 975 targs->nslots = atoi_paranoid(optarg);

+4 -3

tools/testing/selftests/kvm/s390x/cmma_test.c

··· 17 #include "kvm_util.h" 18 #include "kselftest.h" 19 #include "ucall_common.h" 20 21 #define MAIN_PAGE_COUNT 512 22 23 #define TEST_DATA_PAGE_COUNT 512 24 #define TEST_DATA_MEMSLOT 1 25 - #define TEST_DATA_START_GFN 4096 26 27 #define TEST_DATA_TWO_PAGE_COUNT 256 28 #define TEST_DATA_TWO_MEMSLOT 2 29 - #define TEST_DATA_TWO_START_GFN 8192 30 31 static char cmma_value_buf[MAIN_PAGE_COUNT + TEST_DATA_PAGE_COUNT]; 32 ··· 67 " lghi 5,%[page_count]\n" 68 /* r5 += r1 */ 69 "2: agfr 5,1\n" 70 - /* r2 = r1 << 12 */ 71 "1: sllg 2,1,12(0)\n" 72 /* essa(r4, r2, SET_STABLE) */ 73 " .insn rrf,0xb9ab0000,4,2,1,0\n"

··· 17 #include "kvm_util.h" 18 #include "kselftest.h" 19 #include "ucall_common.h" 20 + #include "processor.h" 21 22 #define MAIN_PAGE_COUNT 512 23 24 #define TEST_DATA_PAGE_COUNT 512 25 #define TEST_DATA_MEMSLOT 1 26 + #define TEST_DATA_START_GFN PAGE_SIZE 27 28 #define TEST_DATA_TWO_PAGE_COUNT 256 29 #define TEST_DATA_TWO_MEMSLOT 2 30 + #define TEST_DATA_TWO_START_GFN (2 * PAGE_SIZE) 31 32 static char cmma_value_buf[MAIN_PAGE_COUNT + TEST_DATA_PAGE_COUNT]; 33 ··· 66 " lghi 5,%[page_count]\n" 67 /* r5 += r1 */ 68 "2: agfr 5,1\n" 69 + /* r2 = r1 << PAGE_SHIFT */ 70 "1: sllg 2,1,12(0)\n" 71 /* essa(r4, r2, SET_STABLE) */ 72 " .insn rrf,0xb9ab0000,4,2,1,0\n"

+2

tools/testing/selftests/kvm/s390x/config

···

··· 1 + CONFIG_KVM=y 2 + CONFIG_KVM_S390_UCONTROL=y

+2 -2

tools/testing/selftests/kvm/s390x/debug_test.c

··· 2 /* Test KVM debugging features. */ 3 #include "kvm_util.h" 4 #include "test_util.h" 5 6 #include <linux/kvm.h> 7 8 #define __LC_SVC_NEW_PSW 0x1c0 9 #define __LC_PGM_NEW_PSW 0x1d0 10 - #define ICPT_INSTRUCTION 0x04 11 #define IPA0_DIAG 0x8300 12 #define PGM_SPECIFICATION 0x06 13 ··· 85 vm = test_step_int_1(&vcpu, test_step_pgm_diag_guest_code, 86 __LC_PGM_NEW_PSW, new_psw); 87 TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); 88 - TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_INSTRUCTION); 89 TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa & 0xff00, IPA0_DIAG); 90 vcpu_ioctl(vcpu, KVM_S390_IRQ, &irq); 91 vcpu_run(vcpu);

··· 2 /* Test KVM debugging features. */ 3 #include "kvm_util.h" 4 #include "test_util.h" 5 + #include "sie.h" 6 7 #include <linux/kvm.h> 8 9 #define __LC_SVC_NEW_PSW 0x1c0 10 #define __LC_PGM_NEW_PSW 0x1d0 11 #define IPA0_DIAG 0x8300 12 #define PGM_SPECIFICATION 0x06 13 ··· 85 vm = test_step_int_1(&vcpu, test_step_pgm_diag_guest_code, 86 __LC_PGM_NEW_PSW, new_psw); 87 TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); 88 + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_INST); 89 TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa & 0xff00, IPA0_DIAG); 90 vcpu_ioctl(vcpu, KVM_S390_IRQ, &irq); 91 vcpu_run(vcpu);

+1 -3

tools/testing/selftests/kvm/s390x/memop.c

··· 16 #include "kvm_util.h" 17 #include "kselftest.h" 18 #include "ucall_common.h" 19 20 enum mop_target { 21 LOGICAL, ··· 227 228 #define CHECK_N_DO(f, ...) ({ f(__VA_ARGS__, CHECK_ONLY); f(__VA_ARGS__); }) 229 230 - #define PAGE_SHIFT 12 231 - #define PAGE_SIZE (1ULL << PAGE_SHIFT) 232 - #define PAGE_MASK (~(PAGE_SIZE - 1)) 233 #define CR0_FETCH_PROTECTION_OVERRIDE (1UL << (63 - 38)) 234 #define CR0_STORAGE_PROTECTION_OVERRIDE (1UL << (63 - 39)) 235

··· 16 #include "kvm_util.h" 17 #include "kselftest.h" 18 #include "ucall_common.h" 19 + #include "processor.h" 20 21 enum mop_target { 22 LOGICAL, ··· 226 227 #define CHECK_N_DO(f, ...) ({ f(__VA_ARGS__, CHECK_ONLY); f(__VA_ARGS__); }) 228 229 #define CR0_FETCH_PROTECTION_OVERRIDE (1UL << (63 - 38)) 230 #define CR0_STORAGE_PROTECTION_OVERRIDE (1UL << (63 - 39)) 231

+2 -3

tools/testing/selftests/kvm/s390x/tprot.c

··· 9 #include "kvm_util.h" 10 #include "kselftest.h" 11 #include "ucall_common.h" 12 13 - #define PAGE_SHIFT 12 14 - #define PAGE_SIZE (1 << PAGE_SHIFT) 15 #define CR0_FETCH_PROTECTION_OVERRIDE (1UL << (63 - 38)) 16 #define CR0_STORAGE_PROTECTION_OVERRIDE (1UL << (63 - 39)) 17 ··· 150 * instead. 151 * In order to skip these tests we detect this inside the guest 152 */ 153 - skip = tests[*i].addr < (void *)4096 && 154 tests[*i].expected != TRANSL_UNAVAIL && 155 !mapped_0; 156 if (!skip) {

··· 9 #include "kvm_util.h" 10 #include "kselftest.h" 11 #include "ucall_common.h" 12 + #include "processor.h" 13 14 #define CR0_FETCH_PROTECTION_OVERRIDE (1UL << (63 - 38)) 15 #define CR0_STORAGE_PROTECTION_OVERRIDE (1UL << (63 - 39)) 16 ··· 151 * instead. 152 * In order to skip these tests we detect this inside the guest 153 */ 154 + skip = tests[*i].addr < (void *)PAGE_SIZE && 155 tests[*i].expected != TRANSL_UNAVAIL && 156 !mapped_0; 157 if (!skip) {

+332

tools/testing/selftests/kvm/s390x/ucontrol_test.c

···

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Test code for the s390x kvm ucontrol interface 4 + * 5 + * Copyright IBM Corp. 2024 6 + * 7 + * Authors: 8 + * Christoph Schlameuss <schlameuss@linux.ibm.com> 9 + */ 10 + #include "debug_print.h" 11 + #include "kselftest_harness.h" 12 + #include "kvm_util.h" 13 + #include "processor.h" 14 + #include "sie.h" 15 + 16 + #include <linux/capability.h> 17 + #include <linux/sizes.h> 18 + 19 + #define VM_MEM_SIZE (4 * SZ_1M) 20 + 21 + /* so directly declare capget to check caps without libcap */ 22 + int capget(cap_user_header_t header, cap_user_data_t data); 23 + 24 + /** 25 + * In order to create user controlled virtual machines on S390, 26 + * check KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL 27 + * as privileged user (SYS_ADMIN). 28 + */ 29 + void require_ucontrol_admin(void) 30 + { 31 + struct __user_cap_data_struct data[_LINUX_CAPABILITY_U32S_3]; 32 + struct __user_cap_header_struct hdr = { 33 + .version = _LINUX_CAPABILITY_VERSION_3, 34 + }; 35 + int rc; 36 + 37 + rc = capget(&hdr, data); 38 + TEST_ASSERT_EQ(0, rc); 39 + TEST_REQUIRE((data->effective & CAP_TO_MASK(CAP_SYS_ADMIN)) > 0); 40 + 41 + TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_UCONTROL)); 42 + } 43 + 44 + /* Test program setting some registers and looping */ 45 + extern char test_gprs_asm[]; 46 + asm("test_gprs_asm:\n" 47 + "xgr %r0, %r0\n" 48 + "lgfi %r1,1\n" 49 + "lgfi %r2,2\n" 50 + "lgfi %r3,3\n" 51 + "lgfi %r4,4\n" 52 + "lgfi %r5,5\n" 53 + "lgfi %r6,6\n" 54 + "lgfi %r7,7\n" 55 + "0:\n" 56 + " diag 0,0,0x44\n" 57 + " ahi %r0,1\n" 58 + " j 0b\n" 59 + ); 60 + 61 + FIXTURE(uc_kvm) 62 + { 63 + struct kvm_s390_sie_block *sie_block; 64 + struct kvm_run *run; 65 + uintptr_t base_gpa; 66 + uintptr_t code_gpa; 67 + uintptr_t base_hva; 68 + uintptr_t code_hva; 69 + int kvm_run_size; 70 + void *vm_mem; 71 + int vcpu_fd; 72 + int kvm_fd; 73 + int vm_fd; 74 + }; 75 + 76 + /** 77 + * create VM with single vcpu, map kvm_run and SIE control block for easy access 78 + */ 79 + FIXTURE_SETUP(uc_kvm) 80 + { 81 + struct kvm_s390_vm_cpu_processor info; 82 + int rc; 83 + 84 + require_ucontrol_admin(); 85 + 86 + self->kvm_fd = open_kvm_dev_path_or_exit(); 87 + self->vm_fd = ioctl(self->kvm_fd, KVM_CREATE_VM, KVM_VM_S390_UCONTROL); 88 + ASSERT_GE(self->vm_fd, 0); 89 + 90 + kvm_device_attr_get(self->vm_fd, KVM_S390_VM_CPU_MODEL, 91 + KVM_S390_VM_CPU_PROCESSOR, &info); 92 + TH_LOG("create VM 0x%llx", info.cpuid); 93 + 94 + self->vcpu_fd = ioctl(self->vm_fd, KVM_CREATE_VCPU, 0); 95 + ASSERT_GE(self->vcpu_fd, 0); 96 + 97 + self->kvm_run_size = ioctl(self->kvm_fd, KVM_GET_VCPU_MMAP_SIZE, NULL); 98 + ASSERT_GE(self->kvm_run_size, sizeof(struct kvm_run)) 99 + TH_LOG(KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, self->kvm_run_size)); 100 + self->run = (struct kvm_run *)mmap(NULL, self->kvm_run_size, 101 + PROT_READ | PROT_WRITE, MAP_SHARED, self->vcpu_fd, 0); 102 + ASSERT_NE(self->run, MAP_FAILED); 103 + /** 104 + * For virtual cpus that have been created with S390 user controlled 105 + * virtual machines, the resulting vcpu fd can be memory mapped at page 106 + * offset KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of 107 + * the virtual cpu's hardware control block. 108 + */ 109 + self->sie_block = (struct kvm_s390_sie_block *)mmap(NULL, PAGE_SIZE, 110 + PROT_READ | PROT_WRITE, MAP_SHARED, 111 + self->vcpu_fd, KVM_S390_SIE_PAGE_OFFSET << PAGE_SHIFT); 112 + ASSERT_NE(self->sie_block, MAP_FAILED); 113 + 114 + TH_LOG("VM created %p %p", self->run, self->sie_block); 115 + 116 + self->base_gpa = 0; 117 + self->code_gpa = self->base_gpa + (3 * SZ_1M); 118 + 119 + self->vm_mem = aligned_alloc(SZ_1M, VM_MEM_SIZE); 120 + ASSERT_NE(NULL, self->vm_mem) TH_LOG("malloc failed %u", errno); 121 + self->base_hva = (uintptr_t)self->vm_mem; 122 + self->code_hva = self->base_hva - self->base_gpa + self->code_gpa; 123 + struct kvm_s390_ucas_mapping map = { 124 + .user_addr = self->base_hva, 125 + .vcpu_addr = self->base_gpa, 126 + .length = VM_MEM_SIZE, 127 + }; 128 + TH_LOG("ucas map %p %p 0x%llx", 129 + (void *)map.user_addr, (void *)map.vcpu_addr, map.length); 130 + rc = ioctl(self->vcpu_fd, KVM_S390_UCAS_MAP, &map); 131 + ASSERT_EQ(0, rc) TH_LOG("ucas map result %d not expected, %s", 132 + rc, strerror(errno)); 133 + 134 + TH_LOG("page in %p", (void *)self->base_gpa); 135 + rc = ioctl(self->vcpu_fd, KVM_S390_VCPU_FAULT, self->base_gpa); 136 + ASSERT_EQ(0, rc) TH_LOG("vcpu fault (%p) result %d not expected, %s", 137 + (void *)self->base_hva, rc, strerror(errno)); 138 + 139 + self->sie_block->cpuflags &= ~CPUSTAT_STOPPED; 140 + } 141 + 142 + FIXTURE_TEARDOWN(uc_kvm) 143 + { 144 + munmap(self->sie_block, PAGE_SIZE); 145 + munmap(self->run, self->kvm_run_size); 146 + close(self->vcpu_fd); 147 + close(self->vm_fd); 148 + close(self->kvm_fd); 149 + free(self->vm_mem); 150 + } 151 + 152 + TEST_F(uc_kvm, uc_sie_assertions) 153 + { 154 + /* assert interception of Code 08 (Program Interruption) is set */ 155 + EXPECT_EQ(0, self->sie_block->ecb & ECB_SPECI); 156 + } 157 + 158 + TEST_F(uc_kvm, uc_attr_mem_limit) 159 + { 160 + u64 limit; 161 + struct kvm_device_attr attr = { 162 + .group = KVM_S390_VM_MEM_CTRL, 163 + .attr = KVM_S390_VM_MEM_LIMIT_SIZE, 164 + .addr = (unsigned long)&limit, 165 + }; 166 + int rc; 167 + 168 + rc = ioctl(self->vm_fd, KVM_GET_DEVICE_ATTR, &attr); 169 + EXPECT_EQ(0, rc); 170 + EXPECT_EQ(~0UL, limit); 171 + 172 + /* assert set not supported */ 173 + rc = ioctl(self->vm_fd, KVM_SET_DEVICE_ATTR, &attr); 174 + EXPECT_EQ(-1, rc); 175 + EXPECT_EQ(EINVAL, errno); 176 + } 177 + 178 + TEST_F(uc_kvm, uc_no_dirty_log) 179 + { 180 + struct kvm_dirty_log dlog; 181 + int rc; 182 + 183 + rc = ioctl(self->vm_fd, KVM_GET_DIRTY_LOG, &dlog); 184 + EXPECT_EQ(-1, rc); 185 + EXPECT_EQ(EINVAL, errno); 186 + } 187 + 188 + /** 189 + * Assert HPAGE CAP cannot be enabled on UCONTROL VM 190 + */ 191 + TEST(uc_cap_hpage) 192 + { 193 + int rc, kvm_fd, vm_fd, vcpu_fd; 194 + struct kvm_enable_cap cap = { 195 + .cap = KVM_CAP_S390_HPAGE_1M, 196 + }; 197 + 198 + require_ucontrol_admin(); 199 + 200 + kvm_fd = open_kvm_dev_path_or_exit(); 201 + vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, KVM_VM_S390_UCONTROL); 202 + ASSERT_GE(vm_fd, 0); 203 + 204 + /* assert hpages are not supported on ucontrol vm */ 205 + rc = ioctl(vm_fd, KVM_CHECK_EXTENSION, KVM_CAP_S390_HPAGE_1M); 206 + EXPECT_EQ(0, rc); 207 + 208 + /* Test that KVM_CAP_S390_HPAGE_1M can't be enabled for a ucontrol vm */ 209 + rc = ioctl(vm_fd, KVM_ENABLE_CAP, cap); 210 + EXPECT_EQ(-1, rc); 211 + EXPECT_EQ(EINVAL, errno); 212 + 213 + /* assert HPAGE CAP is rejected after vCPU creation */ 214 + vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0); 215 + ASSERT_GE(vcpu_fd, 0); 216 + rc = ioctl(vm_fd, KVM_ENABLE_CAP, cap); 217 + EXPECT_EQ(-1, rc); 218 + EXPECT_EQ(EBUSY, errno); 219 + 220 + close(vcpu_fd); 221 + close(vm_fd); 222 + close(kvm_fd); 223 + } 224 + 225 + /* verify SIEIC exit 226 + * * fail on codes not expected in the test cases 227 + */ 228 + static bool uc_handle_sieic(FIXTURE_DATA(uc_kvm) * self) 229 + { 230 + struct kvm_s390_sie_block *sie_block = self->sie_block; 231 + struct kvm_run *run = self->run; 232 + 233 + /* check SIE interception code */ 234 + pr_info("sieic: 0x%.2x 0x%.4x 0x%.4x\n", 235 + run->s390_sieic.icptcode, 236 + run->s390_sieic.ipa, 237 + run->s390_sieic.ipb); 238 + switch (run->s390_sieic.icptcode) { 239 + case ICPT_INST: 240 + /* end execution in caller on intercepted instruction */ 241 + pr_info("sie instruction interception\n"); 242 + return false; 243 + case ICPT_OPEREXC: 244 + /* operation exception */ 245 + TEST_FAIL("sie exception on %.4x%.8x", sie_block->ipa, sie_block->ipb); 246 + default: 247 + TEST_FAIL("UNEXPECTED SIEIC CODE %d", run->s390_sieic.icptcode); 248 + } 249 + return true; 250 + } 251 + 252 + /* verify VM state on exit */ 253 + static bool uc_handle_exit(FIXTURE_DATA(uc_kvm) * self) 254 + { 255 + struct kvm_run *run = self->run; 256 + 257 + switch (run->exit_reason) { 258 + case KVM_EXIT_S390_SIEIC: 259 + return uc_handle_sieic(self); 260 + default: 261 + pr_info("exit_reason %2d not handled\n", run->exit_reason); 262 + } 263 + return true; 264 + } 265 + 266 + /* run the VM until interrupted */ 267 + static int uc_run_once(FIXTURE_DATA(uc_kvm) * self) 268 + { 269 + int rc; 270 + 271 + rc = ioctl(self->vcpu_fd, KVM_RUN, NULL); 272 + print_run(self->run, self->sie_block); 273 + print_regs(self->run); 274 + pr_debug("run %d / %d %s\n", rc, errno, strerror(errno)); 275 + return rc; 276 + } 277 + 278 + static void uc_assert_diag44(FIXTURE_DATA(uc_kvm) * self) 279 + { 280 + struct kvm_s390_sie_block *sie_block = self->sie_block; 281 + 282 + /* assert vm was interrupted by diag 0x0044 */ 283 + TEST_ASSERT_EQ(KVM_EXIT_S390_SIEIC, self->run->exit_reason); 284 + TEST_ASSERT_EQ(ICPT_INST, sie_block->icptcode); 285 + TEST_ASSERT_EQ(0x8300, sie_block->ipa); 286 + TEST_ASSERT_EQ(0x440000, sie_block->ipb); 287 + } 288 + 289 + TEST_F(uc_kvm, uc_gprs) 290 + { 291 + struct kvm_sync_regs *sync_regs = &self->run->s.regs; 292 + struct kvm_run *run = self->run; 293 + struct kvm_regs regs = {}; 294 + 295 + /* Set registers to values that are different from the ones that we expect below */ 296 + for (int i = 0; i < 8; i++) 297 + sync_regs->gprs[i] = 8; 298 + run->kvm_dirty_regs |= KVM_SYNC_GPRS; 299 + 300 + /* copy test_gprs_asm to code_hva / code_gpa */ 301 + TH_LOG("copy code %p to vm mapped memory %p / %p", 302 + &test_gprs_asm, (void *)self->code_hva, (void *)self->code_gpa); 303 + memcpy((void *)self->code_hva, &test_gprs_asm, PAGE_SIZE); 304 + 305 + /* DAT disabled + 64 bit mode */ 306 + run->psw_mask = 0x0000000180000000ULL; 307 + run->psw_addr = self->code_gpa; 308 + 309 + /* run and expect interception of diag 44 */ 310 + ASSERT_EQ(0, uc_run_once(self)); 311 + ASSERT_EQ(false, uc_handle_exit(self)); 312 + uc_assert_diag44(self); 313 + 314 + /* Retrieve and check guest register values */ 315 + ASSERT_EQ(0, ioctl(self->vcpu_fd, KVM_GET_REGS, &regs)); 316 + for (int i = 0; i < 8; i++) { 317 + ASSERT_EQ(i, regs.gprs[i]); 318 + ASSERT_EQ(i, sync_regs->gprs[i]); 319 + } 320 + 321 + /* run and expect interception of diag 44 again */ 322 + ASSERT_EQ(0, uc_run_once(self)); 323 + ASSERT_EQ(false, uc_handle_exit(self)); 324 + uc_assert_diag44(self); 325 + 326 + /* check continued increment of register 0 value */ 327 + ASSERT_EQ(0, ioctl(self->vcpu_fd, KVM_GET_REGS, &regs)); 328 + ASSERT_EQ(1, regs.gprs[0]); 329 + ASSERT_EQ(1, sync_regs->gprs[0]); 330 + } 331 + 332 + TEST_HARNESS_MAIN

+21 -8

tools/testing/selftests/kvm/set_memory_region_test.c

··· 175 GUEST_DONE(); 176 } 177 178 - static void test_move_memory_region(void) 179 { 180 pthread_t vcpu_thread; 181 struct kvm_vcpu *vcpu; ··· 183 uint64_t *hva; 184 185 vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_move_memory_region); 186 187 hva = addr_gpa2hva(vm, MEM_REGION_GPA); 188 ··· 269 GUEST_ASSERT(0); 270 } 271 272 - static void test_delete_memory_region(void) 273 { 274 pthread_t vcpu_thread; 275 struct kvm_vcpu *vcpu; ··· 278 struct kvm_vm *vm; 279 280 vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_delete_memory_region); 281 282 /* Delete the memory region, the guest should not die. */ 283 vm_mem_region_delete(vm, MEM_REGION_SLOT); ··· 559 { 560 #ifdef __x86_64__ 561 int i, loops; 562 563 /* 564 * FIXME: the zero-memslot test fails on aarch64 and s390x because 565 * KVM_RUN fails with ENOEXEC or EFAULT. ··· 588 else 589 loops = 10; 590 591 - pr_info("Testing MOVE of in-use region, %d loops\n", loops); 592 - for (i = 0; i < loops; i++) 593 - test_move_memory_region(); 594 595 - pr_info("Testing DELETE of in-use region, %d loops\n", loops); 596 - for (i = 0; i < loops; i++) 597 - test_delete_memory_region(); 598 #endif 599 600 return 0;

··· 175 GUEST_DONE(); 176 } 177 178 + static void test_move_memory_region(bool disable_slot_zap_quirk) 179 { 180 pthread_t vcpu_thread; 181 struct kvm_vcpu *vcpu; ··· 183 uint64_t *hva; 184 185 vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_move_memory_region); 186 + 187 + if (disable_slot_zap_quirk) 188 + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL); 189 190 hva = addr_gpa2hva(vm, MEM_REGION_GPA); 191 ··· 266 GUEST_ASSERT(0); 267 } 268 269 + static void test_delete_memory_region(bool disable_slot_zap_quirk) 270 { 271 pthread_t vcpu_thread; 272 struct kvm_vcpu *vcpu; ··· 275 struct kvm_vm *vm; 276 277 vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_delete_memory_region); 278 + 279 + if (disable_slot_zap_quirk) 280 + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL); 281 282 /* Delete the memory region, the guest should not die. */ 283 vm_mem_region_delete(vm, MEM_REGION_SLOT); ··· 553 { 554 #ifdef __x86_64__ 555 int i, loops; 556 + int j, disable_slot_zap_quirk = 0; 557 558 + if (kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_SLOT_ZAP_ALL) 559 + disable_slot_zap_quirk = 1; 560 /* 561 * FIXME: the zero-memslot test fails on aarch64 and s390x because 562 * KVM_RUN fails with ENOEXEC or EFAULT. ··· 579 else 580 loops = 10; 581 582 + for (j = 0; j <= disable_slot_zap_quirk; j++) { 583 + pr_info("Testing MOVE of in-use region, %d loops, slot zap quirk %s\n", 584 + loops, j ? "disabled" : "enabled"); 585 + for (i = 0; i < loops; i++) 586 + test_move_memory_region(!!j); 587 588 + pr_info("Testing DELETE of in-use region, %d loops, slot zap quirk %s\n", 589 + loops, j ? "disabled" : "enabled"); 590 + for (i = 0; i < loops; i++) 591 + test_delete_memory_region(!!j); 592 + } 593 #endif 594 595 return 0;

+7 -4

tools/testing/selftests/kvm/x86_64/debug_regs.c

··· 47 /* 48 * Single step test, covers 2 basic instructions and 2 emulated 49 * 50 - * Enable interrupts during the single stepping to see that 51 - * pending interrupt we raised is not handled due to KVM_GUESTDBG_BLOCKIRQ 52 */ 53 asm volatile("ss_start: " 54 "sti\n\t" 55 "xor %%eax,%%eax\n\t" 56 "cpuid\n\t" 57 - "movl $0x1a0,%%ecx\n\t" 58 - "rdmsr\n\t" 59 "cli\n\t" 60 : : : "eax", "ebx", "ecx", "edx"); 61

··· 47 /* 48 * Single step test, covers 2 basic instructions and 2 emulated 49 * 50 + * Enable interrupts during the single stepping to see that pending 51 + * interrupt we raised is not handled due to KVM_GUESTDBG_BLOCKIRQ. 52 + * 53 + * Write MSR_IA32_TSC_DEADLINE to verify that KVM's fastpath handler 54 + * exits to userspace due to single-step being enabled. 55 */ 56 asm volatile("ss_start: " 57 "sti\n\t" 58 "xor %%eax,%%eax\n\t" 59 "cpuid\n\t" 60 + "movl $" __stringify(MSR_IA32_TSC_DEADLINE) ", %%ecx\n\t" 61 + "wrmsr\n\t" 62 "cli\n\t" 63 : : : "eax", "ebx", "ecx", "edx"); 64

+1 -1

tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c

··· 242 TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); 243 TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); 244 TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)); 245 - TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_DIRECT_TLBFLUSH)); 246 247 vm = vm_create_with_one_vcpu(&vcpu, guest_code); 248

··· 242 TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); 243 TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); 244 TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)); 245 + TEST_REQUIRE(kvm_hv_cpu_has(HV_X64_NESTED_DIRECT_FLUSH)); 246 247 vm = vm_create_with_one_vcpu(&vcpu, guest_code); 248

+1 -1

tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c

··· 157 int stage; 158 159 TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); 160 - TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_DIRECT_TLBFLUSH)); 161 162 /* Create VM */ 163 vm = vm_create_with_one_vcpu(&vcpu, guest_code);

··· 157 int stage; 158 159 TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); 160 + TEST_REQUIRE(kvm_hv_cpu_has(HV_X64_NESTED_DIRECT_FLUSH)); 161 162 /* Create VM */ 163 vm = vm_create_with_one_vcpu(&vcpu, guest_code);

+32

tools/testing/selftests/kvm/x86_64/sev_smoke_test.c

··· 160 kvm_vm_free(vm); 161 } 162 163 int main(int argc, char *argv[]) 164 { 165 TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV)); ··· 200 if (kvm_cpu_has(X86_FEATURE_SEV_ES)) { 201 test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG); 202 test_sev(guest_sev_es_code, SEV_POLICY_ES); 203 204 if (kvm_has_cap(KVM_CAP_XCRS) && 205 (xgetbv(0) & XFEATURE_MASK_X87_AVX) == XFEATURE_MASK_X87_AVX) {

··· 160 kvm_vm_free(vm); 161 } 162 163 + static void guest_shutdown_code(void) 164 + { 165 + struct desc_ptr idt; 166 + 167 + /* Clobber the IDT so that #UD is guaranteed to trigger SHUTDOWN. */ 168 + memset(&idt, 0, sizeof(idt)); 169 + __asm__ __volatile__("lidt %0" :: "m"(idt)); 170 + 171 + __asm__ __volatile__("ud2"); 172 + } 173 + 174 + static void test_sev_es_shutdown(void) 175 + { 176 + struct kvm_vcpu *vcpu; 177 + struct kvm_vm *vm; 178 + 179 + uint32_t type = KVM_X86_SEV_ES_VM; 180 + 181 + vm = vm_sev_create_with_one_vcpu(type, guest_shutdown_code, &vcpu); 182 + 183 + vm_sev_launch(vm, SEV_POLICY_ES, NULL); 184 + 185 + vcpu_run(vcpu); 186 + TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SHUTDOWN, 187 + "Wanted SHUTDOWN, got %s", 188 + exit_reason_str(vcpu->run->exit_reason)); 189 + 190 + kvm_vm_free(vm); 191 + } 192 + 193 int main(int argc, char *argv[]) 194 { 195 TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV)); ··· 170 if (kvm_cpu_has(X86_FEATURE_SEV_ES)) { 171 test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG); 172 test_sev(guest_sev_es_code, SEV_POLICY_ES); 173 + 174 + test_sev_es_shutdown(); 175 176 if (kvm_has_cap(KVM_CAP_XCRS) && 177 (xgetbv(0) & XFEATURE_MASK_X87_AVX) == XFEATURE_MASK_X87_AVX) {

+37 -17

tools/testing/selftests/kvm/x86_64/xapic_state_test.c

··· 13 struct xapic_vcpu { 14 struct kvm_vcpu *vcpu; 15 bool is_x2apic; 16 }; 17 18 static void xapic_guest_code(void) ··· 32 } 33 } 34 35 static void x2apic_guest_code(void) 36 { 37 asm volatile("cli"); ··· 46 uint64_t val = x2apic_read_reg(APIC_IRR) | 47 x2apic_read_reg(APIC_IRR + 0x10) << 32; 48 49 - x2apic_write_reg(APIC_ICR, val); 50 GUEST_SYNC(val); 51 } while (1); 52 } ··· 81 icr = (u64)(*((u32 *)&xapic.regs[APIC_ICR])) | 82 (u64)(*((u32 *)&xapic.regs[APIC_ICR2])) << 32; 83 if (!x->is_x2apic) { 84 - val &= (-1u | (0xffull << (32 + 24))); 85 - TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); 86 - } else { 87 - TEST_ASSERT_EQ(icr & ~APIC_ICR_BUSY, val & ~APIC_ICR_BUSY); 88 } 89 - } 90 91 - #define X2APIC_RSVED_BITS_MASK (GENMASK_ULL(31,20) | \ 92 - GENMASK_ULL(17,16) | \ 93 - GENMASK_ULL(13,13)) 94 95 static void __test_icr(struct xapic_vcpu *x, uint64_t val) 96 { 97 - if (x->is_x2apic) { 98 - /* Hardware writing vICR register requires reserved bits 31:20, 99 - * 17:16 and 13 kept as zero to avoid #GP exception. Data value 100 - * written to vICR should mask out those bits above. 101 - */ 102 - val &= ~X2APIC_RSVED_BITS_MASK; 103 - } 104 - ____test_icr(x, val | APIC_ICR_BUSY); 105 ____test_icr(x, val & ~(u64)APIC_ICR_BUSY); 106 } 107 ··· 241 */ 242 vm = vm_create_with_one_vcpu(&x.vcpu, xapic_guest_code); 243 x.is_x2apic = false; 244 245 vcpu_clear_cpuid_feature(x.vcpu, X86_FEATURE_X2APIC); 246

··· 13 struct xapic_vcpu { 14 struct kvm_vcpu *vcpu; 15 bool is_x2apic; 16 + bool has_xavic_errata; 17 }; 18 19 static void xapic_guest_code(void) ··· 31 } 32 } 33 34 + #define X2APIC_RSVD_BITS_MASK (GENMASK_ULL(31, 20) | \ 35 + GENMASK_ULL(17, 16) | \ 36 + GENMASK_ULL(13, 13)) 37 + 38 static void x2apic_guest_code(void) 39 { 40 asm volatile("cli"); ··· 41 uint64_t val = x2apic_read_reg(APIC_IRR) | 42 x2apic_read_reg(APIC_IRR + 0x10) << 32; 43 44 + if (val & X2APIC_RSVD_BITS_MASK) { 45 + x2apic_write_reg_fault(APIC_ICR, val); 46 + } else { 47 + x2apic_write_reg(APIC_ICR, val); 48 + GUEST_ASSERT_EQ(x2apic_read_reg(APIC_ICR), val); 49 + } 50 GUEST_SYNC(val); 51 } while (1); 52 } ··· 71 icr = (u64)(*((u32 *)&xapic.regs[APIC_ICR])) | 72 (u64)(*((u32 *)&xapic.regs[APIC_ICR2])) << 32; 73 if (!x->is_x2apic) { 74 + if (!x->has_xavic_errata) 75 + val &= (-1u | (0xffull << (32 + 24))); 76 + } else if (val & X2APIC_RSVD_BITS_MASK) { 77 + return; 78 } 79 80 + if (x->has_xavic_errata) 81 + TEST_ASSERT_EQ(icr & ~APIC_ICR_BUSY, val & ~APIC_ICR_BUSY); 82 + else 83 + TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); 84 + } 85 86 static void __test_icr(struct xapic_vcpu *x, uint64_t val) 87 { 88 + /* 89 + * The BUSY bit is reserved on both AMD and Intel, but only AMD treats 90 + * it is as _must_ be zero. Intel simply ignores the bit. Don't test 91 + * the BUSY bit for x2APIC, as there is no single correct behavior. 92 + */ 93 + if (!x->is_x2apic) 94 + ____test_icr(x, val | APIC_ICR_BUSY); 95 + 96 ____test_icr(x, val & ~(u64)APIC_ICR_BUSY); 97 } 98 ··· 230 */ 231 vm = vm_create_with_one_vcpu(&x.vcpu, xapic_guest_code); 232 x.is_x2apic = false; 233 + 234 + /* 235 + * AMD's AVIC implementation is buggy (fails to clear the ICR BUSY bit), 236 + * and also diverges from KVM with respect to ICR2[23:0] (KVM and Intel 237 + * drops writes, AMD does not). Account for the errata when checking 238 + * that KVM reads back what was written. 239 + */ 240 + x.has_xavic_errata = host_cpu_is_amd && 241 + get_kvm_amd_param_bool("avic"); 242 243 vcpu_clear_cpuid_feature(x.vcpu, X86_FEATURE_X2APIC); 244

+1

tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c

··· 10 #include "test_util.h" 11 #include "kvm_util.h" 12 #include "processor.h" 13 14 #define HCALL_REGION_GPA 0xc0000000ULL 15 #define HCALL_REGION_SLOT 10

··· 10 #include "test_util.h" 11 #include "kvm_util.h" 12 #include "processor.h" 13 + #include "hyperv.h" 14 15 #define HCALL_REGION_GPA 0xc0000000ULL 16 #define HCALL_REGION_SLOT 10

+8 -23

virt/kvm/coalesced_mmio.c

··· 40 return 1; 41 } 42 43 - static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev, u32 last) 44 - { 45 - struct kvm_coalesced_mmio_ring *ring; 46 - unsigned avail; 47 - 48 - /* Are we able to batch it ? */ 49 - 50 - /* last is the first free entry 51 - * check if we don't meet the first used entry 52 - * there is always one unused entry in the buffer 53 - */ 54 - ring = dev->kvm->coalesced_mmio_ring; 55 - avail = (ring->first - last - 1) % KVM_COALESCED_MMIO_MAX; 56 - if (avail == 0) { 57 - /* full */ 58 - return 0; 59 - } 60 - 61 - return 1; 62 - } 63 - 64 static int coalesced_mmio_write(struct kvm_vcpu *vcpu, 65 struct kvm_io_device *this, gpa_t addr, 66 int len, const void *val) ··· 53 54 spin_lock(&dev->kvm->ring_lock); 55 56 insert = READ_ONCE(ring->last); 57 - if (!coalesced_mmio_has_room(dev, insert) || 58 - insert >= KVM_COALESCED_MMIO_MAX) { 59 spin_unlock(&dev->kvm->ring_lock); 60 return -EOPNOTSUPP; 61 }

··· 40 return 1; 41 } 42 43 static int coalesced_mmio_write(struct kvm_vcpu *vcpu, 44 struct kvm_io_device *this, gpa_t addr, 45 int len, const void *val) ··· 74 75 spin_lock(&dev->kvm->ring_lock); 76 77 + /* 78 + * last is the index of the entry to fill. Verify userspace hasn't 79 + * set last to be out of range, and that there is room in the ring. 80 + * Leave one entry free in the ring so that userspace can differentiate 81 + * between an empty ring and a full ring. 82 + */ 83 insert = READ_ONCE(ring->last); 84 + if (insert >= KVM_COALESCED_MMIO_MAX || 85 + (insert + 1) % KVM_COALESCED_MMIO_MAX == READ_ONCE(ring->first)) { 86 spin_unlock(&dev->kvm->ring_lock); 87 return -EOPNOTSUPP; 88 }

+144 -137

virt/kvm/kvm_main.c

··· 136 #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ 137 .open = kvm_no_compat_open 138 #endif 139 - static int hardware_enable_all(void); 140 - static void hardware_disable_all(void); 141 142 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 143 ··· 1220 if (r) 1221 goto out_err_no_arch_destroy_vm; 1222 1223 - r = hardware_enable_all(); 1224 if (r) 1225 goto out_err_no_disable; 1226 ··· 1263 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); 1264 #endif 1265 out_err_no_mmu_notifier: 1266 - hardware_disable_all(); 1267 out_err_no_disable: 1268 kvm_arch_destroy_vm(kvm); 1269 out_err_no_arch_destroy_vm: ··· 1360 #endif 1361 kvm_arch_free_vm(kvm); 1362 preempt_notifier_dec(); 1363 - hardware_disable_all(); 1364 mmdrop(mm); 1365 } 1366 ··· 3270 int r; 3271 unsigned long addr; 3272 3273 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 3274 if (kvm_is_error_hva(addr)) 3275 return -EFAULT; ··· 3346 int r; 3347 unsigned long addr; 3348 3349 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 3350 if (kvm_is_error_hva(addr)) 3351 return -EFAULT; ··· 3378 { 3379 int r; 3380 unsigned long addr; 3381 3382 addr = gfn_to_hva_memslot(memslot, gfn); 3383 if (kvm_is_error_hva(addr)) ··· 3585 int ret; 3586 3587 while ((seg = next_segment(len, offset)) != 0) { 3588 - ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len); 3589 if (ret < 0) 3590 return ret; 3591 offset = 0; ··· 5575 }; 5576 5577 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING 5578 __visible bool kvm_rebooting; 5579 EXPORT_SYMBOL_GPL(kvm_rebooting); 5580 5581 - static DEFINE_PER_CPU(bool, hardware_enabled); 5582 static int kvm_usage_count; 5583 5584 - static int __hardware_enable_nolock(void) 5585 { 5586 - if (__this_cpu_read(hardware_enabled)) 5587 return 0; 5588 5589 - if (kvm_arch_hardware_enable()) { 5590 pr_info("kvm: enabling virtualization on CPU%d failed\n", 5591 raw_smp_processor_id()); 5592 return -EIO; 5593 } 5594 5595 - __this_cpu_write(hardware_enabled, true); 5596 return 0; 5597 - } 5598 - 5599 - static void hardware_enable_nolock(void *failed) 5600 - { 5601 - if (__hardware_enable_nolock()) 5602 - atomic_inc(failed); 5603 } 5604 5605 static int kvm_online_cpu(unsigned int cpu) 5606 { 5607 - int ret = 0; 5608 - 5609 /* 5610 * Abort the CPU online process if hardware virtualization cannot 5611 * be enabled. Otherwise running VMs would encounter unrecoverable 5612 * errors when scheduled to this CPU. 5613 */ 5614 - mutex_lock(&kvm_lock); 5615 - if (kvm_usage_count) 5616 - ret = __hardware_enable_nolock(); 5617 - mutex_unlock(&kvm_lock); 5618 - return ret; 5619 } 5620 5621 - static void hardware_disable_nolock(void *junk) 5622 { 5623 - /* 5624 - * Note, hardware_disable_all_nolock() tells all online CPUs to disable 5625 - * hardware, not just CPUs that successfully enabled hardware! 5626 - */ 5627 - if (!__this_cpu_read(hardware_enabled)) 5628 return; 5629 5630 - kvm_arch_hardware_disable(); 5631 5632 - __this_cpu_write(hardware_enabled, false); 5633 } 5634 5635 static int kvm_offline_cpu(unsigned int cpu) 5636 { 5637 - mutex_lock(&kvm_lock); 5638 - if (kvm_usage_count) 5639 - hardware_disable_nolock(NULL); 5640 - mutex_unlock(&kvm_lock); 5641 return 0; 5642 - } 5643 - 5644 - static void hardware_disable_all_nolock(void) 5645 - { 5646 - BUG_ON(!kvm_usage_count); 5647 - 5648 - kvm_usage_count--; 5649 - if (!kvm_usage_count) 5650 - on_each_cpu(hardware_disable_nolock, NULL, 1); 5651 - } 5652 - 5653 - static void hardware_disable_all(void) 5654 - { 5655 - cpus_read_lock(); 5656 - mutex_lock(&kvm_lock); 5657 - hardware_disable_all_nolock(); 5658 - mutex_unlock(&kvm_lock); 5659 - cpus_read_unlock(); 5660 - } 5661 - 5662 - static int hardware_enable_all(void) 5663 - { 5664 - atomic_t failed = ATOMIC_INIT(0); 5665 - int r; 5666 - 5667 - /* 5668 - * Do not enable hardware virtualization if the system is going down. 5669 - * If userspace initiated a forced reboot, e.g. reboot -f, then it's 5670 - * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling 5671 - * after kvm_reboot() is called. Note, this relies on system_state 5672 - * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops 5673 - * hook instead of registering a dedicated reboot notifier (the latter 5674 - * runs before system_state is updated). 5675 - */ 5676 - if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || 5677 - system_state == SYSTEM_RESTART) 5678 - return -EBUSY; 5679 - 5680 - /* 5681 - * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu() 5682 - * is called, and so on_each_cpu() between them includes the CPU that 5683 - * is being onlined. As a result, hardware_enable_nolock() may get 5684 - * invoked before kvm_online_cpu(), which also enables hardware if the 5685 - * usage count is non-zero. Disable CPU hotplug to avoid attempting to 5686 - * enable hardware multiple times. 5687 - */ 5688 - cpus_read_lock(); 5689 - mutex_lock(&kvm_lock); 5690 - 5691 - r = 0; 5692 - 5693 - kvm_usage_count++; 5694 - if (kvm_usage_count == 1) { 5695 - on_each_cpu(hardware_enable_nolock, &failed, 1); 5696 - 5697 - if (atomic_read(&failed)) { 5698 - hardware_disable_all_nolock(); 5699 - r = -EBUSY; 5700 - } 5701 - } 5702 - 5703 - mutex_unlock(&kvm_lock); 5704 - cpus_read_unlock(); 5705 - 5706 - return r; 5707 } 5708 5709 static void kvm_shutdown(void) ··· 5651 */ 5652 pr_info("kvm: exiting hardware virtualization\n"); 5653 kvm_rebooting = true; 5654 - on_each_cpu(hardware_disable_nolock, NULL, 1); 5655 } 5656 5657 static int kvm_suspend(void) 5658 { 5659 /* 5660 * Secondary CPUs and CPU hotplug are disabled across the suspend/resume 5661 - * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count 5662 - * is stable. Assert that kvm_lock is not held to ensure the system 5663 - * isn't suspended while KVM is enabling hardware. Hardware enabling 5664 - * can be preempted, but the task cannot be frozen until it has dropped 5665 - * all locks (userspace tasks are frozen via a fake signal). 5666 */ 5667 - lockdep_assert_not_held(&kvm_lock); 5668 lockdep_assert_irqs_disabled(); 5669 5670 - if (kvm_usage_count) 5671 - hardware_disable_nolock(NULL); 5672 return 0; 5673 } 5674 5675 static void kvm_resume(void) 5676 { 5677 - lockdep_assert_not_held(&kvm_lock); 5678 lockdep_assert_irqs_disabled(); 5679 5680 - if (kvm_usage_count) 5681 - WARN_ON_ONCE(__hardware_enable_nolock()); 5682 } 5683 5684 static struct syscore_ops kvm_syscore_ops = { ··· 5684 .resume = kvm_resume, 5685 .shutdown = kvm_shutdown, 5686 }; 5687 #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ 5688 - static int hardware_enable_all(void) 5689 { 5690 return 0; 5691 } 5692 5693 - static void hardware_disable_all(void) 5694 { 5695 5696 } ··· 6473 int r; 6474 int cpu; 6475 6476 - #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING 6477 - r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", 6478 - kvm_online_cpu, kvm_offline_cpu); 6479 - if (r) 6480 - return r; 6481 - 6482 - register_syscore_ops(&kvm_syscore_ops); 6483 - #endif 6484 - 6485 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 6486 if (!vcpu_align) 6487 vcpu_align = __alignof__(struct kvm_vcpu); ··· 6483 offsetofend(struct kvm_vcpu, stats_id) 6484 - offsetof(struct kvm_vcpu, arch), 6485 NULL); 6486 - if (!kvm_vcpu_cache) { 6487 - r = -ENOMEM; 6488 - goto err_vcpu_cache; 6489 - } 6490 6491 for_each_possible_cpu(cpu) { 6492 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu), ··· 6518 6519 kvm_gmem_init(module); 6520 6521 /* 6522 * Registration _must_ be the very last thing done, as this exposes 6523 * /dev/kvm to userspace, i.e. all infrastructure must be setup! ··· 6535 return 0; 6536 6537 err_register: 6538 kvm_vfio_ops_exit(); 6539 err_vfio: 6540 kvm_async_pf_deinit(); ··· 6547 for_each_possible_cpu(cpu) 6548 free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); 6549 kmem_cache_destroy(kvm_vcpu_cache); 6550 - err_vcpu_cache: 6551 - #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING 6552 - unregister_syscore_ops(&kvm_syscore_ops); 6553 - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); 6554 - #endif 6555 return r; 6556 } 6557 EXPORT_SYMBOL_GPL(kvm_init); ··· 6562 */ 6563 misc_deregister(&kvm_dev); 6564 6565 debugfs_remove_recursive(kvm_debugfs_dir); 6566 for_each_possible_cpu(cpu) 6567 free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); 6568 kmem_cache_destroy(kvm_vcpu_cache); 6569 kvm_vfio_ops_exit(); 6570 kvm_async_pf_deinit(); 6571 - #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING 6572 - unregister_syscore_ops(&kvm_syscore_ops); 6573 - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); 6574 - #endif 6575 kvm_irqfd_exit(); 6576 } 6577 EXPORT_SYMBOL_GPL(kvm_exit);

··· 136 #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ 137 .open = kvm_no_compat_open 138 #endif 139 + static int kvm_enable_virtualization(void); 140 + static void kvm_disable_virtualization(void); 141 142 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 143 ··· 1220 if (r) 1221 goto out_err_no_arch_destroy_vm; 1222 1223 + r = kvm_enable_virtualization(); 1224 if (r) 1225 goto out_err_no_disable; 1226 ··· 1263 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); 1264 #endif 1265 out_err_no_mmu_notifier: 1266 + kvm_disable_virtualization(); 1267 out_err_no_disable: 1268 kvm_arch_destroy_vm(kvm); 1269 out_err_no_arch_destroy_vm: ··· 1360 #endif 1361 kvm_arch_free_vm(kvm); 1362 preempt_notifier_dec(); 1363 + kvm_disable_virtualization(); 1364 mmdrop(mm); 1365 } 1366 ··· 3270 int r; 3271 unsigned long addr; 3272 3273 + if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) 3274 + return -EFAULT; 3275 + 3276 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 3277 if (kvm_is_error_hva(addr)) 3278 return -EFAULT; ··· 3343 int r; 3344 unsigned long addr; 3345 3346 + if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) 3347 + return -EFAULT; 3348 + 3349 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 3350 if (kvm_is_error_hva(addr)) 3351 return -EFAULT; ··· 3372 { 3373 int r; 3374 unsigned long addr; 3375 + 3376 + if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) 3377 + return -EFAULT; 3378 3379 addr = gfn_to_hva_memslot(memslot, gfn); 3380 if (kvm_is_error_hva(addr)) ··· 3576 int ret; 3577 3578 while ((seg = next_segment(len, offset)) != 0) { 3579 + ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, seg); 3580 if (ret < 0) 3581 return ret; 3582 offset = 0; ··· 5566 }; 5567 5568 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING 5569 + static bool enable_virt_at_load = true; 5570 + module_param(enable_virt_at_load, bool, 0444); 5571 + 5572 __visible bool kvm_rebooting; 5573 EXPORT_SYMBOL_GPL(kvm_rebooting); 5574 5575 + static DEFINE_PER_CPU(bool, virtualization_enabled); 5576 + static DEFINE_MUTEX(kvm_usage_lock); 5577 static int kvm_usage_count; 5578 5579 + __weak void kvm_arch_enable_virtualization(void) 5580 { 5581 + 5582 + } 5583 + 5584 + __weak void kvm_arch_disable_virtualization(void) 5585 + { 5586 + 5587 + } 5588 + 5589 + static int kvm_enable_virtualization_cpu(void) 5590 + { 5591 + if (__this_cpu_read(virtualization_enabled)) 5592 return 0; 5593 5594 + if (kvm_arch_enable_virtualization_cpu()) { 5595 pr_info("kvm: enabling virtualization on CPU%d failed\n", 5596 raw_smp_processor_id()); 5597 return -EIO; 5598 } 5599 5600 + __this_cpu_write(virtualization_enabled, true); 5601 return 0; 5602 } 5603 5604 static int kvm_online_cpu(unsigned int cpu) 5605 { 5606 /* 5607 * Abort the CPU online process if hardware virtualization cannot 5608 * be enabled. Otherwise running VMs would encounter unrecoverable 5609 * errors when scheduled to this CPU. 5610 */ 5611 + return kvm_enable_virtualization_cpu(); 5612 } 5613 5614 + static void kvm_disable_virtualization_cpu(void *ign) 5615 { 5616 + if (!__this_cpu_read(virtualization_enabled)) 5617 return; 5618 5619 + kvm_arch_disable_virtualization_cpu(); 5620 5621 + __this_cpu_write(virtualization_enabled, false); 5622 } 5623 5624 static int kvm_offline_cpu(unsigned int cpu) 5625 { 5626 + kvm_disable_virtualization_cpu(NULL); 5627 return 0; 5628 } 5629 5630 static void kvm_shutdown(void) ··· 5712 */ 5713 pr_info("kvm: exiting hardware virtualization\n"); 5714 kvm_rebooting = true; 5715 + on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1); 5716 } 5717 5718 static int kvm_suspend(void) 5719 { 5720 /* 5721 * Secondary CPUs and CPU hotplug are disabled across the suspend/resume 5722 + * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage 5723 + * count is stable. Assert that kvm_usage_lock is not held to ensure 5724 + * the system isn't suspended while KVM is enabling hardware. Hardware 5725 + * enabling can be preempted, but the task cannot be frozen until it has 5726 + * dropped all locks (userspace tasks are frozen via a fake signal). 5727 */ 5728 + lockdep_assert_not_held(&kvm_usage_lock); 5729 lockdep_assert_irqs_disabled(); 5730 5731 + kvm_disable_virtualization_cpu(NULL); 5732 return 0; 5733 } 5734 5735 static void kvm_resume(void) 5736 { 5737 + lockdep_assert_not_held(&kvm_usage_lock); 5738 lockdep_assert_irqs_disabled(); 5739 5740 + WARN_ON_ONCE(kvm_enable_virtualization_cpu()); 5741 } 5742 5743 static struct syscore_ops kvm_syscore_ops = { ··· 5747 .resume = kvm_resume, 5748 .shutdown = kvm_shutdown, 5749 }; 5750 + 5751 + static int kvm_enable_virtualization(void) 5752 + { 5753 + int r; 5754 + 5755 + guard(mutex)(&kvm_usage_lock); 5756 + 5757 + if (kvm_usage_count++) 5758 + return 0; 5759 + 5760 + kvm_arch_enable_virtualization(); 5761 + 5762 + r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", 5763 + kvm_online_cpu, kvm_offline_cpu); 5764 + if (r) 5765 + goto err_cpuhp; 5766 + 5767 + register_syscore_ops(&kvm_syscore_ops); 5768 + 5769 + /* 5770 + * Undo virtualization enabling and bail if the system is going down. 5771 + * If userspace initiated a forced reboot, e.g. reboot -f, then it's 5772 + * possible for an in-flight operation to enable virtualization after 5773 + * syscore_shutdown() is called, i.e. without kvm_shutdown() being 5774 + * invoked. Note, this relies on system_state being set _before_ 5775 + * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked 5776 + * or this CPU observes the impending shutdown. Which is why KVM uses 5777 + * a syscore ops hook instead of registering a dedicated reboot 5778 + * notifier (the latter runs before system_state is updated). 5779 + */ 5780 + if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || 5781 + system_state == SYSTEM_RESTART) { 5782 + r = -EBUSY; 5783 + goto err_rebooting; 5784 + } 5785 + 5786 + return 0; 5787 + 5788 + err_rebooting: 5789 + unregister_syscore_ops(&kvm_syscore_ops); 5790 + cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); 5791 + err_cpuhp: 5792 + kvm_arch_disable_virtualization(); 5793 + --kvm_usage_count; 5794 + return r; 5795 + } 5796 + 5797 + static void kvm_disable_virtualization(void) 5798 + { 5799 + guard(mutex)(&kvm_usage_lock); 5800 + 5801 + if (--kvm_usage_count) 5802 + return; 5803 + 5804 + unregister_syscore_ops(&kvm_syscore_ops); 5805 + cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); 5806 + kvm_arch_disable_virtualization(); 5807 + } 5808 + 5809 + static int kvm_init_virtualization(void) 5810 + { 5811 + if (enable_virt_at_load) 5812 + return kvm_enable_virtualization(); 5813 + 5814 + return 0; 5815 + } 5816 + 5817 + static void kvm_uninit_virtualization(void) 5818 + { 5819 + if (enable_virt_at_load) 5820 + kvm_disable_virtualization(); 5821 + } 5822 #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ 5823 + static int kvm_enable_virtualization(void) 5824 { 5825 return 0; 5826 } 5827 5828 + static int kvm_init_virtualization(void) 5829 + { 5830 + return 0; 5831 + } 5832 + 5833 + static void kvm_disable_virtualization(void) 5834 + { 5835 + 5836 + } 5837 + 5838 + static void kvm_uninit_virtualization(void) 5839 { 5840 5841 } ··· 6454 int r; 6455 int cpu; 6456 6457 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 6458 if (!vcpu_align) 6459 vcpu_align = __alignof__(struct kvm_vcpu); ··· 6473 offsetofend(struct kvm_vcpu, stats_id) 6474 - offsetof(struct kvm_vcpu, arch), 6475 NULL); 6476 + if (!kvm_vcpu_cache) 6477 + return -ENOMEM; 6478 6479 for_each_possible_cpu(cpu) { 6480 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu), ··· 6510 6511 kvm_gmem_init(module); 6512 6513 + r = kvm_init_virtualization(); 6514 + if (r) 6515 + goto err_virt; 6516 + 6517 /* 6518 * Registration _must_ be the very last thing done, as this exposes 6519 * /dev/kvm to userspace, i.e. all infrastructure must be setup! ··· 6523 return 0; 6524 6525 err_register: 6526 + kvm_uninit_virtualization(); 6527 + err_virt: 6528 kvm_vfio_ops_exit(); 6529 err_vfio: 6530 kvm_async_pf_deinit(); ··· 6533 for_each_possible_cpu(cpu) 6534 free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); 6535 kmem_cache_destroy(kvm_vcpu_cache); 6536 return r; 6537 } 6538 EXPORT_SYMBOL_GPL(kvm_init); ··· 6553 */ 6554 misc_deregister(&kvm_dev); 6555 6556 + kvm_uninit_virtualization(); 6557 + 6558 debugfs_remove_recursive(kvm_debugfs_dir); 6559 for_each_possible_cpu(cpu) 6560 free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); 6561 kmem_cache_destroy(kvm_vcpu_cache); 6562 kvm_vfio_ops_exit(); 6563 kvm_async_pf_deinit(); 6564 kvm_irqfd_exit(); 6565 } 6566 EXPORT_SYMBOL_GPL(kvm_exit);