commit 3efc57369a0ce8f76bf0804f7e673982384e4ac9 · tjh.dev/kernel

+17

Documentation/admin-guide/kernel-parameters.txt

··· 2677 2677 2678 2678 Default is Y (on). 2679 2679 2680 + kvm.enable_virt_at_load=[KVM,ARM64,LOONGARCH,MIPS,RISCV,X86] 2681 + If enabled, KVM will enable virtualization in hardware 2682 + when KVM is loaded, and disable virtualization when KVM 2683 + is unloaded (if KVM is built as a module). 2684 + 2685 + If disabled, KVM will dynamically enable and disable 2686 + virtualization on-demand when creating and destroying 2687 + VMs, i.e. on the 0=>1 and 1=>0 transitions of the 2688 + number of VMs. 2689 + 2690 + Enabling virtualization at module lode avoids potential 2691 + latency for creation of the 0=>1 VM, as KVM serializes 2692 + virtualization enabling across all online CPUs. The 2693 + "cost" of enabling virtualization when KVM is loaded, 2694 + is that doing so may interfere with using out-of-tree 2695 + hypervisors that want to "own" virtualization hardware. 2696 + 2680 2697 kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface. 2681 2698 Default is false (don't support). 2682 2699

+27 -4

Documentation/virt/kvm/api.rst

··· 4214 4214 enabled. If KVM_MSR_EXIT_REASON_FILTER is enabled, KVM will exit to userspace 4215 4215 on denied accesses, i.e. userspace effectively intercepts the MSR access. If 4216 4216 KVM_MSR_EXIT_REASON_FILTER is not enabled, KVM will inject a #GP into the guest 4217 - on denied accesses. 4217 + on denied accesses. Note, if an MSR access is denied during emulation of MSR 4218 + load/stores during VMX transitions, KVM ignores KVM_MSR_EXIT_REASON_FILTER. 4219 + See the below warning for full details. 4218 4220 4219 4221 If an MSR access is allowed by userspace, KVM will emulate and/or virtualize 4220 4222 the access in accordance with the vCPU model. Note, KVM may still ultimately ··· 4231 4229 an error. 4232 4230 4233 4231 .. warning:: 4234 - MSR accesses as part of nested VM-Enter/VM-Exit are not filtered. 4235 - This includes both writes to individual VMCS fields and reads/writes 4236 - through the MSR lists pointed to by the VMCS. 4232 + MSR accesses that are side effects of instruction execution (emulated or 4233 + native) are not filtered as hardware does not honor MSR bitmaps outside of 4234 + RDMSR and WRMSR, and KVM mimics that behavior when emulating instructions 4235 + to avoid pointless divergence from hardware. E.g. RDPID reads MSR_TSC_AUX, 4236 + SYSENTER reads the SYSENTER MSRs, etc. 4237 + 4238 + MSRs that are loaded/stored via dedicated VMCS fields are not filtered as 4239 + part of VM-Enter/VM-Exit emulation. 4240 + 4241 + MSRs that are loaded/store via VMX's load/store lists _are_ filtered as part 4242 + of VM-Enter/VM-Exit emulation. If an MSR access is denied on VM-Enter, KVM 4243 + synthesizes a consistency check VM-Exit(EXIT_REASON_MSR_LOAD_FAIL). If an 4244 + MSR access is denied on VM-Exit, KVM synthesizes a VM-Abort. In short, KVM 4245 + extends Intel's architectural list of MSRs that cannot be loaded/saved via 4246 + the VM-Enter/VM-Exit MSR list. It is platform owner's responsibility to 4247 + to communicate any such restrictions to their end users. 4237 4248 4238 4249 x2APIC MSR accesses cannot be filtered (KVM silently ignores filters that 4239 4250 cover any x2APIC MSRs). ··· 8097 8082 guest CPUID on writes to MISC_ENABLE if 8098 8083 KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT is 8099 8084 disabled. 8085 + 8086 + KVM_X86_QUIRK_SLOT_ZAP_ALL By default, KVM invalidates all SPTEs in 8087 + fast way for memslot deletion when VM type 8088 + is KVM_X86_DEFAULT_VM. 8089 + When this quirk is disabled or when VM type 8090 + is other than KVM_X86_DEFAULT_VM, KVM zaps 8091 + only leaf SPTEs that are within the range of 8092 + the memslot being deleted. 8100 8093 =================================== ============================================ 8101 8094 8102 8095 7.32 KVM_CAP_MAX_VCPU_ID

+24 -8

Documentation/virt/kvm/locking.rst

··· 11 11 12 12 - cpus_read_lock() is taken outside kvm_lock 13 13 14 + - kvm_usage_lock is taken outside cpus_read_lock() 15 + 14 16 - kvm->lock is taken outside vcpu->mutex 15 17 16 18 - kvm->lock is taken outside kvm->slots_lock and kvm->irq_lock ··· 25 23 use the same memslots array. kvm->slots_lock and kvm->slots_arch_lock 26 24 are taken on the waiting side when modifying memslots, so MMU notifiers 27 25 must not take either kvm->slots_lock or kvm->slots_arch_lock. 26 + 27 + cpus_read_lock() vs kvm_lock: 28 + 29 + - Taking cpus_read_lock() outside of kvm_lock is problematic, despite that 30 + being the official ordering, as it is quite easy to unknowingly trigger 31 + cpus_read_lock() while holding kvm_lock. Use caution when walking vm_list, 32 + e.g. avoid complex operations when possible. 28 33 29 34 For SRCU: 30 35 ··· 236 227 :Type: mutex 237 228 :Arch: any 238 229 :Protects: - vm_list 239 - - kvm_usage_count 230 + 231 + ``kvm_usage_lock`` 232 + ^^^^^^^^^^^^^^^^^^ 233 + 234 + :Type: mutex 235 + :Arch: any 236 + :Protects: - kvm_usage_count 240 237 - hardware virtualization enable/disable 241 - :Comment: KVM also disables CPU hotplug via cpus_read_lock() during 242 - enable/disable. 238 + :Comment: Exists to allow taking cpus_read_lock() while kvm_usage_count is 239 + protected, which simplifies the virtualization enabling logic. 243 240 244 241 ``kvm->mn_invalidate_lock`` 245 242 ^^^^^^^^^^^^^^^^^^^^^^^^^^^ ··· 305 290 wakeup. 306 291 307 292 ``vendor_module_lock`` 308 - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 293 + ^^^^^^^^^^^^^^^^^^^^^^ 309 294 :Type: mutex 310 295 :Arch: x86 311 296 :Protects: loading a vendor module (kvm_amd or kvm_intel) 312 - :Comment: Exists because using kvm_lock leads to deadlock. cpu_hotplug_lock is 313 - taken outside of kvm_lock, e.g. in KVM's CPU online/offline callbacks, and 314 - many operations need to take cpu_hotplug_lock when loading a vendor module, 315 - e.g. updating static calls. 297 + :Comment: Exists because using kvm_lock leads to deadlock. kvm_lock is taken 298 + in notifiers, e.g. __kvmclock_cpufreq_notifier(), that may be invoked while 299 + cpu_hotplug_lock is held, e.g. from cpufreq_boost_trigger_state(), and many 300 + operations need to take cpu_hotplug_lock when loading a vendor module, e.g. 301 + updating static calls.

+3 -3

arch/arm64/kvm/arm.c

··· 2164 2164 } 2165 2165 } 2166 2166 2167 - int kvm_arch_hardware_enable(void) 2167 + int kvm_arch_enable_virtualization_cpu(void) 2168 2168 { 2169 2169 /* 2170 2170 * Most calls to this function are made with migration ··· 2184 2184 return 0; 2185 2185 } 2186 2186 2187 - void kvm_arch_hardware_disable(void) 2187 + void kvm_arch_disable_virtualization_cpu(void) 2188 2188 { 2189 2189 kvm_timer_cpu_down(); 2190 2190 kvm_vgic_cpu_down(); ··· 2380 2380 2381 2381 /* 2382 2382 * The stub hypercalls are now disabled, so set our local flag to 2383 - * prevent a later re-init attempt in kvm_arch_hardware_enable(). 2383 + * prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu(). 2384 2384 */ 2385 2385 __this_cpu_write(kvm_hyp_initialized, 1); 2386 2386 preempt_enable();

+2 -2

arch/loongarch/kvm/main.c

··· 261 261 return -ENOIOCTLCMD; 262 262 } 263 263 264 - int kvm_arch_hardware_enable(void) 264 + int kvm_arch_enable_virtualization_cpu(void) 265 265 { 266 266 unsigned long env, gcfg = 0; 267 267 ··· 300 300 return 0; 301 301 } 302 302 303 - void kvm_arch_hardware_disable(void) 303 + void kvm_arch_disable_virtualization_cpu(void) 304 304 { 305 305 write_csr_gcfg(0); 306 306 write_csr_gstat(0);

+2 -2

arch/mips/include/asm/kvm_host.h

··· 728 728 int (*handle_fpe)(struct kvm_vcpu *vcpu); 729 729 int (*handle_msa_disabled)(struct kvm_vcpu *vcpu); 730 730 int (*handle_guest_exit)(struct kvm_vcpu *vcpu); 731 - int (*hardware_enable)(void); 732 - void (*hardware_disable)(void); 731 + int (*enable_virtualization_cpu)(void); 732 + void (*disable_virtualization_cpu)(void); 733 733 int (*check_extension)(struct kvm *kvm, long ext); 734 734 int (*vcpu_init)(struct kvm_vcpu *vcpu); 735 735 void (*vcpu_uninit)(struct kvm_vcpu *vcpu);

+4 -4

arch/mips/kvm/mips.c

··· 125 125 return 1; 126 126 } 127 127 128 - int kvm_arch_hardware_enable(void) 128 + int kvm_arch_enable_virtualization_cpu(void) 129 129 { 130 - return kvm_mips_callbacks->hardware_enable(); 130 + return kvm_mips_callbacks->enable_virtualization_cpu(); 131 131 } 132 132 133 - void kvm_arch_hardware_disable(void) 133 + void kvm_arch_disable_virtualization_cpu(void) 134 134 { 135 - kvm_mips_callbacks->hardware_disable(); 135 + kvm_mips_callbacks->disable_virtualization_cpu(); 136 136 } 137 137 138 138 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)

+4 -4

arch/mips/kvm/vz.c

··· 2869 2869 return ret + 1; 2870 2870 } 2871 2871 2872 - static int kvm_vz_hardware_enable(void) 2872 + static int kvm_vz_enable_virtualization_cpu(void) 2873 2873 { 2874 2874 unsigned int mmu_size, guest_mmu_size, ftlb_size; 2875 2875 u64 guest_cvmctl, cvmvmconfig; ··· 2983 2983 return 0; 2984 2984 } 2985 2985 2986 - static void kvm_vz_hardware_disable(void) 2986 + static void kvm_vz_disable_virtualization_cpu(void) 2987 2987 { 2988 2988 u64 cvmvmconfig; 2989 2989 unsigned int mmu_size; ··· 3280 3280 .handle_msa_disabled = kvm_trap_vz_handle_msa_disabled, 3281 3281 .handle_guest_exit = kvm_trap_vz_handle_guest_exit, 3282 3282 3283 - .hardware_enable = kvm_vz_hardware_enable, 3284 - .hardware_disable = kvm_vz_hardware_disable, 3283 + .enable_virtualization_cpu = kvm_vz_enable_virtualization_cpu, 3284 + .disable_virtualization_cpu = kvm_vz_disable_virtualization_cpu, 3285 3285 .check_extension = kvm_vz_check_extension, 3286 3286 .vcpu_init = kvm_vz_vcpu_init, 3287 3287 .vcpu_uninit = kvm_vz_vcpu_uninit,

+2 -2

arch/riscv/kvm/main.c

··· 20 20 return -EINVAL; 21 21 } 22 22 23 - int kvm_arch_hardware_enable(void) 23 + int kvm_arch_enable_virtualization_cpu(void) 24 24 { 25 25 csr_write(CSR_HEDELEG, KVM_HEDELEG_DEFAULT); 26 26 csr_write(CSR_HIDELEG, KVM_HIDELEG_DEFAULT); ··· 35 35 return 0; 36 36 } 37 37 38 - void kvm_arch_hardware_disable(void) 38 + void kvm_arch_disable_virtualization_cpu(void) 39 39 { 40 40 kvm_riscv_aia_disable(); 41 41

+1

arch/s390/configs/debug_defconfig

··· 59 59 CONFIG_APPLDATA_BASE=y 60 60 CONFIG_S390_HYPFS_FS=y 61 61 CONFIG_KVM=m 62 + CONFIG_KVM_S390_UCONTROL=y 62 63 CONFIG_S390_UNWIND_SELFTEST=m 63 64 CONFIG_S390_KPROBES_SANITY_TEST=m 64 65 CONFIG_S390_MODULES_SANITY_TEST=m

+18 -9

arch/s390/kvm/kvm-s390.c

··· 348 348 return cc == 0; 349 349 } 350 350 351 - static __always_inline void __insn32_query(unsigned int opcode, u8 *query) 351 + static __always_inline void __sortl_query(u8 (*query)[32]) 352 352 { 353 353 asm volatile( 354 354 " lghi 0,0\n" 355 - " lgr 1,%[query]\n" 355 + " la 1,%[query]\n" 356 356 /* Parameter registers are ignored */ 357 - " .insn rrf,%[opc] << 16,2,4,6,0\n" 357 + " .insn rre,0xb9380000,2,4\n" 358 + : [query] "=R" (*query) 358 359 : 359 - : [query] "d" ((unsigned long)query), [opc] "i" (opcode) 360 - : "cc", "memory", "0", "1"); 360 + : "cc", "0", "1"); 361 361 } 362 362 363 - #define INSN_SORTL 0xb938 364 - #define INSN_DFLTCC 0xb939 363 + static __always_inline void __dfltcc_query(u8 (*query)[32]) 364 + { 365 + asm volatile( 366 + " lghi 0,0\n" 367 + " la 1,%[query]\n" 368 + /* Parameter registers are ignored */ 369 + " .insn rrf,0xb9390000,2,4,6,0\n" 370 + : [query] "=R" (*query) 371 + : 372 + : "cc", "0", "1"); 373 + } 365 374 366 375 static void __init kvm_s390_cpu_feat_init(void) 367 376 { ··· 424 415 kvm_s390_available_subfunc.kdsa); 425 416 426 417 if (test_facility(150)) /* SORTL */ 427 - __insn32_query(INSN_SORTL, kvm_s390_available_subfunc.sortl); 418 + __sortl_query(&kvm_s390_available_subfunc.sortl); 428 419 429 420 if (test_facility(151)) /* DFLTCC */ 430 - __insn32_query(INSN_DFLTCC, kvm_s390_available_subfunc.dfltcc); 421 + __dfltcc_query(&kvm_s390_available_subfunc.dfltcc); 431 422 432 423 if (MACHINE_HAS_ESOP) 433 424 allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);

+1

arch/x86/include/asm/cpuid.h

··· 179 179 case 0x1d: 180 180 case 0x1e: 181 181 case 0x1f: 182 + case 0x24: 182 183 case 0x8000001d: 183 184 return true; 184 185 }

+3 -3

arch/x86/include/asm/kvm-x86-ops.h

··· 14 14 * be __static_call_return0. 15 15 */ 16 16 KVM_X86_OP(check_processor_compatibility) 17 - KVM_X86_OP(hardware_enable) 18 - KVM_X86_OP(hardware_disable) 17 + KVM_X86_OP(enable_virtualization_cpu) 18 + KVM_X86_OP(disable_virtualization_cpu) 19 19 KVM_X86_OP(hardware_unsetup) 20 20 KVM_X86_OP(has_emulated_msr) 21 21 KVM_X86_OP(vcpu_after_set_cpuid) ··· 125 125 KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) 126 126 KVM_X86_OP_OPTIONAL(vm_move_enc_context_from) 127 127 KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) 128 - KVM_X86_OP(get_msr_feature) 128 + KVM_X86_OP(get_feature_msr) 129 129 KVM_X86_OP(check_emulate_instruction) 130 130 KVM_X86_OP(apic_init_signal_blocked) 131 131 KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)

+23 -9

arch/x86/include/asm/kvm_host.h

··· 36 36 #include <asm/kvm_page_track.h> 37 37 #include <asm/kvm_vcpu_regs.h> 38 38 #include <asm/hyperv-tlfs.h> 39 + #include <asm/reboot.h> 39 40 40 41 #define __KVM_HAVE_ARCH_VCPU_DEBUGFS 41 42 ··· 212 211 EXIT_FASTPATH_NONE, 213 212 EXIT_FASTPATH_REENTER_GUEST, 214 213 EXIT_FASTPATH_EXIT_HANDLED, 214 + EXIT_FASTPATH_EXIT_USERSPACE, 215 215 }; 216 216 typedef enum exit_fastpath_completion fastpath_t; 217 217 ··· 281 279 */ 282 280 #define PFERR_PRIVATE_ACCESS BIT_ULL(49) 283 281 #define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS) 284 - 285 - #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ 286 - PFERR_WRITE_MASK | \ 287 - PFERR_PRESENT_MASK) 288 282 289 283 /* apic attention bits */ 290 284 #define KVM_APIC_CHECK_VAPIC 0 ··· 1627 1629 1628 1630 int (*check_processor_compatibility)(void); 1629 1631 1630 - int (*hardware_enable)(void); 1631 - void (*hardware_disable)(void); 1632 + int (*enable_virtualization_cpu)(void); 1633 + void (*disable_virtualization_cpu)(void); 1634 + cpu_emergency_virt_cb *emergency_disable_virtualization_cpu; 1635 + 1632 1636 void (*hardware_unsetup)(void); 1633 1637 bool (*has_emulated_msr)(struct kvm *kvm, u32 index); 1634 1638 void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); ··· 1727 1727 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 1728 1728 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 1729 1729 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 1730 + 1731 + const bool x2apic_icr_is_split; 1730 1732 const unsigned long required_apicv_inhibits; 1731 1733 bool allow_apicv_in_x2apic_without_x2apic_virtualization; 1732 1734 void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); ··· 1808 1806 int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); 1809 1807 void (*guest_memory_reclaimed)(struct kvm *kvm); 1810 1808 1811 - int (*get_msr_feature)(struct kvm_msr_entry *entry); 1809 + int (*get_feature_msr)(u32 msr, u64 *data); 1812 1810 1813 1811 int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, 1814 1812 void *insn, int insn_len); ··· 2062 2060 2063 2061 void kvm_enable_efer_bits(u64); 2064 2062 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); 2063 + int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data); 2064 + int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data); 2065 2065 int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated); 2066 2066 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); 2067 2067 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); ··· 2140 2136 2141 2137 void kvm_update_dr7(struct kvm_vcpu *vcpu); 2142 2138 2143 - int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); 2139 + bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 2140 + bool always_retry); 2141 + 2142 + static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, 2143 + gpa_t cr2_or_gpa) 2144 + { 2145 + return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false); 2146 + } 2147 + 2144 2148 void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, 2145 2149 ulong roots_to_free); 2146 2150 void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); ··· 2266 2254 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 2267 2255 int kvm_cpu_has_extint(struct kvm_vcpu *v); 2268 2256 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 2257 + int kvm_cpu_get_extint(struct kvm_vcpu *v); 2269 2258 int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 2270 2259 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); 2271 2260 ··· 2358 2345 KVM_X86_QUIRK_OUT_7E_INC_RIP | \ 2359 2346 KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ 2360 2347 KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ 2361 - KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) 2348 + KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \ 2349 + KVM_X86_QUIRK_SLOT_ZAP_ALL) 2362 2350 2363 2351 /* 2364 2352 * KVM previously used a u32 field in kvm_run to indicate the hypercall was

+20 -14

arch/x86/include/asm/msr-index.h

··· 36 36 #define EFER_FFXSR (1<<_EFER_FFXSR) 37 37 #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) 38 38 39 + /* 40 + * Architectural memory types that are common to MTRRs, PAT, VMX MSRs, etc. 41 + * Most MSRs support/allow only a subset of memory types, but the values 42 + * themselves are common across all relevant MSRs. 43 + */ 44 + #define X86_MEMTYPE_UC 0ull /* Uncacheable, a.k.a. Strong Uncacheable */ 45 + #define X86_MEMTYPE_WC 1ull /* Write Combining */ 46 + /* RESERVED 2 */ 47 + /* RESERVED 3 */ 48 + #define X86_MEMTYPE_WT 4ull /* Write Through */ 49 + #define X86_MEMTYPE_WP 5ull /* Write Protected */ 50 + #define X86_MEMTYPE_WB 6ull /* Write Back */ 51 + #define X86_MEMTYPE_UC_MINUS 7ull /* Weak Uncacheabled (PAT only) */ 52 + 39 53 /* FRED MSRs */ 40 54 #define MSR_IA32_FRED_RSP0 0x1cc /* Level 0 stack pointer */ 41 55 #define MSR_IA32_FRED_RSP1 0x1cd /* Level 1 stack pointer */ ··· 378 364 #define MSR_MTRRdefType 0x000002ff 379 365 380 366 #define MSR_IA32_CR_PAT 0x00000277 367 + 368 + #define PAT_VALUE(p0, p1, p2, p3, p4, p5, p6, p7) \ 369 + ((X86_MEMTYPE_ ## p0) | (X86_MEMTYPE_ ## p1 << 8) | \ 370 + (X86_MEMTYPE_ ## p2 << 16) | (X86_MEMTYPE_ ## p3 << 24) | \ 371 + (X86_MEMTYPE_ ## p4 << 32) | (X86_MEMTYPE_ ## p5 << 40) | \ 372 + (X86_MEMTYPE_ ## p6 << 48) | (X86_MEMTYPE_ ## p7 << 56)) 381 373 382 374 #define MSR_IA32_DEBUGCTLMSR 0x000001d9 383 375 #define MSR_IA32_LASTBRANCHFROMIP 0x000001db ··· 1179 1159 #define MSR_IA32_VMX_VMFUNC 0x00000491 1180 1160 #define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492 1181 1161 1182 - /* VMX_BASIC bits and bitmasks */ 1183 - #define VMX_BASIC_VMCS_SIZE_SHIFT 32 1184 - #define VMX_BASIC_TRUE_CTLS (1ULL << 55) 1185 - #define VMX_BASIC_64 0x0001000000000000LLU 1186 - #define VMX_BASIC_MEM_TYPE_SHIFT 50 1187 - #define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU 1188 - #define VMX_BASIC_MEM_TYPE_WB 6LLU 1189 - #define VMX_BASIC_INOUT 0x0040000000000000LLU 1190 - 1191 1162 /* Resctrl MSRs: */ 1192 1163 /* - Intel: */ 1193 1164 #define MSR_IA32_L3_QOS_CFG 0xc81 ··· 1195 1184 #define MSR_IA32_MBA_BW_BASE 0xc0000200 1196 1185 #define MSR_IA32_SMBA_BW_BASE 0xc0000280 1197 1186 #define MSR_IA32_EVT_CFG_BASE 0xc0000400 1198 - 1199 - /* MSR_IA32_VMX_MISC bits */ 1200 - #define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) 1201 - #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) 1202 - #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F 1203 1187 1204 1188 /* AMD-V MSRs */ 1205 1189 #define MSR_VM_CR 0xc0010114

+1 -1

arch/x86/include/asm/reboot.h

··· 25 25 #define MRR_BIOS 0 26 26 #define MRR_APM 1 27 27 28 - #if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) 29 28 typedef void (cpu_emergency_virt_cb)(void); 29 + #if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) 30 30 void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); 31 31 void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); 32 32 void cpu_emergency_disable_virtualization(void);

+15 -5

arch/x86/include/asm/svm.h

··· 516 516 u32 ghcb_usage; 517 517 } __packed; 518 518 519 + struct vmcb { 520 + struct vmcb_control_area control; 521 + union { 522 + struct vmcb_save_area save; 523 + 524 + /* 525 + * For SEV-ES VMs, the save area in the VMCB is used only to 526 + * save/load host state. Guest state resides in a separate 527 + * page, the aptly named VM Save Area (VMSA), that is encrypted 528 + * with the guest's private key. 529 + */ 530 + struct sev_es_save_area host_sev_es_save; 531 + }; 532 + } __packed; 519 533 520 534 #define EXPECTED_VMCB_SAVE_AREA_SIZE 744 521 535 #define EXPECTED_GHCB_SAVE_AREA_SIZE 1032 ··· 546 532 BUILD_BUG_ON(sizeof(struct ghcb_save_area) != EXPECTED_GHCB_SAVE_AREA_SIZE); 547 533 BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE); 548 534 BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); 535 + BUILD_BUG_ON(offsetof(struct vmcb, save) != EXPECTED_VMCB_CONTROL_AREA_SIZE); 549 536 BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); 550 537 551 538 /* Check offsets of reserved fields */ ··· 582 567 583 568 BUILD_BUG_RESERVED_OFFSET(ghcb, 0xff0); 584 569 } 585 - 586 - struct vmcb { 587 - struct vmcb_control_area control; 588 - struct vmcb_save_area save; 589 - } __packed; 590 570 591 571 #define SVM_CPUID_FUNC 0x8000000a 592 572

+30 -10

arch/x86/include/asm/vmx.h

··· 122 122 123 123 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff 124 124 125 - #define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f 126 - #define VMX_MISC_SAVE_EFER_LMA 0x00000020 127 - #define VMX_MISC_ACTIVITY_HLT 0x00000040 128 - #define VMX_MISC_ACTIVITY_WAIT_SIPI 0x00000100 129 - #define VMX_MISC_ZERO_LEN_INS 0x40000000 130 - #define VMX_MISC_MSR_LIST_MULTIPLIER 512 131 - 132 125 /* VMFUNC functions */ 133 126 #define VMFUNC_CONTROL_BIT(x) BIT((VMX_FEATURE_##x & 0x1f) - 28) 134 127 135 128 #define VMX_VMFUNC_EPTP_SWITCHING VMFUNC_CONTROL_BIT(EPTP_SWITCHING) 136 129 #define VMFUNC_EPTP_ENTRIES 512 130 + 131 + #define VMX_BASIC_32BIT_PHYS_ADDR_ONLY BIT_ULL(48) 132 + #define VMX_BASIC_DUAL_MONITOR_TREATMENT BIT_ULL(49) 133 + #define VMX_BASIC_INOUT BIT_ULL(54) 134 + #define VMX_BASIC_TRUE_CTLS BIT_ULL(55) 137 135 138 136 static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) 139 137 { ··· 143 145 return (vmx_basic & GENMASK_ULL(44, 32)) >> 32; 144 146 } 145 147 148 + static inline u32 vmx_basic_vmcs_mem_type(u64 vmx_basic) 149 + { 150 + return (vmx_basic & GENMASK_ULL(53, 50)) >> 50; 151 + } 152 + 153 + static inline u64 vmx_basic_encode_vmcs_info(u32 revision, u16 size, u8 memtype) 154 + { 155 + return revision | ((u64)size << 32) | ((u64)memtype << 50); 156 + } 157 + 158 + #define VMX_MISC_SAVE_EFER_LMA BIT_ULL(5) 159 + #define VMX_MISC_ACTIVITY_HLT BIT_ULL(6) 160 + #define VMX_MISC_ACTIVITY_SHUTDOWN BIT_ULL(7) 161 + #define VMX_MISC_ACTIVITY_WAIT_SIPI BIT_ULL(8) 162 + #define VMX_MISC_INTEL_PT BIT_ULL(14) 163 + #define VMX_MISC_RDMSR_IN_SMM BIT_ULL(15) 164 + #define VMX_MISC_VMXOFF_BLOCK_SMI BIT_ULL(28) 165 + #define VMX_MISC_VMWRITE_SHADOW_RO_FIELDS BIT_ULL(29) 166 + #define VMX_MISC_ZERO_LEN_INS BIT_ULL(30) 167 + #define VMX_MISC_MSR_LIST_MULTIPLIER 512 168 + 146 169 static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) 147 170 { 148 - return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; 171 + return vmx_misc & GENMASK_ULL(4, 0); 149 172 } 150 173 151 174 static inline int vmx_misc_cr3_count(u64 vmx_misc) ··· 527 508 #define VMX_EPTP_PWL_4 0x18ull 528 509 #define VMX_EPTP_PWL_5 0x20ull 529 510 #define VMX_EPTP_AD_ENABLE_BIT (1ull << 6) 511 + /* The EPTP memtype is encoded in bits 2:0, i.e. doesn't need to be shifted. */ 530 512 #define VMX_EPTP_MT_MASK 0x7ull 531 - #define VMX_EPTP_MT_WB 0x6ull 532 - #define VMX_EPTP_MT_UC 0x0ull 513 + #define VMX_EPTP_MT_WB X86_MEMTYPE_WB 514 + #define VMX_EPTP_MT_UC X86_MEMTYPE_UC 533 515 #define VMX_EPT_READABLE_MASK 0x1ull 534 516 #define VMX_EPT_WRITABLE_MASK 0x2ull 535 517 #define VMX_EPT_EXECUTABLE_MASK 0x4ull

+1

arch/x86/include/uapi/asm/kvm.h

··· 439 439 #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) 440 440 #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) 441 441 #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) 442 + #define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7) 442 443 443 444 #define KVM_STATE_NESTED_FORMAT_VMX 0 444 445 #define KVM_STATE_NESTED_FORMAT_SVM 1

+6

arch/x86/kernel/cpu/mtrr/mtrr.c

··· 55 55 56 56 #include "mtrr.h" 57 57 58 + static_assert(X86_MEMTYPE_UC == MTRR_TYPE_UNCACHABLE); 59 + static_assert(X86_MEMTYPE_WC == MTRR_TYPE_WRCOMB); 60 + static_assert(X86_MEMTYPE_WT == MTRR_TYPE_WRTHROUGH); 61 + static_assert(X86_MEMTYPE_WP == MTRR_TYPE_WRPROT); 62 + static_assert(X86_MEMTYPE_WB == MTRR_TYPE_WRBACK); 63 + 58 64 /* arch_phys_wc_add returns an MTRR register index plus this offset. */ 59 65 #define MTRR_TO_PHYS_WC_OFFSET 1000 60 66

+28 -2

arch/x86/kvm/cpuid.c

··· 705 705 706 706 kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, 707 707 F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) | 708 - F(AMX_COMPLEX) 708 + F(AMX_COMPLEX) | F(AVX10) 709 709 ); 710 710 711 711 kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX, ··· 719 719 720 720 kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX, 721 721 SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA) 722 + ); 723 + 724 + kvm_cpu_cap_init_kvm_defined(CPUID_24_0_EBX, 725 + F(AVX10_128) | F(AVX10_256) | F(AVX10_512) 722 726 ); 723 727 724 728 kvm_cpu_cap_mask(CPUID_8000_0001_ECX, ··· 953 949 switch (function) { 954 950 case 0: 955 951 /* Limited to the highest leaf implemented in KVM. */ 956 - entry->eax = min(entry->eax, 0x1fU); 952 + entry->eax = min(entry->eax, 0x24U); 957 953 break; 958 954 case 1: 959 955 cpuid_entry_override(entry, CPUID_1_EDX); ··· 1178 1174 break; 1179 1175 } 1180 1176 break; 1177 + case 0x24: { 1178 + u8 avx10_version; 1179 + 1180 + if (!kvm_cpu_cap_has(X86_FEATURE_AVX10)) { 1181 + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1182 + break; 1183 + } 1184 + 1185 + /* 1186 + * The AVX10 version is encoded in EBX[7:0]. Note, the version 1187 + * is guaranteed to be >=1 if AVX10 is supported. Note #2, the 1188 + * version needs to be captured before overriding EBX features! 1189 + */ 1190 + avx10_version = min_t(u8, entry->ebx & 0xff, 1); 1191 + cpuid_entry_override(entry, CPUID_24_0_EBX); 1192 + entry->ebx |= avx10_version; 1193 + 1194 + entry->eax = 0; 1195 + entry->ecx = 0; 1196 + entry->edx = 0; 1197 + break; 1198 + } 1181 1199 case KVM_CPUID_SIGNATURE: { 1182 1200 const u32 *sigptr = (const u32 *)KVM_SIGNATURE; 1183 1201 entry->eax = KVM_CPUID_FEATURES;

+7 -3

arch/x86/kvm/irq.c

··· 108 108 * Read pending interrupt(from non-APIC source) 109 109 * vector and intack. 110 110 */ 111 - static int kvm_cpu_get_extint(struct kvm_vcpu *v) 111 + int kvm_cpu_get_extint(struct kvm_vcpu *v) 112 112 { 113 113 if (!kvm_cpu_has_extint(v)) { 114 114 WARN_ON(!lapic_in_kernel(v)); ··· 131 131 } else 132 132 return kvm_pic_read_irq(v->kvm); /* PIC */ 133 133 } 134 + EXPORT_SYMBOL_GPL(kvm_cpu_get_extint); 134 135 135 136 /* 136 137 * Read pending interrupt vector and intack. ··· 142 141 if (vector != -1) 143 142 return vector; /* PIC */ 144 143 145 - return kvm_get_apic_interrupt(v); /* APIC */ 144 + vector = kvm_apic_has_interrupt(v); /* APIC */ 145 + if (vector != -1) 146 + kvm_apic_ack_interrupt(v, vector); 147 + 148 + return vector; 146 149 } 147 - EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); 148 150 149 151 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 150 152 {

+57 -27

arch/x86/kvm/lapic.c

··· 1944 1944 u64 ns = 0; 1945 1945 ktime_t expire; 1946 1946 struct kvm_vcpu *vcpu = apic->vcpu; 1947 - unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; 1947 + u32 this_tsc_khz = vcpu->arch.virtual_tsc_khz; 1948 1948 unsigned long flags; 1949 1949 ktime_t now; 1950 1950 ··· 2453 2453 } 2454 2454 EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); 2455 2455 2456 + #define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13)) 2457 + 2458 + int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) 2459 + { 2460 + if (data & X2APIC_ICR_RESERVED_BITS) 2461 + return 1; 2462 + 2463 + /* 2464 + * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but 2465 + * only AMD requires it to be zero, Intel essentially just ignores the 2466 + * bit. And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled, 2467 + * the CPU performs the reserved bits checks, i.e. the underlying CPU 2468 + * behavior will "win". Arbitrarily clear the BUSY bit, as there is no 2469 + * sane way to provide consistent behavior with respect to hardware. 2470 + */ 2471 + data &= ~APIC_ICR_BUSY; 2472 + 2473 + kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); 2474 + if (kvm_x86_ops.x2apic_icr_is_split) { 2475 + kvm_lapic_set_reg(apic, APIC_ICR, data); 2476 + kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32); 2477 + } else { 2478 + kvm_lapic_set_reg64(apic, APIC_ICR, data); 2479 + } 2480 + trace_kvm_apic_write(APIC_ICR, data); 2481 + return 0; 2482 + } 2483 + 2484 + static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic) 2485 + { 2486 + if (kvm_x86_ops.x2apic_icr_is_split) 2487 + return (u64)kvm_lapic_get_reg(apic, APIC_ICR) | 2488 + (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32; 2489 + 2490 + return kvm_lapic_get_reg64(apic, APIC_ICR); 2491 + } 2492 + 2456 2493 /* emulate APIC access in a trap manner */ 2457 2494 void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) 2458 2495 { ··· 2507 2470 * maybe-unecessary write, and both are in the noise anyways. 2508 2471 */ 2509 2472 if (apic_x2apic_mode(apic) && offset == APIC_ICR) 2510 - kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR)); 2473 + WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic))); 2511 2474 else 2512 2475 kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); 2513 2476 } ··· 2959 2922 } 2960 2923 } 2961 2924 2962 - int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) 2925 + void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector) 2963 2926 { 2964 - int vector = kvm_apic_has_interrupt(vcpu); 2965 2927 struct kvm_lapic *apic = vcpu->arch.apic; 2966 2928 u32 ppr; 2967 2929 2968 - if (vector == -1) 2969 - return -1; 2930 + if (WARN_ON_ONCE(vector < 0 || !apic)) 2931 + return; 2970 2932 2971 2933 /* 2972 2934 * We get here even with APIC virtualization enabled, if doing ··· 2993 2957 __apic_update_ppr(apic, &ppr); 2994 2958 } 2995 2959 2996 - return vector; 2997 2960 } 2961 + EXPORT_SYMBOL_GPL(kvm_apic_ack_interrupt); 2998 2962 2999 2963 static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, 3000 2964 struct kvm_lapic_state *s, bool set) ··· 3026 2990 3027 2991 /* 3028 2992 * In x2APIC mode, the LDR is fixed and based on the id. And 3029 - * ICR is internally a single 64-bit register, but needs to be 3030 - * split to ICR+ICR2 in userspace for backwards compatibility. 2993 + * if the ICR is _not_ split, ICR is internally a single 64-bit 2994 + * register, but needs to be split to ICR+ICR2 in userspace for 2995 + * backwards compatibility. 3031 2996 */ 3032 - if (set) { 2997 + if (set) 3033 2998 *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id); 3034 2999 3035 - icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | 3036 - (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; 3037 - __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); 3038 - } else { 3039 - icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); 3040 - __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); 3000 + if (!kvm_x86_ops.x2apic_icr_is_split) { 3001 + if (set) { 3002 + icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | 3003 + (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; 3004 + __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); 3005 + } else { 3006 + icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); 3007 + __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); 3008 + } 3041 3009 } 3042 3010 } 3043 3011 ··· 3234 3194 return 0; 3235 3195 } 3236 3196 3237 - int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) 3238 - { 3239 - data &= ~APIC_ICR_BUSY; 3240 - 3241 - kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); 3242 - kvm_lapic_set_reg64(apic, APIC_ICR, data); 3243 - trace_kvm_apic_write(APIC_ICR, data); 3244 - return 0; 3245 - } 3246 - 3247 3197 static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data) 3248 3198 { 3249 3199 u32 low; 3250 3200 3251 3201 if (reg == APIC_ICR) { 3252 - *data = kvm_lapic_get_reg64(apic, APIC_ICR); 3202 + *data = kvm_x2apic_icr_read(apic); 3253 3203 return 0; 3254 3204 } 3255 3205

+1 -2

arch/x86/kvm/lapic.h

··· 88 88 void kvm_free_lapic(struct kvm_vcpu *vcpu); 89 89 90 90 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); 91 + void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector); 91 92 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); 92 - int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); 93 93 int kvm_apic_accept_events(struct kvm_vcpu *vcpu); 94 94 void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event); 95 95 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); 96 96 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); 97 97 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); 98 98 void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); 99 - u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); 100 99 void kvm_recalculate_apic_map(struct kvm *kvm); 101 100 void kvm_apic_set_version(struct kvm_vcpu *vcpu); 102 101 void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu);

-2

arch/x86/kvm/mmu.h

··· 223 223 224 224 bool kvm_mmu_may_ignore_guest_pat(void); 225 225 226 - int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); 227 - 228 226 int kvm_mmu_post_init_vm(struct kvm *kvm); 229 227 void kvm_mmu_pre_destroy_vm(struct kvm *kvm); 230 228

+320 -238

arch/x86/kvm/mmu/mmu.c

··· 614 614 return __get_spte_lockless(sptep); 615 615 } 616 616 617 - /* Returns the Accessed status of the PTE and resets it at the same time. */ 618 - static bool mmu_spte_age(u64 *sptep) 619 - { 620 - u64 spte = mmu_spte_get_lockless(sptep); 621 - 622 - if (!is_accessed_spte(spte)) 623 - return false; 624 - 625 - if (spte_ad_enabled(spte)) { 626 - clear_bit((ffs(shadow_accessed_mask) - 1), 627 - (unsigned long *)sptep); 628 - } else { 629 - /* 630 - * Capture the dirty status of the page, so that it doesn't get 631 - * lost when the SPTE is marked for access tracking. 632 - */ 633 - if (is_writable_pte(spte)) 634 - kvm_set_pfn_dirty(spte_to_pfn(spte)); 635 - 636 - spte = mark_spte_for_access_track(spte); 637 - mmu_spte_update_no_track(sptep, spte); 638 - } 639 - 640 - return true; 641 - } 642 - 643 617 static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu) 644 618 { 645 619 return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct; ··· 912 938 * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct 913 939 * pte_list_desc containing more mappings. 914 940 */ 941 + #define KVM_RMAP_MANY BIT(0) 915 942 916 943 /* 917 944 * Returns the number of pointers in the rmap chain, not counting the new one. ··· 925 950 926 951 if (!rmap_head->val) { 927 952 rmap_head->val = (unsigned long)spte; 928 - } else if (!(rmap_head->val & 1)) { 953 + } else if (!(rmap_head->val & KVM_RMAP_MANY)) { 929 954 desc = kvm_mmu_memory_cache_alloc(cache); 930 955 desc->sptes[0] = (u64 *)rmap_head->val; 931 956 desc->sptes[1] = spte; 932 957 desc->spte_count = 2; 933 958 desc->tail_count = 0; 934 - rmap_head->val = (unsigned long)desc | 1; 959 + rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; 935 960 ++count; 936 961 } else { 937 - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 962 + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); 938 963 count = desc->tail_count + desc->spte_count; 939 964 940 965 /* ··· 943 968 */ 944 969 if (desc->spte_count == PTE_LIST_EXT) { 945 970 desc = kvm_mmu_memory_cache_alloc(cache); 946 - desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul); 971 + desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); 947 972 desc->spte_count = 0; 948 973 desc->tail_count = count; 949 - rmap_head->val = (unsigned long)desc | 1; 974 + rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; 950 975 } 951 976 desc->sptes[desc->spte_count++] = spte; 952 977 } ··· 957 982 struct kvm_rmap_head *rmap_head, 958 983 struct pte_list_desc *desc, int i) 959 984 { 960 - struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 985 + struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); 961 986 int j = head_desc->spte_count - 1; 962 987 963 988 /* ··· 986 1011 if (!head_desc->more) 987 1012 rmap_head->val = 0; 988 1013 else 989 - rmap_head->val = (unsigned long)head_desc->more | 1; 1014 + rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY; 990 1015 mmu_free_pte_list_desc(head_desc); 991 1016 } 992 1017 ··· 999 1024 if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm)) 1000 1025 return; 1001 1026 1002 - if (!(rmap_head->val & 1)) { 1027 + if (!(rmap_head->val & KVM_RMAP_MANY)) { 1003 1028 if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) 1004 1029 return; 1005 1030 1006 1031 rmap_head->val = 0; 1007 1032 } else { 1008 - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 1033 + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); 1009 1034 while (desc) { 1010 1035 for (i = 0; i < desc->spte_count; ++i) { 1011 1036 if (desc->sptes[i] == spte) { ··· 1038 1063 if (!rmap_head->val) 1039 1064 return false; 1040 1065 1041 - if (!(rmap_head->val & 1)) { 1066 + if (!(rmap_head->val & KVM_RMAP_MANY)) { 1042 1067 mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); 1043 1068 goto out; 1044 1069 } 1045 1070 1046 - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 1071 + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); 1047 1072 1048 1073 for (; desc; desc = next) { 1049 1074 for (i = 0; i < desc->spte_count; i++) ··· 1063 1088 1064 1089 if (!rmap_head->val) 1065 1090 return 0; 1066 - else if (!(rmap_head->val & 1)) 1091 + else if (!(rmap_head->val & KVM_RMAP_MANY)) 1067 1092 return 1; 1068 1093 1069 - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 1094 + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); 1070 1095 return desc->tail_count + desc->spte_count; 1071 1096 } 1072 1097 ··· 1128 1153 if (!rmap_head->val) 1129 1154 return NULL; 1130 1155 1131 - if (!(rmap_head->val & 1)) { 1156 + if (!(rmap_head->val & KVM_RMAP_MANY)) { 1132 1157 iter->desc = NULL; 1133 1158 sptep = (u64 *)rmap_head->val; 1134 1159 goto out; 1135 1160 } 1136 1161 1137 - iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 1162 + iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); 1138 1163 iter->pos = 0; 1139 1164 sptep = iter->desc->sptes[iter->pos]; 1140 1165 out: ··· 1282 1307 return flush; 1283 1308 } 1284 1309 1285 - /** 1286 - * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages 1287 - * @kvm: kvm instance 1288 - * @slot: slot to protect 1289 - * @gfn_offset: start of the BITS_PER_LONG pages we care about 1290 - * @mask: indicates which pages we should protect 1291 - * 1292 - * Used when we do not need to care about huge page mappings. 1293 - */ 1294 1310 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 1295 1311 struct kvm_memory_slot *slot, 1296 1312 gfn_t gfn_offset, unsigned long mask) ··· 1305 1339 } 1306 1340 } 1307 1341 1308 - /** 1309 - * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write 1310 - * protect the page if the D-bit isn't supported. 1311 - * @kvm: kvm instance 1312 - * @slot: slot to clear D-bit 1313 - * @gfn_offset: start of the BITS_PER_LONG pages we care about 1314 - * @mask: indicates which pages we should clear D-bit 1315 - * 1316 - * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. 1317 - */ 1318 1342 static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1319 1343 struct kvm_memory_slot *slot, 1320 1344 gfn_t gfn_offset, unsigned long mask) ··· 1328 1372 } 1329 1373 } 1330 1374 1331 - /** 1332 - * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected 1333 - * PT level pages. 1334 - * 1335 - * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to 1336 - * enable dirty logging for them. 1337 - * 1338 - * We need to care about huge page mappings: e.g. during dirty logging we may 1339 - * have such mappings. 1340 - */ 1341 1375 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1342 1376 struct kvm_memory_slot *slot, 1343 1377 gfn_t gfn_offset, unsigned long mask) 1344 1378 { 1345 1379 /* 1346 - * Huge pages are NOT write protected when we start dirty logging in 1347 - * initially-all-set mode; must write protect them here so that they 1348 - * are split to 4K on the first write. 1380 + * If the slot was assumed to be "initially all dirty", write-protect 1381 + * huge pages to ensure they are split to 4KiB on the first write (KVM 1382 + * dirty logs at 4KiB granularity). If eager page splitting is enabled, 1383 + * immediately try to split huge pages, e.g. so that vCPUs don't get 1384 + * saddled with the cost of splitting. 1349 1385 * 1350 1386 * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn 1351 1387 * of memslot has no such restriction, so the range can cross two large ··· 1359 1411 PG_LEVEL_2M); 1360 1412 } 1361 1413 1362 - /* Now handle 4K PTEs. */ 1414 + /* 1415 + * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in 1416 + * mask. If PML is enabled and the GFN doesn't need to be write- 1417 + * protected for other reasons, e.g. shadow paging, clear the Dirty bit. 1418 + * Otherwise clear the Writable bit. 1419 + * 1420 + * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is 1421 + * enabled but it chooses between clearing the Dirty bit and Writeable 1422 + * bit based on the context. 1423 + */ 1363 1424 if (kvm_x86_ops.cpu_dirty_log_size) 1364 1425 kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); 1365 1426 else ··· 1410 1453 return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); 1411 1454 } 1412 1455 1413 - static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1414 - const struct kvm_memory_slot *slot) 1456 + static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1457 + const struct kvm_memory_slot *slot) 1415 1458 { 1416 1459 return kvm_zap_all_rmap_sptes(kvm, rmap_head); 1417 - } 1418 - 1419 - static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1420 - struct kvm_memory_slot *slot, gfn_t gfn, int level) 1421 - { 1422 - return __kvm_zap_rmap(kvm, rmap_head, slot); 1423 1460 } 1424 1461 1425 1462 struct slot_rmap_walk_iterator { ··· 1464 1513 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) 1465 1514 { 1466 1515 while (++iterator->rmap <= iterator->end_rmap) { 1467 - iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); 1516 + iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level); 1468 1517 1469 1518 if (iterator->rmap->val) 1470 1519 return; ··· 1485 1534 slot_rmap_walk_okay(_iter_); \ 1486 1535 slot_rmap_walk_next(_iter_)) 1487 1536 1488 - typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1489 - struct kvm_memory_slot *slot, gfn_t gfn, 1490 - int level); 1537 + /* The return value indicates if tlb flush on all vcpus is needed. */ 1538 + typedef bool (*slot_rmaps_handler) (struct kvm *kvm, 1539 + struct kvm_rmap_head *rmap_head, 1540 + const struct kvm_memory_slot *slot); 1491 1541 1492 - static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, 1493 - struct kvm_gfn_range *range, 1494 - rmap_handler_t handler) 1542 + static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, 1543 + const struct kvm_memory_slot *slot, 1544 + slot_rmaps_handler fn, 1545 + int start_level, int end_level, 1546 + gfn_t start_gfn, gfn_t end_gfn, 1547 + bool can_yield, bool flush_on_yield, 1548 + bool flush) 1495 1549 { 1496 1550 struct slot_rmap_walk_iterator iterator; 1497 - bool ret = false; 1498 1551 1499 - for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, 1500 - range->start, range->end - 1, &iterator) 1501 - ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn, 1502 - iterator.level); 1552 + lockdep_assert_held_write(&kvm->mmu_lock); 1503 1553 1504 - return ret; 1554 + for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, 1555 + end_gfn, &iterator) { 1556 + if (iterator.rmap) 1557 + flush |= fn(kvm, iterator.rmap, slot); 1558 + 1559 + if (!can_yield) 1560 + continue; 1561 + 1562 + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 1563 + if (flush && flush_on_yield) { 1564 + kvm_flush_remote_tlbs_range(kvm, start_gfn, 1565 + iterator.gfn - start_gfn + 1); 1566 + flush = false; 1567 + } 1568 + cond_resched_rwlock_write(&kvm->mmu_lock); 1569 + } 1570 + } 1571 + 1572 + return flush; 1573 + } 1574 + 1575 + static __always_inline bool walk_slot_rmaps(struct kvm *kvm, 1576 + const struct kvm_memory_slot *slot, 1577 + slot_rmaps_handler fn, 1578 + int start_level, int end_level, 1579 + bool flush_on_yield) 1580 + { 1581 + return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, 1582 + slot->base_gfn, slot->base_gfn + slot->npages - 1, 1583 + true, flush_on_yield, false); 1584 + } 1585 + 1586 + static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, 1587 + const struct kvm_memory_slot *slot, 1588 + slot_rmaps_handler fn, 1589 + bool flush_on_yield) 1590 + { 1591 + return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); 1592 + } 1593 + 1594 + static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm, 1595 + const struct kvm_memory_slot *slot, 1596 + gfn_t start, gfn_t end, bool can_yield, 1597 + bool flush) 1598 + { 1599 + return __walk_slot_rmaps(kvm, slot, kvm_zap_rmap, 1600 + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, 1601 + start, end - 1, can_yield, true, flush); 1505 1602 } 1506 1603 1507 1604 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) ··· 1557 1558 bool flush = false; 1558 1559 1559 1560 if (kvm_memslots_have_rmaps(kvm)) 1560 - flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap); 1561 + flush = __kvm_rmap_zap_gfn_range(kvm, range->slot, 1562 + range->start, range->end, 1563 + range->may_block, flush); 1561 1564 1562 1565 if (tdp_mmu_enabled) 1563 1566 flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); ··· 1569 1568 kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); 1570 1569 1571 1570 return flush; 1572 - } 1573 - 1574 - static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1575 - struct kvm_memory_slot *slot, gfn_t gfn, int level) 1576 - { 1577 - u64 *sptep; 1578 - struct rmap_iterator iter; 1579 - int young = 0; 1580 - 1581 - for_each_rmap_spte(rmap_head, &iter, sptep) 1582 - young |= mmu_spte_age(sptep); 1583 - 1584 - return young; 1585 - } 1586 - 1587 - static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1588 - struct kvm_memory_slot *slot, gfn_t gfn, int level) 1589 - { 1590 - u64 *sptep; 1591 - struct rmap_iterator iter; 1592 - 1593 - for_each_rmap_spte(rmap_head, &iter, sptep) 1594 - if (is_accessed_spte(*sptep)) 1595 - return true; 1596 - return false; 1597 1571 } 1598 1572 1599 1573 #define RMAP_RECYCLE_THRESHOLD 1000 ··· 1605 1629 __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access); 1606 1630 } 1607 1631 1632 + static bool kvm_rmap_age_gfn_range(struct kvm *kvm, 1633 + struct kvm_gfn_range *range, bool test_only) 1634 + { 1635 + struct slot_rmap_walk_iterator iterator; 1636 + struct rmap_iterator iter; 1637 + bool young = false; 1638 + u64 *sptep; 1639 + 1640 + for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, 1641 + range->start, range->end - 1, &iterator) { 1642 + for_each_rmap_spte(iterator.rmap, &iter, sptep) { 1643 + u64 spte = *sptep; 1644 + 1645 + if (!is_accessed_spte(spte)) 1646 + continue; 1647 + 1648 + if (test_only) 1649 + return true; 1650 + 1651 + if (spte_ad_enabled(spte)) { 1652 + clear_bit((ffs(shadow_accessed_mask) - 1), 1653 + (unsigned long *)sptep); 1654 + } else { 1655 + /* 1656 + * Capture the dirty status of the page, so that 1657 + * it doesn't get lost when the SPTE is marked 1658 + * for access tracking. 1659 + */ 1660 + if (is_writable_pte(spte)) 1661 + kvm_set_pfn_dirty(spte_to_pfn(spte)); 1662 + 1663 + spte = mark_spte_for_access_track(spte); 1664 + mmu_spte_update_no_track(sptep, spte); 1665 + } 1666 + young = true; 1667 + } 1668 + } 1669 + return young; 1670 + } 1671 + 1608 1672 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1609 1673 { 1610 1674 bool young = false; 1611 1675 1612 1676 if (kvm_memslots_have_rmaps(kvm)) 1613 - young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap); 1677 + young = kvm_rmap_age_gfn_range(kvm, range, false); 1614 1678 1615 1679 if (tdp_mmu_enabled) 1616 1680 young |= kvm_tdp_mmu_age_gfn_range(kvm, range); ··· 1663 1647 bool young = false; 1664 1648 1665 1649 if (kvm_memslots_have_rmaps(kvm)) 1666 - young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap); 1650 + young = kvm_rmap_age_gfn_range(kvm, range, true); 1667 1651 1668 1652 if (tdp_mmu_enabled) 1669 1653 young |= kvm_tdp_mmu_test_age_gfn(kvm, range); ··· 2729 2713 write_unlock(&kvm->mmu_lock); 2730 2714 } 2731 2715 2732 - int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 2716 + bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 2717 + bool always_retry) 2733 2718 { 2734 - struct kvm_mmu_page *sp; 2719 + struct kvm *kvm = vcpu->kvm; 2735 2720 LIST_HEAD(invalid_list); 2736 - int r; 2721 + struct kvm_mmu_page *sp; 2722 + gpa_t gpa = cr2_or_gpa; 2723 + bool r = false; 2737 2724 2738 - r = 0; 2739 - write_lock(&kvm->mmu_lock); 2740 - for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { 2741 - r = 1; 2742 - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 2725 + /* 2726 + * Bail early if there aren't any write-protected shadow pages to avoid 2727 + * unnecessarily taking mmu_lock lock, e.g. if the gfn is write-tracked 2728 + * by a third party. Reading indirect_shadow_pages without holding 2729 + * mmu_lock is safe, as this is purely an optimization, i.e. a false 2730 + * positive is benign, and a false negative will simply result in KVM 2731 + * skipping the unprotect+retry path, which is also an optimization. 2732 + */ 2733 + if (!READ_ONCE(kvm->arch.indirect_shadow_pages)) 2734 + goto out; 2735 + 2736 + if (!vcpu->arch.mmu->root_role.direct) { 2737 + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); 2738 + if (gpa == INVALID_GPA) 2739 + goto out; 2743 2740 } 2741 + 2742 + write_lock(&kvm->mmu_lock); 2743 + for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa)) 2744 + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 2745 + 2746 + /* 2747 + * Snapshot the result before zapping, as zapping will remove all list 2748 + * entries, i.e. checking the list later would yield a false negative. 2749 + */ 2750 + r = !list_empty(&invalid_list); 2744 2751 kvm_mmu_commit_zap_page(kvm, &invalid_list); 2745 2752 write_unlock(&kvm->mmu_lock); 2746 2753 2747 - return r; 2748 - } 2749 - 2750 - static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 2751 - { 2752 - gpa_t gpa; 2753 - int r; 2754 - 2755 - if (vcpu->arch.mmu->root_role.direct) 2756 - return 0; 2757 - 2758 - gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 2759 - 2760 - r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2761 - 2754 + out: 2755 + if (r || always_retry) { 2756 + vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); 2757 + vcpu->arch.last_retry_addr = cr2_or_gpa; 2758 + } 2762 2759 return r; 2763 2760 } 2764 2761 ··· 2943 2914 trace_kvm_mmu_set_spte(level, gfn, sptep); 2944 2915 } 2945 2916 2946 - if (wrprot) { 2947 - if (write_fault) 2948 - ret = RET_PF_EMULATE; 2949 - } 2917 + if (wrprot && write_fault) 2918 + ret = RET_PF_WRITE_PROTECTED; 2950 2919 2951 2920 if (flush) 2952 2921 kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); ··· 4576 4549 return RET_PF_RETRY; 4577 4550 4578 4551 if (page_fault_handle_page_track(vcpu, fault)) 4579 - return RET_PF_EMULATE; 4552 + return RET_PF_WRITE_PROTECTED; 4580 4553 4581 4554 r = fast_page_fault(vcpu, fault); 4582 4555 if (r != RET_PF_INVALID) ··· 4645 4618 if (!flags) { 4646 4619 trace_kvm_page_fault(vcpu, fault_address, error_code); 4647 4620 4648 - if (kvm_event_needs_reinjection(vcpu)) 4649 - kvm_mmu_unprotect_page_virt(vcpu, fault_address); 4650 4621 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, 4651 4622 insn_len); 4652 4623 } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { ··· 4667 4642 int r; 4668 4643 4669 4644 if (page_fault_handle_page_track(vcpu, fault)) 4670 - return RET_PF_EMULATE; 4645 + return RET_PF_WRITE_PROTECTED; 4671 4646 4672 4647 r = fast_page_fault(vcpu, fault); 4673 4648 if (r != RET_PF_INVALID) ··· 4744 4719 switch (r) { 4745 4720 case RET_PF_FIXED: 4746 4721 case RET_PF_SPURIOUS: 4722 + case RET_PF_WRITE_PROTECTED: 4747 4723 return 0; 4748 4724 4749 4725 case RET_PF_EMULATE: ··· 5989 5963 write_unlock(&vcpu->kvm->mmu_lock); 5990 5964 } 5991 5965 5966 + static bool is_write_to_guest_page_table(u64 error_code) 5967 + { 5968 + const u64 mask = PFERR_GUEST_PAGE_MASK | PFERR_WRITE_MASK | PFERR_PRESENT_MASK; 5969 + 5970 + return (error_code & mask) == mask; 5971 + } 5972 + 5973 + static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 5974 + u64 error_code, int *emulation_type) 5975 + { 5976 + bool direct = vcpu->arch.mmu->root_role.direct; 5977 + 5978 + /* 5979 + * Do not try to unprotect and retry if the vCPU re-faulted on the same 5980 + * RIP with the same address that was previously unprotected, as doing 5981 + * so will likely put the vCPU into an infinite. E.g. if the vCPU uses 5982 + * a non-page-table modifying instruction on the PDE that points to the 5983 + * instruction, then unprotecting the gfn will unmap the instruction's 5984 + * code, i.e. make it impossible for the instruction to ever complete. 5985 + */ 5986 + if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) && 5987 + vcpu->arch.last_retry_addr == cr2_or_gpa) 5988 + return RET_PF_EMULATE; 5989 + 5990 + /* 5991 + * Reset the unprotect+retry values that guard against infinite loops. 5992 + * The values will be refreshed if KVM explicitly unprotects a gfn and 5993 + * retries, in all other cases it's safe to retry in the future even if 5994 + * the next page fault happens on the same RIP+address. 5995 + */ 5996 + vcpu->arch.last_retry_eip = 0; 5997 + vcpu->arch.last_retry_addr = 0; 5998 + 5999 + /* 6000 + * It should be impossible to reach this point with an MMIO cache hit, 6001 + * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid, 6002 + * writable memslot, and creating a memslot should invalidate the MMIO 6003 + * cache by way of changing the memslot generation. WARN and disallow 6004 + * retry if MMIO is detected, as retrying MMIO emulation is pointless 6005 + * and could put the vCPU into an infinite loop because the processor 6006 + * will keep faulting on the non-existent MMIO address. 6007 + */ 6008 + if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct))) 6009 + return RET_PF_EMULATE; 6010 + 6011 + /* 6012 + * Before emulating the instruction, check to see if the access was due 6013 + * to a read-only violation while the CPU was walking non-nested NPT 6014 + * page tables, i.e. for a direct MMU, for _guest_ page tables in L1. 6015 + * If L1 is sharing (a subset of) its page tables with L2, e.g. by 6016 + * having nCR3 share lower level page tables with hCR3, then when KVM 6017 + * (L0) write-protects the nested NPTs, i.e. npt12 entries, KVM is also 6018 + * unknowingly write-protecting L1's guest page tables, which KVM isn't 6019 + * shadowing. 6020 + * 6021 + * Because the CPU (by default) walks NPT page tables using a write 6022 + * access (to ensure the CPU can do A/D updates), page walks in L1 can 6023 + * trigger write faults for the above case even when L1 isn't modifying 6024 + * PTEs. As a result, KVM will unnecessarily emulate (or at least, try 6025 + * to emulate) an excessive number of L1 instructions; because L1's MMU 6026 + * isn't shadowed by KVM, there is no need to write-protect L1's gPTEs 6027 + * and thus no need to emulate in order to guarantee forward progress. 6028 + * 6029 + * Try to unprotect the gfn, i.e. zap any shadow pages, so that L1 can 6030 + * proceed without triggering emulation. If one or more shadow pages 6031 + * was zapped, skip emulation and resume L1 to let it natively execute 6032 + * the instruction. If no shadow pages were zapped, then the write- 6033 + * fault is due to something else entirely, i.e. KVM needs to emulate, 6034 + * as resuming the guest will put it into an infinite loop. 6035 + * 6036 + * Note, this code also applies to Intel CPUs, even though it is *very* 6037 + * unlikely that an L1 will share its page tables (IA32/PAE/paging64 6038 + * format) with L2's page tables (EPT format). 6039 + * 6040 + * For indirect MMUs, i.e. if KVM is shadowing the current MMU, try to 6041 + * unprotect the gfn and retry if an event is awaiting reinjection. If 6042 + * KVM emulates multiple instructions before completing event injection, 6043 + * the event could be delayed beyond what is architecturally allowed, 6044 + * e.g. KVM could inject an IRQ after the TPR has been raised. 6045 + */ 6046 + if (((direct && is_write_to_guest_page_table(error_code)) || 6047 + (!direct && kvm_event_needs_reinjection(vcpu))) && 6048 + kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) 6049 + return RET_PF_RETRY; 6050 + 6051 + /* 6052 + * The gfn is write-protected, but if KVM detects its emulating an 6053 + * instruction that is unlikely to be used to modify page tables, or if 6054 + * emulation fails, KVM can try to unprotect the gfn and let the CPU 6055 + * re-execute the instruction that caused the page fault. Do not allow 6056 + * retrying an instruction from a nested guest as KVM is only explicitly 6057 + * shadowing L1's page tables, i.e. unprotecting something for L1 isn't 6058 + * going to magically fix whatever issue caused L2 to fail. 6059 + */ 6060 + if (!is_guest_mode(vcpu)) 6061 + *emulation_type |= EMULTYPE_ALLOW_RETRY_PF; 6062 + 6063 + return RET_PF_EMULATE; 6064 + } 6065 + 5992 6066 int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, 5993 6067 void *insn, int insn_len) 5994 6068 { ··· 6134 6008 if (r < 0) 6135 6009 return r; 6136 6010 6011 + if (r == RET_PF_WRITE_PROTECTED) 6012 + r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code, 6013 + &emulation_type); 6014 + 6137 6015 if (r == RET_PF_FIXED) 6138 6016 vcpu->stat.pf_fixed++; 6139 6017 else if (r == RET_PF_EMULATE) ··· 6148 6018 if (r != RET_PF_EMULATE) 6149 6019 return 1; 6150 6020 6151 - /* 6152 - * Before emulating the instruction, check if the error code 6153 - * was due to a RO violation while translating the guest page. 6154 - * This can occur when using nested virtualization with nested 6155 - * paging in both guests. If true, we simply unprotect the page 6156 - * and resume the guest. 6157 - */ 6158 - if (vcpu->arch.mmu->root_role.direct && 6159 - (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { 6160 - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); 6161 - return 1; 6162 - } 6163 - 6164 - /* 6165 - * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still 6166 - * optimistically try to just unprotect the page and let the processor 6167 - * re-execute the instruction that caused the page fault. Do not allow 6168 - * retrying MMIO emulation, as it's not only pointless but could also 6169 - * cause us to enter an infinite loop because the processor will keep 6170 - * faulting on the non-existent MMIO address. Retrying an instruction 6171 - * from a nested guest is also pointless and dangerous as we are only 6172 - * explicitly shadowing L1's page tables, i.e. unprotecting something 6173 - * for L1 isn't going to magically fix whatever issue cause L2 to fail. 6174 - */ 6175 - if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) 6176 - emulation_type |= EMULTYPE_ALLOW_RETRY_PF; 6177 6021 emulate: 6178 6022 return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, 6179 6023 insn_len); ··· 6305 6201 max_huge_page_level = PG_LEVEL_2M; 6306 6202 } 6307 6203 EXPORT_SYMBOL_GPL(kvm_configure_mmu); 6308 - 6309 - /* The return value indicates if tlb flush on all vcpus is needed. */ 6310 - typedef bool (*slot_rmaps_handler) (struct kvm *kvm, 6311 - struct kvm_rmap_head *rmap_head, 6312 - const struct kvm_memory_slot *slot); 6313 - 6314 - static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, 6315 - const struct kvm_memory_slot *slot, 6316 - slot_rmaps_handler fn, 6317 - int start_level, int end_level, 6318 - gfn_t start_gfn, gfn_t end_gfn, 6319 - bool flush_on_yield, bool flush) 6320 - { 6321 - struct slot_rmap_walk_iterator iterator; 6322 - 6323 - lockdep_assert_held_write(&kvm->mmu_lock); 6324 - 6325 - for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, 6326 - end_gfn, &iterator) { 6327 - if (iterator.rmap) 6328 - flush |= fn(kvm, iterator.rmap, slot); 6329 - 6330 - if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 6331 - if (flush && flush_on_yield) { 6332 - kvm_flush_remote_tlbs_range(kvm, start_gfn, 6333 - iterator.gfn - start_gfn + 1); 6334 - flush = false; 6335 - } 6336 - cond_resched_rwlock_write(&kvm->mmu_lock); 6337 - } 6338 - } 6339 - 6340 - return flush; 6341 - } 6342 - 6343 - static __always_inline bool walk_slot_rmaps(struct kvm *kvm, 6344 - const struct kvm_memory_slot *slot, 6345 - slot_rmaps_handler fn, 6346 - int start_level, int end_level, 6347 - bool flush_on_yield) 6348 - { 6349 - return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, 6350 - slot->base_gfn, slot->base_gfn + slot->npages - 1, 6351 - flush_on_yield, false); 6352 - } 6353 - 6354 - static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, 6355 - const struct kvm_memory_slot *slot, 6356 - slot_rmaps_handler fn, 6357 - bool flush_on_yield) 6358 - { 6359 - return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); 6360 - } 6361 6204 6362 6205 static void free_mmu_pages(struct kvm_mmu *mmu) 6363 6206 { ··· 6579 6528 if (WARN_ON_ONCE(start >= end)) 6580 6529 continue; 6581 6530 6582 - flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap, 6583 - PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, 6584 - start, end - 1, true, flush); 6531 + flush = __kvm_rmap_zap_gfn_range(kvm, memslot, start, 6532 + end, true, flush); 6585 6533 } 6586 6534 } 6587 6535 ··· 6868 6818 */ 6869 6819 for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) 6870 6820 __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages, 6871 - level, level, start, end - 1, true, false); 6821 + level, level, start, end - 1, true, true, false); 6872 6822 } 6873 6823 6874 6824 /* Must be called with the mmu_lock held in write-mode. */ ··· 7047 6997 kvm_mmu_zap_all(kvm); 7048 6998 } 7049 6999 7000 + /* 7001 + * Zapping leaf SPTEs with memslot range when a memslot is moved/deleted. 7002 + * 7003 + * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst 7004 + * case scenario we'll have unused shadow pages lying around until they 7005 + * are recycled due to age or when the VM is destroyed. 7006 + */ 7007 + static void kvm_mmu_zap_memslot_leafs(struct kvm *kvm, struct kvm_memory_slot *slot) 7008 + { 7009 + struct kvm_gfn_range range = { 7010 + .slot = slot, 7011 + .start = slot->base_gfn, 7012 + .end = slot->base_gfn + slot->npages, 7013 + .may_block = true, 7014 + }; 7015 + 7016 + write_lock(&kvm->mmu_lock); 7017 + if (kvm_unmap_gfn_range(kvm, &range)) 7018 + kvm_flush_remote_tlbs_memslot(kvm, slot); 7019 + 7020 + write_unlock(&kvm->mmu_lock); 7021 + } 7022 + 7023 + static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm) 7024 + { 7025 + return kvm->arch.vm_type == KVM_X86_DEFAULT_VM && 7026 + kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL); 7027 + } 7028 + 7050 7029 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 7051 7030 struct kvm_memory_slot *slot) 7052 7031 { 7053 - kvm_mmu_zap_all_fast(kvm); 7032 + if (kvm_memslot_flush_zap_all(kvm)) 7033 + kvm_mmu_zap_all_fast(kvm); 7034 + else 7035 + kvm_mmu_zap_memslot_leafs(kvm, slot); 7054 7036 } 7055 7037 7056 7038 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)

+3 -2

arch/x86/kvm/mmu/mmu_internal.h

··· 258 258 * RET_PF_CONTINUE: So far, so good, keep handling the page fault. 259 259 * RET_PF_RETRY: let CPU fault again on the address. 260 260 * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. 261 + * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the 262 + * gfn and retry, or emulate the instruction directly. 261 263 * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. 262 264 * RET_PF_FIXED: The faulting entry has been fixed. 263 265 * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. ··· 276 274 RET_PF_CONTINUE = 0, 277 275 RET_PF_RETRY, 278 276 RET_PF_EMULATE, 277 + RET_PF_WRITE_PROTECTED, 279 278 RET_PF_INVALID, 280 279 RET_PF_FIXED, 281 280 RET_PF_SPURIOUS, ··· 351 348 int max_level); 352 349 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); 353 350 void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); 354 - 355 - void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); 356 351 357 352 void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); 358 353 void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);

+1

arch/x86/kvm/mmu/mmutrace.h

··· 57 57 TRACE_DEFINE_ENUM(RET_PF_CONTINUE); 58 58 TRACE_DEFINE_ENUM(RET_PF_RETRY); 59 59 TRACE_DEFINE_ENUM(RET_PF_EMULATE); 60 + TRACE_DEFINE_ENUM(RET_PF_WRITE_PROTECTED); 60 61 TRACE_DEFINE_ENUM(RET_PF_INVALID); 61 62 TRACE_DEFINE_ENUM(RET_PF_FIXED); 62 63 TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);

+32 -31

arch/x86/kvm/mmu/paging_tmpl.h

··· 646 646 * really care if it changes underneath us after this point). 647 647 */ 648 648 if (FNAME(gpte_changed)(vcpu, gw, top_level)) 649 - goto out_gpte_changed; 649 + return RET_PF_RETRY; 650 650 651 651 if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) 652 - goto out_gpte_changed; 652 + return RET_PF_RETRY; 653 653 654 654 /* 655 655 * Load a new root and retry the faulting instruction in the extremely ··· 659 659 */ 660 660 if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) { 661 661 kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu); 662 - goto out_gpte_changed; 662 + return RET_PF_RETRY; 663 663 } 664 664 665 665 for_each_shadow_entry(vcpu, fault->addr, it) { ··· 674 674 sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, 675 675 false, access); 676 676 677 - if (sp != ERR_PTR(-EEXIST)) { 678 - /* 679 - * We must synchronize the pagetable before linking it 680 - * because the guest doesn't need to flush tlb when 681 - * the gpte is changed from non-present to present. 682 - * Otherwise, the guest may use the wrong mapping. 683 - * 684 - * For PG_LEVEL_4K, kvm_mmu_get_page() has already 685 - * synchronized it transiently via kvm_sync_page(). 686 - * 687 - * For higher level pagetable, we synchronize it via 688 - * the slower mmu_sync_children(). If it needs to 689 - * break, some progress has been made; return 690 - * RET_PF_RETRY and retry on the next #PF. 691 - * KVM_REQ_MMU_SYNC is not necessary but it 692 - * expedites the process. 693 - */ 694 - if (sp->unsync_children && 695 - mmu_sync_children(vcpu, sp, false)) 696 - return RET_PF_RETRY; 697 - } 677 + /* 678 + * Synchronize the new page before linking it, as the CPU (KVM) 679 + * is architecturally disallowed from inserting non-present 680 + * entries into the TLB, i.e. the guest isn't required to flush 681 + * the TLB when changing the gPTE from non-present to present. 682 + * 683 + * For PG_LEVEL_4K, kvm_mmu_find_shadow_page() has already 684 + * synchronized the page via kvm_sync_page(). 685 + * 686 + * For higher level pages, which cannot be unsync themselves 687 + * but can have unsync children, synchronize via the slower 688 + * mmu_sync_children(). If KVM needs to drop mmu_lock due to 689 + * contention or to reschedule, instruct the caller to retry 690 + * the #PF (mmu_sync_children() ensures forward progress will 691 + * be made). 692 + */ 693 + if (sp != ERR_PTR(-EEXIST) && sp->unsync_children && 694 + mmu_sync_children(vcpu, sp, false)) 695 + return RET_PF_RETRY; 698 696 699 697 /* 700 - * Verify that the gpte in the page we've just write 701 - * protected is still there. 698 + * Verify that the gpte in the page, which is now either 699 + * write-protected or unsync, wasn't modified between the fault 700 + * and acquiring mmu_lock. This needs to be done even when 701 + * reusing an existing shadow page to ensure the information 702 + * gathered by the walker matches the information stored in the 703 + * shadow page (which could have been modified by a different 704 + * vCPU even if the page was already linked). Holding mmu_lock 705 + * prevents the shadow page from changing after this point. 702 706 */ 703 707 if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) 704 - goto out_gpte_changed; 708 + return RET_PF_RETRY; 705 709 706 710 if (sp != ERR_PTR(-EEXIST)) 707 711 link_shadow_page(vcpu, it.sptep, sp); ··· 759 755 760 756 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 761 757 return ret; 762 - 763 - out_gpte_changed: 764 - return RET_PF_RETRY; 765 758 } 766 759 767 760 /* ··· 806 805 807 806 if (page_fault_handle_page_track(vcpu, fault)) { 808 807 shadow_page_table_clear_flood(vcpu, fault->addr); 809 - return RET_PF_EMULATE; 808 + return RET_PF_WRITE_PROTECTED; 810 809 } 811 810 812 811 r = mmu_topup_memory_caches(vcpu, true);

+2 -4

arch/x86/kvm/mmu/tdp_mmu.c

··· 1046 1046 * protected, emulation is needed. If the emulation was skipped, 1047 1047 * the vCPU would have the same fault again. 1048 1048 */ 1049 - if (wrprot) { 1050 - if (fault->write) 1051 - ret = RET_PF_EMULATE; 1052 - } 1049 + if (wrprot && fault->write) 1050 + ret = RET_PF_WRITE_PROTECTED; 1053 1051 1054 1052 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 1055 1053 if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {

+8

arch/x86/kvm/reverse_cpuid.h

··· 17 17 CPUID_8000_0007_EDX, 18 18 CPUID_8000_0022_EAX, 19 19 CPUID_7_2_EDX, 20 + CPUID_24_0_EBX, 20 21 NR_KVM_CPU_CAPS, 21 22 22 23 NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, ··· 47 46 #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) 48 47 #define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8) 49 48 #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) 49 + #define X86_FEATURE_AVX10 KVM_X86_FEATURE(CPUID_7_1_EDX, 19) 50 50 51 51 /* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */ 52 52 #define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0) ··· 56 54 #define X86_FEATURE_DDPD_U KVM_X86_FEATURE(CPUID_7_2_EDX, 3) 57 55 #define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) 58 56 #define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) 57 + 58 + /* Intel-defined sub-features, CPUID level 0x00000024:0 (EBX) */ 59 + #define X86_FEATURE_AVX10_128 KVM_X86_FEATURE(CPUID_24_0_EBX, 16) 60 + #define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17) 61 + #define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18) 59 62 60 63 /* CPUID level 0x80000007 (EDX). */ 61 64 #define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) ··· 97 90 [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX}, 98 91 [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX}, 99 92 [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX}, 93 + [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX}, 100 94 }; 101 95 102 96 /*

+19 -5

arch/x86/kvm/smm.c

··· 624 624 #endif 625 625 626 626 /* 627 - * Give leave_smm() a chance to make ISA-specific changes to the vCPU 628 - * state (e.g. enter guest mode) before loading state from the SMM 629 - * state-save area. 627 + * FIXME: When resuming L2 (a.k.a. guest mode), the transition to guest 628 + * mode should happen _after_ loading state from SMRAM. However, KVM 629 + * piggybacks the nested VM-Enter flows (which is wrong for many other 630 + * reasons), and so nSVM/nVMX would clobber state that is loaded from 631 + * SMRAM and from the VMCS/VMCB. 630 632 */ 631 633 if (kvm_x86_call(leave_smm)(vcpu, &smram)) 632 634 return X86EMUL_UNHANDLEABLE; 633 635 634 636 #ifdef CONFIG_X86_64 635 637 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) 636 - return rsm_load_state_64(ctxt, &smram.smram64); 638 + ret = rsm_load_state_64(ctxt, &smram.smram64); 637 639 else 638 640 #endif 639 - return rsm_load_state_32(ctxt, &smram.smram32); 641 + ret = rsm_load_state_32(ctxt, &smram.smram32); 642 + 643 + /* 644 + * If RSM fails and triggers shutdown, architecturally the shutdown 645 + * occurs *before* the transition to guest mode. But due to KVM's 646 + * flawed handling of RSM to L2 (see above), the vCPU may already be 647 + * in_guest_mode(). Force the vCPU out of guest mode before delivering 648 + * the shutdown, so that L1 enters shutdown instead of seeing a VM-Exit 649 + * that architecturally shouldn't be possible. 650 + */ 651 + if (ret != X86EMUL_CONTINUE && is_guest_mode(vcpu)) 652 + kvm_leave_nested(vcpu); 653 + return ret; 640 654 }

+2 -2

arch/x86/kvm/svm/nested.c

··· 1693 1693 return -EINVAL; 1694 1694 1695 1695 ret = -ENOMEM; 1696 - ctl = kzalloc(sizeof(*ctl), GFP_KERNEL_ACCOUNT); 1697 - save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT); 1696 + ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); 1697 + save = kzalloc(sizeof(*save), GFP_KERNEL); 1698 1698 if (!ctl || !save) 1699 1699 goto out_free; 1700 1700

+49 -38

arch/x86/kvm/svm/svm.c

··· 573 573 574 574 static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd) 575 575 { 576 - return page_address(sd->save_area) + 0x400; 576 + return &sd->save_area->host_sev_es_save; 577 577 } 578 578 579 579 static inline void kvm_cpu_svm_disable(void) ··· 592 592 } 593 593 } 594 594 595 - static void svm_emergency_disable(void) 595 + static void svm_emergency_disable_virtualization_cpu(void) 596 596 { 597 597 kvm_rebooting = true; 598 598 599 599 kvm_cpu_svm_disable(); 600 600 } 601 601 602 - static void svm_hardware_disable(void) 602 + static void svm_disable_virtualization_cpu(void) 603 603 { 604 604 /* Make sure we clean up behind us */ 605 605 if (tsc_scaling) ··· 610 610 amd_pmu_disable_virt(); 611 611 } 612 612 613 - static int svm_hardware_enable(void) 613 + static int svm_enable_virtualization_cpu(void) 614 614 { 615 615 616 616 struct svm_cpu_data *sd; ··· 696 696 return; 697 697 698 698 kfree(sd->sev_vmcbs); 699 - __free_page(sd->save_area); 699 + __free_page(__sme_pa_to_page(sd->save_area_pa)); 700 700 sd->save_area_pa = 0; 701 701 sd->save_area = NULL; 702 702 } ··· 704 704 static int svm_cpu_init(int cpu) 705 705 { 706 706 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 707 + struct page *save_area_page; 707 708 int ret = -ENOMEM; 708 709 709 710 memset(sd, 0, sizeof(struct svm_cpu_data)); 710 - sd->save_area = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL); 711 - if (!sd->save_area) 711 + save_area_page = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL); 712 + if (!save_area_page) 712 713 return ret; 713 714 714 715 ret = sev_cpu_init(sd); 715 716 if (ret) 716 717 goto free_save_area; 717 718 718 - sd->save_area_pa = __sme_page_pa(sd->save_area); 719 + sd->save_area = page_address(save_area_page); 720 + sd->save_area_pa = __sme_page_pa(save_area_page); 719 721 return 0; 720 722 721 723 free_save_area: 722 - __free_page(sd->save_area); 723 - sd->save_area = NULL; 724 + __free_page(save_area_page); 724 725 return ret; 725 726 726 727 } ··· 1125 1124 for_each_possible_cpu(cpu) 1126 1125 svm_cpu_uninit(cpu); 1127 1126 1128 - __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), 1129 - get_order(IOPM_SIZE)); 1127 + __free_pages(__sme_pa_to_page(iopm_base), get_order(IOPM_SIZE)); 1130 1128 iopm_base = 0; 1131 1129 } 1132 1130 ··· 1301 1301 if (!kvm_hlt_in_guest(vcpu->kvm)) 1302 1302 svm_set_intercept(svm, INTERCEPT_HLT); 1303 1303 1304 - control->iopm_base_pa = __sme_set(iopm_base); 1304 + control->iopm_base_pa = iopm_base; 1305 1305 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); 1306 1306 control->int_ctl = V_INTR_MASKING_MASK; 1307 1307 ··· 1503 1503 1504 1504 sev_free_vcpu(vcpu); 1505 1505 1506 - __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT)); 1506 + __free_page(__sme_pa_to_page(svm->vmcb01.pa)); 1507 1507 __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); 1508 1508 } 1509 1509 ··· 1533 1533 * TSC_AUX is always virtualized for SEV-ES guests when the feature is 1534 1534 * available. The user return MSR support is not required in this case 1535 1535 * because TSC_AUX is restored on #VMEXIT from the host save area 1536 - * (which has been initialized in svm_hardware_enable()). 1536 + * (which has been initialized in svm_enable_virtualization_cpu()). 1537 1537 */ 1538 1538 if (likely(tsc_aux_uret_slot >= 0) && 1539 1539 (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) ··· 2825 2825 return kvm_complete_insn_gp(vcpu, ret); 2826 2826 } 2827 2827 2828 - static int svm_get_msr_feature(struct kvm_msr_entry *msr) 2828 + static int svm_get_feature_msr(u32 msr, u64 *data) 2829 2829 { 2830 - msr->data = 0; 2830 + *data = 0; 2831 2831 2832 - switch (msr->index) { 2832 + switch (msr) { 2833 2833 case MSR_AMD64_DE_CFG: 2834 2834 if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) 2835 - msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; 2835 + *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; 2836 2836 break; 2837 2837 default: 2838 - return KVM_MSR_RET_INVALID; 2838 + return KVM_MSR_RET_UNSUPPORTED; 2839 2839 } 2840 2840 2841 2841 return 0; ··· 3144 3144 * feature is available. The user return MSR support is not 3145 3145 * required in this case because TSC_AUX is restored on #VMEXIT 3146 3146 * from the host save area (which has been initialized in 3147 - * svm_hardware_enable()). 3147 + * svm_enable_virtualization_cpu()). 3148 3148 */ 3149 3149 if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) 3150 3150 break; ··· 3191 3191 kvm_pr_unimpl_wrmsr(vcpu, ecx, data); 3192 3192 break; 3193 3193 case MSR_AMD64_DE_CFG: { 3194 - struct kvm_msr_entry msr_entry; 3194 + u64 supported_de_cfg; 3195 3195 3196 - msr_entry.index = msr->index; 3197 - if (svm_get_msr_feature(&msr_entry)) 3196 + if (svm_get_feature_msr(ecx, &supported_de_cfg)) 3198 3197 return 1; 3199 3198 3200 - /* Check the supported bits */ 3201 - if (data & ~msr_entry.data) 3199 + if (data & ~supported_de_cfg) 3202 3200 return 1; 3203 3201 3204 - /* Don't allow the guest to change a bit, #GP */ 3205 - if (!msr->host_initiated && (data ^ msr_entry.data)) 3202 + /* 3203 + * Don't let the guest change the host-programmed value. The 3204 + * MSR is very model specific, i.e. contains multiple bits that 3205 + * are completely unknown to KVM, and the one bit known to KVM 3206 + * is simply a reflection of hardware capabilities. 3207 + */ 3208 + if (!msr->host_initiated && data != svm->msr_decfg) 3206 3209 return 1; 3207 3210 3208 3211 svm->msr_decfg = data; ··· 4159 4156 4160 4157 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 4161 4158 { 4159 + struct vcpu_svm *svm = to_svm(vcpu); 4160 + 4162 4161 if (is_guest_mode(vcpu)) 4163 4162 return EXIT_FASTPATH_NONE; 4164 4163 4165 - if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && 4166 - to_svm(vcpu)->vmcb->control.exit_info_1) 4164 + switch (svm->vmcb->control.exit_code) { 4165 + case SVM_EXIT_MSR: 4166 + if (!svm->vmcb->control.exit_info_1) 4167 + break; 4167 4168 return handle_fastpath_set_msr_irqoff(vcpu); 4169 + case SVM_EXIT_HLT: 4170 + return handle_fastpath_hlt(vcpu); 4171 + default: 4172 + break; 4173 + } 4168 4174 4169 4175 return EXIT_FASTPATH_NONE; 4170 4176 } ··· 5004 4992 .check_processor_compatibility = svm_check_processor_compat, 5005 4993 5006 4994 .hardware_unsetup = svm_hardware_unsetup, 5007 - .hardware_enable = svm_hardware_enable, 5008 - .hardware_disable = svm_hardware_disable, 4995 + .enable_virtualization_cpu = svm_enable_virtualization_cpu, 4996 + .disable_virtualization_cpu = svm_disable_virtualization_cpu, 4997 + .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu, 5009 4998 .has_emulated_msr = svm_has_emulated_msr, 5010 4999 5011 5000 .vcpu_create = svm_vcpu_create, ··· 5024 5011 .vcpu_unblocking = avic_vcpu_unblocking, 5025 5012 5026 5013 .update_exception_bitmap = svm_update_exception_bitmap, 5027 - .get_msr_feature = svm_get_msr_feature, 5014 + .get_feature_msr = svm_get_feature_msr, 5028 5015 .get_msr = svm_get_msr, 5029 5016 .set_msr = svm_set_msr, 5030 5017 .get_segment_base = svm_get_segment_base, ··· 5075 5062 .enable_nmi_window = svm_enable_nmi_window, 5076 5063 .enable_irq_window = svm_enable_irq_window, 5077 5064 .update_cr8_intercept = svm_update_cr8_intercept, 5065 + 5066 + .x2apic_icr_is_split = true, 5078 5067 .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, 5079 5068 .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, 5080 5069 .apicv_post_state_restore = avic_apicv_post_state_restore, ··· 5281 5266 5282 5267 iopm_va = page_address(iopm_pages); 5283 5268 memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); 5284 - iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; 5269 + iopm_base = __sme_page_pa(iopm_pages); 5285 5270 5286 5271 init_msrpm_offsets(); 5287 5272 ··· 5440 5425 static void __svm_exit(void) 5441 5426 { 5442 5427 kvm_x86_vendor_exit(); 5443 - 5444 - cpu_emergency_unregister_virt_callback(svm_emergency_disable); 5445 5428 } 5446 5429 5447 5430 static int __init svm_init(void) ··· 5454 5441 r = kvm_x86_vendor_init(&svm_init_ops); 5455 5442 if (r) 5456 5443 return r; 5457 - 5458 - cpu_emergency_register_virt_callback(svm_emergency_disable); 5459 5444 5460 5445 /* 5461 5446 * Common KVM initialization _must_ come last, after this, /dev/kvm is

+16 -2

arch/x86/kvm/svm/svm.h

··· 25 25 #include "cpuid.h" 26 26 #include "kvm_cache_regs.h" 27 27 28 - #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) 28 + /* 29 + * Helpers to convert to/from physical addresses for pages whose address is 30 + * consumed directly by hardware. Even though it's a physical address, SVM 31 + * often restricts the address to the natural width, hence 'unsigned long' 32 + * instead of 'hpa_t'. 33 + */ 34 + static inline unsigned long __sme_page_pa(struct page *page) 35 + { 36 + return __sme_set(page_to_pfn(page) << PAGE_SHIFT); 37 + } 38 + 39 + static inline struct page *__sme_pa_to_page(unsigned long pa) 40 + { 41 + return pfn_to_page(__sme_clr(pa) >> PAGE_SHIFT); 42 + } 29 43 30 44 #define IOPM_SIZE PAGE_SIZE * 3 31 45 #define MSRPM_SIZE PAGE_SIZE * 2 ··· 335 321 u32 next_asid; 336 322 u32 min_asid; 337 323 338 - struct page *save_area; 324 + struct vmcb *save_area; 339 325 unsigned long save_area_pa; 340 326 341 327 struct vmcb *current_vmcb;

+2 -6

arch/x86/kvm/svm/vmenter.S

··· 209 209 7: vmload %_ASM_AX 210 210 8: 211 211 212 - #ifdef CONFIG_MITIGATION_RETPOLINE 213 212 /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ 214 - FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE 215 - #endif 213 + FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT 216 214 217 215 /* Clobbers RAX, RCX, RDX. */ 218 216 RESTORE_HOST_SPEC_CTRL ··· 346 348 347 349 2: cli 348 350 349 - #ifdef CONFIG_MITIGATION_RETPOLINE 350 351 /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ 351 - FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE 352 - #endif 352 + FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT 353 353 354 354 /* Clobbers RAX, RCX, RDX, consumes RDI (@svm) and RSI (@spec_ctrl_intercepted). */ 355 355 RESTORE_HOST_SPEC_CTRL

+4 -6

arch/x86/kvm/vmx/capabilities.h

··· 54 54 }; 55 55 56 56 struct vmcs_config { 57 - int size; 58 - u32 basic_cap; 59 - u32 revision_id; 57 + u64 basic; 60 58 u32 pin_based_exec_ctrl; 61 59 u32 cpu_based_exec_ctrl; 62 60 u32 cpu_based_2nd_exec_ctrl; ··· 74 76 75 77 static inline bool cpu_has_vmx_basic_inout(void) 76 78 { 77 - return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); 79 + return vmcs_config.basic & VMX_BASIC_INOUT; 78 80 } 79 81 80 82 static inline bool cpu_has_virtual_nmis(void) ··· 223 225 static inline bool cpu_has_vmx_shadow_vmcs(void) 224 226 { 225 227 /* check if the cpu supports writing r/o exit information fields */ 226 - if (!(vmcs_config.misc & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) 228 + if (!(vmcs_config.misc & VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) 227 229 return false; 228 230 229 231 return vmcs_config.cpu_based_2nd_exec_ctrl & ··· 365 367 366 368 static inline bool cpu_has_vmx_intel_pt(void) 367 369 { 368 - return (vmcs_config.misc & MSR_IA32_VMX_MISC_INTEL_PT) && 370 + return (vmcs_config.misc & VMX_MISC_INTEL_PT) && 369 371 (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) && 370 372 (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); 371 373 }

+7 -3

arch/x86/kvm/vmx/main.c

··· 23 23 24 24 .hardware_unsetup = vmx_hardware_unsetup, 25 25 26 - .hardware_enable = vmx_hardware_enable, 27 - .hardware_disable = vmx_hardware_disable, 26 + .enable_virtualization_cpu = vmx_enable_virtualization_cpu, 27 + .disable_virtualization_cpu = vmx_disable_virtualization_cpu, 28 + .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu, 29 + 28 30 .has_emulated_msr = vmx_has_emulated_msr, 29 31 30 32 .vm_size = sizeof(struct kvm_vmx), ··· 43 41 .vcpu_put = vmx_vcpu_put, 44 42 45 43 .update_exception_bitmap = vmx_update_exception_bitmap, 46 - .get_msr_feature = vmx_get_msr_feature, 44 + .get_feature_msr = vmx_get_feature_msr, 47 45 .get_msr = vmx_get_msr, 48 46 .set_msr = vmx_set_msr, 49 47 .get_segment_base = vmx_get_segment_base, ··· 91 89 .enable_nmi_window = vmx_enable_nmi_window, 92 90 .enable_irq_window = vmx_enable_irq_window, 93 91 .update_cr8_intercept = vmx_update_cr8_intercept, 92 + 93 + .x2apic_icr_is_split = false, 94 94 .set_virtual_apic_mode = vmx_set_virtual_apic_mode, 95 95 .set_apic_access_page_addr = vmx_set_apic_access_page_addr, 96 96 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,

+96 -38

arch/x86/kvm/vmx/nested.c

··· 981 981 __func__, i, e.index, e.reserved); 982 982 goto fail; 983 983 } 984 - if (kvm_set_msr(vcpu, e.index, e.value)) { 984 + if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) { 985 985 pr_debug_ratelimited( 986 986 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 987 987 __func__, i, e.index, e.value); ··· 1017 1017 } 1018 1018 } 1019 1019 1020 - if (kvm_get_msr(vcpu, msr_index, data)) { 1020 + if (kvm_get_msr_with_filter(vcpu, msr_index, data)) { 1021 1021 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 1022 1022 msr_index); 1023 1023 return false; ··· 1112 1112 /* 1113 1113 * Emulated VMEntry does not fail here. Instead a less 1114 1114 * accurate value will be returned by 1115 - * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1116 - * instead of reading the value from the vmcs02 VMExit 1117 - * MSR-store area. 1115 + * nested_vmx_get_vmexit_msr_value() by reading KVM's 1116 + * internal MSR state instead of reading the value from 1117 + * the vmcs02 VMExit MSR-store area. 1118 1118 */ 1119 1119 pr_warn_ratelimited( 1120 1120 "Not enough msr entries in msr_autostore. Can't add msr %x\n", ··· 1251 1251 1252 1252 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1253 1253 { 1254 - const u64 feature_and_reserved = 1255 - /* feature (except bit 48; see below) */ 1256 - BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1257 - /* reserved */ 1258 - BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1254 + const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | 1255 + VMX_BASIC_INOUT | 1256 + VMX_BASIC_TRUE_CTLS; 1257 + 1258 + const u64 reserved_bits = GENMASK_ULL(63, 56) | 1259 + GENMASK_ULL(47, 45) | 1260 + BIT_ULL(31); 1261 + 1259 1262 u64 vmx_basic = vmcs_config.nested.basic; 1260 1263 1261 - if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1264 + BUILD_BUG_ON(feature_bits & reserved_bits); 1265 + 1266 + /* 1267 + * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has 1268 + * inverted polarity), the incoming value must not set feature bits or 1269 + * reserved bits that aren't allowed/supported by KVM. Fields, i.e. 1270 + * multi-bit values, are explicitly checked below. 1271 + */ 1272 + if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) 1262 1273 return -EINVAL; 1263 1274 1264 1275 /* 1265 1276 * KVM does not emulate a version of VMX that constrains physical 1266 1277 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1267 1278 */ 1268 - if (data & BIT_ULL(48)) 1279 + if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 1269 1280 return -EINVAL; 1270 1281 1271 1282 if (vmx_basic_vmcs_revision_id(vmx_basic) != ··· 1345 1334 1346 1335 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1347 1336 { 1348 - const u64 feature_and_reserved_bits = 1349 - /* feature */ 1350 - BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1351 - BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1352 - /* reserved */ 1353 - GENMASK_ULL(13, 9) | BIT_ULL(31); 1337 + const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA | 1338 + VMX_MISC_ACTIVITY_HLT | 1339 + VMX_MISC_ACTIVITY_SHUTDOWN | 1340 + VMX_MISC_ACTIVITY_WAIT_SIPI | 1341 + VMX_MISC_INTEL_PT | 1342 + VMX_MISC_RDMSR_IN_SMM | 1343 + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 1344 + VMX_MISC_VMXOFF_BLOCK_SMI | 1345 + VMX_MISC_ZERO_LEN_INS; 1346 + 1347 + const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9); 1348 + 1354 1349 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1355 1350 vmcs_config.nested.misc_high); 1356 1351 1357 - if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1352 + BUILD_BUG_ON(feature_bits & reserved_bits); 1353 + 1354 + /* 1355 + * The incoming value must not set feature bits or reserved bits that 1356 + * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are 1357 + * explicitly checked below. 1358 + */ 1359 + if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) 1358 1360 return -EINVAL; 1359 1361 1360 1362 if ((vmx->nested.msrs.pinbased_ctls_high & ··· 2341 2317 2342 2318 /* Posted interrupts setting is only taken from vmcs12. */ 2343 2319 vmx->nested.pi_pending = false; 2344 - if (nested_cpu_has_posted_intr(vmcs12)) 2320 + if (nested_cpu_has_posted_intr(vmcs12)) { 2345 2321 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2346 - else 2322 + } else { 2323 + vmx->nested.posted_intr_nv = -1; 2347 2324 exec_control &= ~PIN_BASED_POSTED_INTR; 2325 + } 2348 2326 pin_controls_set(vmx, exec_control); 2349 2327 2350 2328 /* ··· 2496 2470 2497 2471 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2498 2472 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2473 + 2499 2474 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2500 2475 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2501 2476 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); ··· 2534 2507 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2535 2508 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2536 2509 2537 - vmx->segment_cache.bitmask = 0; 2510 + vmx_segment_cache_clear(vmx); 2538 2511 } 2539 2512 2540 2513 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & ··· 4311 4284 } 4312 4285 4313 4286 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4287 + int irq; 4288 + 4314 4289 if (block_nested_events) 4315 4290 return -EBUSY; 4316 4291 if (!nested_exit_on_intr(vcpu)) 4317 4292 goto no_vmexit; 4318 - nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4293 + 4294 + if (!nested_exit_intr_ack_set(vcpu)) { 4295 + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4296 + return 0; 4297 + } 4298 + 4299 + irq = kvm_cpu_get_extint(vcpu); 4300 + if (irq != -1) { 4301 + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4302 + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4303 + return 0; 4304 + } 4305 + 4306 + irq = kvm_apic_has_interrupt(vcpu); 4307 + if (WARN_ON_ONCE(irq < 0)) 4308 + goto no_vmexit; 4309 + 4310 + /* 4311 + * If the IRQ is L2's PI notification vector, process posted 4312 + * interrupts for L2 instead of injecting VM-Exit, as the 4313 + * detection/morphing architecturally occurs when the IRQ is 4314 + * delivered to the CPU. Note, only interrupts that are routed 4315 + * through the local APIC trigger posted interrupt processing, 4316 + * and enabling posted interrupts requires ACK-on-exit. 4317 + */ 4318 + if (irq == vmx->nested.posted_intr_nv) { 4319 + vmx->nested.pi_pending = true; 4320 + kvm_apic_clear_irr(vcpu, irq); 4321 + goto no_vmexit; 4322 + } 4323 + 4324 + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4325 + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4326 + 4327 + /* 4328 + * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must 4329 + * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI 4330 + * if APICv is active. 4331 + */ 4332 + kvm_apic_ack_interrupt(vcpu, irq); 4319 4333 return 0; 4320 4334 } 4321 4335 ··· 4874 4806 goto vmabort; 4875 4807 } 4876 4808 4877 - if (kvm_set_msr(vcpu, h.index, h.value)) { 4809 + if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) { 4878 4810 pr_debug_ratelimited( 4879 4811 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4880 4812 __func__, j, h.index, h.value); ··· 5037 4969 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5038 4970 5039 4971 if (likely(!vmx->fail)) { 5040 - if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 5041 - nested_exit_intr_ack_set(vcpu)) { 5042 - int irq = kvm_cpu_get_interrupt(vcpu); 5043 - WARN_ON(irq < 0); 5044 - vmcs12->vm_exit_intr_info = irq | 5045 - INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 5046 - } 5047 - 5048 4972 if (vm_exit_reason != -1) 5049 4973 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 5050 4974 vmcs12->exit_qualification, ··· 7111 7051 { 7112 7052 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 7113 7053 msrs->misc_low |= 7114 - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 7054 + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 7115 7055 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 7116 7056 VMX_MISC_ACTIVITY_HLT | 7117 7057 VMX_MISC_ACTIVITY_WAIT_SIPI; ··· 7126 7066 * guest, and the VMCS structure we give it - not about the 7127 7067 * VMX support of the underlying hardware. 7128 7068 */ 7129 - msrs->basic = 7130 - VMCS12_REVISION | 7131 - VMX_BASIC_TRUE_CTLS | 7132 - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 7133 - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 7069 + msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, 7070 + X86_MEMTYPE_WB); 7134 7071 7072 + msrs->basic |= VMX_BASIC_TRUE_CTLS; 7135 7073 if (cpu_has_vmx_basic_inout()) 7136 7074 msrs->basic |= VMX_BASIC_INOUT; 7137 7075 }

+7 -1

arch/x86/kvm/vmx/nested.h

··· 39 39 40 40 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) 41 41 { 42 + lockdep_assert_once(lockdep_is_held(&vcpu->mutex) || 43 + !refcount_read(&vcpu->kvm->users_count)); 44 + 42 45 return to_vmx(vcpu)->nested.cached_vmcs12; 43 46 } 44 47 45 48 static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) 46 49 { 50 + lockdep_assert_once(lockdep_is_held(&vcpu->mutex) || 51 + !refcount_read(&vcpu->kvm->users_count)); 52 + 47 53 return to_vmx(vcpu)->nested.cached_shadow_vmcs12; 48 54 } 49 55 ··· 115 109 static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu) 116 110 { 117 111 return to_vmx(vcpu)->nested.msrs.misc_low & 118 - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; 112 + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; 119 113 } 120 114 121 115 static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)

+1 -1

arch/x86/kvm/vmx/sgx.c

··· 274 274 * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to 275 275 * enforce restriction of access to the PROVISIONKEY. 276 276 */ 277 - contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT); 277 + contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL); 278 278 if (!contents) 279 279 return -ENOMEM; 280 280

+35 -32

arch/x86/kvm/vmx/vmx.c

··· 525 525 VMX_SEGMENT_FIELD(LDTR), 526 526 }; 527 527 528 - static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) 529 - { 530 - vmx->segment_cache.bitmask = 0; 531 - } 532 528 533 529 static unsigned long host_idt_base; 534 530 ··· 751 755 return -EIO; 752 756 } 753 757 754 - static void vmx_emergency_disable(void) 758 + void vmx_emergency_disable_virtualization_cpu(void) 755 759 { 756 760 int cpu = raw_smp_processor_id(); 757 761 struct loaded_vmcs *v; ··· 1994 1998 return !(msr->data & ~valid_bits); 1995 1999 } 1996 2000 1997 - int vmx_get_msr_feature(struct kvm_msr_entry *msr) 2001 + int vmx_get_feature_msr(u32 msr, u64 *data) 1998 2002 { 1999 - switch (msr->index) { 2003 + switch (msr) { 2000 2004 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2001 2005 if (!nested) 2002 2006 return 1; 2003 - return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); 2007 + return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); 2004 2008 default: 2005 - return KVM_MSR_RET_INVALID; 2009 + return KVM_MSR_RET_UNSUPPORTED; 2006 2010 } 2007 2011 } 2008 2012 ··· 2601 2605 static int setup_vmcs_config(struct vmcs_config *vmcs_conf, 2602 2606 struct vmx_capability *vmx_cap) 2603 2607 { 2604 - u32 vmx_msr_low, vmx_msr_high; 2605 2608 u32 _pin_based_exec_control = 0; 2606 2609 u32 _cpu_based_exec_control = 0; 2607 2610 u32 _cpu_based_2nd_exec_control = 0; 2608 2611 u64 _cpu_based_3rd_exec_control = 0; 2609 2612 u32 _vmexit_control = 0; 2610 2613 u32 _vmentry_control = 0; 2614 + u64 basic_msr; 2611 2615 u64 misc_msr; 2612 2616 int i; 2613 2617 ··· 2730 2734 _vmexit_control &= ~x_ctrl; 2731 2735 } 2732 2736 2733 - rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); 2737 + rdmsrl(MSR_IA32_VMX_BASIC, basic_msr); 2734 2738 2735 2739 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 2736 - if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) 2740 + if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) 2737 2741 return -EIO; 2738 2742 2739 2743 #ifdef CONFIG_X86_64 2740 - /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ 2741 - if (vmx_msr_high & (1u<<16)) 2744 + /* 2745 + * KVM expects to be able to shove all legal physical addresses into 2746 + * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always 2747 + * 0 for processors that support Intel 64 architecture". 2748 + */ 2749 + if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 2742 2750 return -EIO; 2743 2751 #endif 2744 2752 2745 2753 /* Require Write-Back (WB) memory type for VMCS accesses. */ 2746 - if (((vmx_msr_high >> 18) & 15) != 6) 2754 + if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) 2747 2755 return -EIO; 2748 2756 2749 2757 rdmsrl(MSR_IA32_VMX_MISC, misc_msr); 2750 2758 2751 - vmcs_conf->size = vmx_msr_high & 0x1fff; 2752 - vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; 2753 - 2754 - vmcs_conf->revision_id = vmx_msr_low; 2755 - 2759 + vmcs_conf->basic = basic_msr; 2756 2760 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 2757 2761 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 2758 2762 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; ··· 2840 2844 return -EFAULT; 2841 2845 } 2842 2846 2843 - int vmx_hardware_enable(void) 2847 + int vmx_enable_virtualization_cpu(void) 2844 2848 { 2845 2849 int cpu = raw_smp_processor_id(); 2846 2850 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); ··· 2877 2881 __loaded_vmcs_clear(v); 2878 2882 } 2879 2883 2880 - void vmx_hardware_disable(void) 2884 + void vmx_disable_virtualization_cpu(void) 2881 2885 { 2882 2886 vmclear_local_loaded_vmcss(); 2883 2887 ··· 2899 2903 if (!pages) 2900 2904 return NULL; 2901 2905 vmcs = page_address(pages); 2902 - memset(vmcs, 0, vmcs_config.size); 2906 + memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic)); 2903 2907 2904 2908 /* KVM supports Enlightened VMCS v1 only */ 2905 2909 if (kvm_is_using_evmcs()) 2906 2910 vmcs->hdr.revision_id = KVM_EVMCS_VERSION; 2907 2911 else 2908 - vmcs->hdr.revision_id = vmcs_config.revision_id; 2912 + vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 2909 2913 2910 2914 if (shadow) 2911 2915 vmcs->hdr.shadow_vmcs = 1; ··· 2998 3002 * physical CPU. 2999 3003 */ 3000 3004 if (kvm_is_using_evmcs()) 3001 - vmcs->hdr.revision_id = vmcs_config.revision_id; 3005 + vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 3002 3006 3003 3007 per_cpu(vmxarea, cpu) = vmcs; 3004 3008 } ··· 4215 4219 { 4216 4220 struct vcpu_vmx *vmx = to_vmx(vcpu); 4217 4221 4222 + /* 4223 + * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated 4224 + * and freed, and must not be accessed outside of vcpu->mutex. The 4225 + * vCPU's cached PI NV is valid if and only if posted interrupts 4226 + * enabled in its vmcs12, i.e. checking the vector also checks that 4227 + * L1 has enabled posted interrupts for L2. 4228 + */ 4218 4229 if (is_guest_mode(vcpu) && 4219 4230 vector == vmx->nested.posted_intr_nv) { 4220 4231 /* ··· 5807 5804 error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) 5808 5805 ? PFERR_PRESENT_MASK : 0; 5809 5806 5810 - error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ? 5811 - PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; 5807 + if (error_code & EPT_VIOLATION_GVA_IS_VALID) 5808 + error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? 5809 + PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; 5812 5810 5813 5811 /* 5814 5812 * Check that the GPA doesn't exceed physical memory limits, as that is ··· 7269 7265 return handle_fastpath_set_msr_irqoff(vcpu); 7270 7266 case EXIT_REASON_PREEMPTION_TIMER: 7271 7267 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); 7268 + case EXIT_REASON_HLT: 7269 + return handle_fastpath_hlt(vcpu); 7272 7270 default: 7273 7271 return EXIT_FASTPATH_NONE; 7274 7272 } ··· 7971 7965 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); 7972 7966 kvm_cpu_cap_clear(X86_FEATURE_SGX1); 7973 7967 kvm_cpu_cap_clear(X86_FEATURE_SGX2); 7968 + kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA); 7974 7969 } 7975 7970 7976 7971 if (vmx_umip_emulated()) ··· 8522 8515 u64 use_timer_freq = 5000ULL * 1000 * 1000; 8523 8516 8524 8517 cpu_preemption_timer_multi = 8525 - vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; 8518 + vmx_misc_preemption_timer_rate(vmcs_config.misc); 8526 8519 8527 8520 if (tsc_khz) 8528 8521 use_timer_freq = (u64)tsc_khz * 1000; ··· 8589 8582 { 8590 8583 allow_smaller_maxphyaddr = false; 8591 8584 8592 - cpu_emergency_unregister_virt_callback(vmx_emergency_disable); 8593 - 8594 8585 vmx_cleanup_l1d_flush(); 8595 8586 } 8596 8587 ··· 8634 8629 8635 8630 pi_init_cpu(cpu); 8636 8631 } 8637 - 8638 - cpu_emergency_register_virt_callback(vmx_emergency_disable); 8639 8632 8640 8633 vmx_check_vmcs12_offsets(); 8641 8634

+5 -4

arch/x86/kvm/vmx/vmx.h

··· 17 17 #include "run_flags.h" 18 18 #include "../mmu.h" 19 19 20 - #define MSR_TYPE_R 1 21 - #define MSR_TYPE_W 2 22 - #define MSR_TYPE_RW 3 23 - 24 20 #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) 25 21 26 22 #ifdef CONFIG_X86_64 ··· 750 754 static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) 751 755 { 752 756 return lapic_in_kernel(vcpu) && enable_ipiv; 757 + } 758 + 759 + static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) 760 + { 761 + vmx->segment_cache.bitmask = 0; 753 762 } 754 763 755 764 #endif /* __KVM_X86_VMX_H */

+8

arch/x86/kvm/vmx/vmx_onhyperv.h

··· 104 104 struct hv_vp_assist_page *vp_ap = 105 105 hv_get_vp_assist_page(smp_processor_id()); 106 106 107 + /* 108 + * When enabling eVMCS, KVM verifies that every CPU has a valid hv_vp_assist_page() 109 + * and aborts enabling the feature otherwise. CPU onlining path is also checked in 110 + * vmx_hardware_enable(). 111 + */ 112 + if (KVM_BUG_ON(!vp_ap, kvm_get_running_vcpu()->kvm)) 113 + return; 114 + 107 115 if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall) 108 116 vp_ap->nested_control.features.directhypercall = 1; 109 117 vp_ap->current_nested_vmcs = phys_addr;

+1 -1

arch/x86/kvm/vmx/vmx_ops.h

··· 47 47 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, 48 48 "16-bit accessor invalid for 64-bit high field"); 49 49 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, 50 - "16-bit accessor invalid for 32-bit high field"); 50 + "16-bit accessor invalid for 32-bit field"); 51 51 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, 52 52 "16-bit accessor invalid for natural width field"); 53 53 }

+4 -3

arch/x86/kvm/vmx/x86_ops.h

··· 13 13 14 14 void vmx_hardware_unsetup(void); 15 15 int vmx_check_processor_compat(void); 16 - int vmx_hardware_enable(void); 17 - void vmx_hardware_disable(void); 16 + int vmx_enable_virtualization_cpu(void); 17 + void vmx_disable_virtualization_cpu(void); 18 + void vmx_emergency_disable_virtualization_cpu(void); 18 19 int vmx_vm_init(struct kvm *kvm); 19 20 void vmx_vm_destroy(struct kvm *kvm); 20 21 int vmx_vcpu_precreate(struct kvm *kvm); ··· 57 56 void vmx_msr_filter_changed(struct kvm_vcpu *vcpu); 58 57 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); 59 58 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); 60 - int vmx_get_msr_feature(struct kvm_msr_entry *msr); 59 + int vmx_get_feature_msr(u32 msr, u64 *data); 61 60 int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); 62 61 u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg); 63 62 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);

+484 -522

arch/x86/kvm/x86.c

··· 305 305 static struct kmem_cache *x86_emulator_cache; 306 306 307 307 /* 308 - * When called, it means the previous get/set msr reached an invalid msr. 309 - * Return true if we want to ignore/silent this failed msr access. 308 + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track 309 + * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, 310 + * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that 311 + * require host support, i.e. should be probed via RDMSR. emulated_msrs holds 312 + * MSRs that KVM emulates without strictly requiring host support. 313 + * msr_based_features holds MSRs that enumerate features, i.e. are effectively 314 + * CPUID leafs. Note, msr_based_features isn't mutually exclusive with 315 + * msrs_to_save and emulated_msrs. 310 316 */ 311 - static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) 312 - { 313 - const char *op = write ? "wrmsr" : "rdmsr"; 314 317 315 - if (ignore_msrs) { 316 - if (report_ignored_msrs) 317 - kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", 318 - op, msr, data); 319 - /* Mask the error */ 318 + static const u32 msrs_to_save_base[] = { 319 + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 320 + MSR_STAR, 321 + #ifdef CONFIG_X86_64 322 + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 323 + #endif 324 + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, 325 + MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, 326 + MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, 327 + MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, 328 + MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, 329 + MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, 330 + MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, 331 + MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, 332 + MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, 333 + MSR_IA32_UMWAIT_CONTROL, 334 + 335 + MSR_IA32_XFD, MSR_IA32_XFD_ERR, 336 + }; 337 + 338 + static const u32 msrs_to_save_pmu[] = { 339 + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, 340 + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, 341 + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, 342 + MSR_CORE_PERF_GLOBAL_CTRL, 343 + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, 344 + 345 + /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */ 346 + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, 347 + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, 348 + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, 349 + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, 350 + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, 351 + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, 352 + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, 353 + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, 354 + 355 + MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, 356 + MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, 357 + 358 + /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */ 359 + MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, 360 + MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, 361 + MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, 362 + MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, 363 + 364 + MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 365 + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, 366 + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, 367 + }; 368 + 369 + static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + 370 + ARRAY_SIZE(msrs_to_save_pmu)]; 371 + static unsigned num_msrs_to_save; 372 + 373 + static const u32 emulated_msrs_all[] = { 374 + MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 375 + MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 376 + 377 + #ifdef CONFIG_KVM_HYPERV 378 + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 379 + HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, 380 + HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, 381 + HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, 382 + HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, 383 + HV_X64_MSR_RESET, 384 + HV_X64_MSR_VP_INDEX, 385 + HV_X64_MSR_VP_RUNTIME, 386 + HV_X64_MSR_SCONTROL, 387 + HV_X64_MSR_STIMER0_CONFIG, 388 + HV_X64_MSR_VP_ASSIST_PAGE, 389 + HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, 390 + HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, 391 + HV_X64_MSR_SYNDBG_OPTIONS, 392 + HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, 393 + HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, 394 + HV_X64_MSR_SYNDBG_PENDING_BUFFER, 395 + #endif 396 + 397 + MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 398 + MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, 399 + 400 + MSR_IA32_TSC_ADJUST, 401 + MSR_IA32_TSC_DEADLINE, 402 + MSR_IA32_ARCH_CAPABILITIES, 403 + MSR_IA32_PERF_CAPABILITIES, 404 + MSR_IA32_MISC_ENABLE, 405 + MSR_IA32_MCG_STATUS, 406 + MSR_IA32_MCG_CTL, 407 + MSR_IA32_MCG_EXT_CTL, 408 + MSR_IA32_SMBASE, 409 + MSR_SMI_COUNT, 410 + MSR_PLATFORM_INFO, 411 + MSR_MISC_FEATURES_ENABLES, 412 + MSR_AMD64_VIRT_SPEC_CTRL, 413 + MSR_AMD64_TSC_RATIO, 414 + MSR_IA32_POWER_CTL, 415 + MSR_IA32_UCODE_REV, 416 + 417 + /* 418 + * KVM always supports the "true" VMX control MSRs, even if the host 419 + * does not. The VMX MSRs as a whole are considered "emulated" as KVM 420 + * doesn't strictly require them to exist in the host (ignoring that 421 + * KVM would refuse to load in the first place if the core set of MSRs 422 + * aren't supported). 423 + */ 424 + MSR_IA32_VMX_BASIC, 425 + MSR_IA32_VMX_TRUE_PINBASED_CTLS, 426 + MSR_IA32_VMX_TRUE_PROCBASED_CTLS, 427 + MSR_IA32_VMX_TRUE_EXIT_CTLS, 428 + MSR_IA32_VMX_TRUE_ENTRY_CTLS, 429 + MSR_IA32_VMX_MISC, 430 + MSR_IA32_VMX_CR0_FIXED0, 431 + MSR_IA32_VMX_CR4_FIXED0, 432 + MSR_IA32_VMX_VMCS_ENUM, 433 + MSR_IA32_VMX_PROCBASED_CTLS2, 434 + MSR_IA32_VMX_EPT_VPID_CAP, 435 + MSR_IA32_VMX_VMFUNC, 436 + 437 + MSR_K7_HWCR, 438 + MSR_KVM_POLL_CONTROL, 439 + }; 440 + 441 + static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; 442 + static unsigned num_emulated_msrs; 443 + 444 + /* 445 + * List of MSRs that control the existence of MSR-based features, i.e. MSRs 446 + * that are effectively CPUID leafs. VMX MSRs are also included in the set of 447 + * feature MSRs, but are handled separately to allow expedited lookups. 448 + */ 449 + static const u32 msr_based_features_all_except_vmx[] = { 450 + MSR_AMD64_DE_CFG, 451 + MSR_IA32_UCODE_REV, 452 + MSR_IA32_ARCH_CAPABILITIES, 453 + MSR_IA32_PERF_CAPABILITIES, 454 + }; 455 + 456 + static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + 457 + (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; 458 + static unsigned int num_msr_based_features; 459 + 460 + /* 461 + * All feature MSRs except uCode revID, which tracks the currently loaded uCode 462 + * patch, are immutable once the vCPU model is defined. 463 + */ 464 + static bool kvm_is_immutable_feature_msr(u32 msr) 465 + { 466 + int i; 467 + 468 + if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) 320 469 return true; 321 - } else { 322 - kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", 323 - op, msr, data); 324 - return false; 470 + 471 + for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { 472 + if (msr == msr_based_features_all_except_vmx[i]) 473 + return msr != MSR_IA32_UCODE_REV; 325 474 } 475 + 476 + return false; 477 + } 478 + 479 + static bool kvm_is_advertised_msr(u32 msr_index) 480 + { 481 + unsigned int i; 482 + 483 + for (i = 0; i < num_msrs_to_save; i++) { 484 + if (msrs_to_save[i] == msr_index) 485 + return true; 486 + } 487 + 488 + for (i = 0; i < num_emulated_msrs; i++) { 489 + if (emulated_msrs[i] == msr_index) 490 + return true; 491 + } 492 + 493 + return false; 494 + } 495 + 496 + typedef int (*msr_access_t)(struct kvm_vcpu *vcpu, u32 index, u64 *data, 497 + bool host_initiated); 498 + 499 + static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr, 500 + u64 *data, bool host_initiated, 501 + enum kvm_msr_access rw, 502 + msr_access_t msr_access_fn) 503 + { 504 + const char *op = rw == MSR_TYPE_W ? "wrmsr" : "rdmsr"; 505 + int ret; 506 + 507 + BUILD_BUG_ON(rw != MSR_TYPE_R && rw != MSR_TYPE_W); 508 + 509 + /* 510 + * Zero the data on read failures to avoid leaking stack data to the 511 + * guest and/or userspace, e.g. if the failure is ignored below. 512 + */ 513 + ret = msr_access_fn(vcpu, msr, data, host_initiated); 514 + if (ret && rw == MSR_TYPE_R) 515 + *data = 0; 516 + 517 + if (ret != KVM_MSR_RET_UNSUPPORTED) 518 + return ret; 519 + 520 + /* 521 + * Userspace is allowed to read MSRs, and write '0' to MSRs, that KVM 522 + * advertises to userspace, even if an MSR isn't fully supported. 523 + * Simply check that @data is '0', which covers both the write '0' case 524 + * and all reads (in which case @data is zeroed on failure; see above). 525 + */ 526 + if (host_initiated && !*data && kvm_is_advertised_msr(msr)) 527 + return 0; 528 + 529 + if (!ignore_msrs) { 530 + kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", 531 + op, msr, *data); 532 + return ret; 533 + } 534 + 535 + if (report_ignored_msrs) 536 + kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, *data); 537 + 538 + return 0; 326 539 } 327 540 328 541 static struct kmem_cache *kvm_alloc_emulator_cache(void) ··· 568 355 569 356 /* 570 357 * Disabling irqs at this point since the following code could be 571 - * interrupted and executed through kvm_arch_hardware_disable() 358 + * interrupted and executed through kvm_arch_disable_virtualization_cpu() 572 359 */ 573 360 local_irq_save(flags); 574 361 if (msrs->registered) { ··· 626 413 627 414 static void kvm_user_return_msr_cpu_online(void) 628 415 { 629 - unsigned int cpu = smp_processor_id(); 630 - struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); 416 + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); 631 417 u64 value; 632 418 int i; 633 419 ··· 831 619 ex->error_code = error_code; 832 620 ex->has_payload = has_payload; 833 621 ex->payload = payload; 834 - } 835 - 836 - /* Forcibly leave the nested mode in cases like a vCPU reset */ 837 - static void kvm_leave_nested(struct kvm_vcpu *vcpu) 838 - { 839 - kvm_x86_ops.nested_ops->leave_nested(vcpu); 840 622 } 841 623 842 624 static void kvm_multiple_exception(struct kvm_vcpu *vcpu, ··· 1618 1412 EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); 1619 1413 1620 1414 /* 1621 - * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track 1622 - * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, 1623 - * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that 1624 - * require host support, i.e. should be probed via RDMSR. emulated_msrs holds 1625 - * MSRs that KVM emulates without strictly requiring host support. 1626 - * msr_based_features holds MSRs that enumerate features, i.e. are effectively 1627 - * CPUID leafs. Note, msr_based_features isn't mutually exclusive with 1628 - * msrs_to_save and emulated_msrs. 1629 - */ 1630 - 1631 - static const u32 msrs_to_save_base[] = { 1632 - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 1633 - MSR_STAR, 1634 - #ifdef CONFIG_X86_64 1635 - MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 1636 - #endif 1637 - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, 1638 - MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, 1639 - MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, 1640 - MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, 1641 - MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, 1642 - MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, 1643 - MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, 1644 - MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, 1645 - MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, 1646 - MSR_IA32_UMWAIT_CONTROL, 1647 - 1648 - MSR_IA32_XFD, MSR_IA32_XFD_ERR, 1649 - }; 1650 - 1651 - static const u32 msrs_to_save_pmu[] = { 1652 - MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, 1653 - MSR_ARCH_PERFMON_FIXED_CTR0 + 2, 1654 - MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, 1655 - MSR_CORE_PERF_GLOBAL_CTRL, 1656 - MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, 1657 - 1658 - /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */ 1659 - MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, 1660 - MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, 1661 - MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, 1662 - MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, 1663 - MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, 1664 - MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, 1665 - MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, 1666 - MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, 1667 - 1668 - MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, 1669 - MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, 1670 - 1671 - /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */ 1672 - MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, 1673 - MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, 1674 - MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, 1675 - MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, 1676 - 1677 - MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 1678 - MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, 1679 - MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, 1680 - }; 1681 - 1682 - static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + 1683 - ARRAY_SIZE(msrs_to_save_pmu)]; 1684 - static unsigned num_msrs_to_save; 1685 - 1686 - static const u32 emulated_msrs_all[] = { 1687 - MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 1688 - MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 1689 - 1690 - #ifdef CONFIG_KVM_HYPERV 1691 - HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 1692 - HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, 1693 - HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, 1694 - HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, 1695 - HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, 1696 - HV_X64_MSR_RESET, 1697 - HV_X64_MSR_VP_INDEX, 1698 - HV_X64_MSR_VP_RUNTIME, 1699 - HV_X64_MSR_SCONTROL, 1700 - HV_X64_MSR_STIMER0_CONFIG, 1701 - HV_X64_MSR_VP_ASSIST_PAGE, 1702 - HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, 1703 - HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, 1704 - HV_X64_MSR_SYNDBG_OPTIONS, 1705 - HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, 1706 - HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, 1707 - HV_X64_MSR_SYNDBG_PENDING_BUFFER, 1708 - #endif 1709 - 1710 - MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 1711 - MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, 1712 - 1713 - MSR_IA32_TSC_ADJUST, 1714 - MSR_IA32_TSC_DEADLINE, 1715 - MSR_IA32_ARCH_CAPABILITIES, 1716 - MSR_IA32_PERF_CAPABILITIES, 1717 - MSR_IA32_MISC_ENABLE, 1718 - MSR_IA32_MCG_STATUS, 1719 - MSR_IA32_MCG_CTL, 1720 - MSR_IA32_MCG_EXT_CTL, 1721 - MSR_IA32_SMBASE, 1722 - MSR_SMI_COUNT, 1723 - MSR_PLATFORM_INFO, 1724 - MSR_MISC_FEATURES_ENABLES, 1725 - MSR_AMD64_VIRT_SPEC_CTRL, 1726 - MSR_AMD64_TSC_RATIO, 1727 - MSR_IA32_POWER_CTL, 1728 - MSR_IA32_UCODE_REV, 1729 - 1730 - /* 1731 - * KVM always supports the "true" VMX control MSRs, even if the host 1732 - * does not. The VMX MSRs as a whole are considered "emulated" as KVM 1733 - * doesn't strictly require them to exist in the host (ignoring that 1734 - * KVM would refuse to load in the first place if the core set of MSRs 1735 - * aren't supported). 1736 - */ 1737 - MSR_IA32_VMX_BASIC, 1738 - MSR_IA32_VMX_TRUE_PINBASED_CTLS, 1739 - MSR_IA32_VMX_TRUE_PROCBASED_CTLS, 1740 - MSR_IA32_VMX_TRUE_EXIT_CTLS, 1741 - MSR_IA32_VMX_TRUE_ENTRY_CTLS, 1742 - MSR_IA32_VMX_MISC, 1743 - MSR_IA32_VMX_CR0_FIXED0, 1744 - MSR_IA32_VMX_CR4_FIXED0, 1745 - MSR_IA32_VMX_VMCS_ENUM, 1746 - MSR_IA32_VMX_PROCBASED_CTLS2, 1747 - MSR_IA32_VMX_EPT_VPID_CAP, 1748 - MSR_IA32_VMX_VMFUNC, 1749 - 1750 - MSR_K7_HWCR, 1751 - MSR_KVM_POLL_CONTROL, 1752 - }; 1753 - 1754 - static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; 1755 - static unsigned num_emulated_msrs; 1756 - 1757 - /* 1758 - * List of MSRs that control the existence of MSR-based features, i.e. MSRs 1759 - * that are effectively CPUID leafs. VMX MSRs are also included in the set of 1760 - * feature MSRs, but are handled separately to allow expedited lookups. 1761 - */ 1762 - static const u32 msr_based_features_all_except_vmx[] = { 1763 - MSR_AMD64_DE_CFG, 1764 - MSR_IA32_UCODE_REV, 1765 - MSR_IA32_ARCH_CAPABILITIES, 1766 - MSR_IA32_PERF_CAPABILITIES, 1767 - }; 1768 - 1769 - static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + 1770 - (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; 1771 - static unsigned int num_msr_based_features; 1772 - 1773 - /* 1774 - * All feature MSRs except uCode revID, which tracks the currently loaded uCode 1775 - * patch, are immutable once the vCPU model is defined. 1776 - */ 1777 - static bool kvm_is_immutable_feature_msr(u32 msr) 1778 - { 1779 - int i; 1780 - 1781 - if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) 1782 - return true; 1783 - 1784 - for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { 1785 - if (msr == msr_based_features_all_except_vmx[i]) 1786 - return msr != MSR_IA32_UCODE_REV; 1787 - } 1788 - 1789 - return false; 1790 - } 1791 - 1792 - /* 1793 1415 * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM 1794 1416 * does not yet virtualize. These include: 1795 1417 * 10 - MISC_PACKAGE_CTRLS ··· 1694 1660 return data; 1695 1661 } 1696 1662 1697 - static int kvm_get_msr_feature(struct kvm_msr_entry *msr) 1663 + static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, 1664 + bool host_initiated) 1698 1665 { 1699 - switch (msr->index) { 1666 + WARN_ON_ONCE(!host_initiated); 1667 + 1668 + switch (index) { 1700 1669 case MSR_IA32_ARCH_CAPABILITIES: 1701 - msr->data = kvm_get_arch_capabilities(); 1670 + *data = kvm_get_arch_capabilities(); 1702 1671 break; 1703 1672 case MSR_IA32_PERF_CAPABILITIES: 1704 - msr->data = kvm_caps.supported_perf_cap; 1673 + *data = kvm_caps.supported_perf_cap; 1705 1674 break; 1706 1675 case MSR_IA32_UCODE_REV: 1707 - rdmsrl_safe(msr->index, &msr->data); 1676 + rdmsrl_safe(index, data); 1708 1677 break; 1709 1678 default: 1710 - return kvm_x86_call(get_msr_feature)(msr); 1679 + return kvm_x86_call(get_feature_msr)(index, data); 1711 1680 } 1712 1681 return 0; 1713 1682 } 1714 1683 1715 - static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 1684 + static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 1716 1685 { 1717 - struct kvm_msr_entry msr; 1718 - int r; 1719 - 1720 - /* Unconditionally clear the output for simplicity */ 1721 - msr.data = 0; 1722 - msr.index = index; 1723 - r = kvm_get_msr_feature(&msr); 1724 - 1725 - if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false)) 1726 - r = 0; 1727 - 1728 - *data = msr.data; 1729 - 1730 - return r; 1686 + return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R, 1687 + kvm_get_feature_msr); 1731 1688 } 1732 1689 1733 1690 static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) ··· 1905 1880 return kvm_x86_call(set_msr)(vcpu, &msr); 1906 1881 } 1907 1882 1883 + static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, 1884 + bool host_initiated) 1885 + { 1886 + return __kvm_set_msr(vcpu, index, *data, host_initiated); 1887 + } 1888 + 1908 1889 static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, 1909 1890 u32 index, u64 data, bool host_initiated) 1910 1891 { 1911 - int ret = __kvm_set_msr(vcpu, index, data, host_initiated); 1912 - 1913 - if (ret == KVM_MSR_RET_INVALID) 1914 - if (kvm_msr_ignored_check(index, data, true)) 1915 - ret = 0; 1916 - 1917 - return ret; 1892 + return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W, 1893 + _kvm_set_msr); 1918 1894 } 1919 1895 1920 1896 /* ··· 1954 1928 static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, 1955 1929 u32 index, u64 *data, bool host_initiated) 1956 1930 { 1957 - int ret = __kvm_get_msr(vcpu, index, data, host_initiated); 1958 - 1959 - if (ret == KVM_MSR_RET_INVALID) { 1960 - /* Unconditionally clear *data for simplicity */ 1961 - *data = 0; 1962 - if (kvm_msr_ignored_check(index, 0, false)) 1963 - ret = 0; 1964 - } 1965 - 1966 - return ret; 1931 + return kvm_do_msr_access(vcpu, index, data, host_initiated, MSR_TYPE_R, 1932 + __kvm_get_msr); 1967 1933 } 1968 1934 1969 - static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) 1935 + int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) 1970 1936 { 1971 1937 if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) 1972 1938 return KVM_MSR_RET_FILTERED; 1973 1939 return kvm_get_msr_ignored_check(vcpu, index, data, false); 1974 1940 } 1941 + EXPORT_SYMBOL_GPL(kvm_get_msr_with_filter); 1975 1942 1976 - static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) 1943 + int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) 1977 1944 { 1978 1945 if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) 1979 1946 return KVM_MSR_RET_FILTERED; 1980 1947 return kvm_set_msr_ignored_check(vcpu, index, data, false); 1981 1948 } 1949 + EXPORT_SYMBOL_GPL(kvm_set_msr_with_filter); 1982 1950 1983 1951 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) 1984 1952 { ··· 2019 1999 static u64 kvm_msr_reason(int r) 2020 2000 { 2021 2001 switch (r) { 2022 - case KVM_MSR_RET_INVALID: 2002 + case KVM_MSR_RET_UNSUPPORTED: 2023 2003 return KVM_MSR_EXIT_REASON_UNKNOWN; 2024 2004 case KVM_MSR_RET_FILTERED: 2025 2005 return KVM_MSR_EXIT_REASON_FILTER; ··· 2182 2162 { 2183 2163 u32 msr = kvm_rcx_read(vcpu); 2184 2164 u64 data; 2185 - fastpath_t ret = EXIT_FASTPATH_NONE; 2165 + fastpath_t ret; 2166 + bool handled; 2186 2167 2187 2168 kvm_vcpu_srcu_read_lock(vcpu); 2188 2169 2189 2170 switch (msr) { 2190 2171 case APIC_BASE_MSR + (APIC_ICR >> 4): 2191 2172 data = kvm_read_edx_eax(vcpu); 2192 - if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { 2193 - kvm_skip_emulated_instruction(vcpu); 2194 - ret = EXIT_FASTPATH_EXIT_HANDLED; 2195 - } 2173 + handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); 2196 2174 break; 2197 2175 case MSR_IA32_TSC_DEADLINE: 2198 2176 data = kvm_read_edx_eax(vcpu); 2199 - if (!handle_fastpath_set_tscdeadline(vcpu, data)) { 2200 - kvm_skip_emulated_instruction(vcpu); 2201 - ret = EXIT_FASTPATH_REENTER_GUEST; 2202 - } 2177 + handled = !handle_fastpath_set_tscdeadline(vcpu, data); 2203 2178 break; 2204 2179 default: 2180 + handled = false; 2205 2181 break; 2206 2182 } 2207 2183 2208 - if (ret != EXIT_FASTPATH_NONE) 2184 + if (handled) { 2185 + if (!kvm_skip_emulated_instruction(vcpu)) 2186 + ret = EXIT_FASTPATH_EXIT_USERSPACE; 2187 + else 2188 + ret = EXIT_FASTPATH_REENTER_GUEST; 2209 2189 trace_kvm_msr_write(msr, data); 2190 + } else { 2191 + ret = EXIT_FASTPATH_NONE; 2192 + } 2210 2193 2211 2194 kvm_vcpu_srcu_read_unlock(vcpu); 2212 2195 ··· 3769 3746 mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); 3770 3747 } 3771 3748 3772 - static bool kvm_is_msr_to_save(u32 msr_index) 3773 - { 3774 - unsigned int i; 3775 - 3776 - for (i = 0; i < num_msrs_to_save; i++) { 3777 - if (msrs_to_save[i] == msr_index) 3778 - return true; 3779 - } 3780 - 3781 - return false; 3782 - } 3783 - 3784 3749 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 3785 3750 { 3786 3751 u32 msr = msr_info->index; ··· 4150 4139 if (kvm_pmu_is_valid_msr(vcpu, msr)) 4151 4140 return kvm_pmu_set_msr(vcpu, msr_info); 4152 4141 4153 - /* 4154 - * Userspace is allowed to write '0' to MSRs that KVM reports 4155 - * as to-be-saved, even if an MSRs isn't fully supported. 4156 - */ 4157 - if (msr_info->host_initiated && !data && 4158 - kvm_is_msr_to_save(msr)) 4159 - break; 4160 - 4161 - return KVM_MSR_RET_INVALID; 4142 + return KVM_MSR_RET_UNSUPPORTED; 4162 4143 } 4163 4144 return 0; 4164 4145 } ··· 4501 4498 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) 4502 4499 return kvm_pmu_get_msr(vcpu, msr_info); 4503 4500 4504 - /* 4505 - * Userspace is allowed to read MSRs that KVM reports as 4506 - * to-be-saved, even if an MSR isn't fully supported. 4507 - */ 4508 - if (msr_info->host_initiated && 4509 - kvm_is_msr_to_save(msr_info->index)) { 4510 - msr_info->data = 0; 4511 - break; 4512 - } 4513 - 4514 - return KVM_MSR_RET_INVALID; 4501 + return KVM_MSR_RET_UNSUPPORTED; 4515 4502 } 4516 4503 return 0; 4517 4504 } ··· 4939 4946 break; 4940 4947 } 4941 4948 case KVM_GET_MSRS: 4942 - r = msr_io(NULL, argp, do_get_msr_feature, 1); 4949 + r = msr_io(NULL, argp, do_get_feature_msr, 1); 4943 4950 break; 4944 4951 #ifdef CONFIG_KVM_HYPERV 4945 4952 case KVM_GET_SUPPORTED_HV_CPUID: ··· 7376 7383 7377 7384 static void kvm_probe_feature_msr(u32 msr_index) 7378 7385 { 7379 - struct kvm_msr_entry msr = { 7380 - .index = msr_index, 7381 - }; 7386 + u64 data; 7382 7387 7383 - if (kvm_get_msr_feature(&msr)) 7388 + if (kvm_get_feature_msr(NULL, msr_index, &data, true)) 7384 7389 return; 7385 7390 7386 7391 msr_based_features[num_msr_based_features++] = msr_index; ··· 8856 8865 return 1; 8857 8866 } 8858 8867 8859 - static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 8860 - int emulation_type) 8868 + static bool kvm_unprotect_and_retry_on_failure(struct kvm_vcpu *vcpu, 8869 + gpa_t cr2_or_gpa, 8870 + int emulation_type) 8861 8871 { 8862 - gpa_t gpa = cr2_or_gpa; 8863 - kvm_pfn_t pfn; 8864 - 8865 8872 if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) 8866 8873 return false; 8867 - 8868 - if (WARN_ON_ONCE(is_guest_mode(vcpu)) || 8869 - WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) 8870 - return false; 8871 - 8872 - if (!vcpu->arch.mmu->root_role.direct) { 8873 - /* 8874 - * Write permission should be allowed since only 8875 - * write access need to be emulated. 8876 - */ 8877 - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); 8878 - 8879 - /* 8880 - * If the mapping is invalid in guest, let cpu retry 8881 - * it to generate fault. 8882 - */ 8883 - if (gpa == INVALID_GPA) 8884 - return true; 8885 - } 8886 - 8887 - /* 8888 - * Do not retry the unhandleable instruction if it faults on the 8889 - * readonly host memory, otherwise it will goto a infinite loop: 8890 - * retry instruction -> write #PF -> emulation fail -> retry 8891 - * instruction -> ... 8892 - */ 8893 - pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); 8894 - 8895 - /* 8896 - * If the instruction failed on the error pfn, it can not be fixed, 8897 - * report the error to userspace. 8898 - */ 8899 - if (is_error_noslot_pfn(pfn)) 8900 - return false; 8901 - 8902 - kvm_release_pfn_clean(pfn); 8903 - 8904 - /* 8905 - * If emulation may have been triggered by a write to a shadowed page 8906 - * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the 8907 - * guest to let the CPU re-execute the instruction in the hope that the 8908 - * CPU can cleanly execute the instruction that KVM failed to emulate. 8909 - */ 8910 - if (vcpu->kvm->arch.indirect_shadow_pages) 8911 - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); 8912 8874 8913 8875 /* 8914 8876 * If the failed instruction faulted on an access to page tables that ··· 8873 8929 * then zap the SPTE to unprotect the gfn, and then do it all over 8874 8930 * again. Report the error to userspace. 8875 8931 */ 8876 - return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP); 8877 - } 8878 - 8879 - static bool retry_instruction(struct x86_emulate_ctxt *ctxt, 8880 - gpa_t cr2_or_gpa, int emulation_type) 8881 - { 8882 - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 8883 - unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa; 8884 - 8885 - last_retry_eip = vcpu->arch.last_retry_eip; 8886 - last_retry_addr = vcpu->arch.last_retry_addr; 8932 + if (emulation_type & EMULTYPE_WRITE_PF_TO_SP) 8933 + return false; 8887 8934 8888 8935 /* 8889 - * If the emulation is caused by #PF and it is non-page_table 8890 - * writing instruction, it means the VM-EXIT is caused by shadow 8891 - * page protected, we can zap the shadow page and retry this 8892 - * instruction directly. 8893 - * 8894 - * Note: if the guest uses a non-page-table modifying instruction 8895 - * on the PDE that points to the instruction, then we will unmap 8896 - * the instruction and go to an infinite loop. So, we cache the 8897 - * last retried eip and the last fault address, if we meet the eip 8898 - * and the address again, we can break out of the potential infinite 8899 - * loop. 8936 + * If emulation may have been triggered by a write to a shadowed page 8937 + * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the 8938 + * guest to let the CPU re-execute the instruction in the hope that the 8939 + * CPU can cleanly execute the instruction that KVM failed to emulate. 8900 8940 */ 8901 - vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; 8941 + __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, true); 8902 8942 8903 - if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) 8904 - return false; 8905 - 8906 - if (WARN_ON_ONCE(is_guest_mode(vcpu)) || 8907 - WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) 8908 - return false; 8909 - 8910 - if (x86_page_table_writing_insn(ctxt)) 8911 - return false; 8912 - 8913 - if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa) 8914 - return false; 8915 - 8916 - vcpu->arch.last_retry_eip = ctxt->eip; 8917 - vcpu->arch.last_retry_addr = cr2_or_gpa; 8918 - 8919 - if (!vcpu->arch.mmu->root_role.direct) 8920 - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); 8921 - 8922 - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); 8923 - 8943 + /* 8944 + * Retry even if _this_ vCPU didn't unprotect the gfn, as it's possible 8945 + * all SPTEs were already zapped by a different task. The alternative 8946 + * is to report the error to userspace and likely terminate the guest, 8947 + * and the last_retry_{eip,addr} checks will prevent retrying the page 8948 + * fault indefinitely, i.e. there's nothing to lose by retrying. 8949 + */ 8924 8950 return true; 8925 8951 } 8926 8952 ··· 9090 9176 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 9091 9177 bool writeback = true; 9092 9178 9179 + if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) && 9180 + (WARN_ON_ONCE(is_guest_mode(vcpu)) || 9181 + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))) 9182 + emulation_type &= ~EMULTYPE_ALLOW_RETRY_PF; 9183 + 9093 9184 r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len); 9094 9185 if (r != X86EMUL_CONTINUE) { 9095 9186 if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT) ··· 9125 9206 kvm_queue_exception(vcpu, UD_VECTOR); 9126 9207 return 1; 9127 9208 } 9128 - if (reexecute_instruction(vcpu, cr2_or_gpa, 9129 - emulation_type)) 9209 + if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa, 9210 + emulation_type)) 9130 9211 return 1; 9131 9212 9132 9213 if (ctxt->have_exception && ··· 9173 9254 return 1; 9174 9255 } 9175 9256 9176 - if (retry_instruction(ctxt, cr2_or_gpa, emulation_type)) 9257 + /* 9258 + * If emulation was caused by a write-protection #PF on a non-page_table 9259 + * writing instruction, try to unprotect the gfn, i.e. zap shadow pages, 9260 + * and retry the instruction, as the vCPU is likely no longer using the 9261 + * gfn as a page table. 9262 + */ 9263 + if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) && 9264 + !x86_page_table_writing_insn(ctxt) && 9265 + kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) 9177 9266 return 1; 9178 9267 9179 9268 /* this is needed for vmware backdoor interface to work since it ··· 9212 9285 return 1; 9213 9286 9214 9287 if (r == EMULATION_FAILED) { 9215 - if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type)) 9288 + if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa, 9289 + emulation_type)) 9216 9290 return 1; 9217 9291 9218 9292 return handle_emulation_failure(vcpu, emulation_type); ··· 9681 9753 9682 9754 guard(mutex)(&vendor_module_lock); 9683 9755 9684 - if (kvm_x86_ops.hardware_enable) { 9756 + if (kvm_x86_ops.enable_virtualization_cpu) { 9685 9757 pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name); 9686 9758 return -EEXIST; 9687 9759 } ··· 9808 9880 return 0; 9809 9881 9810 9882 out_unwind_ops: 9811 - kvm_x86_ops.hardware_enable = NULL; 9883 + kvm_x86_ops.enable_virtualization_cpu = NULL; 9812 9884 kvm_x86_call(hardware_unsetup)(); 9813 9885 out_mmu_exit: 9814 9886 kvm_mmu_vendor_module_exit(); ··· 9849 9921 WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); 9850 9922 #endif 9851 9923 mutex_lock(&vendor_module_lock); 9852 - kvm_x86_ops.hardware_enable = NULL; 9924 + kvm_x86_ops.enable_virtualization_cpu = NULL; 9853 9925 mutex_unlock(&vendor_module_lock); 9854 9926 } 9855 9927 EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); 9856 - 9857 - static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) 9858 - { 9859 - /* 9860 - * The vCPU has halted, e.g. executed HLT. Update the run state if the 9861 - * local APIC is in-kernel, the run loop will detect the non-runnable 9862 - * state and halt the vCPU. Exit to userspace if the local APIC is 9863 - * managed by userspace, in which case userspace is responsible for 9864 - * handling wake events. 9865 - */ 9866 - ++vcpu->stat.halt_exits; 9867 - if (lapic_in_kernel(vcpu)) { 9868 - vcpu->arch.mp_state = state; 9869 - return 1; 9870 - } else { 9871 - vcpu->run->exit_reason = reason; 9872 - return 0; 9873 - } 9874 - } 9875 - 9876 - int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) 9877 - { 9878 - return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); 9879 - } 9880 - EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); 9881 - 9882 - int kvm_emulate_halt(struct kvm_vcpu *vcpu) 9883 - { 9884 - int ret = kvm_skip_emulated_instruction(vcpu); 9885 - /* 9886 - * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered 9887 - * KVM_EXIT_DEBUG here. 9888 - */ 9889 - return kvm_emulate_halt_noskip(vcpu) && ret; 9890 - } 9891 - EXPORT_SYMBOL_GPL(kvm_emulate_halt); 9892 - 9893 - int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) 9894 - { 9895 - int ret = kvm_skip_emulated_instruction(vcpu); 9896 - 9897 - return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, 9898 - KVM_EXIT_AP_RESET_HOLD) && ret; 9899 - } 9900 - EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); 9901 9928 9902 9929 #ifdef CONFIG_X86_64 9903 9930 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, ··· 11090 11207 if (vcpu->arch.apic_attention) 11091 11208 kvm_lapic_sync_from_vapic(vcpu); 11092 11209 11210 + if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE)) 11211 + return 0; 11212 + 11093 11213 r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath); 11094 11214 return r; 11095 11215 ··· 11104 11218 kvm_lapic_sync_from_vapic(vcpu); 11105 11219 out: 11106 11220 return r; 11221 + } 11222 + 11223 + static bool kvm_vcpu_running(struct kvm_vcpu *vcpu) 11224 + { 11225 + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 11226 + !vcpu->arch.apf.halted); 11227 + } 11228 + 11229 + static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) 11230 + { 11231 + if (!list_empty_careful(&vcpu->async_pf.done)) 11232 + return true; 11233 + 11234 + if (kvm_apic_has_pending_init_or_sipi(vcpu) && 11235 + kvm_apic_init_sipi_allowed(vcpu)) 11236 + return true; 11237 + 11238 + if (vcpu->arch.pv.pv_unhalted) 11239 + return true; 11240 + 11241 + if (kvm_is_exception_pending(vcpu)) 11242 + return true; 11243 + 11244 + if (kvm_test_request(KVM_REQ_NMI, vcpu) || 11245 + (vcpu->arch.nmi_pending && 11246 + kvm_x86_call(nmi_allowed)(vcpu, false))) 11247 + return true; 11248 + 11249 + #ifdef CONFIG_KVM_SMM 11250 + if (kvm_test_request(KVM_REQ_SMI, vcpu) || 11251 + (vcpu->arch.smi_pending && 11252 + kvm_x86_call(smi_allowed)(vcpu, false))) 11253 + return true; 11254 + #endif 11255 + 11256 + if (kvm_test_request(KVM_REQ_PMI, vcpu)) 11257 + return true; 11258 + 11259 + if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) 11260 + return true; 11261 + 11262 + if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) 11263 + return true; 11264 + 11265 + if (kvm_hv_has_stimer_pending(vcpu)) 11266 + return true; 11267 + 11268 + if (is_guest_mode(vcpu) && 11269 + kvm_x86_ops.nested_ops->has_events && 11270 + kvm_x86_ops.nested_ops->has_events(vcpu, false)) 11271 + return true; 11272 + 11273 + if (kvm_xen_has_pending_events(vcpu)) 11274 + return true; 11275 + 11276 + return false; 11277 + } 11278 + 11279 + int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 11280 + { 11281 + return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); 11107 11282 } 11108 11283 11109 11284 /* Called within kvm->srcu read side. */ ··· 11238 11291 return 1; 11239 11292 } 11240 11293 11241 - static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) 11242 - { 11243 - return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 11244 - !vcpu->arch.apf.halted); 11245 - } 11246 - 11247 11294 /* Called within kvm->srcu read side. */ 11248 11295 static int vcpu_run(struct kvm_vcpu *vcpu) 11249 11296 { ··· 11287 11346 } 11288 11347 11289 11348 return r; 11349 + } 11350 + 11351 + static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) 11352 + { 11353 + /* 11354 + * The vCPU has halted, e.g. executed HLT. Update the run state if the 11355 + * local APIC is in-kernel, the run loop will detect the non-runnable 11356 + * state and halt the vCPU. Exit to userspace if the local APIC is 11357 + * managed by userspace, in which case userspace is responsible for 11358 + * handling wake events. 11359 + */ 11360 + ++vcpu->stat.halt_exits; 11361 + if (lapic_in_kernel(vcpu)) { 11362 + if (kvm_vcpu_has_events(vcpu)) 11363 + vcpu->arch.pv.pv_unhalted = false; 11364 + else 11365 + vcpu->arch.mp_state = state; 11366 + return 1; 11367 + } else { 11368 + vcpu->run->exit_reason = reason; 11369 + return 0; 11370 + } 11371 + } 11372 + 11373 + int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) 11374 + { 11375 + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); 11376 + } 11377 + EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); 11378 + 11379 + int kvm_emulate_halt(struct kvm_vcpu *vcpu) 11380 + { 11381 + int ret = kvm_skip_emulated_instruction(vcpu); 11382 + /* 11383 + * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered 11384 + * KVM_EXIT_DEBUG here. 11385 + */ 11386 + return kvm_emulate_halt_noskip(vcpu) && ret; 11387 + } 11388 + EXPORT_SYMBOL_GPL(kvm_emulate_halt); 11389 + 11390 + fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) 11391 + { 11392 + int ret; 11393 + 11394 + kvm_vcpu_srcu_read_lock(vcpu); 11395 + ret = kvm_emulate_halt(vcpu); 11396 + kvm_vcpu_srcu_read_unlock(vcpu); 11397 + 11398 + if (!ret) 11399 + return EXIT_FASTPATH_EXIT_USERSPACE; 11400 + 11401 + if (kvm_vcpu_running(vcpu)) 11402 + return EXIT_FASTPATH_REENTER_GUEST; 11403 + 11404 + return EXIT_FASTPATH_EXIT_HANDLED; 11405 + } 11406 + EXPORT_SYMBOL_GPL(handle_fastpath_hlt); 11407 + 11408 + int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) 11409 + { 11410 + int ret = kvm_skip_emulated_instruction(vcpu); 11411 + 11412 + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, 11413 + KVM_EXIT_AP_RESET_HOLD) && ret; 11414 + } 11415 + EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); 11416 + 11417 + bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) 11418 + { 11419 + return kvm_vcpu_apicv_active(vcpu) && 11420 + kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu); 11421 + } 11422 + 11423 + bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) 11424 + { 11425 + return vcpu->arch.preempted_in_kernel; 11426 + } 11427 + 11428 + bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) 11429 + { 11430 + if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) 11431 + return true; 11432 + 11433 + if (kvm_test_request(KVM_REQ_NMI, vcpu) || 11434 + #ifdef CONFIG_KVM_SMM 11435 + kvm_test_request(KVM_REQ_SMI, vcpu) || 11436 + #endif 11437 + kvm_test_request(KVM_REQ_EVENT, vcpu)) 11438 + return true; 11439 + 11440 + return kvm_arch_dy_has_pending_interrupt(vcpu); 11290 11441 } 11291 11442 11292 11443 static inline int complete_emulated_io(struct kvm_vcpu *vcpu) ··· 12297 12264 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); 12298 12265 vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); 12299 12266 12300 - vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; 12301 - 12302 12267 kvm_async_pf_hash_reset(vcpu); 12303 12268 12304 12269 vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap; ··· 12462 12431 if (!init_event) { 12463 12432 vcpu->arch.smbase = 0x30000; 12464 12433 12434 + vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; 12435 + 12465 12436 vcpu->arch.msr_misc_features_enables = 0; 12466 12437 vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | 12467 12438 MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; ··· 12549 12516 } 12550 12517 EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); 12551 12518 12552 - int kvm_arch_hardware_enable(void) 12519 + void kvm_arch_enable_virtualization(void) 12520 + { 12521 + cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); 12522 + } 12523 + 12524 + void kvm_arch_disable_virtualization(void) 12525 + { 12526 + cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); 12527 + } 12528 + 12529 + int kvm_arch_enable_virtualization_cpu(void) 12553 12530 { 12554 12531 struct kvm *kvm; 12555 12532 struct kvm_vcpu *vcpu; ··· 12575 12532 if (ret) 12576 12533 return ret; 12577 12534 12578 - ret = kvm_x86_call(hardware_enable)(); 12535 + ret = kvm_x86_call(enable_virtualization_cpu)(); 12579 12536 if (ret != 0) 12580 12537 return ret; 12581 12538 ··· 12655 12612 return 0; 12656 12613 } 12657 12614 12658 - void kvm_arch_hardware_disable(void) 12615 + void kvm_arch_disable_virtualization_cpu(void) 12659 12616 { 12660 - kvm_x86_call(hardware_disable)(); 12617 + kvm_x86_call(disable_virtualization_cpu)(); 12661 12618 drop_user_return_notifiers(); 12662 12619 } 12663 12620 ··· 13203 13160 /* Free the arrays associated with the old memslot. */ 13204 13161 if (change == KVM_MR_MOVE) 13205 13162 kvm_arch_free_memslot(kvm, old); 13206 - } 13207 - 13208 - static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) 13209 - { 13210 - if (!list_empty_careful(&vcpu->async_pf.done)) 13211 - return true; 13212 - 13213 - if (kvm_apic_has_pending_init_or_sipi(vcpu) && 13214 - kvm_apic_init_sipi_allowed(vcpu)) 13215 - return true; 13216 - 13217 - if (vcpu->arch.pv.pv_unhalted) 13218 - return true; 13219 - 13220 - if (kvm_is_exception_pending(vcpu)) 13221 - return true; 13222 - 13223 - if (kvm_test_request(KVM_REQ_NMI, vcpu) || 13224 - (vcpu->arch.nmi_pending && 13225 - kvm_x86_call(nmi_allowed)(vcpu, false))) 13226 - return true; 13227 - 13228 - #ifdef CONFIG_KVM_SMM 13229 - if (kvm_test_request(KVM_REQ_SMI, vcpu) || 13230 - (vcpu->arch.smi_pending && 13231 - kvm_x86_call(smi_allowed)(vcpu, false))) 13232 - return true; 13233 - #endif 13234 - 13235 - if (kvm_test_request(KVM_REQ_PMI, vcpu)) 13236 - return true; 13237 - 13238 - if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) 13239 - return true; 13240 - 13241 - if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) 13242 - return true; 13243 - 13244 - if (kvm_hv_has_stimer_pending(vcpu)) 13245 - return true; 13246 - 13247 - if (is_guest_mode(vcpu) && 13248 - kvm_x86_ops.nested_ops->has_events && 13249 - kvm_x86_ops.nested_ops->has_events(vcpu, false)) 13250 - return true; 13251 - 13252 - if (kvm_xen_has_pending_events(vcpu)) 13253 - return true; 13254 - 13255 - return false; 13256 - } 13257 - 13258 - int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 13259 - { 13260 - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); 13261 - } 13262 - 13263 - bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) 13264 - { 13265 - return kvm_vcpu_apicv_active(vcpu) && 13266 - kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu); 13267 - } 13268 - 13269 - bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) 13270 - { 13271 - return vcpu->arch.preempted_in_kernel; 13272 - } 13273 - 13274 - bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) 13275 - { 13276 - if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) 13277 - return true; 13278 - 13279 - if (kvm_test_request(KVM_REQ_NMI, vcpu) || 13280 - #ifdef CONFIG_KVM_SMM 13281 - kvm_test_request(KVM_REQ_SMI, vcpu) || 13282 - #endif 13283 - kvm_test_request(KVM_REQ_EVENT, vcpu)) 13284 - return true; 13285 - 13286 - return kvm_arch_dy_has_pending_interrupt(vcpu); 13287 13163 } 13288 13164 13289 13165 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)

+26 -5

arch/x86/kvm/x86.h

··· 103 103 return max(val, min); 104 104 } 105 105 106 - #define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL 106 + #define MSR_IA32_CR_PAT_DEFAULT \ 107 + PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC) 107 108 108 109 void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); 109 110 int kvm_check_nested_events(struct kvm_vcpu *vcpu); 111 + 112 + /* Forcibly leave the nested mode in cases like a vCPU reset */ 113 + static inline void kvm_leave_nested(struct kvm_vcpu *vcpu) 114 + { 115 + kvm_x86_ops.nested_ops->leave_nested(vcpu); 116 + } 110 117 111 118 static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) 112 119 { ··· 341 334 int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 342 335 int emulation_type, void *insn, int insn_len); 343 336 fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); 337 + fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu); 344 338 345 339 extern struct kvm_caps kvm_caps; 346 340 extern struct kvm_host_values kvm_host; ··· 512 504 int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); 513 505 bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); 514 506 507 + enum kvm_msr_access { 508 + MSR_TYPE_R = BIT(0), 509 + MSR_TYPE_W = BIT(1), 510 + MSR_TYPE_RW = MSR_TYPE_R | MSR_TYPE_W, 511 + }; 512 + 515 513 /* 516 514 * Internal error codes that are used to indicate that MSR emulation encountered 517 - * an error that should result in #GP in the guest, unless userspace 518 - * handles it. 515 + * an error that should result in #GP in the guest, unless userspace handles it. 516 + * Note, '1', '0', and negative numbers are off limits, as they are used by KVM 517 + * as part of KVM's lightly documented internal KVM_RUN return codes. 518 + * 519 + * UNSUPPORTED - The MSR isn't supported, either because it is completely 520 + * unknown to KVM, or because the MSR should not exist according 521 + * to the vCPU model. 522 + * 523 + * FILTERED - Access to the MSR is denied by a userspace MSR filter. 519 524 */ 520 - #define KVM_MSR_RET_INVALID 2 /* in-kernel MSR emulation #GP condition */ 521 - #define KVM_MSR_RET_FILTERED 3 /* #GP due to userspace MSR filter */ 525 + #define KVM_MSR_RET_UNSUPPORTED 2 526 + #define KVM_MSR_RET_FILTERED 3 522 527 523 528 #define __cr4_reserved_bits(__cpu_has, __c) \ 524 529 ({ \

+10 -26

arch/x86/mm/pat/memtype.c

··· 176 176 } 177 177 #endif 178 178 179 - enum { 180 - PAT_UC = 0, /* uncached */ 181 - PAT_WC = 1, /* Write combining */ 182 - PAT_WT = 4, /* Write Through */ 183 - PAT_WP = 5, /* Write Protected */ 184 - PAT_WB = 6, /* Write Back (default) */ 185 - PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */ 186 - }; 187 - 188 179 #define CM(c) (_PAGE_CACHE_MODE_ ## c) 189 180 190 181 static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val, ··· 185 194 char *cache_mode; 186 195 187 196 switch (pat_val) { 188 - case PAT_UC: cache = CM(UC); cache_mode = "UC "; break; 189 - case PAT_WC: cache = CM(WC); cache_mode = "WC "; break; 190 - case PAT_WT: cache = CM(WT); cache_mode = "WT "; break; 191 - case PAT_WP: cache = CM(WP); cache_mode = "WP "; break; 192 - case PAT_WB: cache = CM(WB); cache_mode = "WB "; break; 193 - case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; 194 - default: cache = CM(WB); cache_mode = "WB "; break; 197 + case X86_MEMTYPE_UC: cache = CM(UC); cache_mode = "UC "; break; 198 + case X86_MEMTYPE_WC: cache = CM(WC); cache_mode = "WC "; break; 199 + case X86_MEMTYPE_WT: cache = CM(WT); cache_mode = "WT "; break; 200 + case X86_MEMTYPE_WP: cache = CM(WP); cache_mode = "WP "; break; 201 + case X86_MEMTYPE_WB: cache = CM(WB); cache_mode = "WB "; break; 202 + case X86_MEMTYPE_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; 203 + default: cache = CM(WB); cache_mode = "WB "; break; 195 204 } 196 205 197 206 memcpy(msg, cache_mode, 4); ··· 248 257 void __init pat_bp_init(void) 249 258 { 250 259 struct cpuinfo_x86 *c = &boot_cpu_data; 251 - #define PAT(p0, p1, p2, p3, p4, p5, p6, p7) \ 252 - (((u64)PAT_ ## p0) | ((u64)PAT_ ## p1 << 8) | \ 253 - ((u64)PAT_ ## p2 << 16) | ((u64)PAT_ ## p3 << 24) | \ 254 - ((u64)PAT_ ## p4 << 32) | ((u64)PAT_ ## p5 << 40) | \ 255 - ((u64)PAT_ ## p6 << 48) | ((u64)PAT_ ## p7 << 56)) 256 - 257 260 258 261 if (!IS_ENABLED(CONFIG_X86_PAT)) 259 262 pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n"); ··· 278 293 * NOTE: When WC or WP is used, it is redirected to UC- per 279 294 * the default setup in __cachemode2pte_tbl[]. 280 295 */ 281 - pat_msr_val = PAT(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC); 296 + pat_msr_val = PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC); 282 297 } 283 298 284 299 /* ··· 313 328 * NOTE: When WT or WP is used, it is redirected to UC- per 314 329 * the default setup in __cachemode2pte_tbl[]. 315 330 */ 316 - pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC); 331 + pat_msr_val = PAT_VALUE(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC); 317 332 } else { 318 333 /* 319 334 * Full PAT support. We put WT in slot 7 to improve ··· 341 356 * The reserved slots are unused, but mapped to their 342 357 * corresponding types in the presence of PAT errata. 343 358 */ 344 - pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT); 359 + pat_msr_val = PAT_VALUE(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT); 345 360 } 346 361 347 362 memory_caching_control |= CACHE_PAT; 348 363 349 364 init_cache_modes(pat_msr_val); 350 - #undef PAT 351 365 } 352 366 353 367 static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */

+16 -2

include/linux/kvm_host.h

··· 1529 1529 #endif 1530 1530 1531 1531 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING 1532 - int kvm_arch_hardware_enable(void); 1533 - void kvm_arch_hardware_disable(void); 1532 + /* 1533 + * kvm_arch_{enable,disable}_virtualization() are called on one CPU, under 1534 + * kvm_usage_lock, immediately after/before 0=>1 and 1=>0 transitions of 1535 + * kvm_usage_count, i.e. at the beginning of the generic hardware enabling 1536 + * sequence, and at the end of the generic hardware disabling sequence. 1537 + */ 1538 + void kvm_arch_enable_virtualization(void); 1539 + void kvm_arch_disable_virtualization(void); 1540 + /* 1541 + * kvm_arch_{enable,disable}_virtualization_cpu() are called on "every" CPU to 1542 + * do the actual twiddling of hardware bits. The hooks are called on all 1543 + * online CPUs when KVM enables/disabled virtualization, and on a single CPU 1544 + * when that CPU is onlined/offlined (including for Resume/Suspend). 1545 + */ 1546 + int kvm_arch_enable_virtualization_cpu(void); 1547 + void kvm_arch_disable_virtualization_cpu(void); 1534 1548 #endif 1535 1549 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); 1536 1550 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);

+4

tools/testing/selftests/kvm/.gitignore

··· 5 5 !*.h 6 6 !*.S 7 7 !*.sh 8 + !.gitignore 9 + !config 10 + !settings 11 + !Makefile

+4

tools/testing/selftests/kvm/Makefile

··· 130 130 TEST_GEN_PROGS_x86_64 += x86_64/triple_fault_event_test 131 131 TEST_GEN_PROGS_x86_64 += x86_64/recalc_apic_map_test 132 132 TEST_GEN_PROGS_x86_64 += access_tracking_perf_test 133 + TEST_GEN_PROGS_x86_64 += coalesced_io_test 133 134 TEST_GEN_PROGS_x86_64 += demand_paging_test 134 135 TEST_GEN_PROGS_x86_64 += dirty_log_test 135 136 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test ··· 168 167 TEST_GEN_PROGS_aarch64 += aarch64/no-vgic-v3 169 168 TEST_GEN_PROGS_aarch64 += access_tracking_perf_test 170 169 TEST_GEN_PROGS_aarch64 += arch_timer 170 + TEST_GEN_PROGS_aarch64 += coalesced_io_test 171 171 TEST_GEN_PROGS_aarch64 += demand_paging_test 172 172 TEST_GEN_PROGS_aarch64 += dirty_log_test 173 173 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test ··· 190 188 TEST_GEN_PROGS_s390x += s390x/cmma_test 191 189 TEST_GEN_PROGS_s390x += s390x/debug_test 192 190 TEST_GEN_PROGS_s390x += s390x/shared_zeropage_test 191 + TEST_GEN_PROGS_s390x += s390x/ucontrol_test 193 192 TEST_GEN_PROGS_s390x += demand_paging_test 194 193 TEST_GEN_PROGS_s390x += dirty_log_test 195 194 TEST_GEN_PROGS_s390x += guest_print_test ··· 203 200 TEST_GEN_PROGS_riscv += riscv/sbi_pmu_test 204 201 TEST_GEN_PROGS_riscv += riscv/ebreak_test 205 202 TEST_GEN_PROGS_riscv += arch_timer 203 + TEST_GEN_PROGS_riscv += coalesced_io_test 206 204 TEST_GEN_PROGS_riscv += demand_paging_test 207 205 TEST_GEN_PROGS_riscv += dirty_log_test 208 206 TEST_GEN_PROGS_riscv += get-reg-list

+236

tools/testing/selftests/kvm/coalesced_io_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <signal.h> 3 + #include <stdio.h> 4 + #include <stdlib.h> 5 + #include <string.h> 6 + #include <sys/ioctl.h> 7 + 8 + #include <linux/sizes.h> 9 + 10 + #include <kvm_util.h> 11 + #include <processor.h> 12 + 13 + #include "ucall_common.h" 14 + 15 + struct kvm_coalesced_io { 16 + struct kvm_coalesced_mmio_ring *ring; 17 + uint32_t ring_size; 18 + uint64_t mmio_gpa; 19 + uint64_t *mmio; 20 + 21 + /* 22 + * x86-only, but define pio_port for all architectures to minimize the 23 + * amount of #ifdeffery and complexity, without having to sacrifice 24 + * verbose error messages. 25 + */ 26 + uint8_t pio_port; 27 + }; 28 + 29 + static struct kvm_coalesced_io kvm_builtin_io_ring; 30 + 31 + #ifdef __x86_64__ 32 + static const int has_pio = 1; 33 + #else 34 + static const int has_pio = 0; 35 + #endif 36 + 37 + static void guest_code(struct kvm_coalesced_io *io) 38 + { 39 + int i, j; 40 + 41 + for (;;) { 42 + for (j = 0; j < 1 + has_pio; j++) { 43 + /* 44 + * KVM always leaves one free entry, i.e. exits to 45 + * userspace before the last entry is filled. 46 + */ 47 + for (i = 0; i < io->ring_size - 1; i++) { 48 + #ifdef __x86_64__ 49 + if (i & 1) 50 + outl(io->pio_port, io->pio_port + i); 51 + else 52 + #endif 53 + WRITE_ONCE(*io->mmio, io->mmio_gpa + i); 54 + } 55 + #ifdef __x86_64__ 56 + if (j & 1) 57 + outl(io->pio_port, io->pio_port + i); 58 + else 59 + #endif 60 + WRITE_ONCE(*io->mmio, io->mmio_gpa + i); 61 + } 62 + GUEST_SYNC(0); 63 + 64 + WRITE_ONCE(*io->mmio, io->mmio_gpa + i); 65 + #ifdef __x86_64__ 66 + outl(io->pio_port, io->pio_port + i); 67 + #endif 68 + } 69 + } 70 + 71 + static void vcpu_run_and_verify_io_exit(struct kvm_vcpu *vcpu, 72 + struct kvm_coalesced_io *io, 73 + uint32_t ring_start, 74 + uint32_t expected_exit) 75 + { 76 + const bool want_pio = expected_exit == KVM_EXIT_IO; 77 + struct kvm_coalesced_mmio_ring *ring = io->ring; 78 + struct kvm_run *run = vcpu->run; 79 + uint32_t pio_value; 80 + 81 + WRITE_ONCE(ring->first, ring_start); 82 + WRITE_ONCE(ring->last, ring_start); 83 + 84 + vcpu_run(vcpu); 85 + 86 + /* 87 + * Annoyingly, reading PIO data is safe only for PIO exits, otherwise 88 + * data_offset is garbage, e.g. an MMIO gpa. 89 + */ 90 + if (run->exit_reason == KVM_EXIT_IO) 91 + pio_value = *(uint32_t *)((void *)run + run->io.data_offset); 92 + else 93 + pio_value = 0; 94 + 95 + TEST_ASSERT((!want_pio && (run->exit_reason == KVM_EXIT_MMIO && run->mmio.is_write && 96 + run->mmio.phys_addr == io->mmio_gpa && run->mmio.len == 8 && 97 + *(uint64_t *)run->mmio.data == io->mmio_gpa + io->ring_size - 1)) || 98 + (want_pio && (run->exit_reason == KVM_EXIT_IO && run->io.port == io->pio_port && 99 + run->io.direction == KVM_EXIT_IO_OUT && run->io.count == 1 && 100 + pio_value == io->pio_port + io->ring_size - 1)), 101 + "For start = %u, expected exit on %u-byte %s write 0x%llx = %lx, got exit_reason = %u (%s)\n " 102 + "(MMIO addr = 0x%llx, write = %u, len = %u, data = %lx)\n " 103 + "(PIO port = 0x%x, write = %u, len = %u, count = %u, data = %x", 104 + ring_start, want_pio ? 4 : 8, want_pio ? "PIO" : "MMIO", 105 + want_pio ? (unsigned long long)io->pio_port : io->mmio_gpa, 106 + (want_pio ? io->pio_port : io->mmio_gpa) + io->ring_size - 1, run->exit_reason, 107 + run->exit_reason == KVM_EXIT_MMIO ? "MMIO" : run->exit_reason == KVM_EXIT_IO ? "PIO" : "other", 108 + run->mmio.phys_addr, run->mmio.is_write, run->mmio.len, *(uint64_t *)run->mmio.data, 109 + run->io.port, run->io.direction, run->io.size, run->io.count, pio_value); 110 + } 111 + 112 + static void vcpu_run_and_verify_coalesced_io(struct kvm_vcpu *vcpu, 113 + struct kvm_coalesced_io *io, 114 + uint32_t ring_start, 115 + uint32_t expected_exit) 116 + { 117 + struct kvm_coalesced_mmio_ring *ring = io->ring; 118 + int i; 119 + 120 + vcpu_run_and_verify_io_exit(vcpu, io, ring_start, expected_exit); 121 + 122 + TEST_ASSERT((ring->last + 1) % io->ring_size == ring->first, 123 + "Expected ring to be full (minus 1), first = %u, last = %u, max = %u, start = %u", 124 + ring->first, ring->last, io->ring_size, ring_start); 125 + 126 + for (i = 0; i < io->ring_size - 1; i++) { 127 + uint32_t idx = (ring->first + i) % io->ring_size; 128 + struct kvm_coalesced_mmio *entry = &ring->coalesced_mmio[idx]; 129 + 130 + #ifdef __x86_64__ 131 + if (i & 1) 132 + TEST_ASSERT(entry->phys_addr == io->pio_port && 133 + entry->len == 4 && entry->pio && 134 + *(uint32_t *)entry->data == io->pio_port + i, 135 + "Wanted 4-byte port I/O 0x%x = 0x%x in entry %u, got %u-byte %s 0x%llx = 0x%x", 136 + io->pio_port, io->pio_port + i, i, 137 + entry->len, entry->pio ? "PIO" : "MMIO", 138 + entry->phys_addr, *(uint32_t *)entry->data); 139 + else 140 + #endif 141 + TEST_ASSERT(entry->phys_addr == io->mmio_gpa && 142 + entry->len == 8 && !entry->pio, 143 + "Wanted 8-byte MMIO to 0x%lx = %lx in entry %u, got %u-byte %s 0x%llx = 0x%lx", 144 + io->mmio_gpa, io->mmio_gpa + i, i, 145 + entry->len, entry->pio ? "PIO" : "MMIO", 146 + entry->phys_addr, *(uint64_t *)entry->data); 147 + } 148 + } 149 + 150 + static void test_coalesced_io(struct kvm_vcpu *vcpu, 151 + struct kvm_coalesced_io *io, uint32_t ring_start) 152 + { 153 + struct kvm_coalesced_mmio_ring *ring = io->ring; 154 + 155 + kvm_vm_register_coalesced_io(vcpu->vm, io->mmio_gpa, 8, false /* pio */); 156 + #ifdef __x86_64__ 157 + kvm_vm_register_coalesced_io(vcpu->vm, io->pio_port, 8, true /* pio */); 158 + #endif 159 + 160 + vcpu_run_and_verify_coalesced_io(vcpu, io, ring_start, KVM_EXIT_MMIO); 161 + #ifdef __x86_64__ 162 + vcpu_run_and_verify_coalesced_io(vcpu, io, ring_start, KVM_EXIT_IO); 163 + #endif 164 + 165 + /* 166 + * Verify ucall, which may use non-coalesced MMIO or PIO, generates an 167 + * immediate exit. 168 + */ 169 + WRITE_ONCE(ring->first, ring_start); 170 + WRITE_ONCE(ring->last, ring_start); 171 + vcpu_run(vcpu); 172 + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC); 173 + TEST_ASSERT_EQ(ring->first, ring_start); 174 + TEST_ASSERT_EQ(ring->last, ring_start); 175 + 176 + /* Verify that non-coalesced MMIO/PIO generates an exit to userspace. */ 177 + kvm_vm_unregister_coalesced_io(vcpu->vm, io->mmio_gpa, 8, false /* pio */); 178 + vcpu_run_and_verify_io_exit(vcpu, io, ring_start, KVM_EXIT_MMIO); 179 + 180 + #ifdef __x86_64__ 181 + kvm_vm_unregister_coalesced_io(vcpu->vm, io->pio_port, 8, true /* pio */); 182 + vcpu_run_and_verify_io_exit(vcpu, io, ring_start, KVM_EXIT_IO); 183 + #endif 184 + } 185 + 186 + int main(int argc, char *argv[]) 187 + { 188 + struct kvm_vcpu *vcpu; 189 + struct kvm_vm *vm; 190 + int i; 191 + 192 + TEST_REQUIRE(kvm_has_cap(KVM_CAP_COALESCED_MMIO)); 193 + 194 + #ifdef __x86_64__ 195 + TEST_REQUIRE(kvm_has_cap(KVM_CAP_COALESCED_PIO)); 196 + #endif 197 + 198 + vm = vm_create_with_one_vcpu(&vcpu, guest_code); 199 + 200 + kvm_builtin_io_ring = (struct kvm_coalesced_io) { 201 + /* 202 + * The I/O ring is a kernel-allocated page whose address is 203 + * relative to each vCPU's run page, with the page offset 204 + * provided by KVM in the return of KVM_CAP_COALESCED_MMIO. 205 + */ 206 + .ring = (void *)vcpu->run + 207 + (kvm_check_cap(KVM_CAP_COALESCED_MMIO) * getpagesize()), 208 + 209 + /* 210 + * The size of the I/O ring is fixed, but KVM defines the sized 211 + * based on the kernel's PAGE_SIZE. Thus, userspace must query 212 + * the host's page size at runtime to compute the ring size. 213 + */ 214 + .ring_size = (getpagesize() - sizeof(struct kvm_coalesced_mmio_ring)) / 215 + sizeof(struct kvm_coalesced_mmio), 216 + 217 + /* 218 + * Arbitrary address+port (MMIO mustn't overlap memslots), with 219 + * the MMIO GPA identity mapped in the guest. 220 + */ 221 + .mmio_gpa = 4ull * SZ_1G, 222 + .mmio = (uint64_t *)(4ull * SZ_1G), 223 + .pio_port = 0x80, 224 + }; 225 + 226 + virt_map(vm, (uint64_t)kvm_builtin_io_ring.mmio, kvm_builtin_io_ring.mmio_gpa, 1); 227 + 228 + sync_global_to_guest(vm, kvm_builtin_io_ring); 229 + vcpu_args_set(vcpu, 1, &kvm_builtin_io_ring); 230 + 231 + for (i = 0; i < kvm_builtin_io_ring.ring_size; i++) 232 + test_coalesced_io(vcpu, &kvm_builtin_io_ring, i); 233 + 234 + kvm_vm_free(vm); 235 + return 0; 236 + }

+17 -2

tools/testing/selftests/kvm/guest_print_test.c

··· 107 107 expected_assert_msg, &assert_msg[offset]); 108 108 } 109 109 110 + /* 111 + * Open code vcpu_run(), sans the UCALL_ABORT handling, so that intentional 112 + * guest asserts guest can be verified instead of being reported as failures. 113 + */ 114 + static void do_vcpu_run(struct kvm_vcpu *vcpu) 115 + { 116 + int r; 117 + 118 + do { 119 + r = __vcpu_run(vcpu); 120 + } while (r == -1 && errno == EINTR); 121 + 122 + TEST_ASSERT(!r, KVM_IOCTL_ERROR(KVM_RUN, r)); 123 + } 124 + 110 125 static void run_test(struct kvm_vcpu *vcpu, const char *expected_printf, 111 126 const char *expected_assert) 112 127 { ··· 129 114 struct ucall uc; 130 115 131 116 while (1) { 132 - vcpu_run(vcpu); 117 + do_vcpu_run(vcpu); 133 118 134 119 TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON, 135 120 "Unexpected exit reason: %u (%s),", ··· 174 159 175 160 vm = vm_create_with_one_vcpu(&vcpu, guest_code_limits); 176 161 run = vcpu->run; 177 - vcpu_run(vcpu); 162 + do_vcpu_run(vcpu); 178 163 179 164 TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON, 180 165 "Unexpected exit reason: %u (%s),",

+26 -2

tools/testing/selftests/kvm/include/kvm_util.h

··· 428 428 void kvm_vm_free(struct kvm_vm *vmp); 429 429 void kvm_vm_restart(struct kvm_vm *vmp); 430 430 void kvm_vm_release(struct kvm_vm *vmp); 431 - int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, 432 - size_t len); 433 431 void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename); 434 432 int kvm_memfd_alloc(size_t size, bool hugepages); 435 433 ··· 456 458 static inline uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm) 457 459 { 458 460 return __vm_ioctl(vm, KVM_RESET_DIRTY_RINGS, NULL); 461 + } 462 + 463 + static inline void kvm_vm_register_coalesced_io(struct kvm_vm *vm, 464 + uint64_t address, 465 + uint64_t size, bool pio) 466 + { 467 + struct kvm_coalesced_mmio_zone zone = { 468 + .addr = address, 469 + .size = size, 470 + .pio = pio, 471 + }; 472 + 473 + vm_ioctl(vm, KVM_REGISTER_COALESCED_MMIO, &zone); 474 + } 475 + 476 + static inline void kvm_vm_unregister_coalesced_io(struct kvm_vm *vm, 477 + uint64_t address, 478 + uint64_t size, bool pio) 479 + { 480 + struct kvm_coalesced_mmio_zone zone = { 481 + .addr = address, 482 + .size = size, 483 + .pio = pio, 484 + }; 485 + 486 + vm_ioctl(vm, KVM_UNREGISTER_COALESCED_MMIO, &zone); 459 487 } 460 488 461 489 static inline int vm_get_stats_fd(struct kvm_vm *vm)

+69

tools/testing/selftests/kvm/include/s390x/debug_print.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Definition for kernel virtual machines on s390x 4 + * 5 + * Copyright IBM Corp. 2024 6 + * 7 + * Authors: 8 + * Christoph Schlameuss <schlameuss@linux.ibm.com> 9 + */ 10 + 11 + #ifndef SELFTEST_KVM_DEBUG_PRINT_H 12 + #define SELFTEST_KVM_DEBUG_PRINT_H 13 + 14 + #include "asm/ptrace.h" 15 + #include "kvm_util.h" 16 + #include "sie.h" 17 + 18 + static inline void print_hex_bytes(const char *name, u64 addr, size_t len) 19 + { 20 + u64 pos; 21 + 22 + pr_debug("%s (%p)\n", name, (void *)addr); 23 + pr_debug(" 0/0x00---------|"); 24 + if (len > 8) 25 + pr_debug(" 8/0x08---------|"); 26 + if (len > 16) 27 + pr_debug(" 16/0x10--------|"); 28 + if (len > 24) 29 + pr_debug(" 24/0x18--------|"); 30 + for (pos = 0; pos < len; pos += 8) { 31 + if ((pos % 32) == 0) 32 + pr_debug("\n %3lu 0x%.3lx ", pos, pos); 33 + pr_debug(" %16lx", *((u64 *)(addr + pos))); 34 + } 35 + pr_debug("\n"); 36 + } 37 + 38 + static inline void print_hex(const char *name, u64 addr) 39 + { 40 + print_hex_bytes(name, addr, 512); 41 + } 42 + 43 + static inline void print_psw(struct kvm_run *run, struct kvm_s390_sie_block *sie_block) 44 + { 45 + pr_debug("flags:0x%x psw:0x%.16llx:0x%.16llx exit:%u %s\n", 46 + run->flags, 47 + run->psw_mask, run->psw_addr, 48 + run->exit_reason, exit_reason_str(run->exit_reason)); 49 + pr_debug("sie_block psw:0x%.16llx:0x%.16llx\n", 50 + sie_block->psw_mask, sie_block->psw_addr); 51 + } 52 + 53 + static inline void print_run(struct kvm_run *run, struct kvm_s390_sie_block *sie_block) 54 + { 55 + print_hex_bytes("run", (u64)run, 0x150); 56 + print_hex("sie_block", (u64)sie_block); 57 + print_psw(run, sie_block); 58 + } 59 + 60 + static inline void print_regs(struct kvm_run *run) 61 + { 62 + struct kvm_sync_regs *sync_regs = &run->s.regs; 63 + 64 + print_hex_bytes("GPRS", (u64)sync_regs->gprs, 8 * NUM_GPRS); 65 + print_hex_bytes("ACRS", (u64)sync_regs->acrs, 4 * NUM_ACRS); 66 + print_hex_bytes("CRS", (u64)sync_regs->crs, 8 * NUM_CRS); 67 + } 68 + 69 + #endif /* SELFTEST_KVM_DEBUG_PRINT_H */

+5

tools/testing/selftests/kvm/include/s390x/processor.h

··· 21 21 #define PAGE_PROTECT 0x200 /* HW read-only bit */ 22 22 #define PAGE_NOEXEC 0x100 /* HW no-execute bit */ 23 23 24 + /* Page size definitions */ 25 + #define PAGE_SHIFT 12 26 + #define PAGE_SIZE BIT_ULL(PAGE_SHIFT) 27 + #define PAGE_MASK (~(PAGE_SIZE - 1)) 28 + 24 29 /* Is there a portable way to do this? */ 25 30 static inline void cpu_relax(void) 26 31 {

+240

tools/testing/selftests/kvm/include/s390x/sie.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Definition for kernel virtual machines on s390. 4 + * 5 + * Adapted copy of struct definition kvm_s390_sie_block from 6 + * arch/s390/include/asm/kvm_host.h for use in userspace selftest programs. 7 + * 8 + * Copyright IBM Corp. 2008, 2024 9 + * 10 + * Authors: 11 + * Christoph Schlameuss <schlameuss@linux.ibm.com> 12 + * Carsten Otte <cotte@de.ibm.com> 13 + */ 14 + 15 + #ifndef SELFTEST_KVM_SIE_H 16 + #define SELFTEST_KVM_SIE_H 17 + 18 + #include <linux/types.h> 19 + 20 + struct kvm_s390_sie_block { 21 + #define CPUSTAT_STOPPED 0x80000000 22 + #define CPUSTAT_WAIT 0x10000000 23 + #define CPUSTAT_ECALL_PEND 0x08000000 24 + #define CPUSTAT_STOP_INT 0x04000000 25 + #define CPUSTAT_IO_INT 0x02000000 26 + #define CPUSTAT_EXT_INT 0x01000000 27 + #define CPUSTAT_RUNNING 0x00800000 28 + #define CPUSTAT_RETAINED 0x00400000 29 + #define CPUSTAT_TIMING_SUB 0x00020000 30 + #define CPUSTAT_SIE_SUB 0x00010000 31 + #define CPUSTAT_RRF 0x00008000 32 + #define CPUSTAT_SLSV 0x00004000 33 + #define CPUSTAT_SLSR 0x00002000 34 + #define CPUSTAT_ZARCH 0x00000800 35 + #define CPUSTAT_MCDS 0x00000100 36 + #define CPUSTAT_KSS 0x00000200 37 + #define CPUSTAT_SM 0x00000080 38 + #define CPUSTAT_IBS 0x00000040 39 + #define CPUSTAT_GED2 0x00000010 40 + #define CPUSTAT_G 0x00000008 41 + #define CPUSTAT_GED 0x00000004 42 + #define CPUSTAT_J 0x00000002 43 + #define CPUSTAT_P 0x00000001 44 + __u32 cpuflags; /* 0x0000 */ 45 + __u32: 1; /* 0x0004 */ 46 + __u32 prefix : 18; 47 + __u32: 1; 48 + __u32 ibc : 12; 49 + __u8 reserved08[4]; /* 0x0008 */ 50 + #define PROG_IN_SIE BIT(0) 51 + __u32 prog0c; /* 0x000c */ 52 + union { 53 + __u8 reserved10[16]; /* 0x0010 */ 54 + struct { 55 + __u64 pv_handle_cpu; 56 + __u64 pv_handle_config; 57 + }; 58 + }; 59 + #define PROG_BLOCK_SIE BIT(0) 60 + #define PROG_REQUEST BIT(1) 61 + __u32 prog20; /* 0x0020 */ 62 + __u8 reserved24[4]; /* 0x0024 */ 63 + __u64 cputm; /* 0x0028 */ 64 + __u64 ckc; /* 0x0030 */ 65 + __u64 epoch; /* 0x0038 */ 66 + __u32 svcc; /* 0x0040 */ 67 + #define LCTL_CR0 0x8000 68 + #define LCTL_CR6 0x0200 69 + #define LCTL_CR9 0x0040 70 + #define LCTL_CR10 0x0020 71 + #define LCTL_CR11 0x0010 72 + #define LCTL_CR14 0x0002 73 + __u16 lctl; /* 0x0044 */ 74 + __s16 icpua; /* 0x0046 */ 75 + #define ICTL_OPEREXC 0x80000000 76 + #define ICTL_PINT 0x20000000 77 + #define ICTL_LPSW 0x00400000 78 + #define ICTL_STCTL 0x00040000 79 + #define ICTL_ISKE 0x00004000 80 + #define ICTL_SSKE 0x00002000 81 + #define ICTL_RRBE 0x00001000 82 + #define ICTL_TPROT 0x00000200 83 + __u32 ictl; /* 0x0048 */ 84 + #define ECA_CEI 0x80000000 85 + #define ECA_IB 0x40000000 86 + #define ECA_SIGPI 0x10000000 87 + #define ECA_MVPGI 0x01000000 88 + #define ECA_AIV 0x00200000 89 + #define ECA_VX 0x00020000 90 + #define ECA_PROTEXCI 0x00002000 91 + #define ECA_APIE 0x00000008 92 + #define ECA_SII 0x00000001 93 + __u32 eca; /* 0x004c */ 94 + #define ICPT_INST 0x04 95 + #define ICPT_PROGI 0x08 96 + #define ICPT_INSTPROGI 0x0C 97 + #define ICPT_EXTREQ 0x10 98 + #define ICPT_EXTINT 0x14 99 + #define ICPT_IOREQ 0x18 100 + #define ICPT_WAIT 0x1c 101 + #define ICPT_VALIDITY 0x20 102 + #define ICPT_STOP 0x28 103 + #define ICPT_OPEREXC 0x2C 104 + #define ICPT_PARTEXEC 0x38 105 + #define ICPT_IOINST 0x40 106 + #define ICPT_KSS 0x5c 107 + #define ICPT_MCHKREQ 0x60 108 + #define ICPT_INT_ENABLE 0x64 109 + #define ICPT_PV_INSTR 0x68 110 + #define ICPT_PV_NOTIFY 0x6c 111 + #define ICPT_PV_PREF 0x70 112 + __u8 icptcode; /* 0x0050 */ 113 + __u8 icptstatus; /* 0x0051 */ 114 + __u16 ihcpu; /* 0x0052 */ 115 + __u8 reserved54; /* 0x0054 */ 116 + #define IICTL_CODE_NONE 0x00 117 + #define IICTL_CODE_MCHK 0x01 118 + #define IICTL_CODE_EXT 0x02 119 + #define IICTL_CODE_IO 0x03 120 + #define IICTL_CODE_RESTART 0x04 121 + #define IICTL_CODE_SPECIFICATION 0x10 122 + #define IICTL_CODE_OPERAND 0x11 123 + __u8 iictl; /* 0x0055 */ 124 + __u16 ipa; /* 0x0056 */ 125 + __u32 ipb; /* 0x0058 */ 126 + __u32 scaoh; /* 0x005c */ 127 + #define FPF_BPBC 0x20 128 + __u8 fpf; /* 0x0060 */ 129 + #define ECB_GS 0x40 130 + #define ECB_TE 0x10 131 + #define ECB_SPECI 0x08 132 + #define ECB_SRSI 0x04 133 + #define ECB_HOSTPROTINT 0x02 134 + #define ECB_PTF 0x01 135 + __u8 ecb; /* 0x0061 */ 136 + #define ECB2_CMMA 0x80 137 + #define ECB2_IEP 0x20 138 + #define ECB2_PFMFI 0x08 139 + #define ECB2_ESCA 0x04 140 + #define ECB2_ZPCI_LSI 0x02 141 + __u8 ecb2; /* 0x0062 */ 142 + #define ECB3_AISI 0x20 143 + #define ECB3_AISII 0x10 144 + #define ECB3_DEA 0x08 145 + #define ECB3_AES 0x04 146 + #define ECB3_RI 0x01 147 + __u8 ecb3; /* 0x0063 */ 148 + #define ESCA_SCAOL_MASK ~0x3fU 149 + __u32 scaol; /* 0x0064 */ 150 + __u8 sdf; /* 0x0068 */ 151 + __u8 epdx; /* 0x0069 */ 152 + __u8 cpnc; /* 0x006a */ 153 + __u8 reserved6b; /* 0x006b */ 154 + __u32 todpr; /* 0x006c */ 155 + #define GISA_FORMAT1 0x00000001 156 + __u32 gd; /* 0x0070 */ 157 + __u8 reserved74[12]; /* 0x0074 */ 158 + __u64 mso; /* 0x0080 */ 159 + __u64 msl; /* 0x0088 */ 160 + __u64 psw_mask; /* 0x0090 */ 161 + __u64 psw_addr; /* 0x0098 */ 162 + __u64 gg14; /* 0x00a0 */ 163 + __u64 gg15; /* 0x00a8 */ 164 + __u8 reservedb0[8]; /* 0x00b0 */ 165 + #define HPID_KVM 0x4 166 + #define HPID_VSIE 0x5 167 + __u8 hpid; /* 0x00b8 */ 168 + __u8 reservedb9[7]; /* 0x00b9 */ 169 + union { 170 + struct { 171 + __u32 eiparams; /* 0x00c0 */ 172 + __u16 extcpuaddr; /* 0x00c4 */ 173 + __u16 eic; /* 0x00c6 */ 174 + }; 175 + __u64 mcic; /* 0x00c0 */ 176 + } __packed; 177 + __u32 reservedc8; /* 0x00c8 */ 178 + union { 179 + struct { 180 + __u16 pgmilc; /* 0x00cc */ 181 + __u16 iprcc; /* 0x00ce */ 182 + }; 183 + __u32 edc; /* 0x00cc */ 184 + } __packed; 185 + union { 186 + struct { 187 + __u32 dxc; /* 0x00d0 */ 188 + __u16 mcn; /* 0x00d4 */ 189 + __u8 perc; /* 0x00d6 */ 190 + __u8 peratmid; /* 0x00d7 */ 191 + }; 192 + __u64 faddr; /* 0x00d0 */ 193 + } __packed; 194 + __u64 peraddr; /* 0x00d8 */ 195 + __u8 eai; /* 0x00e0 */ 196 + __u8 peraid; /* 0x00e1 */ 197 + __u8 oai; /* 0x00e2 */ 198 + __u8 armid; /* 0x00e3 */ 199 + __u8 reservede4[4]; /* 0x00e4 */ 200 + union { 201 + __u64 tecmc; /* 0x00e8 */ 202 + struct { 203 + __u16 subchannel_id; /* 0x00e8 */ 204 + __u16 subchannel_nr; /* 0x00ea */ 205 + __u32 io_int_parm; /* 0x00ec */ 206 + __u32 io_int_word; /* 0x00f0 */ 207 + }; 208 + } __packed; 209 + __u8 reservedf4[8]; /* 0x00f4 */ 210 + #define CRYCB_FORMAT_MASK 0x00000003 211 + #define CRYCB_FORMAT0 0x00000000 212 + #define CRYCB_FORMAT1 0x00000001 213 + #define CRYCB_FORMAT2 0x00000003 214 + __u32 crycbd; /* 0x00fc */ 215 + __u64 gcr[16]; /* 0x0100 */ 216 + union { 217 + __u64 gbea; /* 0x0180 */ 218 + __u64 sidad; 219 + }; 220 + __u8 reserved188[8]; /* 0x0188 */ 221 + __u64 sdnxo; /* 0x0190 */ 222 + __u8 reserved198[8]; /* 0x0198 */ 223 + __u32 fac; /* 0x01a0 */ 224 + __u8 reserved1a4[20]; /* 0x01a4 */ 225 + __u64 cbrlo; /* 0x01b8 */ 226 + __u8 reserved1c0[8]; /* 0x01c0 */ 227 + #define ECD_HOSTREGMGMT 0x20000000 228 + #define ECD_MEF 0x08000000 229 + #define ECD_ETOKENF 0x02000000 230 + #define ECD_ECC 0x00200000 231 + __u32 ecd; /* 0x01c8 */ 232 + __u8 reserved1cc[18]; /* 0x01cc */ 233 + __u64 pp; /* 0x01de */ 234 + __u8 reserved1e6[2]; /* 0x01e6 */ 235 + __u64 itdba; /* 0x01e8 */ 236 + __u64 riccbd; /* 0x01f0 */ 237 + __u64 gvrd; /* 0x01f8 */ 238 + } __packed __aligned(512); 239 + 240 + #endif /* SELFTEST_KVM_SIE_H */

+20 -1

tools/testing/selftests/kvm/include/x86_64/apic.h

··· 11 11 #include <stdint.h> 12 12 13 13 #include "processor.h" 14 + #include "ucall_common.h" 14 15 15 16 #define APIC_DEFAULT_GPA 0xfee00000ULL 16 17 ··· 94 93 return rdmsr(APIC_BASE_MSR + (reg >> 4)); 95 94 } 96 95 96 + static inline uint8_t x2apic_write_reg_safe(unsigned int reg, uint64_t value) 97 + { 98 + return wrmsr_safe(APIC_BASE_MSR + (reg >> 4), value); 99 + } 100 + 97 101 static inline void x2apic_write_reg(unsigned int reg, uint64_t value) 98 102 { 99 - wrmsr(APIC_BASE_MSR + (reg >> 4), value); 103 + uint8_t fault = x2apic_write_reg_safe(reg, value); 104 + 105 + __GUEST_ASSERT(!fault, "Unexpected fault 0x%x on WRMSR(%x) = %lx\n", 106 + fault, APIC_BASE_MSR + (reg >> 4), value); 100 107 } 108 + 109 + static inline void x2apic_write_reg_fault(unsigned int reg, uint64_t value) 110 + { 111 + uint8_t fault = x2apic_write_reg_safe(reg, value); 112 + 113 + __GUEST_ASSERT(fault == GP_VECTOR, 114 + "Wanted #GP on WRMSR(%x) = %lx, got 0x%x\n", 115 + APIC_BASE_MSR + (reg >> 4), value, fault); 116 + } 117 + 101 118 102 119 #endif /* SELFTEST_KVM_APIC_H */

+18

tools/testing/selftests/kvm/include/x86_64/hyperv.h

··· 186 186 #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED \ 187 187 KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 14) 188 188 189 + /* HYPERV_CPUID_NESTED_FEATURES.EAX */ 190 + #define HV_X64_NESTED_DIRECT_FLUSH \ 191 + KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 17) 192 + #define HV_X64_NESTED_GUEST_MAPPING_FLUSH \ 193 + KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 18) 194 + #define HV_X64_NESTED_MSR_BITMAP \ 195 + KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 19) 196 + 197 + /* HYPERV_CPUID_NESTED_FEATURES.EBX */ 198 + #define HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL \ 199 + KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EBX, 0) 200 + 189 201 /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */ 190 202 #define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING \ 191 203 KVM_X86_CPU_FEATURE(HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0, EAX, 1) ··· 354 342 355 343 /* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */ 356 344 #define HV_INVARIANT_TSC_EXPOSED BIT_ULL(0) 345 + 346 + const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); 347 + const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu); 348 + void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu); 349 + 350 + bool kvm_hv_cpu_has(struct kvm_x86_cpu_feature feature); 357 351 358 352 #endif /* !SELFTEST_KVM_HYPERV_H */

+4 -3

tools/testing/selftests/kvm/include/x86_64/processor.h

··· 25 25 extern bool host_cpu_is_amd; 26 26 extern uint64_t guest_tsc_khz; 27 27 28 + #ifndef MAX_NR_CPUID_ENTRIES 29 + #define MAX_NR_CPUID_ENTRIES 100 30 + #endif 31 + 28 32 /* Forced emulation prefix, used to invoke the emulator unconditionally. */ 29 33 #define KVM_FEP "ud2; .byte 'k', 'v', 'm';" 30 34 ··· 912 908 const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid, 913 909 uint32_t function, uint32_t index); 914 910 const struct kvm_cpuid2 *kvm_get_supported_cpuid(void); 915 - const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); 916 - const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu); 917 911 918 912 static inline uint32_t kvm_cpu_fms(void) 919 913 { ··· 1011 1009 } 1012 1010 1013 1011 void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid); 1014 - void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu); 1015 1012 1016 1013 static inline struct kvm_cpuid_entry2 *__vcpu_get_cpuid_entry(struct kvm_vcpu *vcpu, 1017 1014 uint32_t function,

+6 -79

tools/testing/selftests/kvm/lib/kvm_util.c

··· 712 712 } 713 713 714 714 static void __vm_mem_region_delete(struct kvm_vm *vm, 715 - struct userspace_mem_region *region, 716 - bool unlink) 715 + struct userspace_mem_region *region) 717 716 { 718 717 int ret; 719 718 720 - if (unlink) { 721 - rb_erase(&region->gpa_node, &vm->regions.gpa_tree); 722 - rb_erase(&region->hva_node, &vm->regions.hva_tree); 723 - hash_del(&region->slot_node); 724 - } 719 + rb_erase(&region->gpa_node, &vm->regions.gpa_tree); 720 + rb_erase(&region->hva_node, &vm->regions.hva_tree); 721 + hash_del(&region->slot_node); 725 722 726 723 region->region.memory_size = 0; 727 724 vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region); ··· 759 762 760 763 /* Free userspace_mem_regions. */ 761 764 hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node) 762 - __vm_mem_region_delete(vmp, region, false); 765 + __vm_mem_region_delete(vmp, region); 763 766 764 767 /* Free sparsebit arrays. */ 765 768 sparsebit_free(&vmp->vpages_valid); ··· 789 792 TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); 790 793 791 794 return fd; 792 - } 793 - 794 - /* 795 - * Memory Compare, host virtual to guest virtual 796 - * 797 - * Input Args: 798 - * hva - Starting host virtual address 799 - * vm - Virtual Machine 800 - * gva - Starting guest virtual address 801 - * len - number of bytes to compare 802 - * 803 - * Output Args: None 804 - * 805 - * Input/Output Args: None 806 - * 807 - * Return: 808 - * Returns 0 if the bytes starting at hva for a length of len 809 - * are equal the guest virtual bytes starting at gva. Returns 810 - * a value < 0, if bytes at hva are less than those at gva. 811 - * Otherwise a value > 0 is returned. 812 - * 813 - * Compares the bytes starting at the host virtual address hva, for 814 - * a length of len, to the guest bytes starting at the guest virtual 815 - * address given by gva. 816 - */ 817 - int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len) 818 - { 819 - size_t amt; 820 - 821 - /* 822 - * Compare a batch of bytes until either a match is found 823 - * or all the bytes have been compared. 824 - */ 825 - for (uintptr_t offset = 0; offset < len; offset += amt) { 826 - uintptr_t ptr1 = (uintptr_t)hva + offset; 827 - 828 - /* 829 - * Determine host address for guest virtual address 830 - * at offset. 831 - */ 832 - uintptr_t ptr2 = (uintptr_t)addr_gva2hva(vm, gva + offset); 833 - 834 - /* 835 - * Determine amount to compare on this pass. 836 - * Don't allow the comparsion to cross a page boundary. 837 - */ 838 - amt = len - offset; 839 - if ((ptr1 >> vm->page_shift) != ((ptr1 + amt) >> vm->page_shift)) 840 - amt = vm->page_size - (ptr1 % vm->page_size); 841 - if ((ptr2 >> vm->page_shift) != ((ptr2 + amt) >> vm->page_shift)) 842 - amt = vm->page_size - (ptr2 % vm->page_size); 843 - 844 - assert((ptr1 >> vm->page_shift) == ((ptr1 + amt - 1) >> vm->page_shift)); 845 - assert((ptr2 >> vm->page_shift) == ((ptr2 + amt - 1) >> vm->page_shift)); 846 - 847 - /* 848 - * Perform the comparison. If there is a difference 849 - * return that result to the caller, otherwise need 850 - * to continue on looking for a mismatch. 851 - */ 852 - int ret = memcmp((void *)ptr1, (void *)ptr2, amt); 853 - if (ret != 0) 854 - return ret; 855 - } 856 - 857 - /* 858 - * No mismatch found. Let the caller know the two memory 859 - * areas are equal. 860 - */ 861 - return 0; 862 795 } 863 796 864 797 static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree, ··· 1197 1270 */ 1198 1271 void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) 1199 1272 { 1200 - __vm_mem_region_delete(vm, memslot2region(vm, slot), true); 1273 + __vm_mem_region_delete(vm, memslot2region(vm, slot)); 1201 1274 } 1202 1275 1203 1276 void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size,

+5 -5

tools/testing/selftests/kvm/lib/s390x/processor.c

··· 14 14 { 15 15 vm_paddr_t paddr; 16 16 17 - TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", 17 + TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x", 18 18 vm->page_size); 19 19 20 20 if (vm->pgd_created) ··· 79 79 } 80 80 81 81 /* Fill in page table entry */ 82 - idx = (gva >> 12) & 0x0ffu; /* page index */ 82 + idx = (gva >> PAGE_SHIFT) & 0x0ffu; /* page index */ 83 83 if (!(entry[idx] & PAGE_INVALID)) 84 84 fprintf(stderr, 85 85 "WARNING: PTE for gpa=0x%"PRIx64" already set!\n", gpa); ··· 91 91 int ri, idx; 92 92 uint64_t *entry; 93 93 94 - TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", 94 + TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x", 95 95 vm->page_size); 96 96 97 97 entry = addr_gpa2hva(vm, vm->pgd); ··· 103 103 entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN); 104 104 } 105 105 106 - idx = (gva >> 12) & 0x0ffu; /* page index */ 106 + idx = (gva >> PAGE_SHIFT) & 0x0ffu; /* page index */ 107 107 108 108 TEST_ASSERT(!(entry[idx] & PAGE_INVALID), 109 109 "No page mapping for vm virtual address 0x%lx", gva); ··· 168 168 struct kvm_sregs sregs; 169 169 struct kvm_vcpu *vcpu; 170 170 171 - TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", 171 + TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x", 172 172 vm->page_size); 173 173 174 174 stack_vaddr = __vm_vaddr_alloc(vm, stack_size,

+67

tools/testing/selftests/kvm/lib/x86_64/hyperv.c

··· 8 8 #include "processor.h" 9 9 #include "hyperv.h" 10 10 11 + const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) 12 + { 13 + static struct kvm_cpuid2 *cpuid; 14 + int kvm_fd; 15 + 16 + if (cpuid) 17 + return cpuid; 18 + 19 + cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); 20 + kvm_fd = open_kvm_dev_path_or_exit(); 21 + 22 + kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid); 23 + 24 + close(kvm_fd); 25 + return cpuid; 26 + } 27 + 28 + void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu) 29 + { 30 + static struct kvm_cpuid2 *cpuid_full; 31 + const struct kvm_cpuid2 *cpuid_sys, *cpuid_hv; 32 + int i, nent = 0; 33 + 34 + if (!cpuid_full) { 35 + cpuid_sys = kvm_get_supported_cpuid(); 36 + cpuid_hv = kvm_get_supported_hv_cpuid(); 37 + 38 + cpuid_full = allocate_kvm_cpuid2(cpuid_sys->nent + cpuid_hv->nent); 39 + if (!cpuid_full) { 40 + perror("malloc"); 41 + abort(); 42 + } 43 + 44 + /* Need to skip KVM CPUID leaves 0x400000xx */ 45 + for (i = 0; i < cpuid_sys->nent; i++) { 46 + if (cpuid_sys->entries[i].function >= 0x40000000 && 47 + cpuid_sys->entries[i].function < 0x40000100) 48 + continue; 49 + cpuid_full->entries[nent] = cpuid_sys->entries[i]; 50 + nent++; 51 + } 52 + 53 + memcpy(&cpuid_full->entries[nent], cpuid_hv->entries, 54 + cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2)); 55 + cpuid_full->nent = nent + cpuid_hv->nent; 56 + } 57 + 58 + vcpu_init_cpuid(vcpu, cpuid_full); 59 + } 60 + 61 + const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu) 62 + { 63 + struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); 64 + 65 + vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, cpuid); 66 + 67 + return cpuid; 68 + } 69 + 70 + bool kvm_hv_cpu_has(struct kvm_x86_cpu_feature feature) 71 + { 72 + if (!kvm_has_cap(KVM_CAP_SYS_HYPERV_CPUID)) 73 + return false; 74 + 75 + return kvm_cpuid_has(kvm_get_supported_hv_cpuid(), feature); 76 + } 77 + 11 78 struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm, 12 79 vm_vaddr_t *p_hv_pages_gva) 13 80 {

+3 -66

tools/testing/selftests/kvm/lib/x86_64/processor.c

··· 19 19 #define KERNEL_DS 0x10 20 20 #define KERNEL_TSS 0x18 21 21 22 - #define MAX_NR_CPUID_ENTRIES 100 23 - 24 22 vm_vaddr_t exception_handlers; 25 23 bool host_cpu_is_amd; 26 24 bool host_cpu_is_intel; ··· 564 566 if (kvm_fixup_exception(regs)) 565 567 return; 566 568 567 - ucall_assert(UCALL_UNHANDLED, 568 - "Unhandled exception in guest", __FILE__, __LINE__, 569 - "Unhandled exception '0x%lx' at guest RIP '0x%lx'", 570 - regs->vector, regs->rip); 569 + GUEST_FAIL("Unhandled exception '0x%lx' at guest RIP '0x%lx'", 570 + regs->vector, regs->rip); 571 571 } 572 572 573 573 static void vm_init_descriptor_tables(struct kvm_vm *vm) ··· 607 611 { 608 612 struct ucall uc; 609 613 610 - if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED) 614 + if (get_ucall(vcpu, &uc) == UCALL_ABORT) 611 615 REPORT_GUEST_ASSERT(uc); 612 616 } 613 617 ··· 1189 1193 void xen_hypercall(uint64_t nr, uint64_t a0, void *a1) 1190 1194 { 1191 1195 GUEST_ASSERT(!__xen_hypercall(nr, a0, a1)); 1192 - } 1193 - 1194 - const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) 1195 - { 1196 - static struct kvm_cpuid2 *cpuid; 1197 - int kvm_fd; 1198 - 1199 - if (cpuid) 1200 - return cpuid; 1201 - 1202 - cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); 1203 - kvm_fd = open_kvm_dev_path_or_exit(); 1204 - 1205 - kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid); 1206 - 1207 - close(kvm_fd); 1208 - return cpuid; 1209 - } 1210 - 1211 - void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu) 1212 - { 1213 - static struct kvm_cpuid2 *cpuid_full; 1214 - const struct kvm_cpuid2 *cpuid_sys, *cpuid_hv; 1215 - int i, nent = 0; 1216 - 1217 - if (!cpuid_full) { 1218 - cpuid_sys = kvm_get_supported_cpuid(); 1219 - cpuid_hv = kvm_get_supported_hv_cpuid(); 1220 - 1221 - cpuid_full = allocate_kvm_cpuid2(cpuid_sys->nent + cpuid_hv->nent); 1222 - if (!cpuid_full) { 1223 - perror("malloc"); 1224 - abort(); 1225 - } 1226 - 1227 - /* Need to skip KVM CPUID leaves 0x400000xx */ 1228 - for (i = 0; i < cpuid_sys->nent; i++) { 1229 - if (cpuid_sys->entries[i].function >= 0x40000000 && 1230 - cpuid_sys->entries[i].function < 0x40000100) 1231 - continue; 1232 - cpuid_full->entries[nent] = cpuid_sys->entries[i]; 1233 - nent++; 1234 - } 1235 - 1236 - memcpy(&cpuid_full->entries[nent], cpuid_hv->entries, 1237 - cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2)); 1238 - cpuid_full->nent = nent + cpuid_hv->nent; 1239 - } 1240 - 1241 - vcpu_init_cpuid(vcpu, cpuid_full); 1242 - } 1243 - 1244 - const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu) 1245 - { 1246 - struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); 1247 - 1248 - vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, cpuid); 1249 - 1250 - return cpuid; 1251 1196 } 1252 1197 1253 1198 unsigned long vm_compute_max_gfn(struct kvm_vm *vm)

+17 -2

tools/testing/selftests/kvm/memslot_modification_stress_test.c

··· 79 79 useconds_t delay; 80 80 uint64_t nr_iterations; 81 81 bool partition_vcpu_memory_access; 82 + bool disable_slot_zap_quirk; 82 83 }; 83 84 84 85 static void run_test(enum vm_guest_mode mode, void *arg) ··· 90 89 vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, 91 90 VM_MEM_SRC_ANONYMOUS, 92 91 p->partition_vcpu_memory_access); 92 + #ifdef __x86_64__ 93 + if (p->disable_slot_zap_quirk) 94 + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL); 95 + 96 + pr_info("Memslot zap quirk %s\n", p->disable_slot_zap_quirk ? 97 + "disabled" : "enabled"); 98 + #endif 93 99 94 100 pr_info("Finished creating vCPUs\n"); 95 101 ··· 115 107 static void help(char *name) 116 108 { 117 109 puts(""); 118 - printf("usage: %s [-h] [-m mode] [-d delay_usec]\n" 110 + printf("usage: %s [-h] [-m mode] [-d delay_usec] [-q]\n" 119 111 " [-b memory] [-v vcpus] [-o] [-i iterations]\n", name); 120 112 guest_modes_help(); 121 113 printf(" -d: add a delay between each iteration of adding and\n" 122 114 " deleting a memslot in usec.\n"); 115 + printf(" -q: Disable memslot zap quirk.\n"); 123 116 printf(" -b: specify the size of the memory region which should be\n" 124 117 " accessed by each vCPU. e.g. 10M or 3G.\n" 125 118 " Default: 1G\n"); ··· 146 137 147 138 guest_modes_append_default(); 148 139 149 - while ((opt = getopt(argc, argv, "hm:d:b:v:oi:")) != -1) { 140 + while ((opt = getopt(argc, argv, "hm:d:qb:v:oi:")) != -1) { 150 141 switch (opt) { 151 142 case 'm': 152 143 guest_modes_cmdline(optarg); ··· 168 159 break; 169 160 case 'i': 170 161 p.nr_iterations = atoi_positive("Number of iterations", optarg); 162 + break; 163 + case 'q': 164 + p.disable_slot_zap_quirk = true; 165 + 166 + TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & 167 + KVM_X86_QUIRK_SLOT_ZAP_ALL); 171 168 break; 172 169 case 'h': 173 170 default:

+11 -1

tools/testing/selftests/kvm/memslot_perf_test.c

··· 113 113 static sem_t vcpu_ready; 114 114 115 115 static bool map_unmap_verify; 116 + static bool disable_slot_zap_quirk; 116 117 117 118 static bool verbose; 118 119 #define pr_info_v(...) \ ··· 579 578 uint32_t guest_page_size = data->vm->page_size; 580 579 uint64_t movesrcgpa, movetestgpa; 581 580 581 + if (disable_slot_zap_quirk) 582 + vm_enable_cap(data->vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL); 583 + 582 584 movesrcgpa = vm_slot2gpa(data, data->nslots - 1); 583 585 584 586 if (isactive) { ··· 900 896 pr_info(" -h: print this help screen.\n"); 901 897 pr_info(" -v: enable verbose mode (not for benchmarking).\n"); 902 898 pr_info(" -d: enable extra debug checks.\n"); 899 + pr_info(" -q: Disable memslot zap quirk during memslot move.\n"); 903 900 pr_info(" -s: specify memslot count cap (-1 means no cap; currently: %i)\n", 904 901 targs->nslots); 905 902 pr_info(" -f: specify the first test to run (currently: %i; max %zu)\n", ··· 959 954 uint32_t max_mem_slots; 960 955 int opt; 961 956 962 - while ((opt = getopt(argc, argv, "hvds:f:e:l:r:")) != -1) { 957 + while ((opt = getopt(argc, argv, "hvdqs:f:e:l:r:")) != -1) { 963 958 switch (opt) { 964 959 case 'h': 965 960 default: ··· 970 965 break; 971 966 case 'd': 972 967 map_unmap_verify = true; 968 + break; 969 + case 'q': 970 + disable_slot_zap_quirk = true; 971 + TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & 972 + KVM_X86_QUIRK_SLOT_ZAP_ALL); 973 973 break; 974 974 case 's': 975 975 targs->nslots = atoi_paranoid(optarg);

+4 -3

tools/testing/selftests/kvm/s390x/cmma_test.c

··· 17 17 #include "kvm_util.h" 18 18 #include "kselftest.h" 19 19 #include "ucall_common.h" 20 + #include "processor.h" 20 21 21 22 #define MAIN_PAGE_COUNT 512 22 23 23 24 #define TEST_DATA_PAGE_COUNT 512 24 25 #define TEST_DATA_MEMSLOT 1 25 - #define TEST_DATA_START_GFN 4096 26 + #define TEST_DATA_START_GFN PAGE_SIZE 26 27 27 28 #define TEST_DATA_TWO_PAGE_COUNT 256 28 29 #define TEST_DATA_TWO_MEMSLOT 2 29 - #define TEST_DATA_TWO_START_GFN 8192 30 + #define TEST_DATA_TWO_START_GFN (2 * PAGE_SIZE) 30 31 31 32 static char cmma_value_buf[MAIN_PAGE_COUNT + TEST_DATA_PAGE_COUNT]; 32 33 ··· 67 66 " lghi 5,%[page_count]\n" 68 67 /* r5 += r1 */ 69 68 "2: agfr 5,1\n" 70 - /* r2 = r1 << 12 */ 69 + /* r2 = r1 << PAGE_SHIFT */ 71 70 "1: sllg 2,1,12(0)\n" 72 71 /* essa(r4, r2, SET_STABLE) */ 73 72 " .insn rrf,0xb9ab0000,4,2,1,0\n"

+2

tools/testing/selftests/kvm/s390x/config

··· 1 + CONFIG_KVM=y 2 + CONFIG_KVM_S390_UCONTROL=y

+2 -2

tools/testing/selftests/kvm/s390x/debug_test.c

··· 2 2 /* Test KVM debugging features. */ 3 3 #include "kvm_util.h" 4 4 #include "test_util.h" 5 + #include "sie.h" 5 6 6 7 #include <linux/kvm.h> 7 8 8 9 #define __LC_SVC_NEW_PSW 0x1c0 9 10 #define __LC_PGM_NEW_PSW 0x1d0 10 - #define ICPT_INSTRUCTION 0x04 11 11 #define IPA0_DIAG 0x8300 12 12 #define PGM_SPECIFICATION 0x06 13 13 ··· 85 85 vm = test_step_int_1(&vcpu, test_step_pgm_diag_guest_code, 86 86 __LC_PGM_NEW_PSW, new_psw); 87 87 TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); 88 - TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_INSTRUCTION); 88 + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_INST); 89 89 TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa & 0xff00, IPA0_DIAG); 90 90 vcpu_ioctl(vcpu, KVM_S390_IRQ, &irq); 91 91 vcpu_run(vcpu);

+1 -3

tools/testing/selftests/kvm/s390x/memop.c

··· 16 16 #include "kvm_util.h" 17 17 #include "kselftest.h" 18 18 #include "ucall_common.h" 19 + #include "processor.h" 19 20 20 21 enum mop_target { 21 22 LOGICAL, ··· 227 226 228 227 #define CHECK_N_DO(f, ...) ({ f(__VA_ARGS__, CHECK_ONLY); f(__VA_ARGS__); }) 229 228 230 - #define PAGE_SHIFT 12 231 - #define PAGE_SIZE (1ULL << PAGE_SHIFT) 232 - #define PAGE_MASK (~(PAGE_SIZE - 1)) 233 229 #define CR0_FETCH_PROTECTION_OVERRIDE (1UL << (63 - 38)) 234 230 #define CR0_STORAGE_PROTECTION_OVERRIDE (1UL << (63 - 39)) 235 231

+2 -3

tools/testing/selftests/kvm/s390x/tprot.c

··· 9 9 #include "kvm_util.h" 10 10 #include "kselftest.h" 11 11 #include "ucall_common.h" 12 + #include "processor.h" 12 13 13 - #define PAGE_SHIFT 12 14 - #define PAGE_SIZE (1 << PAGE_SHIFT) 15 14 #define CR0_FETCH_PROTECTION_OVERRIDE (1UL << (63 - 38)) 16 15 #define CR0_STORAGE_PROTECTION_OVERRIDE (1UL << (63 - 39)) 17 16 ··· 150 151 * instead. 151 152 * In order to skip these tests we detect this inside the guest 152 153 */ 153 - skip = tests[*i].addr < (void *)4096 && 154 + skip = tests[*i].addr < (void *)PAGE_SIZE && 154 155 tests[*i].expected != TRANSL_UNAVAIL && 155 156 !mapped_0; 156 157 if (!skip) {

+332

tools/testing/selftests/kvm/s390x/ucontrol_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Test code for the s390x kvm ucontrol interface 4 + * 5 + * Copyright IBM Corp. 2024 6 + * 7 + * Authors: 8 + * Christoph Schlameuss <schlameuss@linux.ibm.com> 9 + */ 10 + #include "debug_print.h" 11 + #include "kselftest_harness.h" 12 + #include "kvm_util.h" 13 + #include "processor.h" 14 + #include "sie.h" 15 + 16 + #include <linux/capability.h> 17 + #include <linux/sizes.h> 18 + 19 + #define VM_MEM_SIZE (4 * SZ_1M) 20 + 21 + /* so directly declare capget to check caps without libcap */ 22 + int capget(cap_user_header_t header, cap_user_data_t data); 23 + 24 + /** 25 + * In order to create user controlled virtual machines on S390, 26 + * check KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL 27 + * as privileged user (SYS_ADMIN). 28 + */ 29 + void require_ucontrol_admin(void) 30 + { 31 + struct __user_cap_data_struct data[_LINUX_CAPABILITY_U32S_3]; 32 + struct __user_cap_header_struct hdr = { 33 + .version = _LINUX_CAPABILITY_VERSION_3, 34 + }; 35 + int rc; 36 + 37 + rc = capget(&hdr, data); 38 + TEST_ASSERT_EQ(0, rc); 39 + TEST_REQUIRE((data->effective & CAP_TO_MASK(CAP_SYS_ADMIN)) > 0); 40 + 41 + TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_UCONTROL)); 42 + } 43 + 44 + /* Test program setting some registers and looping */ 45 + extern char test_gprs_asm[]; 46 + asm("test_gprs_asm:\n" 47 + "xgr %r0, %r0\n" 48 + "lgfi %r1,1\n" 49 + "lgfi %r2,2\n" 50 + "lgfi %r3,3\n" 51 + "lgfi %r4,4\n" 52 + "lgfi %r5,5\n" 53 + "lgfi %r6,6\n" 54 + "lgfi %r7,7\n" 55 + "0:\n" 56 + " diag 0,0,0x44\n" 57 + " ahi %r0,1\n" 58 + " j 0b\n" 59 + ); 60 + 61 + FIXTURE(uc_kvm) 62 + { 63 + struct kvm_s390_sie_block *sie_block; 64 + struct kvm_run *run; 65 + uintptr_t base_gpa; 66 + uintptr_t code_gpa; 67 + uintptr_t base_hva; 68 + uintptr_t code_hva; 69 + int kvm_run_size; 70 + void *vm_mem; 71 + int vcpu_fd; 72 + int kvm_fd; 73 + int vm_fd; 74 + }; 75 + 76 + /** 77 + * create VM with single vcpu, map kvm_run and SIE control block for easy access 78 + */ 79 + FIXTURE_SETUP(uc_kvm) 80 + { 81 + struct kvm_s390_vm_cpu_processor info; 82 + int rc; 83 + 84 + require_ucontrol_admin(); 85 + 86 + self->kvm_fd = open_kvm_dev_path_or_exit(); 87 + self->vm_fd = ioctl(self->kvm_fd, KVM_CREATE_VM, KVM_VM_S390_UCONTROL); 88 + ASSERT_GE(self->vm_fd, 0); 89 + 90 + kvm_device_attr_get(self->vm_fd, KVM_S390_VM_CPU_MODEL, 91 + KVM_S390_VM_CPU_PROCESSOR, &info); 92 + TH_LOG("create VM 0x%llx", info.cpuid); 93 + 94 + self->vcpu_fd = ioctl(self->vm_fd, KVM_CREATE_VCPU, 0); 95 + ASSERT_GE(self->vcpu_fd, 0); 96 + 97 + self->kvm_run_size = ioctl(self->kvm_fd, KVM_GET_VCPU_MMAP_SIZE, NULL); 98 + ASSERT_GE(self->kvm_run_size, sizeof(struct kvm_run)) 99 + TH_LOG(KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, self->kvm_run_size)); 100 + self->run = (struct kvm_run *)mmap(NULL, self->kvm_run_size, 101 + PROT_READ | PROT_WRITE, MAP_SHARED, self->vcpu_fd, 0); 102 + ASSERT_NE(self->run, MAP_FAILED); 103 + /** 104 + * For virtual cpus that have been created with S390 user controlled 105 + * virtual machines, the resulting vcpu fd can be memory mapped at page 106 + * offset KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of 107 + * the virtual cpu's hardware control block. 108 + */ 109 + self->sie_block = (struct kvm_s390_sie_block *)mmap(NULL, PAGE_SIZE, 110 + PROT_READ | PROT_WRITE, MAP_SHARED, 111 + self->vcpu_fd, KVM_S390_SIE_PAGE_OFFSET << PAGE_SHIFT); 112 + ASSERT_NE(self->sie_block, MAP_FAILED); 113 + 114 + TH_LOG("VM created %p %p", self->run, self->sie_block); 115 + 116 + self->base_gpa = 0; 117 + self->code_gpa = self->base_gpa + (3 * SZ_1M); 118 + 119 + self->vm_mem = aligned_alloc(SZ_1M, VM_MEM_SIZE); 120 + ASSERT_NE(NULL, self->vm_mem) TH_LOG("malloc failed %u", errno); 121 + self->base_hva = (uintptr_t)self->vm_mem; 122 + self->code_hva = self->base_hva - self->base_gpa + self->code_gpa; 123 + struct kvm_s390_ucas_mapping map = { 124 + .user_addr = self->base_hva, 125 + .vcpu_addr = self->base_gpa, 126 + .length = VM_MEM_SIZE, 127 + }; 128 + TH_LOG("ucas map %p %p 0x%llx", 129 + (void *)map.user_addr, (void *)map.vcpu_addr, map.length); 130 + rc = ioctl(self->vcpu_fd, KVM_S390_UCAS_MAP, &map); 131 + ASSERT_EQ(0, rc) TH_LOG("ucas map result %d not expected, %s", 132 + rc, strerror(errno)); 133 + 134 + TH_LOG("page in %p", (void *)self->base_gpa); 135 + rc = ioctl(self->vcpu_fd, KVM_S390_VCPU_FAULT, self->base_gpa); 136 + ASSERT_EQ(0, rc) TH_LOG("vcpu fault (%p) result %d not expected, %s", 137 + (void *)self->base_hva, rc, strerror(errno)); 138 + 139 + self->sie_block->cpuflags &= ~CPUSTAT_STOPPED; 140 + } 141 + 142 + FIXTURE_TEARDOWN(uc_kvm) 143 + { 144 + munmap(self->sie_block, PAGE_SIZE); 145 + munmap(self->run, self->kvm_run_size); 146 + close(self->vcpu_fd); 147 + close(self->vm_fd); 148 + close(self->kvm_fd); 149 + free(self->vm_mem); 150 + } 151 + 152 + TEST_F(uc_kvm, uc_sie_assertions) 153 + { 154 + /* assert interception of Code 08 (Program Interruption) is set */ 155 + EXPECT_EQ(0, self->sie_block->ecb & ECB_SPECI); 156 + } 157 + 158 + TEST_F(uc_kvm, uc_attr_mem_limit) 159 + { 160 + u64 limit; 161 + struct kvm_device_attr attr = { 162 + .group = KVM_S390_VM_MEM_CTRL, 163 + .attr = KVM_S390_VM_MEM_LIMIT_SIZE, 164 + .addr = (unsigned long)&limit, 165 + }; 166 + int rc; 167 + 168 + rc = ioctl(self->vm_fd, KVM_GET_DEVICE_ATTR, &attr); 169 + EXPECT_EQ(0, rc); 170 + EXPECT_EQ(~0UL, limit); 171 + 172 + /* assert set not supported */ 173 + rc = ioctl(self->vm_fd, KVM_SET_DEVICE_ATTR, &attr); 174 + EXPECT_EQ(-1, rc); 175 + EXPECT_EQ(EINVAL, errno); 176 + } 177 + 178 + TEST_F(uc_kvm, uc_no_dirty_log) 179 + { 180 + struct kvm_dirty_log dlog; 181 + int rc; 182 + 183 + rc = ioctl(self->vm_fd, KVM_GET_DIRTY_LOG, &dlog); 184 + EXPECT_EQ(-1, rc); 185 + EXPECT_EQ(EINVAL, errno); 186 + } 187 + 188 + /** 189 + * Assert HPAGE CAP cannot be enabled on UCONTROL VM 190 + */ 191 + TEST(uc_cap_hpage) 192 + { 193 + int rc, kvm_fd, vm_fd, vcpu_fd; 194 + struct kvm_enable_cap cap = { 195 + .cap = KVM_CAP_S390_HPAGE_1M, 196 + }; 197 + 198 + require_ucontrol_admin(); 199 + 200 + kvm_fd = open_kvm_dev_path_or_exit(); 201 + vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, KVM_VM_S390_UCONTROL); 202 + ASSERT_GE(vm_fd, 0); 203 + 204 + /* assert hpages are not supported on ucontrol vm */ 205 + rc = ioctl(vm_fd, KVM_CHECK_EXTENSION, KVM_CAP_S390_HPAGE_1M); 206 + EXPECT_EQ(0, rc); 207 + 208 + /* Test that KVM_CAP_S390_HPAGE_1M can't be enabled for a ucontrol vm */ 209 + rc = ioctl(vm_fd, KVM_ENABLE_CAP, cap); 210 + EXPECT_EQ(-1, rc); 211 + EXPECT_EQ(EINVAL, errno); 212 + 213 + /* assert HPAGE CAP is rejected after vCPU creation */ 214 + vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0); 215 + ASSERT_GE(vcpu_fd, 0); 216 + rc = ioctl(vm_fd, KVM_ENABLE_CAP, cap); 217 + EXPECT_EQ(-1, rc); 218 + EXPECT_EQ(EBUSY, errno); 219 + 220 + close(vcpu_fd); 221 + close(vm_fd); 222 + close(kvm_fd); 223 + } 224 + 225 + /* verify SIEIC exit 226 + * * fail on codes not expected in the test cases 227 + */ 228 + static bool uc_handle_sieic(FIXTURE_DATA(uc_kvm) * self) 229 + { 230 + struct kvm_s390_sie_block *sie_block = self->sie_block; 231 + struct kvm_run *run = self->run; 232 + 233 + /* check SIE interception code */ 234 + pr_info("sieic: 0x%.2x 0x%.4x 0x%.4x\n", 235 + run->s390_sieic.icptcode, 236 + run->s390_sieic.ipa, 237 + run->s390_sieic.ipb); 238 + switch (run->s390_sieic.icptcode) { 239 + case ICPT_INST: 240 + /* end execution in caller on intercepted instruction */ 241 + pr_info("sie instruction interception\n"); 242 + return false; 243 + case ICPT_OPEREXC: 244 + /* operation exception */ 245 + TEST_FAIL("sie exception on %.4x%.8x", sie_block->ipa, sie_block->ipb); 246 + default: 247 + TEST_FAIL("UNEXPECTED SIEIC CODE %d", run->s390_sieic.icptcode); 248 + } 249 + return true; 250 + } 251 + 252 + /* verify VM state on exit */ 253 + static bool uc_handle_exit(FIXTURE_DATA(uc_kvm) * self) 254 + { 255 + struct kvm_run *run = self->run; 256 + 257 + switch (run->exit_reason) { 258 + case KVM_EXIT_S390_SIEIC: 259 + return uc_handle_sieic(self); 260 + default: 261 + pr_info("exit_reason %2d not handled\n", run->exit_reason); 262 + } 263 + return true; 264 + } 265 + 266 + /* run the VM until interrupted */ 267 + static int uc_run_once(FIXTURE_DATA(uc_kvm) * self) 268 + { 269 + int rc; 270 + 271 + rc = ioctl(self->vcpu_fd, KVM_RUN, NULL); 272 + print_run(self->run, self->sie_block); 273 + print_regs(self->run); 274 + pr_debug("run %d / %d %s\n", rc, errno, strerror(errno)); 275 + return rc; 276 + } 277 + 278 + static void uc_assert_diag44(FIXTURE_DATA(uc_kvm) * self) 279 + { 280 + struct kvm_s390_sie_block *sie_block = self->sie_block; 281 + 282 + /* assert vm was interrupted by diag 0x0044 */ 283 + TEST_ASSERT_EQ(KVM_EXIT_S390_SIEIC, self->run->exit_reason); 284 + TEST_ASSERT_EQ(ICPT_INST, sie_block->icptcode); 285 + TEST_ASSERT_EQ(0x8300, sie_block->ipa); 286 + TEST_ASSERT_EQ(0x440000, sie_block->ipb); 287 + } 288 + 289 + TEST_F(uc_kvm, uc_gprs) 290 + { 291 + struct kvm_sync_regs *sync_regs = &self->run->s.regs; 292 + struct kvm_run *run = self->run; 293 + struct kvm_regs regs = {}; 294 + 295 + /* Set registers to values that are different from the ones that we expect below */ 296 + for (int i = 0; i < 8; i++) 297 + sync_regs->gprs[i] = 8; 298 + run->kvm_dirty_regs |= KVM_SYNC_GPRS; 299 + 300 + /* copy test_gprs_asm to code_hva / code_gpa */ 301 + TH_LOG("copy code %p to vm mapped memory %p / %p", 302 + &test_gprs_asm, (void *)self->code_hva, (void *)self->code_gpa); 303 + memcpy((void *)self->code_hva, &test_gprs_asm, PAGE_SIZE); 304 + 305 + /* DAT disabled + 64 bit mode */ 306 + run->psw_mask = 0x0000000180000000ULL; 307 + run->psw_addr = self->code_gpa; 308 + 309 + /* run and expect interception of diag 44 */ 310 + ASSERT_EQ(0, uc_run_once(self)); 311 + ASSERT_EQ(false, uc_handle_exit(self)); 312 + uc_assert_diag44(self); 313 + 314 + /* Retrieve and check guest register values */ 315 + ASSERT_EQ(0, ioctl(self->vcpu_fd, KVM_GET_REGS, &regs)); 316 + for (int i = 0; i < 8; i++) { 317 + ASSERT_EQ(i, regs.gprs[i]); 318 + ASSERT_EQ(i, sync_regs->gprs[i]); 319 + } 320 + 321 + /* run and expect interception of diag 44 again */ 322 + ASSERT_EQ(0, uc_run_once(self)); 323 + ASSERT_EQ(false, uc_handle_exit(self)); 324 + uc_assert_diag44(self); 325 + 326 + /* check continued increment of register 0 value */ 327 + ASSERT_EQ(0, ioctl(self->vcpu_fd, KVM_GET_REGS, &regs)); 328 + ASSERT_EQ(1, regs.gprs[0]); 329 + ASSERT_EQ(1, sync_regs->gprs[0]); 330 + } 331 + 332 + TEST_HARNESS_MAIN

+21 -8

tools/testing/selftests/kvm/set_memory_region_test.c

··· 175 175 GUEST_DONE(); 176 176 } 177 177 178 - static void test_move_memory_region(void) 178 + static void test_move_memory_region(bool disable_slot_zap_quirk) 179 179 { 180 180 pthread_t vcpu_thread; 181 181 struct kvm_vcpu *vcpu; ··· 183 183 uint64_t *hva; 184 184 185 185 vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_move_memory_region); 186 + 187 + if (disable_slot_zap_quirk) 188 + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL); 186 189 187 190 hva = addr_gpa2hva(vm, MEM_REGION_GPA); 188 191 ··· 269 266 GUEST_ASSERT(0); 270 267 } 271 268 272 - static void test_delete_memory_region(void) 269 + static void test_delete_memory_region(bool disable_slot_zap_quirk) 273 270 { 274 271 pthread_t vcpu_thread; 275 272 struct kvm_vcpu *vcpu; ··· 278 275 struct kvm_vm *vm; 279 276 280 277 vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_delete_memory_region); 278 + 279 + if (disable_slot_zap_quirk) 280 + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL); 281 281 282 282 /* Delete the memory region, the guest should not die. */ 283 283 vm_mem_region_delete(vm, MEM_REGION_SLOT); ··· 559 553 { 560 554 #ifdef __x86_64__ 561 555 int i, loops; 556 + int j, disable_slot_zap_quirk = 0; 562 557 558 + if (kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_SLOT_ZAP_ALL) 559 + disable_slot_zap_quirk = 1; 563 560 /* 564 561 * FIXME: the zero-memslot test fails on aarch64 and s390x because 565 562 * KVM_RUN fails with ENOEXEC or EFAULT. ··· 588 579 else 589 580 loops = 10; 590 581 591 - pr_info("Testing MOVE of in-use region, %d loops\n", loops); 592 - for (i = 0; i < loops; i++) 593 - test_move_memory_region(); 582 + for (j = 0; j <= disable_slot_zap_quirk; j++) { 583 + pr_info("Testing MOVE of in-use region, %d loops, slot zap quirk %s\n", 584 + loops, j ? "disabled" : "enabled"); 585 + for (i = 0; i < loops; i++) 586 + test_move_memory_region(!!j); 594 587 595 - pr_info("Testing DELETE of in-use region, %d loops\n", loops); 596 - for (i = 0; i < loops; i++) 597 - test_delete_memory_region(); 588 + pr_info("Testing DELETE of in-use region, %d loops, slot zap quirk %s\n", 589 + loops, j ? "disabled" : "enabled"); 590 + for (i = 0; i < loops; i++) 591 + test_delete_memory_region(!!j); 592 + } 598 593 #endif 599 594 600 595 return 0;

+7 -4

tools/testing/selftests/kvm/x86_64/debug_regs.c

··· 47 47 /* 48 48 * Single step test, covers 2 basic instructions and 2 emulated 49 49 * 50 - * Enable interrupts during the single stepping to see that 51 - * pending interrupt we raised is not handled due to KVM_GUESTDBG_BLOCKIRQ 50 + * Enable interrupts during the single stepping to see that pending 51 + * interrupt we raised is not handled due to KVM_GUESTDBG_BLOCKIRQ. 52 + * 53 + * Write MSR_IA32_TSC_DEADLINE to verify that KVM's fastpath handler 54 + * exits to userspace due to single-step being enabled. 52 55 */ 53 56 asm volatile("ss_start: " 54 57 "sti\n\t" 55 58 "xor %%eax,%%eax\n\t" 56 59 "cpuid\n\t" 57 - "movl $0x1a0,%%ecx\n\t" 58 - "rdmsr\n\t" 60 + "movl $" __stringify(MSR_IA32_TSC_DEADLINE) ", %%ecx\n\t" 61 + "wrmsr\n\t" 59 62 "cli\n\t" 60 63 : : : "eax", "ebx", "ecx", "edx"); 61 64

+1 -1

tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c

··· 242 242 TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); 243 243 TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); 244 244 TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)); 245 - TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_DIRECT_TLBFLUSH)); 245 + TEST_REQUIRE(kvm_hv_cpu_has(HV_X64_NESTED_DIRECT_FLUSH)); 246 246 247 247 vm = vm_create_with_one_vcpu(&vcpu, guest_code); 248 248

+1 -1

tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c

··· 157 157 int stage; 158 158 159 159 TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); 160 - TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_DIRECT_TLBFLUSH)); 160 + TEST_REQUIRE(kvm_hv_cpu_has(HV_X64_NESTED_DIRECT_FLUSH)); 161 161 162 162 /* Create VM */ 163 163 vm = vm_create_with_one_vcpu(&vcpu, guest_code);

+32

tools/testing/selftests/kvm/x86_64/sev_smoke_test.c

··· 160 160 kvm_vm_free(vm); 161 161 } 162 162 163 + static void guest_shutdown_code(void) 164 + { 165 + struct desc_ptr idt; 166 + 167 + /* Clobber the IDT so that #UD is guaranteed to trigger SHUTDOWN. */ 168 + memset(&idt, 0, sizeof(idt)); 169 + __asm__ __volatile__("lidt %0" :: "m"(idt)); 170 + 171 + __asm__ __volatile__("ud2"); 172 + } 173 + 174 + static void test_sev_es_shutdown(void) 175 + { 176 + struct kvm_vcpu *vcpu; 177 + struct kvm_vm *vm; 178 + 179 + uint32_t type = KVM_X86_SEV_ES_VM; 180 + 181 + vm = vm_sev_create_with_one_vcpu(type, guest_shutdown_code, &vcpu); 182 + 183 + vm_sev_launch(vm, SEV_POLICY_ES, NULL); 184 + 185 + vcpu_run(vcpu); 186 + TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SHUTDOWN, 187 + "Wanted SHUTDOWN, got %s", 188 + exit_reason_str(vcpu->run->exit_reason)); 189 + 190 + kvm_vm_free(vm); 191 + } 192 + 163 193 int main(int argc, char *argv[]) 164 194 { 165 195 TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV)); ··· 200 170 if (kvm_cpu_has(X86_FEATURE_SEV_ES)) { 201 171 test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG); 202 172 test_sev(guest_sev_es_code, SEV_POLICY_ES); 173 + 174 + test_sev_es_shutdown(); 203 175 204 176 if (kvm_has_cap(KVM_CAP_XCRS) && 205 177 (xgetbv(0) & XFEATURE_MASK_X87_AVX) == XFEATURE_MASK_X87_AVX) {

+37 -17

tools/testing/selftests/kvm/x86_64/xapic_state_test.c

··· 13 13 struct xapic_vcpu { 14 14 struct kvm_vcpu *vcpu; 15 15 bool is_x2apic; 16 + bool has_xavic_errata; 16 17 }; 17 18 18 19 static void xapic_guest_code(void) ··· 32 31 } 33 32 } 34 33 34 + #define X2APIC_RSVD_BITS_MASK (GENMASK_ULL(31, 20) | \ 35 + GENMASK_ULL(17, 16) | \ 36 + GENMASK_ULL(13, 13)) 37 + 35 38 static void x2apic_guest_code(void) 36 39 { 37 40 asm volatile("cli"); ··· 46 41 uint64_t val = x2apic_read_reg(APIC_IRR) | 47 42 x2apic_read_reg(APIC_IRR + 0x10) << 32; 48 43 49 - x2apic_write_reg(APIC_ICR, val); 44 + if (val & X2APIC_RSVD_BITS_MASK) { 45 + x2apic_write_reg_fault(APIC_ICR, val); 46 + } else { 47 + x2apic_write_reg(APIC_ICR, val); 48 + GUEST_ASSERT_EQ(x2apic_read_reg(APIC_ICR), val); 49 + } 50 50 GUEST_SYNC(val); 51 51 } while (1); 52 52 } ··· 81 71 icr = (u64)(*((u32 *)&xapic.regs[APIC_ICR])) | 82 72 (u64)(*((u32 *)&xapic.regs[APIC_ICR2])) << 32; 83 73 if (!x->is_x2apic) { 84 - val &= (-1u | (0xffull << (32 + 24))); 85 - TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); 86 - } else { 87 - TEST_ASSERT_EQ(icr & ~APIC_ICR_BUSY, val & ~APIC_ICR_BUSY); 74 + if (!x->has_xavic_errata) 75 + val &= (-1u | (0xffull << (32 + 24))); 76 + } else if (val & X2APIC_RSVD_BITS_MASK) { 77 + return; 88 78 } 89 - } 90 79 91 - #define X2APIC_RSVED_BITS_MASK (GENMASK_ULL(31,20) | \ 92 - GENMASK_ULL(17,16) | \ 93 - GENMASK_ULL(13,13)) 80 + if (x->has_xavic_errata) 81 + TEST_ASSERT_EQ(icr & ~APIC_ICR_BUSY, val & ~APIC_ICR_BUSY); 82 + else 83 + TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); 84 + } 94 85 95 86 static void __test_icr(struct xapic_vcpu *x, uint64_t val) 96 87 { 97 - if (x->is_x2apic) { 98 - /* Hardware writing vICR register requires reserved bits 31:20, 99 - * 17:16 and 13 kept as zero to avoid #GP exception. Data value 100 - * written to vICR should mask out those bits above. 101 - */ 102 - val &= ~X2APIC_RSVED_BITS_MASK; 103 - } 104 - ____test_icr(x, val | APIC_ICR_BUSY); 88 + /* 89 + * The BUSY bit is reserved on both AMD and Intel, but only AMD treats 90 + * it is as _must_ be zero. Intel simply ignores the bit. Don't test 91 + * the BUSY bit for x2APIC, as there is no single correct behavior. 92 + */ 93 + if (!x->is_x2apic) 94 + ____test_icr(x, val | APIC_ICR_BUSY); 95 + 105 96 ____test_icr(x, val & ~(u64)APIC_ICR_BUSY); 106 97 } 107 98 ··· 241 230 */ 242 231 vm = vm_create_with_one_vcpu(&x.vcpu, xapic_guest_code); 243 232 x.is_x2apic = false; 233 + 234 + /* 235 + * AMD's AVIC implementation is buggy (fails to clear the ICR BUSY bit), 236 + * and also diverges from KVM with respect to ICR2[23:0] (KVM and Intel 237 + * drops writes, AMD does not). Account for the errata when checking 238 + * that KVM reads back what was written. 239 + */ 240 + x.has_xavic_errata = host_cpu_is_amd && 241 + get_kvm_amd_param_bool("avic"); 244 242 245 243 vcpu_clear_cpuid_feature(x.vcpu, X86_FEATURE_X2APIC); 246 244

+1

tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c

··· 10 10 #include "test_util.h" 11 11 #include "kvm_util.h" 12 12 #include "processor.h" 13 + #include "hyperv.h" 13 14 14 15 #define HCALL_REGION_GPA 0xc0000000ULL 15 16 #define HCALL_REGION_SLOT 10

+8 -23

virt/kvm/coalesced_mmio.c

··· 40 40 return 1; 41 41 } 42 42 43 - static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev, u32 last) 44 - { 45 - struct kvm_coalesced_mmio_ring *ring; 46 - unsigned avail; 47 - 48 - /* Are we able to batch it ? */ 49 - 50 - /* last is the first free entry 51 - * check if we don't meet the first used entry 52 - * there is always one unused entry in the buffer 53 - */ 54 - ring = dev->kvm->coalesced_mmio_ring; 55 - avail = (ring->first - last - 1) % KVM_COALESCED_MMIO_MAX; 56 - if (avail == 0) { 57 - /* full */ 58 - return 0; 59 - } 60 - 61 - return 1; 62 - } 63 - 64 43 static int coalesced_mmio_write(struct kvm_vcpu *vcpu, 65 44 struct kvm_io_device *this, gpa_t addr, 66 45 int len, const void *val) ··· 53 74 54 75 spin_lock(&dev->kvm->ring_lock); 55 76 77 + /* 78 + * last is the index of the entry to fill. Verify userspace hasn't 79 + * set last to be out of range, and that there is room in the ring. 80 + * Leave one entry free in the ring so that userspace can differentiate 81 + * between an empty ring and a full ring. 82 + */ 56 83 insert = READ_ONCE(ring->last); 57 - if (!coalesced_mmio_has_room(dev, insert) || 58 - insert >= KVM_COALESCED_MMIO_MAX) { 84 + if (insert >= KVM_COALESCED_MMIO_MAX || 85 + (insert + 1) % KVM_COALESCED_MMIO_MAX == READ_ONCE(ring->first)) { 59 86 spin_unlock(&dev->kvm->ring_lock); 60 87 return -EOPNOTSUPP; 61 88 }

+144 -137

virt/kvm/kvm_main.c

··· 136 136 #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ 137 137 .open = kvm_no_compat_open 138 138 #endif 139 - static int hardware_enable_all(void); 140 - static void hardware_disable_all(void); 139 + static int kvm_enable_virtualization(void); 140 + static void kvm_disable_virtualization(void); 141 141 142 142 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 143 143 ··· 1220 1220 if (r) 1221 1221 goto out_err_no_arch_destroy_vm; 1222 1222 1223 - r = hardware_enable_all(); 1223 + r = kvm_enable_virtualization(); 1224 1224 if (r) 1225 1225 goto out_err_no_disable; 1226 1226 ··· 1263 1263 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); 1264 1264 #endif 1265 1265 out_err_no_mmu_notifier: 1266 - hardware_disable_all(); 1266 + kvm_disable_virtualization(); 1267 1267 out_err_no_disable: 1268 1268 kvm_arch_destroy_vm(kvm); 1269 1269 out_err_no_arch_destroy_vm: ··· 1360 1360 #endif 1361 1361 kvm_arch_free_vm(kvm); 1362 1362 preempt_notifier_dec(); 1363 - hardware_disable_all(); 1363 + kvm_disable_virtualization(); 1364 1364 mmdrop(mm); 1365 1365 } 1366 1366 ··· 3270 3270 int r; 3271 3271 unsigned long addr; 3272 3272 3273 + if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) 3274 + return -EFAULT; 3275 + 3273 3276 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 3274 3277 if (kvm_is_error_hva(addr)) 3275 3278 return -EFAULT; ··· 3346 3343 int r; 3347 3344 unsigned long addr; 3348 3345 3346 + if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) 3347 + return -EFAULT; 3348 + 3349 3349 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 3350 3350 if (kvm_is_error_hva(addr)) 3351 3351 return -EFAULT; ··· 3378 3372 { 3379 3373 int r; 3380 3374 unsigned long addr; 3375 + 3376 + if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) 3377 + return -EFAULT; 3381 3378 3382 3379 addr = gfn_to_hva_memslot(memslot, gfn); 3383 3380 if (kvm_is_error_hva(addr)) ··· 3585 3576 int ret; 3586 3577 3587 3578 while ((seg = next_segment(len, offset)) != 0) { 3588 - ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len); 3579 + ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, seg); 3589 3580 if (ret < 0) 3590 3581 return ret; 3591 3582 offset = 0; ··· 5575 5566 }; 5576 5567 5577 5568 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING 5569 + static bool enable_virt_at_load = true; 5570 + module_param(enable_virt_at_load, bool, 0444); 5571 + 5578 5572 __visible bool kvm_rebooting; 5579 5573 EXPORT_SYMBOL_GPL(kvm_rebooting); 5580 5574 5581 - static DEFINE_PER_CPU(bool, hardware_enabled); 5575 + static DEFINE_PER_CPU(bool, virtualization_enabled); 5576 + static DEFINE_MUTEX(kvm_usage_lock); 5582 5577 static int kvm_usage_count; 5583 5578 5584 - static int __hardware_enable_nolock(void) 5579 + __weak void kvm_arch_enable_virtualization(void) 5585 5580 { 5586 - if (__this_cpu_read(hardware_enabled)) 5581 + 5582 + } 5583 + 5584 + __weak void kvm_arch_disable_virtualization(void) 5585 + { 5586 + 5587 + } 5588 + 5589 + static int kvm_enable_virtualization_cpu(void) 5590 + { 5591 + if (__this_cpu_read(virtualization_enabled)) 5587 5592 return 0; 5588 5593 5589 - if (kvm_arch_hardware_enable()) { 5594 + if (kvm_arch_enable_virtualization_cpu()) { 5590 5595 pr_info("kvm: enabling virtualization on CPU%d failed\n", 5591 5596 raw_smp_processor_id()); 5592 5597 return -EIO; 5593 5598 } 5594 5599 5595 - __this_cpu_write(hardware_enabled, true); 5600 + __this_cpu_write(virtualization_enabled, true); 5596 5601 return 0; 5597 - } 5598 - 5599 - static void hardware_enable_nolock(void *failed) 5600 - { 5601 - if (__hardware_enable_nolock()) 5602 - atomic_inc(failed); 5603 5602 } 5604 5603 5605 5604 static int kvm_online_cpu(unsigned int cpu) 5606 5605 { 5607 - int ret = 0; 5608 - 5609 5606 /* 5610 5607 * Abort the CPU online process if hardware virtualization cannot 5611 5608 * be enabled. Otherwise running VMs would encounter unrecoverable 5612 5609 * errors when scheduled to this CPU. 5613 5610 */ 5614 - mutex_lock(&kvm_lock); 5615 - if (kvm_usage_count) 5616 - ret = __hardware_enable_nolock(); 5617 - mutex_unlock(&kvm_lock); 5618 - return ret; 5611 + return kvm_enable_virtualization_cpu(); 5619 5612 } 5620 5613 5621 - static void hardware_disable_nolock(void *junk) 5614 + static void kvm_disable_virtualization_cpu(void *ign) 5622 5615 { 5623 - /* 5624 - * Note, hardware_disable_all_nolock() tells all online CPUs to disable 5625 - * hardware, not just CPUs that successfully enabled hardware! 5626 - */ 5627 - if (!__this_cpu_read(hardware_enabled)) 5616 + if (!__this_cpu_read(virtualization_enabled)) 5628 5617 return; 5629 5618 5630 - kvm_arch_hardware_disable(); 5619 + kvm_arch_disable_virtualization_cpu(); 5631 5620 5632 - __this_cpu_write(hardware_enabled, false); 5621 + __this_cpu_write(virtualization_enabled, false); 5633 5622 } 5634 5623 5635 5624 static int kvm_offline_cpu(unsigned int cpu) 5636 5625 { 5637 - mutex_lock(&kvm_lock); 5638 - if (kvm_usage_count) 5639 - hardware_disable_nolock(NULL); 5640 - mutex_unlock(&kvm_lock); 5626 + kvm_disable_virtualization_cpu(NULL); 5641 5627 return 0; 5642 - } 5643 - 5644 - static void hardware_disable_all_nolock(void) 5645 - { 5646 - BUG_ON(!kvm_usage_count); 5647 - 5648 - kvm_usage_count--; 5649 - if (!kvm_usage_count) 5650 - on_each_cpu(hardware_disable_nolock, NULL, 1); 5651 - } 5652 - 5653 - static void hardware_disable_all(void) 5654 - { 5655 - cpus_read_lock(); 5656 - mutex_lock(&kvm_lock); 5657 - hardware_disable_all_nolock(); 5658 - mutex_unlock(&kvm_lock); 5659 - cpus_read_unlock(); 5660 - } 5661 - 5662 - static int hardware_enable_all(void) 5663 - { 5664 - atomic_t failed = ATOMIC_INIT(0); 5665 - int r; 5666 - 5667 - /* 5668 - * Do not enable hardware virtualization if the system is going down. 5669 - * If userspace initiated a forced reboot, e.g. reboot -f, then it's 5670 - * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling 5671 - * after kvm_reboot() is called. Note, this relies on system_state 5672 - * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops 5673 - * hook instead of registering a dedicated reboot notifier (the latter 5674 - * runs before system_state is updated). 5675 - */ 5676 - if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || 5677 - system_state == SYSTEM_RESTART) 5678 - return -EBUSY; 5679 - 5680 - /* 5681 - * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu() 5682 - * is called, and so on_each_cpu() between them includes the CPU that 5683 - * is being onlined. As a result, hardware_enable_nolock() may get 5684 - * invoked before kvm_online_cpu(), which also enables hardware if the 5685 - * usage count is non-zero. Disable CPU hotplug to avoid attempting to 5686 - * enable hardware multiple times. 5687 - */ 5688 - cpus_read_lock(); 5689 - mutex_lock(&kvm_lock); 5690 - 5691 - r = 0; 5692 - 5693 - kvm_usage_count++; 5694 - if (kvm_usage_count == 1) { 5695 - on_each_cpu(hardware_enable_nolock, &failed, 1); 5696 - 5697 - if (atomic_read(&failed)) { 5698 - hardware_disable_all_nolock(); 5699 - r = -EBUSY; 5700 - } 5701 - } 5702 - 5703 - mutex_unlock(&kvm_lock); 5704 - cpus_read_unlock(); 5705 - 5706 - return r; 5707 5628 } 5708 5629 5709 5630 static void kvm_shutdown(void) ··· 5651 5712 */ 5652 5713 pr_info("kvm: exiting hardware virtualization\n"); 5653 5714 kvm_rebooting = true; 5654 - on_each_cpu(hardware_disable_nolock, NULL, 1); 5715 + on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1); 5655 5716 } 5656 5717 5657 5718 static int kvm_suspend(void) 5658 5719 { 5659 5720 /* 5660 5721 * Secondary CPUs and CPU hotplug are disabled across the suspend/resume 5661 - * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count 5662 - * is stable. Assert that kvm_lock is not held to ensure the system 5663 - * isn't suspended while KVM is enabling hardware. Hardware enabling 5664 - * can be preempted, but the task cannot be frozen until it has dropped 5665 - * all locks (userspace tasks are frozen via a fake signal). 5722 + * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage 5723 + * count is stable. Assert that kvm_usage_lock is not held to ensure 5724 + * the system isn't suspended while KVM is enabling hardware. Hardware 5725 + * enabling can be preempted, but the task cannot be frozen until it has 5726 + * dropped all locks (userspace tasks are frozen via a fake signal). 5666 5727 */ 5667 - lockdep_assert_not_held(&kvm_lock); 5728 + lockdep_assert_not_held(&kvm_usage_lock); 5668 5729 lockdep_assert_irqs_disabled(); 5669 5730 5670 - if (kvm_usage_count) 5671 - hardware_disable_nolock(NULL); 5731 + kvm_disable_virtualization_cpu(NULL); 5672 5732 return 0; 5673 5733 } 5674 5734 5675 5735 static void kvm_resume(void) 5676 5736 { 5677 - lockdep_assert_not_held(&kvm_lock); 5737 + lockdep_assert_not_held(&kvm_usage_lock); 5678 5738 lockdep_assert_irqs_disabled(); 5679 5739 5680 - if (kvm_usage_count) 5681 - WARN_ON_ONCE(__hardware_enable_nolock()); 5740 + WARN_ON_ONCE(kvm_enable_virtualization_cpu()); 5682 5741 } 5683 5742 5684 5743 static struct syscore_ops kvm_syscore_ops = { ··· 5684 5747 .resume = kvm_resume, 5685 5748 .shutdown = kvm_shutdown, 5686 5749 }; 5750 + 5751 + static int kvm_enable_virtualization(void) 5752 + { 5753 + int r; 5754 + 5755 + guard(mutex)(&kvm_usage_lock); 5756 + 5757 + if (kvm_usage_count++) 5758 + return 0; 5759 + 5760 + kvm_arch_enable_virtualization(); 5761 + 5762 + r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", 5763 + kvm_online_cpu, kvm_offline_cpu); 5764 + if (r) 5765 + goto err_cpuhp; 5766 + 5767 + register_syscore_ops(&kvm_syscore_ops); 5768 + 5769 + /* 5770 + * Undo virtualization enabling and bail if the system is going down. 5771 + * If userspace initiated a forced reboot, e.g. reboot -f, then it's 5772 + * possible for an in-flight operation to enable virtualization after 5773 + * syscore_shutdown() is called, i.e. without kvm_shutdown() being 5774 + * invoked. Note, this relies on system_state being set _before_ 5775 + * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked 5776 + * or this CPU observes the impending shutdown. Which is why KVM uses 5777 + * a syscore ops hook instead of registering a dedicated reboot 5778 + * notifier (the latter runs before system_state is updated). 5779 + */ 5780 + if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || 5781 + system_state == SYSTEM_RESTART) { 5782 + r = -EBUSY; 5783 + goto err_rebooting; 5784 + } 5785 + 5786 + return 0; 5787 + 5788 + err_rebooting: 5789 + unregister_syscore_ops(&kvm_syscore_ops); 5790 + cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); 5791 + err_cpuhp: 5792 + kvm_arch_disable_virtualization(); 5793 + --kvm_usage_count; 5794 + return r; 5795 + } 5796 + 5797 + static void kvm_disable_virtualization(void) 5798 + { 5799 + guard(mutex)(&kvm_usage_lock); 5800 + 5801 + if (--kvm_usage_count) 5802 + return; 5803 + 5804 + unregister_syscore_ops(&kvm_syscore_ops); 5805 + cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); 5806 + kvm_arch_disable_virtualization(); 5807 + } 5808 + 5809 + static int kvm_init_virtualization(void) 5810 + { 5811 + if (enable_virt_at_load) 5812 + return kvm_enable_virtualization(); 5813 + 5814 + return 0; 5815 + } 5816 + 5817 + static void kvm_uninit_virtualization(void) 5818 + { 5819 + if (enable_virt_at_load) 5820 + kvm_disable_virtualization(); 5821 + } 5687 5822 #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ 5688 - static int hardware_enable_all(void) 5823 + static int kvm_enable_virtualization(void) 5689 5824 { 5690 5825 return 0; 5691 5826 } 5692 5827 5693 - static void hardware_disable_all(void) 5828 + static int kvm_init_virtualization(void) 5829 + { 5830 + return 0; 5831 + } 5832 + 5833 + static void kvm_disable_virtualization(void) 5834 + { 5835 + 5836 + } 5837 + 5838 + static void kvm_uninit_virtualization(void) 5694 5839 { 5695 5840 5696 5841 } ··· 6473 6454 int r; 6474 6455 int cpu; 6475 6456 6476 - #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING 6477 - r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", 6478 - kvm_online_cpu, kvm_offline_cpu); 6479 - if (r) 6480 - return r; 6481 - 6482 - register_syscore_ops(&kvm_syscore_ops); 6483 - #endif 6484 - 6485 6457 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 6486 6458 if (!vcpu_align) 6487 6459 vcpu_align = __alignof__(struct kvm_vcpu); ··· 6483 6473 offsetofend(struct kvm_vcpu, stats_id) 6484 6474 - offsetof(struct kvm_vcpu, arch), 6485 6475 NULL); 6486 - if (!kvm_vcpu_cache) { 6487 - r = -ENOMEM; 6488 - goto err_vcpu_cache; 6489 - } 6476 + if (!kvm_vcpu_cache) 6477 + return -ENOMEM; 6490 6478 6491 6479 for_each_possible_cpu(cpu) { 6492 6480 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu), ··· 6518 6510 6519 6511 kvm_gmem_init(module); 6520 6512 6513 + r = kvm_init_virtualization(); 6514 + if (r) 6515 + goto err_virt; 6516 + 6521 6517 /* 6522 6518 * Registration _must_ be the very last thing done, as this exposes 6523 6519 * /dev/kvm to userspace, i.e. all infrastructure must be setup! ··· 6535 6523 return 0; 6536 6524 6537 6525 err_register: 6526 + kvm_uninit_virtualization(); 6527 + err_virt: 6538 6528 kvm_vfio_ops_exit(); 6539 6529 err_vfio: 6540 6530 kvm_async_pf_deinit(); ··· 6547 6533 for_each_possible_cpu(cpu) 6548 6534 free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); 6549 6535 kmem_cache_destroy(kvm_vcpu_cache); 6550 - err_vcpu_cache: 6551 - #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING 6552 - unregister_syscore_ops(&kvm_syscore_ops); 6553 - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); 6554 - #endif 6555 6536 return r; 6556 6537 } 6557 6538 EXPORT_SYMBOL_GPL(kvm_init); ··· 6562 6553 */ 6563 6554 misc_deregister(&kvm_dev); 6564 6555 6556 + kvm_uninit_virtualization(); 6557 + 6565 6558 debugfs_remove_recursive(kvm_debugfs_dir); 6566 6559 for_each_possible_cpu(cpu) 6567 6560 free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); 6568 6561 kmem_cache_destroy(kvm_vcpu_cache); 6569 6562 kvm_vfio_ops_exit(); 6570 6563 kvm_async_pf_deinit(); 6571 - #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING 6572 - unregister_syscore_ops(&kvm_syscore_ops); 6573 - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); 6574 - #endif 6575 6564 kvm_irqfd_exit(); 6576 6565 } 6577 6566 EXPORT_SYMBOL_GPL(kvm_exit);