Merge tag 'kvm-5.6-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm

+60

Documentation/powerpc/ultravisor.rst

··· 948 948 up its internal state for this virtual machine. 949 949 950 950 951 + H_SVM_INIT_ABORT 952 + ---------------- 953 + 954 + Abort the process of securing an SVM. 955 + 956 + Syntax 957 + ~~~~~~ 958 + 959 + .. code-block:: c 960 + 961 + uint64_t hypercall(const uint64_t H_SVM_INIT_ABORT) 962 + 963 + Return values 964 + ~~~~~~~~~~~~~ 965 + 966 + One of the following values: 967 + 968 + * H_PARAMETER on successfully cleaning up the state, 969 + Hypervisor will return this value to the 970 + **guest**, to indicate that the underlying 971 + UV_ESM ultracall failed. 972 + 973 + * H_STATE if called after a VM has gone secure (i.e 974 + H_SVM_INIT_DONE hypercall was successful). 975 + 976 + * H_UNSUPPORTED if called from a wrong context (e.g. from a 977 + normal VM). 978 + 979 + Description 980 + ~~~~~~~~~~~ 981 + 982 + Abort the process of securing a virtual machine. This call must 983 + be made after a prior call to ``H_SVM_INIT_START`` hypercall and 984 + before a call to ``H_SVM_INIT_DONE``. 985 + 986 + On entry into this hypercall the non-volatile GPRs and FPRs are 987 + expected to contain the values they had at the time the VM issued 988 + the UV_ESM ultracall. Further ``SRR0`` is expected to contain the 989 + address of the instruction after the ``UV_ESM`` ultracall and ``SRR1`` 990 + the MSR value with which to return to the VM. 991 + 992 + This hypercall will cleanup any partial state that was established for 993 + the VM since the prior ``H_SVM_INIT_START`` hypercall, including paging 994 + out pages that were paged-into secure memory, and issue the 995 + ``UV_SVM_TERMINATE`` ultracall to terminate the VM. 996 + 997 + After the partial state is cleaned up, control returns to the VM 998 + (**not Ultravisor**), at the address specified in ``SRR0`` with the 999 + MSR values set to the value in ``SRR1``. 1000 + 1001 + Use cases 1002 + ~~~~~~~~~ 1003 + 1004 + If after a successful call to ``H_SVM_INIT_START``, the Ultravisor 1005 + encounters an error while securing a virtual machine, either due 1006 + to lack of resources or because the VM's security information could 1007 + not be validated, Ultravisor informs the Hypervisor about it. 1008 + Hypervisor should use this call to clean up any internal state for 1009 + this virtual machine and return to the VM. 1010 + 951 1011 H_SVM_PAGE_IN 952 1012 ------------- 953 1013

+9

Documentation/virt/kvm/api.txt

··· 2196 2196 arm64 system registers have the following id bit patterns: 2197 2197 0x6030 0000 0013 <op0:2> <op1:3> <crn:4> <crm:4> <op2:3> 2198 2198 2199 + WARNING: 2200 + Two system register IDs do not follow the specified pattern. These 2201 + are KVM_REG_ARM_TIMER_CVAL and KVM_REG_ARM_TIMER_CNT, which map to 2202 + system registers CNTV_CVAL_EL0 and CNTVCT_EL0 respectively. These 2203 + two had their values accidentally swapped, which means TIMER_CVAL is 2204 + derived from the register encoding for CNTVCT_EL0 and TIMER_CNT is 2205 + derived from the register encoding for CNTV_CVAL_EL0. As this is 2206 + API, it must remain this way. 2207 + 2199 2208 arm64 firmware pseudo-registers have the following bit pattern: 2200 2209 0x6030 0000 0014 <regno:16> 2201 2210

+24 -3

arch/arm/include/asm/kvm_emulate.h

··· 9 9 10 10 #include <linux/kvm_host.h> 11 11 #include <asm/kvm_asm.h> 12 - #include <asm/kvm_mmio.h> 13 12 #include <asm/kvm_arm.h> 14 13 #include <asm/cputype.h> 15 14 16 15 /* arm64 compatibility macros */ 16 + #define PSR_AA32_MODE_FIQ FIQ_MODE 17 + #define PSR_AA32_MODE_SVC SVC_MODE 17 18 #define PSR_AA32_MODE_ABT ABT_MODE 18 19 #define PSR_AA32_MODE_UND UND_MODE 19 20 #define PSR_AA32_T_BIT PSR_T_BIT 21 + #define PSR_AA32_F_BIT PSR_F_BIT 20 22 #define PSR_AA32_I_BIT PSR_I_BIT 21 23 #define PSR_AA32_A_BIT PSR_A_BIT 22 24 #define PSR_AA32_E_BIT PSR_E_BIT 23 25 #define PSR_AA32_IT_MASK PSR_IT_MASK 26 + #define PSR_AA32_GE_MASK 0x000f0000 27 + #define PSR_AA32_DIT_BIT 0x00200000 28 + #define PSR_AA32_PAN_BIT 0x00400000 29 + #define PSR_AA32_SSBS_BIT 0x00800000 30 + #define PSR_AA32_Q_BIT PSR_Q_BIT 31 + #define PSR_AA32_V_BIT PSR_V_BIT 32 + #define PSR_AA32_C_BIT PSR_C_BIT 33 + #define PSR_AA32_Z_BIT PSR_Z_BIT 34 + #define PSR_AA32_N_BIT PSR_N_BIT 24 35 25 36 unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num); 26 37 ··· 50 39 static inline void vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long v) 51 40 { 52 41 *__vcpu_spsr(vcpu) = v; 42 + } 43 + 44 + static inline unsigned long host_spsr_to_spsr32(unsigned long spsr) 45 + { 46 + return spsr; 53 47 } 54 48 55 49 static inline unsigned long vcpu_get_reg(struct kvm_vcpu *vcpu, ··· 198 182 return kvm_vcpu_get_hsr(vcpu) & HSR_SSE; 199 183 } 200 184 185 + static inline bool kvm_vcpu_dabt_issf(const struct kvm_vcpu *vcpu) 186 + { 187 + return false; 188 + } 189 + 201 190 static inline int kvm_vcpu_dabt_get_rd(struct kvm_vcpu *vcpu) 202 191 { 203 192 return (kvm_vcpu_get_hsr(vcpu) & HSR_SRT_MASK) >> HSR_SRT_SHIFT; ··· 219 198 } 220 199 221 200 /* Get Access Size from a data abort */ 222 - static inline int kvm_vcpu_dabt_get_as(struct kvm_vcpu *vcpu) 201 + static inline unsigned int kvm_vcpu_dabt_get_as(struct kvm_vcpu *vcpu) 223 202 { 224 203 switch ((kvm_vcpu_get_hsr(vcpu) >> 22) & 0x3) { 225 204 case 0: ··· 230 209 return 4; 231 210 default: 232 211 kvm_err("Hardware is weird: SAS 0b11 is reserved\n"); 233 - return -EFAULT; 212 + return 4; 234 213 } 235 214 } 236 215

+9 -7

arch/arm/include/asm/kvm_host.h

··· 14 14 #include <asm/cputype.h> 15 15 #include <asm/kvm.h> 16 16 #include <asm/kvm_asm.h> 17 - #include <asm/kvm_mmio.h> 18 17 #include <asm/fpstate.h> 19 18 #include <kvm/arm_arch_timer.h> 20 19 ··· 201 202 /* Don't run the guest (internal implementation need) */ 202 203 bool pause; 203 204 204 - /* IO related fields */ 205 - struct kvm_decode mmio_decode; 206 - 207 205 /* Cache some mmu pages needed inside spinlock regions */ 208 206 struct kvm_mmu_memory_cache mmu_page_cache; 209 207 ··· 280 284 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); 281 285 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 282 286 283 - struct kvm_vcpu *kvm_arm_get_running_vcpu(void); 284 - struct kvm_vcpu __percpu **kvm_get_running_vcpus(void); 285 287 void kvm_arm_halt_guest(struct kvm *kvm); 286 288 void kvm_arm_resume_guest(struct kvm *kvm); 287 289 ··· 293 299 294 300 static inline void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run, 295 301 int exception_index) {} 302 + 303 + /* MMIO helpers */ 304 + void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); 305 + unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); 306 + 307 + int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); 308 + int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, 309 + phys_addr_t fault_ipa); 296 310 297 311 static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, 298 312 unsigned long hyp_stack_ptr, ··· 365 363 static inline bool kvm_arch_requires_vhe(void) { return false; } 366 364 static inline void kvm_arch_hardware_unsetup(void) {} 367 365 static inline void kvm_arch_sync_events(struct kvm *kvm) {} 368 - static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} 369 366 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} 370 367 static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} 368 + static inline void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) {} 371 369 372 370 static inline void kvm_arm_init_debug(void) {} 373 371 static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}

+1

arch/arm/include/asm/kvm_hyp.h

··· 10 10 #include <linux/compiler.h> 11 11 #include <linux/kvm_host.h> 12 12 #include <asm/cp15.h> 13 + #include <asm/kvm_arm.h> 13 14 #include <asm/vfp.h> 14 15 15 16 #define __hyp_text __section(.hyp.text) notrace

-26

arch/arm/include/asm/kvm_mmio.h

··· 1 - /* SPDX-License-Identifier: GPL-2.0-only */ 2 - /* 3 - * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 - * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 - */ 6 - 7 - #ifndef __ARM_KVM_MMIO_H__ 8 - #define __ARM_KVM_MMIO_H__ 9 - 10 - #include <linux/kvm_host.h> 11 - #include <asm/kvm_asm.h> 12 - #include <asm/kvm_arm.h> 13 - 14 - struct kvm_decode { 15 - unsigned long rt; 16 - bool sign_extend; 17 - }; 18 - 19 - void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); 20 - unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); 21 - 22 - int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); 23 - int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, 24 - phys_addr_t fault_ipa); 25 - 26 - #endif /* __ARM_KVM_MMIO_H__ */

-5

arch/arm/kvm/guest.c

··· 34 34 { NULL } 35 35 }; 36 36 37 - int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 38 - { 39 - return 0; 40 - } 41 - 42 37 static u64 core_reg_offset_from_id(u64 id) 43 38 { 44 39 return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE);

+38 -2

arch/arm64/include/asm/kvm_emulate.h

··· 17 17 #include <asm/esr.h> 18 18 #include <asm/kvm_arm.h> 19 19 #include <asm/kvm_hyp.h> 20 - #include <asm/kvm_mmio.h> 21 20 #include <asm/ptrace.h> 22 21 #include <asm/cputype.h> 23 22 #include <asm/virt.h> ··· 218 219 vcpu_gp_regs(vcpu)->spsr[KVM_SPSR_EL1] = v; 219 220 } 220 221 222 + /* 223 + * The layout of SPSR for an AArch32 state is different when observed from an 224 + * AArch64 SPSR_ELx or an AArch32 SPSR_*. This function generates the AArch32 225 + * view given an AArch64 view. 226 + * 227 + * In ARM DDI 0487E.a see: 228 + * 229 + * - The AArch64 view (SPSR_EL2) in section C5.2.18, page C5-426 230 + * - The AArch32 view (SPSR_abt) in section G8.2.126, page G8-6256 231 + * - The AArch32 view (SPSR_und) in section G8.2.132, page G8-6280 232 + * 233 + * Which show the following differences: 234 + * 235 + * | Bit | AA64 | AA32 | Notes | 236 + * +-----+------+------+-----------------------------| 237 + * | 24 | DIT | J | J is RES0 in ARMv8 | 238 + * | 21 | SS | DIT | SS doesn't exist in AArch32 | 239 + * 240 + * ... and all other bits are (currently) common. 241 + */ 242 + static inline unsigned long host_spsr_to_spsr32(unsigned long spsr) 243 + { 244 + const unsigned long overlap = BIT(24) | BIT(21); 245 + unsigned long dit = !!(spsr & PSR_AA32_DIT_BIT); 246 + 247 + spsr &= ~overlap; 248 + 249 + spsr |= dit << 21; 250 + 251 + return spsr; 252 + } 253 + 221 254 static inline bool vcpu_mode_priv(const struct kvm_vcpu *vcpu) 222 255 { 223 256 u32 mode; ··· 314 283 return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SSE); 315 284 } 316 285 286 + static inline bool kvm_vcpu_dabt_issf(const struct kvm_vcpu *vcpu) 287 + { 288 + return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SF); 289 + } 290 + 317 291 static inline int kvm_vcpu_dabt_get_rd(const struct kvm_vcpu *vcpu) 318 292 { 319 293 return (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT; ··· 340 304 return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_CM); 341 305 } 342 306 343 - static inline int kvm_vcpu_dabt_get_as(const struct kvm_vcpu *vcpu) 307 + static inline unsigned int kvm_vcpu_dabt_get_as(const struct kvm_vcpu *vcpu) 344 308 { 345 309 return 1 << ((kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT); 346 310 }

+9 -7

arch/arm64/include/asm/kvm_host.h

··· 24 24 #include <asm/fpsimd.h> 25 25 #include <asm/kvm.h> 26 26 #include <asm/kvm_asm.h> 27 - #include <asm/kvm_mmio.h> 28 27 #include <asm/thread_info.h> 29 28 30 29 #define __KVM_HAVE_ARCH_INTC_INITIALIZED ··· 52 53 53 54 int __attribute_const__ kvm_target_cpu(void); 54 55 int kvm_reset_vcpu(struct kvm_vcpu *vcpu); 55 - void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); 56 + void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu); 56 57 int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); 57 58 void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); 58 59 ··· 324 325 /* Don't run the guest (internal implementation need) */ 325 326 bool pause; 326 327 327 - /* IO related fields */ 328 - struct kvm_decode mmio_decode; 329 - 330 328 /* Cache some mmu pages needed inside spinlock regions */ 331 329 struct kvm_mmu_memory_cache mmu_page_cache; 332 330 ··· 442 446 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); 443 447 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 444 448 445 - struct kvm_vcpu *kvm_arm_get_running_vcpu(void); 446 - struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); 447 449 void kvm_arm_halt_guest(struct kvm *kvm); 448 450 void kvm_arm_resume_guest(struct kvm *kvm); 449 451 ··· 484 490 int exception_index); 485 491 void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run, 486 492 int exception_index); 493 + 494 + /* MMIO helpers */ 495 + void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); 496 + unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); 497 + 498 + int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); 499 + int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, 500 + phys_addr_t fault_ipa); 487 501 488 502 int kvm_perf_init(void); 489 503 int kvm_perf_teardown(void);

-29

arch/arm64/include/asm/kvm_mmio.h

··· 1 - /* SPDX-License-Identifier: GPL-2.0-only */ 2 - /* 3 - * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 - * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 - */ 6 - 7 - #ifndef __ARM64_KVM_MMIO_H__ 8 - #define __ARM64_KVM_MMIO_H__ 9 - 10 - #include <linux/kvm_host.h> 11 - #include <asm/kvm_arm.h> 12 - 13 - /* 14 - * This is annoying. The mmio code requires this, even if we don't 15 - * need any decoding. To be fixed. 16 - */ 17 - struct kvm_decode { 18 - unsigned long rt; 19 - bool sign_extend; 20 - }; 21 - 22 - void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); 23 - unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); 24 - 25 - int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); 26 - int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, 27 - phys_addr_t fault_ipa); 28 - 29 - #endif /* __ARM64_KVM_MMIO_H__ */

+1

arch/arm64/include/asm/ptrace.h

··· 62 62 #define PSR_AA32_I_BIT 0x00000080 63 63 #define PSR_AA32_A_BIT 0x00000100 64 64 #define PSR_AA32_E_BIT 0x00000200 65 + #define PSR_AA32_PAN_BIT 0x00400000 65 66 #define PSR_AA32_SSBS_BIT 0x00800000 66 67 #define PSR_AA32_DIT_BIT 0x01000000 67 68 #define PSR_AA32_Q_BIT 0x08000000

+10 -2

arch/arm64/include/uapi/asm/kvm.h

··· 220 220 #define KVM_REG_ARM_PTIMER_CVAL ARM64_SYS_REG(3, 3, 14, 2, 2) 221 221 #define KVM_REG_ARM_PTIMER_CNT ARM64_SYS_REG(3, 3, 14, 0, 1) 222 222 223 - /* EL0 Virtual Timer Registers */ 223 + /* 224 + * EL0 Virtual Timer Registers 225 + * 226 + * WARNING: 227 + * KVM_REG_ARM_TIMER_CVAL and KVM_REG_ARM_TIMER_CNT are not defined 228 + * with the appropriate register encodings. Their values have been 229 + * accidentally swapped. As this is set API, the definitions here 230 + * must be used, rather than ones derived from the encodings. 231 + */ 224 232 #define KVM_REG_ARM_TIMER_CTL ARM64_SYS_REG(3, 3, 14, 3, 1) 225 - #define KVM_REG_ARM_TIMER_CNT ARM64_SYS_REG(3, 3, 14, 3, 2) 226 233 #define KVM_REG_ARM_TIMER_CVAL ARM64_SYS_REG(3, 3, 14, 0, 2) 234 + #define KVM_REG_ARM_TIMER_CNT ARM64_SYS_REG(3, 3, 14, 3, 2) 227 235 228 236 /* KVM-as-firmware specific pseudo-registers */ 229 237 #define KVM_REG_ARM_FW (0x0014 << KVM_REG_ARM_COPROC_SHIFT)

+1

arch/arm64/include/uapi/asm/ptrace.h

··· 49 49 #define PSR_SSBS_BIT 0x00001000 50 50 #define PSR_PAN_BIT 0x00400000 51 51 #define PSR_UAO_BIT 0x00800000 52 + #define PSR_DIT_BIT 0x01000000 52 53 #define PSR_V_BIT 0x10000000 53 54 #define PSR_C_BIT 0x20000000 54 55 #define PSR_Z_BIT 0x40000000

-5

arch/arm64/kvm/guest.c

··· 47 47 { NULL } 48 48 }; 49 49 50 - int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 51 - { 52 - return 0; 53 - } 54 - 55 50 static bool core_reg_offset_is_vreg(u64 off) 56 51 { 57 52 return off >= KVM_REG_ARM_CORE_REG(fp_regs.vregs) &&

+3 -4

arch/arm64/kvm/hyp/entry.S

··· 51 51 * u64 __guest_enter(struct kvm_vcpu *vcpu, 52 52 * struct kvm_cpu_context *host_ctxt); 53 53 */ 54 - ENTRY(__guest_enter) 54 + SYM_FUNC_START(__guest_enter) 55 55 // x0: vcpu 56 56 // x1: host context 57 57 // x2-x17: clobbered by macros ··· 100 100 // Do not touch any register after this! 101 101 eret 102 102 sb 103 - ENDPROC(__guest_enter) 104 103 105 - ENTRY(__guest_exit) 104 + SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) 106 105 // x0: return code 107 106 // x1: vcpu 108 107 // x2-x29,lr: vcpu regs ··· 194 195 msr spsr_el2, x4 195 196 orr x0, x0, x5 196 197 1: ret 197 - ENDPROC(__guest_exit) 198 + SYM_FUNC_END(__guest_enter)

+65 -5

arch/arm64/kvm/inject_fault.c

··· 14 14 #include <asm/kvm_emulate.h> 15 15 #include <asm/esr.h> 16 16 17 - #define PSTATE_FAULT_BITS_64 (PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | \ 18 - PSR_I_BIT | PSR_D_BIT) 19 - 20 17 #define CURRENT_EL_SP_EL0_VECTOR 0x0 21 18 #define CURRENT_EL_SP_ELx_VECTOR 0x200 22 19 #define LOWER_EL_AArch64_VECTOR 0x400 ··· 47 50 return vcpu_read_sys_reg(vcpu, VBAR_EL1) + exc_offset + type; 48 51 } 49 52 53 + /* 54 + * When an exception is taken, most PSTATE fields are left unchanged in the 55 + * handler. However, some are explicitly overridden (e.g. M[4:0]). Luckily all 56 + * of the inherited bits have the same position in the AArch64/AArch32 SPSR_ELx 57 + * layouts, so we don't need to shuffle these for exceptions from AArch32 EL0. 58 + * 59 + * For the SPSR_ELx layout for AArch64, see ARM DDI 0487E.a page C5-429. 60 + * For the SPSR_ELx layout for AArch32, see ARM DDI 0487E.a page C5-426. 61 + * 62 + * Here we manipulate the fields in order of the AArch64 SPSR_ELx layout, from 63 + * MSB to LSB. 64 + */ 65 + static unsigned long get_except64_pstate(struct kvm_vcpu *vcpu) 66 + { 67 + unsigned long sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); 68 + unsigned long old, new; 69 + 70 + old = *vcpu_cpsr(vcpu); 71 + new = 0; 72 + 73 + new |= (old & PSR_N_BIT); 74 + new |= (old & PSR_Z_BIT); 75 + new |= (old & PSR_C_BIT); 76 + new |= (old & PSR_V_BIT); 77 + 78 + // TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests) 79 + 80 + new |= (old & PSR_DIT_BIT); 81 + 82 + // PSTATE.UAO is set to zero upon any exception to AArch64 83 + // See ARM DDI 0487E.a, page D5-2579. 84 + 85 + // PSTATE.PAN is unchanged unless SCTLR_ELx.SPAN == 0b0 86 + // SCTLR_ELx.SPAN is RES1 when ARMv8.1-PAN is not implemented 87 + // See ARM DDI 0487E.a, page D5-2578. 88 + new |= (old & PSR_PAN_BIT); 89 + if (!(sctlr & SCTLR_EL1_SPAN)) 90 + new |= PSR_PAN_BIT; 91 + 92 + // PSTATE.SS is set to zero upon any exception to AArch64 93 + // See ARM DDI 0487E.a, page D2-2452. 94 + 95 + // PSTATE.IL is set to zero upon any exception to AArch64 96 + // See ARM DDI 0487E.a, page D1-2306. 97 + 98 + // PSTATE.SSBS is set to SCTLR_ELx.DSSBS upon any exception to AArch64 99 + // See ARM DDI 0487E.a, page D13-3258 100 + if (sctlr & SCTLR_ELx_DSSBS) 101 + new |= PSR_SSBS_BIT; 102 + 103 + // PSTATE.BTYPE is set to zero upon any exception to AArch64 104 + // See ARM DDI 0487E.a, pages D1-2293 to D1-2294. 105 + 106 + new |= PSR_D_BIT; 107 + new |= PSR_A_BIT; 108 + new |= PSR_I_BIT; 109 + new |= PSR_F_BIT; 110 + 111 + new |= PSR_MODE_EL1h; 112 + 113 + return new; 114 + } 115 + 50 116 static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr) 51 117 { 52 118 unsigned long cpsr = *vcpu_cpsr(vcpu); ··· 119 59 vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu)); 120 60 *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync); 121 61 122 - *vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64; 62 + *vcpu_cpsr(vcpu) = get_except64_pstate(vcpu); 123 63 vcpu_write_spsr(vcpu, cpsr); 124 64 125 65 vcpu_write_sys_reg(vcpu, addr, FAR_EL1); ··· 154 94 vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu)); 155 95 *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync); 156 96 157 - *vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64; 97 + *vcpu_cpsr(vcpu) = get_except64_pstate(vcpu); 158 98 vcpu_write_spsr(vcpu, cpsr); 159 99 160 100 /*

+1 -1

arch/arm64/kvm/reset.c

··· 204 204 return true; 205 205 } 206 206 207 - void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 207 + void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) 208 208 { 209 209 kfree(vcpu->arch.sve_state); 210 210 }

+25 -31

arch/arm64/kvm/va_layout.c

··· 13 13 #include <asm/kvm_mmu.h> 14 14 15 15 /* 16 - * The LSB of the random hyp VA tag or 0 if no randomization is used. 16 + * The LSB of the HYP VA tag 17 17 */ 18 18 static u8 tag_lsb; 19 19 /* 20 - * The random hyp VA tag value with the region bit if hyp randomization is used 20 + * The HYP VA tag value with the region bit 21 21 */ 22 22 static u64 tag_val; 23 23 static u64 va_mask; 24 24 25 + /* 26 + * We want to generate a hyp VA with the following format (with V == 27 + * vabits_actual): 28 + * 29 + * 63 ... V | V-1 | V-2 .. tag_lsb | tag_lsb - 1 .. 0 30 + * --------------------------------------------------------- 31 + * | 0000000 | hyp_va_msb | random tag | kern linear VA | 32 + * |--------- tag_val -----------|----- va_mask ---| 33 + * 34 + * which does not conflict with the idmap regions. 35 + */ 25 36 __init void kvm_compute_layout(void) 26 37 { 27 38 phys_addr_t idmap_addr = __pa_symbol(__hyp_idmap_text_start); 28 39 u64 hyp_va_msb; 29 - int kva_msb; 30 40 31 41 /* Where is my RAM region? */ 32 42 hyp_va_msb = idmap_addr & BIT(vabits_actual - 1); 33 43 hyp_va_msb ^= BIT(vabits_actual - 1); 34 44 35 - kva_msb = fls64((u64)phys_to_virt(memblock_start_of_DRAM()) ^ 45 + tag_lsb = fls64((u64)phys_to_virt(memblock_start_of_DRAM()) ^ 36 46 (u64)(high_memory - 1)); 37 47 38 - if (kva_msb == (vabits_actual - 1)) { 39 - /* 40 - * No space in the address, let's compute the mask so 41 - * that it covers (vabits_actual - 1) bits, and the region 42 - * bit. The tag stays set to zero. 43 - */ 44 - va_mask = BIT(vabits_actual - 1) - 1; 45 - va_mask |= hyp_va_msb; 46 - } else { 47 - /* 48 - * We do have some free bits to insert a random tag. 49 - * Hyp VAs are now created from kernel linear map VAs 50 - * using the following formula (with V == vabits_actual): 51 - * 52 - * 63 ... V | V-1 | V-2 .. tag_lsb | tag_lsb - 1 .. 0 53 - * --------------------------------------------------------- 54 - * | 0000000 | hyp_va_msb | random tag | kern linear VA | 55 - */ 56 - tag_lsb = kva_msb; 57 - va_mask = GENMASK_ULL(tag_lsb - 1, 0); 58 - tag_val = get_random_long() & GENMASK_ULL(vabits_actual - 2, tag_lsb); 59 - tag_val |= hyp_va_msb; 60 - tag_val >>= tag_lsb; 48 + va_mask = GENMASK_ULL(tag_lsb - 1, 0); 49 + tag_val = hyp_va_msb; 50 + 51 + if (tag_lsb != (vabits_actual - 1)) { 52 + /* We have some free bits to insert a random tag. */ 53 + tag_val |= get_random_long() & GENMASK_ULL(vabits_actual - 2, tag_lsb); 61 54 } 55 + tag_val >>= tag_lsb; 62 56 } 63 57 64 58 static u32 compute_instruction(int n, u32 rd, u32 rn) ··· 111 117 * VHE doesn't need any address translation, let's NOP 112 118 * everything. 113 119 * 114 - * Alternatively, if we don't have any spare bits in 115 - * the address, NOP everything after masking that 116 - * kernel VA. 120 + * Alternatively, if the tag is zero (because the layout 121 + * dictates it and we don't have any spare bits in the 122 + * address), NOP everything after masking the kernel VA. 117 123 */ 118 - if (has_vhe() || (!tag_lsb && i > 0)) { 124 + if (has_vhe() || (!tag_val && i > 0)) { 119 125 updptr[i] = cpu_to_le32(aarch64_insn_gen_nop()); 120 126 continue; 121 127 }

+28 -56

arch/mips/kvm/mips.c

··· 156 156 struct kvm_vcpu *vcpu; 157 157 158 158 kvm_for_each_vcpu(i, vcpu, kvm) { 159 - kvm_arch_vcpu_free(vcpu); 159 + kvm_vcpu_destroy(vcpu); 160 160 } 161 161 162 162 mutex_lock(&kvm->lock); ··· 280 280 pr_debug("\tEND(%s)\n", symbol); 281 281 } 282 282 283 - struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) 283 + int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) 284 + { 285 + return 0; 286 + } 287 + 288 + int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) 284 289 { 285 290 int err, size; 286 291 void *gebase, *p, *handler, *refill_start, *refill_end; 287 292 int i; 288 293 289 - struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); 294 + kvm_debug("kvm @ %p: create cpu %d at %p\n", 295 + vcpu->kvm, vcpu->vcpu_id, vcpu); 290 296 291 - if (!vcpu) { 292 - err = -ENOMEM; 293 - goto out; 294 - } 295 - 296 - err = kvm_vcpu_init(vcpu, kvm, id); 297 - 297 + err = kvm_mips_callbacks->vcpu_init(vcpu); 298 298 if (err) 299 - goto out_free_cpu; 299 + return err; 300 300 301 - kvm_debug("kvm @ %p: create cpu %d at %p\n", kvm, id, vcpu); 301 + hrtimer_init(&vcpu->arch.comparecount_timer, CLOCK_MONOTONIC, 302 + HRTIMER_MODE_REL); 303 + vcpu->arch.comparecount_timer.function = kvm_mips_comparecount_wakeup; 302 304 303 305 /* 304 306 * Allocate space for host mode exception handlers that handle ··· 315 313 316 314 if (!gebase) { 317 315 err = -ENOMEM; 318 - goto out_uninit_cpu; 316 + goto out_uninit_vcpu; 319 317 } 320 318 kvm_debug("Allocated %d bytes for KVM Exception Handlers @ %p\n", 321 319 ALIGN(size, PAGE_SIZE), gebase); ··· 394 392 vcpu->arch.last_sched_cpu = -1; 395 393 vcpu->arch.last_exec_cpu = -1; 396 394 397 - return vcpu; 395 + /* Initial guest state */ 396 + err = kvm_mips_callbacks->vcpu_setup(vcpu); 397 + if (err) 398 + goto out_free_commpage; 398 399 400 + return 0; 401 + 402 + out_free_commpage: 403 + kfree(vcpu->arch.kseg0_commpage); 399 404 out_free_gebase: 400 405 kfree(gebase); 401 - 402 - out_uninit_cpu: 403 - kvm_vcpu_uninit(vcpu); 404 - 405 - out_free_cpu: 406 - kfree(vcpu); 407 - 408 - out: 409 - return ERR_PTR(err); 406 + out_uninit_vcpu: 407 + kvm_mips_callbacks->vcpu_uninit(vcpu); 408 + return err; 410 409 } 411 410 412 - void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 411 + void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 413 412 { 414 413 hrtimer_cancel(&vcpu->arch.comparecount_timer); 415 - 416 - kvm_vcpu_uninit(vcpu); 417 414 418 415 kvm_mips_dump_stats(vcpu); 419 416 420 417 kvm_mmu_free_memory_caches(vcpu); 421 418 kfree(vcpu->arch.guest_ebase); 422 419 kfree(vcpu->arch.kseg0_commpage); 423 - kfree(vcpu); 424 - } 425 420 426 - void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 427 - { 428 - kvm_arch_vcpu_free(vcpu); 421 + kvm_mips_callbacks->vcpu_uninit(vcpu); 429 422 } 430 423 431 424 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, ··· 1230 1233 return kvm_mips_count_timeout(vcpu); 1231 1234 } 1232 1235 1233 - int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 1234 - { 1235 - int err; 1236 - 1237 - err = kvm_mips_callbacks->vcpu_init(vcpu); 1238 - if (err) 1239 - return err; 1240 - 1241 - hrtimer_init(&vcpu->arch.comparecount_timer, CLOCK_MONOTONIC, 1242 - HRTIMER_MODE_REL); 1243 - vcpu->arch.comparecount_timer.function = kvm_mips_comparecount_wakeup; 1244 - return 0; 1245 - } 1246 - 1247 - void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 1248 - { 1249 - kvm_mips_callbacks->vcpu_uninit(vcpu); 1250 - } 1251 - 1252 1236 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 1253 1237 struct kvm_translation *tr) 1254 1238 { 1255 1239 return 0; 1256 - } 1257 - 1258 - /* Initial guest state */ 1259 - int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 1260 - { 1261 - return kvm_mips_callbacks->vcpu_setup(vcpu); 1262 1240 } 1263 1241 1264 1242 static void kvm_mips_set_c0_status(void)

+1

arch/powerpc/include/asm/hvcall.h

··· 350 350 #define H_SVM_PAGE_OUT 0xEF04 351 351 #define H_SVM_INIT_START 0xEF08 352 352 #define H_SVM_INIT_DONE 0xEF0C 353 + #define H_SVM_INIT_ABORT 0xEF14 353 354 354 355 /* Values for 2nd argument to H_SET_MODE */ 355 356 #define H_SET_MODE_RESOURCE_SET_CIABR 1

+8 -2

arch/powerpc/include/asm/kvm_book3s_uvmem.h

··· 19 19 unsigned long kvmppc_h_svm_init_start(struct kvm *kvm); 20 20 unsigned long kvmppc_h_svm_init_done(struct kvm *kvm); 21 21 int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn); 22 + unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm); 22 23 void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, 23 - struct kvm *kvm); 24 + struct kvm *kvm, bool skip_page_out); 24 25 #else 25 26 static inline int kvmppc_uvmem_init(void) 26 27 { ··· 63 62 return H_UNSUPPORTED; 64 63 } 65 64 65 + static inline unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm) 66 + { 67 + return H_UNSUPPORTED; 68 + } 69 + 66 70 static inline int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn) 67 71 { 68 72 return -EFAULT; ··· 75 69 76 70 static inline void 77 71 kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, 78 - struct kvm *kvm) { } 72 + struct kvm *kvm, bool skip_page_out) { } 79 73 #endif /* CONFIG_PPC_UV */ 80 74 #endif /* __ASM_KVM_BOOK3S_UVMEM_H__ */

+1

arch/powerpc/include/asm/kvm_host.h

··· 278 278 /* Flag values for kvm_arch.secure_guest */ 279 279 #define KVMPPC_SECURE_INIT_START 0x1 /* H_SVM_INIT_START has been called */ 280 280 #define KVMPPC_SECURE_INIT_DONE 0x2 /* H_SVM_INIT_DONE completed */ 281 + #define KVMPPC_SECURE_INIT_ABORT 0x4 /* H_SVM_INIT_ABORT issued */ 281 282 282 283 struct kvm_arch { 283 284 unsigned int lpid;

+2 -3

arch/powerpc/include/asm/kvm_ppc.h

··· 119 119 enum xlate_instdata xlid, enum xlate_readwrite xlrw, 120 120 struct kvmppc_pte *pte); 121 121 122 - extern struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, 123 - unsigned int id); 122 + extern int kvmppc_core_vcpu_create(struct kvm_vcpu *vcpu); 124 123 extern void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu); 125 124 extern int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu); 126 125 extern int kvmppc_core_check_processor_compat(void); ··· 273 274 void (*inject_interrupt)(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags); 274 275 void (*set_msr)(struct kvm_vcpu *vcpu, u64 msr); 275 276 int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu); 276 - struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned int id); 277 + int (*vcpu_create)(struct kvm_vcpu *vcpu); 277 278 void (*vcpu_free)(struct kvm_vcpu *vcpu); 278 279 int (*check_requests)(struct kvm_vcpu *vcpu); 279 280 int (*get_dirty_log)(struct kvm *kvm, struct kvm_dirty_log *log);

+2 -7

arch/powerpc/kvm/book3s.c

··· 471 471 } 472 472 EXPORT_SYMBOL_GPL(kvmppc_load_last_inst); 473 473 474 - int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 475 - { 476 - return 0; 477 - } 478 - 479 474 int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) 480 475 { 481 476 return 0; ··· 784 789 kvm_vcpu_kick(vcpu); 785 790 } 786 791 787 - struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) 792 + int kvmppc_core_vcpu_create(struct kvm_vcpu *vcpu) 788 793 { 789 - return kvm->arch.kvm_ops->vcpu_create(kvm, id); 794 + return vcpu->kvm->arch.kvm_ops->vcpu_create(vcpu); 790 795 } 791 796 792 797 void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)

+2 -2

arch/powerpc/kvm/book3s_64_mmu_hv.c

··· 284 284 /* Protect linux PTE lookup from page table destruction */ 285 285 rcu_read_lock_sched(); /* this disables preemption too */ 286 286 ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, 287 - current->mm->pgd, false, pte_idx_ret); 287 + kvm->mm->pgd, false, pte_idx_ret); 288 288 rcu_read_unlock_sched(); 289 289 if (ret == H_TOO_HARD) { 290 290 /* this can't happen */ ··· 573 573 is_ci = false; 574 574 pfn = 0; 575 575 page = NULL; 576 - mm = current->mm; 576 + mm = kvm->mm; 577 577 pte_size = PAGE_SIZE; 578 578 writing = (dsisr & DSISR_ISSTORE) != 0; 579 579 /* If writing != 0, then the HPTE must allow writing, if we get here */

+1 -1

arch/powerpc/kvm/book3s_64_mmu_radix.c

··· 1102 1102 unsigned int shift; 1103 1103 1104 1104 if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START) 1105 - kvmppc_uvmem_drop_pages(memslot, kvm); 1105 + kvmppc_uvmem_drop_pages(memslot, kvm, true); 1106 1106 1107 1107 if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) 1108 1108 return;

+6 -4

arch/powerpc/kvm/book3s_64_vio.c

··· 253 253 } 254 254 } 255 255 256 + account_locked_vm(kvm->mm, 257 + kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false); 258 + 256 259 kvm_put_kvm(stt->kvm); 257 260 258 - account_locked_vm(current->mm, 259 - kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false); 260 261 call_rcu(&stt->rcu, release_spapr_tce_table); 261 262 262 263 return 0; ··· 273 272 { 274 273 struct kvmppc_spapr_tce_table *stt = NULL; 275 274 struct kvmppc_spapr_tce_table *siter; 275 + struct mm_struct *mm = kvm->mm; 276 276 unsigned long npages, size = args->size; 277 277 int ret = -ENOMEM; 278 278 ··· 282 280 return -EINVAL; 283 281 284 282 npages = kvmppc_tce_pages(size); 285 - ret = account_locked_vm(current->mm, kvmppc_stt_pages(npages), true); 283 + ret = account_locked_vm(mm, kvmppc_stt_pages(npages), true); 286 284 if (ret) 287 285 return ret; 288 286 ··· 328 326 329 327 kfree(stt); 330 328 fail_acct: 331 - account_locked_vm(current->mm, kvmppc_stt_pages(npages), false); 329 + account_locked_vm(mm, kvmppc_stt_pages(npages), false); 332 330 return ret; 333 331 } 334 332

+16 -26

arch/powerpc/kvm/book3s_hv.c

··· 1091 1091 case H_SVM_INIT_DONE: 1092 1092 ret = kvmppc_h_svm_init_done(vcpu->kvm); 1093 1093 break; 1094 + case H_SVM_INIT_ABORT: 1095 + ret = kvmppc_h_svm_init_abort(vcpu->kvm); 1096 + break; 1094 1097 1095 1098 default: 1096 1099 return RESUME_HOST; ··· 2274 2271 } 2275 2272 #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ 2276 2273 2277 - static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, 2278 - unsigned int id) 2274 + static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu) 2279 2275 { 2280 - struct kvm_vcpu *vcpu; 2281 2276 int err; 2282 2277 int core; 2283 2278 struct kvmppc_vcore *vcore; 2279 + struct kvm *kvm; 2280 + unsigned int id; 2284 2281 2285 - err = -ENOMEM; 2286 - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 2287 - if (!vcpu) 2288 - goto out; 2289 - 2290 - err = kvm_vcpu_init(vcpu, kvm, id); 2291 - if (err) 2292 - goto free_vcpu; 2282 + kvm = vcpu->kvm; 2283 + id = vcpu->vcpu_id; 2293 2284 2294 2285 vcpu->arch.shared = &vcpu->arch.shregs; 2295 2286 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE ··· 2365 2368 mutex_unlock(&kvm->lock); 2366 2369 2367 2370 if (!vcore) 2368 - goto free_vcpu; 2371 + return err; 2369 2372 2370 2373 spin_lock(&vcore->lock); 2371 2374 ++vcore->num_threads; ··· 2380 2383 2381 2384 debugfs_vcpu_init(vcpu, id); 2382 2385 2383 - return vcpu; 2384 - 2385 - free_vcpu: 2386 - kmem_cache_free(kvm_vcpu_cache, vcpu); 2387 - out: 2388 - return ERR_PTR(err); 2386 + return 0; 2389 2387 } 2390 2388 2391 2389 static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode, ··· 2434 2442 unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow); 2435 2443 unpin_vpa(vcpu->kvm, &vcpu->arch.vpa); 2436 2444 spin_unlock(&vcpu->arch.vpa_update_lock); 2437 - kvm_vcpu_uninit(vcpu); 2438 - kmem_cache_free(kvm_vcpu_cache, vcpu); 2439 2445 } 2440 2446 2441 2447 static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu) ··· 4275 4285 user_vrsave = mfspr(SPRN_VRSAVE); 4276 4286 4277 4287 vcpu->arch.wqp = &vcpu->arch.vcore->wq; 4278 - vcpu->arch.pgdir = current->mm->pgd; 4288 + vcpu->arch.pgdir = kvm->mm->pgd; 4279 4289 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; 4280 4290 4281 4291 do { ··· 4630 4640 4631 4641 /* Look up the VMA for the start of this memory slot */ 4632 4642 hva = memslot->userspace_addr; 4633 - down_read(&current->mm->mmap_sem); 4634 - vma = find_vma(current->mm, hva); 4643 + down_read(&kvm->mm->mmap_sem); 4644 + vma = find_vma(kvm->mm, hva); 4635 4645 if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO)) 4636 4646 goto up_out; 4637 4647 4638 4648 psize = vma_kernel_pagesize(vma); 4639 4649 4640 - up_read(&current->mm->mmap_sem); 4650 + up_read(&kvm->mm->mmap_sem); 4641 4651 4642 4652 /* We can handle 4k, 64k or 16M pages in the VRMA */ 4643 4653 if (psize >= 0x1000000) ··· 4670 4680 return err; 4671 4681 4672 4682 up_out: 4673 - up_read(&current->mm->mmap_sem); 4683 + up_read(&kvm->mm->mmap_sem); 4674 4684 goto out_srcu; 4675 4685 } 4676 4686 ··· 5467 5477 continue; 5468 5478 5469 5479 kvm_for_each_memslot(memslot, slots) { 5470 - kvmppc_uvmem_drop_pages(memslot, kvm); 5480 + kvmppc_uvmem_drop_pages(memslot, kvm, true); 5471 5481 uv_unregister_mem_slot(kvm->arch.lpid, memslot->id); 5472 5482 } 5473 5483 }

+31 -3

arch/powerpc/kvm/book3s_hv_uvmem.c

··· 258 258 * QEMU page table with normal PTEs from newly allocated pages. 259 259 */ 260 260 void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, 261 - struct kvm *kvm) 261 + struct kvm *kvm, bool skip_page_out) 262 262 { 263 263 int i; 264 264 struct kvmppc_uvmem_page_pvt *pvt; ··· 276 276 277 277 uvmem_page = pfn_to_page(uvmem_pfn); 278 278 pvt = uvmem_page->zone_device_data; 279 - pvt->skip_page_out = true; 279 + pvt->skip_page_out = skip_page_out; 280 280 mutex_unlock(&kvm->arch.uvmem_lock); 281 281 282 282 pfn = gfn_to_pfn(kvm, gfn); ··· 284 284 continue; 285 285 kvm_release_pfn_clean(pfn); 286 286 } 287 + } 288 + 289 + unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm) 290 + { 291 + int srcu_idx; 292 + struct kvm_memory_slot *memslot; 293 + 294 + /* 295 + * Expect to be called only after INIT_START and before INIT_DONE. 296 + * If INIT_DONE was completed, use normal VM termination sequence. 297 + */ 298 + if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)) 299 + return H_UNSUPPORTED; 300 + 301 + if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) 302 + return H_STATE; 303 + 304 + srcu_idx = srcu_read_lock(&kvm->srcu); 305 + 306 + kvm_for_each_memslot(memslot, kvm_memslots(kvm)) 307 + kvmppc_uvmem_drop_pages(memslot, kvm, false); 308 + 309 + srcu_read_unlock(&kvm->srcu, srcu_idx); 310 + 311 + kvm->arch.secure_guest = 0; 312 + uv_svm_terminate(kvm->arch.lpid); 313 + 314 + return H_PARAMETER; 287 315 } 288 316 289 317 /* ··· 571 543 572 544 ret = migrate_vma_setup(&mig); 573 545 if (ret) 574 - return ret; 546 + goto out; 575 547 576 548 spage = migrate_pfn_to_page(*mig.src); 577 549 if (!spage || !(*mig.src & MIGRATE_PFN_MIGRATE))

+11 -23

arch/powerpc/kvm/book3s_pr.c

··· 1744 1744 return r; 1745 1745 } 1746 1746 1747 - static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, 1748 - unsigned int id) 1747 + static int kvmppc_core_vcpu_create_pr(struct kvm_vcpu *vcpu) 1749 1748 { 1750 1749 struct kvmppc_vcpu_book3s *vcpu_book3s; 1751 - struct kvm_vcpu *vcpu; 1752 - int err = -ENOMEM; 1753 1750 unsigned long p; 1751 + int err; 1754 1752 1755 - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 1756 - if (!vcpu) 1757 - goto out; 1753 + err = -ENOMEM; 1758 1754 1759 1755 vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); 1760 1756 if (!vcpu_book3s) 1761 - goto free_vcpu; 1757 + goto out; 1762 1758 vcpu->arch.book3s = vcpu_book3s; 1763 1759 1764 1760 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER ··· 1764 1768 goto free_vcpu3s; 1765 1769 #endif 1766 1770 1767 - err = kvm_vcpu_init(vcpu, kvm, id); 1768 - if (err) 1769 - goto free_shadow_vcpu; 1770 - 1771 - err = -ENOMEM; 1772 1771 p = __get_free_page(GFP_KERNEL|__GFP_ZERO); 1773 1772 if (!p) 1774 - goto uninit_vcpu; 1773 + goto free_shadow_vcpu; 1775 1774 vcpu->arch.shared = (void *)p; 1776 1775 #ifdef CONFIG_PPC_BOOK3S_64 1777 1776 /* Always start the shared struct in native endian mode */ ··· 1797 1806 1798 1807 err = kvmppc_mmu_init(vcpu); 1799 1808 if (err < 0) 1800 - goto uninit_vcpu; 1809 + goto free_shared_page; 1801 1810 1802 - return vcpu; 1811 + return 0; 1803 1812 1804 - uninit_vcpu: 1805 - kvm_vcpu_uninit(vcpu); 1813 + free_shared_page: 1814 + free_page((unsigned long)vcpu->arch.shared); 1806 1815 free_shadow_vcpu: 1807 1816 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER 1808 1817 kfree(vcpu->arch.shadow_vcpu); 1809 1818 free_vcpu3s: 1810 1819 #endif 1811 1820 vfree(vcpu_book3s); 1812 - free_vcpu: 1813 - kmem_cache_free(kvm_vcpu_cache, vcpu); 1814 1821 out: 1815 - return ERR_PTR(err); 1822 + return err; 1816 1823 } 1817 1824 1818 1825 static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu) ··· 1818 1829 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); 1819 1830 1820 1831 free_page((unsigned long)vcpu->arch.shared & PAGE_MASK); 1821 - kvm_vcpu_uninit(vcpu); 1822 1832 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER 1823 1833 kfree(vcpu->arch.shadow_vcpu); 1824 1834 #endif 1825 1835 vfree(vcpu_book3s); 1826 - kmem_cache_free(kvm_vcpu_cache, vcpu); 1827 1836 } 1828 1837 1829 1838 static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) ··· 2017 2030 { 2018 2031 /* We should not get called */ 2019 2032 BUG(); 2033 + return 0; 2020 2034 } 2021 2035 #endif /* CONFIG_PPC64 */ 2022 2036

+1 -1

arch/powerpc/kvm/book3s_xive_native.c

··· 631 631 srcu_idx = srcu_read_lock(&kvm->srcu); 632 632 gfn = gpa_to_gfn(kvm_eq.qaddr); 633 633 634 - page_size = kvm_host_page_size(kvm, gfn); 634 + page_size = kvm_host_page_size(vcpu, gfn); 635 635 if (1ull << kvm_eq.qshift > page_size) { 636 636 srcu_read_unlock(&kvm->srcu, srcu_idx); 637 637 pr_warn("Incompatible host page size %lx!\n", page_size);

+34 -33

arch/powerpc/kvm/booke.c

··· 775 775 debug = current->thread.debug; 776 776 current->thread.debug = vcpu->arch.dbg_reg; 777 777 778 - vcpu->arch.pgdir = current->mm->pgd; 778 + vcpu->arch.pgdir = vcpu->kvm->mm->pgd; 779 779 kvmppc_fix_ee_before_entry(); 780 780 781 781 ret = __kvmppc_vcpu_run(kvm_run, vcpu); ··· 1375 1375 arm_next_watchdog(vcpu); 1376 1376 1377 1377 update_timer_ints(vcpu); 1378 - } 1379 - 1380 - /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */ 1381 - int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 1382 - { 1383 - int i; 1384 - int r; 1385 - 1386 - vcpu->arch.regs.nip = 0; 1387 - vcpu->arch.shared->pir = vcpu->vcpu_id; 1388 - kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ 1389 - kvmppc_set_msr(vcpu, 0); 1390 - 1391 - #ifndef CONFIG_KVM_BOOKE_HV 1392 - vcpu->arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS; 1393 - vcpu->arch.shadow_pid = 1; 1394 - vcpu->arch.shared->msr = 0; 1395 - #endif 1396 - 1397 - /* Eye-catching numbers so we know if the guest takes an interrupt 1398 - * before it's programmed its own IVPR/IVORs. */ 1399 - vcpu->arch.ivpr = 0x55550000; 1400 - for (i = 0; i < BOOKE_IRQPRIO_MAX; i++) 1401 - vcpu->arch.ivor[i] = 0x7700 | i * 4; 1402 - 1403 - kvmppc_init_timing_stats(vcpu); 1404 - 1405 - r = kvmppc_core_vcpu_setup(vcpu); 1406 - kvmppc_sanity_check(vcpu); 1407 - return r; 1408 1378 } 1409 1379 1410 1380 int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) ··· 2084 2114 return kvm->arch.kvm_ops->init_vm(kvm); 2085 2115 } 2086 2116 2087 - struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) 2117 + int kvmppc_core_vcpu_create(struct kvm_vcpu *vcpu) 2088 2118 { 2089 - return kvm->arch.kvm_ops->vcpu_create(kvm, id); 2119 + int i; 2120 + int r; 2121 + 2122 + r = vcpu->kvm->arch.kvm_ops->vcpu_create(vcpu); 2123 + if (r) 2124 + return r; 2125 + 2126 + /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */ 2127 + vcpu->arch.regs.nip = 0; 2128 + vcpu->arch.shared->pir = vcpu->vcpu_id; 2129 + kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ 2130 + kvmppc_set_msr(vcpu, 0); 2131 + 2132 + #ifndef CONFIG_KVM_BOOKE_HV 2133 + vcpu->arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS; 2134 + vcpu->arch.shadow_pid = 1; 2135 + vcpu->arch.shared->msr = 0; 2136 + #endif 2137 + 2138 + /* Eye-catching numbers so we know if the guest takes an interrupt 2139 + * before it's programmed its own IVPR/IVORs. */ 2140 + vcpu->arch.ivpr = 0x55550000; 2141 + for (i = 0; i < BOOKE_IRQPRIO_MAX; i++) 2142 + vcpu->arch.ivor[i] = 0x7700 | i * 4; 2143 + 2144 + kvmppc_init_timing_stats(vcpu); 2145 + 2146 + r = kvmppc_core_vcpu_setup(vcpu); 2147 + if (r) 2148 + vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu); 2149 + kvmppc_sanity_check(vcpu); 2150 + return r; 2090 2151 } 2091 2152 2092 2153 void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)

+7 -29

arch/powerpc/kvm/e500.c

··· 433 433 return r; 434 434 } 435 435 436 - static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm, 437 - unsigned int id) 436 + static int kvmppc_core_vcpu_create_e500(struct kvm_vcpu *vcpu) 438 437 { 439 438 struct kvmppc_vcpu_e500 *vcpu_e500; 440 - struct kvm_vcpu *vcpu; 441 439 int err; 442 440 443 - BUILD_BUG_ON_MSG(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0, 444 - "struct kvm_vcpu must be at offset 0 for arch usercopy region"); 441 + BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0); 442 + vcpu_e500 = to_e500(vcpu); 445 443 446 - vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 447 - if (!vcpu_e500) { 448 - err = -ENOMEM; 449 - goto out; 450 - } 451 - 452 - vcpu = &vcpu_e500->vcpu; 453 - err = kvm_vcpu_init(vcpu, kvm, id); 454 - if (err) 455 - goto free_vcpu; 456 - 457 - if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) { 458 - err = -ENOMEM; 459 - goto uninit_vcpu; 460 - } 444 + if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) 445 + return -ENOMEM; 461 446 462 447 err = kvmppc_e500_tlb_init(vcpu_e500); 463 448 if (err) ··· 454 469 goto uninit_tlb; 455 470 } 456 471 457 - return vcpu; 472 + return 0; 458 473 459 474 uninit_tlb: 460 475 kvmppc_e500_tlb_uninit(vcpu_e500); 461 476 uninit_id: 462 477 kvmppc_e500_id_table_free(vcpu_e500); 463 - uninit_vcpu: 464 - kvm_vcpu_uninit(vcpu); 465 - free_vcpu: 466 - kmem_cache_free(kvm_vcpu_cache, vcpu_e500); 467 - out: 468 - return ERR_PTR(err); 478 + return err; 469 479 } 470 480 471 481 static void kvmppc_core_vcpu_free_e500(struct kvm_vcpu *vcpu) ··· 470 490 free_page((unsigned long)vcpu->arch.shared); 471 491 kvmppc_e500_tlb_uninit(vcpu_e500); 472 492 kvmppc_e500_id_table_free(vcpu_e500); 473 - kvm_vcpu_uninit(vcpu); 474 - kmem_cache_free(kvm_vcpu_cache, vcpu_e500); 475 493 } 476 494 477 495 static int kvmppc_core_init_vm_e500(struct kvm *kvm)

+6 -24

arch/powerpc/kvm/e500mc.c

··· 301 301 return r; 302 302 } 303 303 304 - static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, 305 - unsigned int id) 304 + static int kvmppc_core_vcpu_create_e500mc(struct kvm_vcpu *vcpu) 306 305 { 307 306 struct kvmppc_vcpu_e500 *vcpu_e500; 308 - struct kvm_vcpu *vcpu; 309 307 int err; 310 308 311 - vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 312 - if (!vcpu_e500) { 313 - err = -ENOMEM; 314 - goto out; 315 - } 316 - vcpu = &vcpu_e500->vcpu; 309 + BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0); 310 + vcpu_e500 = to_e500(vcpu); 317 311 318 312 /* Invalid PIR value -- this LPID dosn't have valid state on any cpu */ 319 313 vcpu->arch.oldpir = 0xffffffff; 320 314 321 - err = kvm_vcpu_init(vcpu, kvm, id); 322 - if (err) 323 - goto free_vcpu; 324 - 325 315 err = kvmppc_e500_tlb_init(vcpu_e500); 326 316 if (err) 327 - goto uninit_vcpu; 317 + return err; 328 318 329 319 vcpu->arch.shared = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 330 320 if (!vcpu->arch.shared) { ··· 322 332 goto uninit_tlb; 323 333 } 324 334 325 - return vcpu; 335 + return 0; 326 336 327 337 uninit_tlb: 328 338 kvmppc_e500_tlb_uninit(vcpu_e500); 329 - uninit_vcpu: 330 - kvm_vcpu_uninit(vcpu); 331 - 332 - free_vcpu: 333 - kmem_cache_free(kvm_vcpu_cache, vcpu_e500); 334 - out: 335 - return ERR_PTR(err); 339 + return err; 336 340 } 337 341 338 342 static void kvmppc_core_vcpu_free_e500mc(struct kvm_vcpu *vcpu) ··· 335 351 336 352 free_page((unsigned long)vcpu->arch.shared); 337 353 kvmppc_e500_tlb_uninit(vcpu_e500); 338 - kvm_vcpu_uninit(vcpu); 339 - kmem_cache_free(kvm_vcpu_cache, vcpu_e500); 340 354 } 341 355 342 356 static int kvmppc_core_init_vm_e500mc(struct kvm *kvm)

-5

arch/powerpc/kvm/emulate_loadstore.c

··· 73 73 { 74 74 struct kvm_run *run = vcpu->run; 75 75 u32 inst; 76 - int ra, rs, rt; 77 76 enum emulation_result emulated = EMULATE_FAIL; 78 77 int advance = 1; 79 78 struct instruction_op op; ··· 83 84 emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst); 84 85 if (emulated != EMULATE_DONE) 85 86 return emulated; 86 - 87 - ra = get_ra(inst); 88 - rs = get_rs(inst); 89 - rt = get_rt(inst); 90 87 91 88 vcpu->arch.mmio_vsx_copy_nums = 0; 92 89 vcpu->arch.mmio_vsx_offset = 0;

+44 -44

arch/powerpc/kvm/powerpc.c

··· 475 475 #endif 476 476 477 477 kvm_for_each_vcpu(i, vcpu, kvm) 478 - kvm_arch_vcpu_free(vcpu); 478 + kvm_vcpu_destroy(vcpu); 479 479 480 480 mutex_lock(&kvm->lock); 481 481 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) ··· 720 720 kvmppc_core_flush_memslot(kvm, slot); 721 721 } 722 722 723 - struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) 723 + int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) 724 + { 725 + return 0; 726 + } 727 + 728 + static enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer) 724 729 { 725 730 struct kvm_vcpu *vcpu; 726 - vcpu = kvmppc_core_vcpu_create(kvm, id); 727 - if (!IS_ERR(vcpu)) { 728 - vcpu->arch.wqp = &vcpu->wq; 729 - kvmppc_create_vcpu_debugfs(vcpu, id); 730 - } 731 - return vcpu; 731 + 732 + vcpu = container_of(timer, struct kvm_vcpu, arch.dec_timer); 733 + kvmppc_decrementer_func(vcpu); 734 + 735 + return HRTIMER_NORESTART; 736 + } 737 + 738 + int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) 739 + { 740 + int err; 741 + 742 + hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 743 + vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; 744 + vcpu->arch.dec_expires = get_tb(); 745 + 746 + #ifdef CONFIG_KVM_EXIT_TIMING 747 + mutex_init(&vcpu->arch.exit_timing_lock); 748 + #endif 749 + err = kvmppc_subarch_vcpu_init(vcpu); 750 + if (err) 751 + return err; 752 + 753 + err = kvmppc_core_vcpu_create(vcpu); 754 + if (err) 755 + goto out_vcpu_uninit; 756 + 757 + vcpu->arch.wqp = &vcpu->wq; 758 + kvmppc_create_vcpu_debugfs(vcpu, vcpu->vcpu_id); 759 + return 0; 760 + 761 + out_vcpu_uninit: 762 + kvmppc_mmu_destroy(vcpu); 763 + kvmppc_subarch_vcpu_uninit(vcpu); 764 + return err; 732 765 } 733 766 734 767 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 735 768 { 736 769 } 737 770 738 - void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 771 + void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 739 772 { 740 773 /* Make sure we're not using the vcpu anymore */ 741 774 hrtimer_cancel(&vcpu->arch.dec_timer); ··· 791 758 } 792 759 793 760 kvmppc_core_vcpu_free(vcpu); 794 - } 795 761 796 - void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 797 - { 798 - kvm_arch_vcpu_free(vcpu); 762 + kvmppc_mmu_destroy(vcpu); 763 + kvmppc_subarch_vcpu_uninit(vcpu); 799 764 } 800 765 801 766 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) 802 767 { 803 768 return kvmppc_core_pending_dec(vcpu); 804 - } 805 - 806 - static enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer) 807 - { 808 - struct kvm_vcpu *vcpu; 809 - 810 - vcpu = container_of(timer, struct kvm_vcpu, arch.dec_timer); 811 - kvmppc_decrementer_func(vcpu); 812 - 813 - return HRTIMER_NORESTART; 814 - } 815 - 816 - int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 817 - { 818 - int ret; 819 - 820 - hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 821 - vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; 822 - vcpu->arch.dec_expires = get_tb(); 823 - 824 - #ifdef CONFIG_KVM_EXIT_TIMING 825 - mutex_init(&vcpu->arch.exit_timing_lock); 826 - #endif 827 - ret = kvmppc_subarch_vcpu_init(vcpu); 828 - return ret; 829 - } 830 - 831 - void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 832 - { 833 - kvmppc_mmu_destroy(vcpu); 834 - kvmppc_subarch_vcpu_uninit(vcpu); 835 769 } 836 770 837 771 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)

-1

arch/s390/include/asm/kvm_host.h

··· 914 914 915 915 static inline void kvm_arch_hardware_disable(void) {} 916 916 static inline void kvm_arch_sync_events(struct kvm *kvm) {} 917 - static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} 918 917 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} 919 918 static inline void kvm_arch_free_memslot(struct kvm *kvm, 920 919 struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}

+56 -64

arch/s390/kvm/kvm-s390.c

··· 2530 2530 if (vcpu->kvm->arch.use_cmma) 2531 2531 kvm_s390_vcpu_unsetup_cmma(vcpu); 2532 2532 free_page((unsigned long)(vcpu->arch.sie_block)); 2533 - 2534 - kvm_vcpu_uninit(vcpu); 2535 - kmem_cache_free(kvm_vcpu_cache, vcpu); 2536 2533 } 2537 2534 2538 2535 static void kvm_free_vcpus(struct kvm *kvm) ··· 2538 2541 struct kvm_vcpu *vcpu; 2539 2542 2540 2543 kvm_for_each_vcpu(i, vcpu, kvm) 2541 - kvm_arch_vcpu_destroy(vcpu); 2544 + kvm_vcpu_destroy(vcpu); 2542 2545 2543 2546 mutex_lock(&kvm->lock); 2544 2547 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) ··· 2698 2701 mutex_unlock(&kvm->lock); 2699 2702 2700 2703 return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS; 2701 - } 2702 - 2703 - int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 2704 - { 2705 - vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; 2706 - kvm_clear_async_pf_completion_queue(vcpu); 2707 - vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | 2708 - KVM_SYNC_GPRS | 2709 - KVM_SYNC_ACRS | 2710 - KVM_SYNC_CRS | 2711 - KVM_SYNC_ARCH0 | 2712 - KVM_SYNC_PFAULT; 2713 - kvm_s390_set_prefix(vcpu, 0); 2714 - if (test_kvm_facility(vcpu->kvm, 64)) 2715 - vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; 2716 - if (test_kvm_facility(vcpu->kvm, 82)) 2717 - vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC; 2718 - if (test_kvm_facility(vcpu->kvm, 133)) 2719 - vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB; 2720 - if (test_kvm_facility(vcpu->kvm, 156)) 2721 - vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN; 2722 - /* fprs can be synchronized via vrs, even if the guest has no vx. With 2723 - * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format. 2724 - */ 2725 - if (MACHINE_HAS_VX) 2726 - vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS; 2727 - else 2728 - vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS; 2729 - 2730 - if (kvm_is_ucontrol(vcpu->kvm)) 2731 - return __kvm_ucontrol_vcpu_init(vcpu); 2732 - 2733 - return 0; 2734 2704 } 2735 2705 2736 2706 /* needs disabled preemption to protect from TOD sync and vcpu_load/put */ ··· 2926 2962 vcpu->arch.sie_block->fac = (u32)(u64) model->fac_list; 2927 2963 } 2928 2964 2929 - int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 2965 + static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) 2930 2966 { 2931 2967 int rc = 0; 2932 2968 ··· 2999 3035 return rc; 3000 3036 } 3001 3037 3002 - struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 3003 - unsigned int id) 3038 + int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) 3004 3039 { 3005 - struct kvm_vcpu *vcpu; 3006 - struct sie_page *sie_page; 3007 - int rc = -EINVAL; 3008 - 3009 3040 if (!kvm_is_ucontrol(kvm) && !sca_can_add_vcpu(kvm, id)) 3010 - goto out; 3041 + return -EINVAL; 3042 + return 0; 3043 + } 3011 3044 3012 - rc = -ENOMEM; 3013 - 3014 - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 3015 - if (!vcpu) 3016 - goto out; 3045 + int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) 3046 + { 3047 + struct sie_page *sie_page; 3048 + int rc; 3017 3049 3018 3050 BUILD_BUG_ON(sizeof(struct sie_page) != 4096); 3019 3051 sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL); 3020 3052 if (!sie_page) 3021 - goto out_free_cpu; 3053 + return -ENOMEM; 3022 3054 3023 3055 vcpu->arch.sie_block = &sie_page->sie_block; 3024 3056 vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb; ··· 3023 3063 vcpu->arch.sie_block->mso = 0; 3024 3064 vcpu->arch.sie_block->msl = sclp.hamax; 3025 3065 3026 - vcpu->arch.sie_block->icpua = id; 3066 + vcpu->arch.sie_block->icpua = vcpu->vcpu_id; 3027 3067 spin_lock_init(&vcpu->arch.local_int.lock); 3028 - vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa_int.origin; 3068 + vcpu->arch.sie_block->gd = (u32)(u64)vcpu->kvm->arch.gisa_int.origin; 3029 3069 if (vcpu->arch.sie_block->gd && sclp.has_gisaf) 3030 3070 vcpu->arch.sie_block->gd |= GISA_FORMAT1; 3031 3071 seqcount_init(&vcpu->arch.cputm_seqcount); 3032 3072 3033 - rc = kvm_vcpu_init(vcpu, kvm, id); 3034 - if (rc) 3035 - goto out_free_sie_block; 3036 - VM_EVENT(kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", id, vcpu, 3037 - vcpu->arch.sie_block); 3038 - trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block); 3073 + vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; 3074 + kvm_clear_async_pf_completion_queue(vcpu); 3075 + vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | 3076 + KVM_SYNC_GPRS | 3077 + KVM_SYNC_ACRS | 3078 + KVM_SYNC_CRS | 3079 + KVM_SYNC_ARCH0 | 3080 + KVM_SYNC_PFAULT; 3081 + kvm_s390_set_prefix(vcpu, 0); 3082 + if (test_kvm_facility(vcpu->kvm, 64)) 3083 + vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; 3084 + if (test_kvm_facility(vcpu->kvm, 82)) 3085 + vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC; 3086 + if (test_kvm_facility(vcpu->kvm, 133)) 3087 + vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB; 3088 + if (test_kvm_facility(vcpu->kvm, 156)) 3089 + vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN; 3090 + /* fprs can be synchronized via vrs, even if the guest has no vx. With 3091 + * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format. 3092 + */ 3093 + if (MACHINE_HAS_VX) 3094 + vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS; 3095 + else 3096 + vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS; 3039 3097 3040 - return vcpu; 3098 + if (kvm_is_ucontrol(vcpu->kvm)) { 3099 + rc = __kvm_ucontrol_vcpu_init(vcpu); 3100 + if (rc) 3101 + goto out_free_sie_block; 3102 + } 3103 + 3104 + VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", 3105 + vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); 3106 + trace_kvm_s390_create_vcpu(vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); 3107 + 3108 + rc = kvm_s390_vcpu_setup(vcpu); 3109 + if (rc) 3110 + goto out_ucontrol_uninit; 3111 + return 0; 3112 + 3113 + out_ucontrol_uninit: 3114 + if (kvm_is_ucontrol(vcpu->kvm)) 3115 + gmap_remove(vcpu->arch.gmap); 3041 3116 out_free_sie_block: 3042 3117 free_page((unsigned long)(vcpu->arch.sie_block)); 3043 - out_free_cpu: 3044 - kmem_cache_free(kvm_vcpu_cache, vcpu); 3045 - out: 3046 - return ERR_PTR(rc); 3118 + return rc; 3047 3119 } 3048 3120 3049 3121 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)

+4

arch/x86/include/asm/kvm_emulate.h

··· 222 222 223 223 bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, 224 224 u32 *ecx, u32 *edx, bool check_limit); 225 + bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt); 226 + bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt); 227 + bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt); 228 + 225 229 void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); 226 230 227 231 unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);

+23 -11

arch/x86/include/asm/kvm_host.h

··· 175 175 VCPU_SREG_LDTR, 176 176 }; 177 177 178 + enum exit_fastpath_completion { 179 + EXIT_FASTPATH_NONE, 180 + EXIT_FASTPATH_SKIP_EMUL_INS, 181 + }; 182 + 178 183 #include <asm/kvm_emulate.h> 179 184 180 185 #define KVM_NR_MEM_OBJS 40 ··· 383 378 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); 384 379 unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); 385 380 u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); 386 - int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err, 381 + int (*page_fault)(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err, 387 382 bool prefault); 388 383 void (*inject_page_fault)(struct kvm_vcpu *vcpu, 389 384 struct x86_exception *fault); 390 - gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, 391 - struct x86_exception *exception); 385 + gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t gva_or_gpa, 386 + u32 access, struct x86_exception *exception); 392 387 gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, 393 388 struct x86_exception *exception); 394 389 int (*sync_page)(struct kvm_vcpu *vcpu, ··· 611 606 * Paging state of an L2 guest (used for nested npt) 612 607 * 613 608 * This context will save all necessary information to walk page tables 614 - * of the an L2 guest. This context is only initialized for page table 609 + * of an L2 guest. This context is only initialized for page table 615 610 * walking and not for faulting since we never handle l2 page faults on 616 611 * the host. 617 612 */ ··· 690 685 bool pvclock_set_guest_stopped_request; 691 686 692 687 struct { 688 + u8 preempted; 693 689 u64 msr_val; 694 690 u64 last_steal; 695 - struct gfn_to_hva_cache stime; 696 - struct kvm_steal_time steal; 691 + struct gfn_to_pfn_cache cache; 697 692 } st; 698 693 699 694 u64 tsc_offset; ··· 1027 1022 bool msi_redir_hint; 1028 1023 }; 1029 1024 1025 + static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) 1026 + { 1027 + return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; 1028 + } 1029 + 1030 1030 struct kvm_x86_ops { 1031 1031 int (*cpu_has_kvm_support)(void); /* __init */ 1032 1032 int (*disabled_by_bios)(void); /* __init */ ··· 1050 1040 void (*vm_destroy)(struct kvm *kvm); 1051 1041 1052 1042 /* Create, but do not attach this VCPU */ 1053 - struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); 1043 + int (*vcpu_create)(struct kvm_vcpu *vcpu); 1054 1044 void (*vcpu_free)(struct kvm_vcpu *vcpu); 1055 1045 void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); 1056 1046 ··· 1100 1090 void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr); 1101 1091 1102 1092 void (*run)(struct kvm_vcpu *vcpu); 1103 - int (*handle_exit)(struct kvm_vcpu *vcpu); 1093 + int (*handle_exit)(struct kvm_vcpu *vcpu, 1094 + enum exit_fastpath_completion exit_fastpath); 1104 1095 int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); 1105 1096 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); 1106 1097 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); ··· 1151 1140 int (*check_intercept)(struct kvm_vcpu *vcpu, 1152 1141 struct x86_instruction_info *info, 1153 1142 enum x86_intercept_stage stage); 1154 - void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); 1143 + void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu, 1144 + enum exit_fastpath_completion *exit_fastpath); 1155 1145 bool (*mpx_supported)(void); 1156 1146 bool (*xsaves_supported)(void); 1157 1147 bool (*umip_emulated)(void); 1158 1148 bool (*pt_supported)(void); 1149 + bool (*pku_supported)(void); 1159 1150 1160 1151 int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); 1161 1152 void (*request_immediate_exit)(struct kvm_vcpu *vcpu); ··· 1481 1468 1482 1469 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 1483 1470 1484 - int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, 1471 + int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, 1485 1472 void *insn, int insn_len); 1486 1473 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); 1487 1474 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); ··· 1627 1614 int kvm_is_in_guest(void); 1628 1615 1629 1616 int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); 1630 - int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); 1631 1617 bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); 1632 1618 bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); 1633 1619

+4

arch/x86/include/asm/pgtable_types.h

··· 566 566 extern pte_t *lookup_address(unsigned long address, unsigned int *level); 567 567 extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, 568 568 unsigned int *level); 569 + 570 + struct mm_struct; 571 + extern pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address, 572 + unsigned int *level); 569 573 extern pmd_t *lookup_pmd_address(unsigned long address); 570 574 extern phys_addr_t slow_virt_to_phys(void *__address); 571 575 extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,

+3 -3

arch/x86/include/asm/vmx.h

··· 22 22 /* 23 23 * Definitions of Primary Processor-Based VM-Execution Controls. 24 24 */ 25 - #define CPU_BASED_VIRTUAL_INTR_PENDING VMCS_CONTROL_BIT(VIRTUAL_INTR_PENDING) 26 - #define CPU_BASED_USE_TSC_OFFSETING VMCS_CONTROL_BIT(TSC_OFFSETTING) 25 + #define CPU_BASED_INTR_WINDOW_EXITING VMCS_CONTROL_BIT(VIRTUAL_INTR_PENDING) 26 + #define CPU_BASED_USE_TSC_OFFSETTING VMCS_CONTROL_BIT(TSC_OFFSETTING) 27 27 #define CPU_BASED_HLT_EXITING VMCS_CONTROL_BIT(HLT_EXITING) 28 28 #define CPU_BASED_INVLPG_EXITING VMCS_CONTROL_BIT(INVLPG_EXITING) 29 29 #define CPU_BASED_MWAIT_EXITING VMCS_CONTROL_BIT(MWAIT_EXITING) ··· 34 34 #define CPU_BASED_CR8_LOAD_EXITING VMCS_CONTROL_BIT(CR8_LOAD_EXITING) 35 35 #define CPU_BASED_CR8_STORE_EXITING VMCS_CONTROL_BIT(CR8_STORE_EXITING) 36 36 #define CPU_BASED_TPR_SHADOW VMCS_CONTROL_BIT(VIRTUAL_TPR) 37 - #define CPU_BASED_VIRTUAL_NMI_PENDING VMCS_CONTROL_BIT(VIRTUAL_NMI_PENDING) 37 + #define CPU_BASED_NMI_WINDOW_EXITING VMCS_CONTROL_BIT(VIRTUAL_NMI_PENDING) 38 38 #define CPU_BASED_MOV_DR_EXITING VMCS_CONTROL_BIT(MOV_DR_EXITING) 39 39 #define CPU_BASED_UNCOND_IO_EXITING VMCS_CONTROL_BIT(UNCOND_IO_EXITING) 40 40 #define CPU_BASED_USE_IO_BITMAPS VMCS_CONTROL_BIT(USE_IO_BITMAPS)

+2 -2

arch/x86/include/uapi/asm/vmx.h

··· 33 33 #define EXIT_REASON_TRIPLE_FAULT 2 34 34 #define EXIT_REASON_INIT_SIGNAL 3 35 35 36 - #define EXIT_REASON_PENDING_INTERRUPT 7 36 + #define EXIT_REASON_INTERRUPT_WINDOW 7 37 37 #define EXIT_REASON_NMI_WINDOW 8 38 38 #define EXIT_REASON_TASK_SWITCH 9 39 39 #define EXIT_REASON_CPUID 10 ··· 94 94 { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ 95 95 { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ 96 96 { EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \ 97 - { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ 97 + { EXIT_REASON_INTERRUPT_WINDOW, "INTERRUPT_WINDOW" }, \ 98 98 { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ 99 99 { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ 100 100 { EXIT_REASON_CPUID, "CPUID" }, \

+6 -3

arch/x86/kvm/cpuid.c

··· 62 62 return xcr0; 63 63 } 64 64 65 - #define F(x) bit(X86_FEATURE_##x) 65 + #define F feature_bit 66 66 67 67 int kvm_update_cpuid(struct kvm_vcpu *vcpu) 68 68 { ··· 281 281 return r; 282 282 } 283 283 284 - static void cpuid_mask(u32 *word, int wordnum) 284 + static __always_inline void cpuid_mask(u32 *word, int wordnum) 285 285 { 286 + reverse_cpuid_check(wordnum); 286 287 *word &= boot_cpu_data.x86_capability[wordnum]; 287 288 } 288 289 ··· 353 352 unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; 354 353 unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; 355 354 unsigned f_la57; 355 + unsigned f_pku = kvm_x86_ops->pku_supported() ? F(PKU) : 0; 356 356 357 357 /* cpuid 7.0.ebx */ 358 358 const u32 kvm_cpuid_7_0_ebx_x86_features = ··· 365 363 366 364 /* cpuid 7.0.ecx*/ 367 365 const u32 kvm_cpuid_7_0_ecx_x86_features = 368 - F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | 366 + F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) | 369 367 F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | 370 368 F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | 371 369 F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/; ··· 394 392 /* Set LA57 based on hardware capability. */ 395 393 entry->ecx |= f_la57; 396 394 entry->ecx |= f_umip; 395 + entry->ecx |= f_pku; 397 396 /* PKU is not yet implemented for shadow paging. */ 398 397 if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) 399 398 entry->ecx &= ~F(PKU);

+36 -9

arch/x86/kvm/cpuid.h

··· 53 53 [CPUID_7_ECX] = { 7, 0, CPUID_ECX}, 54 54 [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX}, 55 55 [CPUID_7_EDX] = { 7, 0, CPUID_EDX}, 56 + [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX}, 56 57 }; 58 + 59 + /* 60 + * Reverse CPUID and its derivatives can only be used for hardware-defined 61 + * feature words, i.e. words whose bits directly correspond to a CPUID leaf. 62 + * Retrieving a feature bit or masking guest CPUID from a Linux-defined word 63 + * is nonsensical as the bit number/mask is an arbitrary software-defined value 64 + * and can't be used by KVM to query/control guest capabilities. And obviously 65 + * the leaf being queried must have an entry in the lookup table. 66 + */ 67 + static __always_inline void reverse_cpuid_check(unsigned x86_leaf) 68 + { 69 + BUILD_BUG_ON(x86_leaf == CPUID_LNX_1); 70 + BUILD_BUG_ON(x86_leaf == CPUID_LNX_2); 71 + BUILD_BUG_ON(x86_leaf == CPUID_LNX_3); 72 + BUILD_BUG_ON(x86_leaf == CPUID_LNX_4); 73 + BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); 74 + BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); 75 + } 76 + 77 + /* 78 + * Retrieve the bit mask from an X86_FEATURE_* definition. Features contain 79 + * the hardware defined bit number (stored in bits 4:0) and a software defined 80 + * "word" (stored in bits 31:5). The word is used to index into arrays of 81 + * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has(). 82 + */ 83 + static __always_inline u32 __feature_bit(int x86_feature) 84 + { 85 + reverse_cpuid_check(x86_feature / 32); 86 + return 1 << (x86_feature & 31); 87 + } 88 + 89 + #define feature_bit(name) __feature_bit(X86_FEATURE_##name) 57 90 58 91 static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) 59 92 { 60 93 unsigned x86_leaf = x86_feature / 32; 61 94 62 - BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); 63 - BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); 64 - 95 + reverse_cpuid_check(x86_leaf); 65 96 return reverse_cpuid[x86_leaf]; 66 97 } 67 98 ··· 124 93 { 125 94 int *reg; 126 95 127 - if (x86_feature == X86_FEATURE_XSAVE && 128 - !static_cpu_has(X86_FEATURE_XSAVE)) 129 - return false; 130 - 131 96 reg = guest_cpuid_get_register(vcpu, x86_feature); 132 97 if (!reg) 133 98 return false; 134 99 135 - return *reg & bit(x86_feature); 100 + return *reg & __feature_bit(x86_feature); 136 101 } 137 102 138 103 static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x86_feature) ··· 137 110 138 111 reg = guest_cpuid_get_register(vcpu, x86_feature); 139 112 if (reg) 140 - *reg &= ~bit(x86_feature); 113 + *reg &= ~__feature_bit(x86_feature); 141 114 } 142 115 143 116 static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu)

+85 -48

arch/x86/kvm/emulate.c

··· 22 22 #include "kvm_cache_regs.h" 23 23 #include <asm/kvm_emulate.h> 24 24 #include <linux/stringify.h> 25 + #include <asm/fpu/api.h> 25 26 #include <asm/debugreg.h> 26 27 #include <asm/nospec-branch.h> 27 28 ··· 311 310 #define ON64(x) 312 311 #endif 313 312 314 - static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); 313 + typedef void (*fastop_t)(struct fastop *); 314 + 315 + static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); 315 316 316 317 #define __FOP_FUNC(name) \ 317 318 ".align " __stringify(FASTOP_SIZE) " \n\t" \ ··· 1078 1075 } 1079 1076 } 1080 1077 1081 - static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) 1078 + static void emulator_get_fpu(void) 1082 1079 { 1080 + fpregs_lock(); 1081 + 1082 + fpregs_assert_state_consistent(); 1083 + if (test_thread_flag(TIF_NEED_FPU_LOAD)) 1084 + switch_fpu_return(); 1085 + } 1086 + 1087 + static void emulator_put_fpu(void) 1088 + { 1089 + fpregs_unlock(); 1090 + } 1091 + 1092 + static void read_sse_reg(sse128_t *data, int reg) 1093 + { 1094 + emulator_get_fpu(); 1083 1095 switch (reg) { 1084 1096 case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break; 1085 1097 case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; ··· 1116 1098 #endif 1117 1099 default: BUG(); 1118 1100 } 1101 + emulator_put_fpu(); 1119 1102 } 1120 1103 1121 - static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, 1122 - int reg) 1104 + static void write_sse_reg(sse128_t *data, int reg) 1123 1105 { 1106 + emulator_get_fpu(); 1124 1107 switch (reg) { 1125 1108 case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break; 1126 1109 case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; ··· 1143 1124 #endif 1144 1125 default: BUG(); 1145 1126 } 1127 + emulator_put_fpu(); 1146 1128 } 1147 1129 1148 - static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) 1130 + static void read_mmx_reg(u64 *data, int reg) 1149 1131 { 1132 + emulator_get_fpu(); 1150 1133 switch (reg) { 1151 1134 case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; 1152 1135 case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; ··· 1160 1139 case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; 1161 1140 default: BUG(); 1162 1141 } 1142 + emulator_put_fpu(); 1163 1143 } 1164 1144 1165 - static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) 1145 + static void write_mmx_reg(u64 *data, int reg) 1166 1146 { 1147 + emulator_get_fpu(); 1167 1148 switch (reg) { 1168 1149 case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; 1169 1150 case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; ··· 1177 1154 case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; 1178 1155 default: BUG(); 1179 1156 } 1157 + emulator_put_fpu(); 1180 1158 } 1181 1159 1182 1160 static int em_fninit(struct x86_emulate_ctxt *ctxt) ··· 1185 1161 if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) 1186 1162 return emulate_nm(ctxt); 1187 1163 1164 + emulator_get_fpu(); 1188 1165 asm volatile("fninit"); 1166 + emulator_put_fpu(); 1189 1167 return X86EMUL_CONTINUE; 1190 1168 } 1191 1169 ··· 1198 1172 if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) 1199 1173 return emulate_nm(ctxt); 1200 1174 1175 + emulator_get_fpu(); 1201 1176 asm volatile("fnstcw %0": "+m"(fcw)); 1177 + emulator_put_fpu(); 1202 1178 1203 1179 ctxt->dst.val = fcw; 1204 1180 ··· 1214 1186 if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) 1215 1187 return emulate_nm(ctxt); 1216 1188 1189 + emulator_get_fpu(); 1217 1190 asm volatile("fnstsw %0": "+m"(fsw)); 1191 + emulator_put_fpu(); 1218 1192 1219 1193 ctxt->dst.val = fsw; 1220 1194 ··· 1235 1205 op->type = OP_XMM; 1236 1206 op->bytes = 16; 1237 1207 op->addr.xmm = reg; 1238 - read_sse_reg(ctxt, &op->vec_val, reg); 1208 + read_sse_reg(&op->vec_val, reg); 1239 1209 return; 1240 1210 } 1241 1211 if (ctxt->d & Mmx) { ··· 1286 1256 op->type = OP_XMM; 1287 1257 op->bytes = 16; 1288 1258 op->addr.xmm = ctxt->modrm_rm; 1289 - read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm); 1259 + read_sse_reg(&op->vec_val, ctxt->modrm_rm); 1290 1260 return rc; 1291 1261 } 1292 1262 if (ctxt->d & Mmx) { ··· 1863 1833 op->bytes * op->count); 1864 1834 break; 1865 1835 case OP_XMM: 1866 - write_sse_reg(ctxt, &op->vec_val, op->addr.xmm); 1836 + write_sse_reg(&op->vec_val, op->addr.xmm); 1867 1837 break; 1868 1838 case OP_MM: 1869 - write_mmx_reg(ctxt, &op->mm_val, op->addr.mm); 1839 + write_mmx_reg(&op->mm_val, op->addr.mm); 1870 1840 break; 1871 1841 case OP_NONE: 1872 1842 /* no writeback */ ··· 2378 2348 static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) 2379 2349 { 2380 2350 #ifdef CONFIG_X86_64 2381 - u32 eax, ebx, ecx, edx; 2382 - 2383 - eax = 0x80000001; 2384 - ecx = 0; 2385 - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); 2386 - return edx & bit(X86_FEATURE_LM); 2351 + return ctxt->ops->guest_has_long_mode(ctxt); 2387 2352 #else 2388 2353 return false; 2389 2354 #endif ··· 3643 3618 return X86EMUL_CONTINUE; 3644 3619 } 3645 3620 3646 - #define FFL(x) bit(X86_FEATURE_##x) 3647 - 3648 3621 static int em_movbe(struct x86_emulate_ctxt *ctxt) 3649 3622 { 3650 - u32 ebx, ecx, edx, eax = 1; 3651 3623 u16 tmp; 3652 3624 3653 - /* 3654 - * Check MOVBE is set in the guest-visible CPUID leaf. 3655 - */ 3656 - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); 3657 - if (!(ecx & FFL(MOVBE))) 3625 + if (!ctxt->ops->guest_has_movbe(ctxt)) 3658 3626 return emulate_ud(ctxt); 3659 3627 3660 3628 switch (ctxt->op_bytes) { ··· 4045 4027 4046 4028 static int check_fxsr(struct x86_emulate_ctxt *ctxt) 4047 4029 { 4048 - u32 eax = 1, ebx, ecx = 0, edx; 4049 - 4050 - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); 4051 - if (!(edx & FFL(FXSR))) 4030 + if (!ctxt->ops->guest_has_fxsr(ctxt)) 4052 4031 return emulate_ud(ctxt); 4053 4032 4054 4033 if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) ··· 4107 4092 if (rc != X86EMUL_CONTINUE) 4108 4093 return rc; 4109 4094 4095 + emulator_get_fpu(); 4096 + 4110 4097 rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); 4098 + 4099 + emulator_put_fpu(); 4111 4100 4112 4101 if (rc != X86EMUL_CONTINUE) 4113 4102 return rc; ··· 4155 4136 if (rc != X86EMUL_CONTINUE) 4156 4137 return rc; 4157 4138 4139 + emulator_get_fpu(); 4140 + 4158 4141 if (size < __fxstate_size(16)) { 4159 4142 rc = fxregs_fixup(&fx_state, size); 4160 4143 if (rc != X86EMUL_CONTINUE) ··· 4172 4151 rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); 4173 4152 4174 4153 out: 4154 + emulator_put_fpu(); 4155 + 4175 4156 return rc; 4176 4157 } 4177 4158 ··· 5233 5210 ctxt->ad_bytes = def_ad_bytes ^ 6; 5234 5211 break; 5235 5212 case 0x26: /* ES override */ 5213 + has_seg_override = true; 5214 + ctxt->seg_override = VCPU_SREG_ES; 5215 + break; 5236 5216 case 0x2e: /* CS override */ 5217 + has_seg_override = true; 5218 + ctxt->seg_override = VCPU_SREG_CS; 5219 + break; 5237 5220 case 0x36: /* SS override */ 5221 + has_seg_override = true; 5222 + ctxt->seg_override = VCPU_SREG_SS; 5223 + break; 5238 5224 case 0x3e: /* DS override */ 5239 5225 has_seg_override = true; 5240 - ctxt->seg_override = (ctxt->b >> 3) & 3; 5226 + ctxt->seg_override = VCPU_SREG_DS; 5241 5227 break; 5242 5228 case 0x64: /* FS override */ 5229 + has_seg_override = true; 5230 + ctxt->seg_override = VCPU_SREG_FS; 5231 + break; 5243 5232 case 0x65: /* GS override */ 5244 5233 has_seg_override = true; 5245 - ctxt->seg_override = ctxt->b & 7; 5234 + ctxt->seg_override = VCPU_SREG_GS; 5246 5235 break; 5247 5236 case 0x40 ... 0x4f: /* REX */ 5248 5237 if (mode != X86EMUL_MODE_PROT64) ··· 5338 5303 } 5339 5304 break; 5340 5305 case Escape: 5341 - if (ctxt->modrm > 0xbf) 5342 - opcode = opcode.u.esc->high[ctxt->modrm - 0xc0]; 5343 - else 5306 + if (ctxt->modrm > 0xbf) { 5307 + size_t size = ARRAY_SIZE(opcode.u.esc->high); 5308 + u32 index = array_index_nospec( 5309 + ctxt->modrm - 0xc0, size); 5310 + 5311 + opcode = opcode.u.esc->high[index]; 5312 + } else { 5344 5313 opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7]; 5314 + } 5345 5315 break; 5346 5316 case InstrDual: 5347 5317 if ((ctxt->modrm >> 6) == 3) ··· 5488 5448 { 5489 5449 int rc; 5490 5450 5451 + emulator_get_fpu(); 5491 5452 rc = asm_safe("fwait"); 5453 + emulator_put_fpu(); 5492 5454 5493 5455 if (unlikely(rc != X86EMUL_CONTINUE)) 5494 5456 return emulate_exception(ctxt, MF_VECTOR, 0, false); ··· 5498 5456 return X86EMUL_CONTINUE; 5499 5457 } 5500 5458 5501 - static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, 5502 - struct operand *op) 5459 + static void fetch_possible_mmx_operand(struct operand *op) 5503 5460 { 5504 5461 if (op->type == OP_MM) 5505 - read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); 5462 + read_mmx_reg(&op->mm_val, op->addr.mm); 5506 5463 } 5507 5464 5508 - static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) 5465 + static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop) 5509 5466 { 5510 5467 ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; 5511 5468 ··· 5580 5539 * Now that we know the fpu is exception safe, we can fetch 5581 5540 * operands from it. 5582 5541 */ 5583 - fetch_possible_mmx_operand(ctxt, &ctxt->src); 5584 - fetch_possible_mmx_operand(ctxt, &ctxt->src2); 5542 + fetch_possible_mmx_operand(&ctxt->src); 5543 + fetch_possible_mmx_operand(&ctxt->src2); 5585 5544 if (!(ctxt->d & Mov)) 5586 - fetch_possible_mmx_operand(ctxt, &ctxt->dst); 5545 + fetch_possible_mmx_operand(&ctxt->dst); 5587 5546 } 5588 5547 5589 5548 if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { ··· 5682 5641 ctxt->eflags &= ~X86_EFLAGS_RF; 5683 5642 5684 5643 if (ctxt->execute) { 5685 - if (ctxt->d & Fastop) { 5686 - void (*fop)(struct fastop *) = (void *)ctxt->execute; 5687 - rc = fastop(ctxt, fop); 5688 - if (rc != X86EMUL_CONTINUE) 5689 - goto done; 5690 - goto writeback; 5691 - } 5692 - rc = ctxt->execute(ctxt); 5644 + if (ctxt->d & Fastop) 5645 + rc = fastop(ctxt, (fastop_t)ctxt->execute); 5646 + else 5647 + rc = ctxt->execute(ctxt); 5693 5648 if (rc != X86EMUL_CONTINUE) 5694 5649 goto done; 5695 5650 goto writeback;

+10 -7

arch/x86/kvm/hyperv.c

··· 33 33 #include <trace/events/kvm.h> 34 34 35 35 #include "trace.h" 36 + #include "irq.h" 36 37 37 38 #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64) 38 39 ··· 810 809 u32 index, u64 *pdata) 811 810 { 812 811 struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; 812 + size_t size = ARRAY_SIZE(hv->hv_crash_param); 813 813 814 - if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) 814 + if (WARN_ON_ONCE(index >= size)) 815 815 return -EINVAL; 816 816 817 - *pdata = hv->hv_crash_param[index]; 817 + *pdata = hv->hv_crash_param[array_index_nospec(index, size)]; 818 818 return 0; 819 819 } 820 820 ··· 854 852 u32 index, u64 data) 855 853 { 856 854 struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; 855 + size_t size = ARRAY_SIZE(hv->hv_crash_param); 857 856 858 - if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) 857 + if (WARN_ON_ONCE(index >= size)) 859 858 return -EINVAL; 860 859 861 - hv->hv_crash_param[index] = data; 860 + hv->hv_crash_param[array_index_nospec(index, size)] = data; 862 861 return 0; 863 862 } 864 863 ··· 1061 1058 return 1; 1062 1059 break; 1063 1060 default: 1064 - vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", 1061 + vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", 1065 1062 msr, data); 1066 1063 return 1; 1067 1064 } ··· 1124 1121 return 1; 1125 1122 1126 1123 /* 1127 - * Clear apic_assist portion of f(struct hv_vp_assist_page 1124 + * Clear apic_assist portion of struct hv_vp_assist_page 1128 1125 * only, there can be valuable data in the rest which needs 1129 1126 * to be preserved e.g. on migration. 1130 1127 */ ··· 1181 1178 return 1; 1182 1179 break; 1183 1180 default: 1184 - vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", 1181 + vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", 1185 1182 msr, data); 1186 1183 return 1; 1187 1184 }

+5 -1

arch/x86/kvm/i8259.c

··· 460 460 switch (addr) { 461 461 case 0x20: 462 462 case 0x21: 463 + pic_lock(s); 464 + pic_ioport_write(&s->pics[0], addr, data); 465 + pic_unlock(s); 466 + break; 463 467 case 0xa0: 464 468 case 0xa1: 465 469 pic_lock(s); 466 - pic_ioport_write(&s->pics[addr >> 7], addr, data); 470 + pic_ioport_write(&s->pics[1], addr, data); 467 471 pic_unlock(s); 468 472 break; 469 473 case 0x4d0:

+25 -16

arch/x86/kvm/ioapic.c

··· 36 36 #include <linux/io.h> 37 37 #include <linux/slab.h> 38 38 #include <linux/export.h> 39 + #include <linux/nospec.h> 39 40 #include <asm/processor.h> 40 41 #include <asm/page.h> 41 42 #include <asm/current.h> ··· 69 68 default: 70 69 { 71 70 u32 redir_index = (ioapic->ioregsel - 0x10) >> 1; 72 - u64 redir_content; 71 + u64 redir_content = ~0ULL; 73 72 74 - if (redir_index < IOAPIC_NUM_PINS) 75 - redir_content = 76 - ioapic->redirtbl[redir_index].bits; 77 - else 78 - redir_content = ~0ULL; 73 + if (redir_index < IOAPIC_NUM_PINS) { 74 + u32 index = array_index_nospec( 75 + redir_index, IOAPIC_NUM_PINS); 76 + 77 + redir_content = ioapic->redirtbl[index].bits; 78 + } 79 79 80 80 result = (ioapic->ioregsel & 0x1) ? 81 81 (redir_content >> 32) & 0xffffffff : ··· 110 108 union kvm_ioapic_redirect_entry *e; 111 109 112 110 e = &ioapic->redirtbl[RTC_GSI]; 113 - if (!kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id, 114 - e->fields.dest_mode)) 111 + if (!kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, 112 + e->fields.dest_id, 113 + kvm_lapic_irq_dest_mode(!!e->fields.dest_mode))) 115 114 return; 116 115 117 116 new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector); ··· 191 188 /* 192 189 * Return 0 for coalesced interrupts; for edge-triggered interrupts, 193 190 * this only happens if a previous edge has not been delivered due 194 - * do masking. For level interrupts, the remote_irr field tells 191 + * to masking. For level interrupts, the remote_irr field tells 195 192 * us if the interrupt is waiting for an EOI. 196 193 * 197 194 * RTC is special: it is edge-triggered, but userspace likes to know ··· 253 250 if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG || 254 251 kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || 255 252 index == RTC_GSI) { 256 - if (kvm_apic_match_dest(vcpu, NULL, 0, 257 - e->fields.dest_id, e->fields.dest_mode) || 253 + u16 dm = kvm_lapic_irq_dest_mode(!!e->fields.dest_mode); 254 + 255 + if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, 256 + e->fields.dest_id, dm) || 258 257 kvm_apic_pending_eoi(vcpu, e->fields.vector)) 259 258 __set_bit(e->fields.vector, 260 259 ioapic_handled_vectors); ··· 297 292 298 293 if (index >= IOAPIC_NUM_PINS) 299 294 return; 295 + index = array_index_nospec(index, IOAPIC_NUM_PINS); 300 296 e = &ioapic->redirtbl[index]; 301 297 mask_before = e->fields.mask; 302 298 /* Preserve read-only fields */ ··· 333 327 if (e->fields.delivery_mode == APIC_DM_FIXED) { 334 328 struct kvm_lapic_irq irq; 335 329 336 - irq.shorthand = 0; 330 + irq.shorthand = APIC_DEST_NOSHORT; 337 331 irq.vector = e->fields.vector; 338 332 irq.delivery_mode = e->fields.delivery_mode << 8; 339 333 irq.dest_id = e->fields.dest_id; 340 - irq.dest_mode = e->fields.dest_mode; 334 + irq.dest_mode = 335 + kvm_lapic_irq_dest_mode(!!e->fields.dest_mode); 341 336 bitmap_zero(&vcpu_bitmap, 16); 342 337 kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, 343 338 &vcpu_bitmap); ··· 350 343 * keep ioapic_handled_vectors synchronized. 351 344 */ 352 345 irq.dest_id = old_dest_id; 353 - irq.dest_mode = old_dest_mode; 346 + irq.dest_mode = 347 + kvm_lapic_irq_dest_mode( 348 + !!e->fields.dest_mode); 354 349 kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, 355 350 &vcpu_bitmap); 356 351 } ··· 378 369 379 370 irqe.dest_id = entry->fields.dest_id; 380 371 irqe.vector = entry->fields.vector; 381 - irqe.dest_mode = entry->fields.dest_mode; 372 + irqe.dest_mode = kvm_lapic_irq_dest_mode(!!entry->fields.dest_mode); 382 373 irqe.trig_mode = entry->fields.trig_mode; 383 374 irqe.delivery_mode = entry->fields.delivery_mode << 8; 384 375 irqe.level = 1; 385 - irqe.shorthand = 0; 376 + irqe.shorthand = APIC_DEST_NOSHORT; 386 377 irqe.msi_redir_hint = false; 387 378 388 379 if (irqe.trig_mode == IOAPIC_EDGE_TRIG)

-6

arch/x86/kvm/ioapic.h

··· 116 116 } 117 117 118 118 void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); 119 - bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 120 - int short_hand, unsigned int dest, int dest_mode); 121 - int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); 122 119 void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, 123 120 int trigger_mode); 124 121 int kvm_ioapic_init(struct kvm *kvm); ··· 123 126 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, 124 127 int level, bool line_status); 125 128 void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); 126 - int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, 127 - struct kvm_lapic_irq *irq, 128 - struct dest_map *dest_map); 129 129 void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); 130 130 void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); 131 131 void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,

+3

arch/x86/kvm/irq.h

··· 113 113 114 114 int kvm_setup_default_irq_routing(struct kvm *kvm); 115 115 int kvm_setup_empty_irq_routing(struct kvm *kvm); 116 + int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, 117 + struct kvm_lapic_irq *irq, 118 + struct dest_map *dest_map); 116 119 117 120 #endif

+10 -8

arch/x86/kvm/irq_comm.c

··· 52 52 unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; 53 53 unsigned int dest_vcpus = 0; 54 54 55 - if (irq->dest_mode == 0 && irq->dest_id == 0xff && 56 - kvm_lowest_prio_delivery(irq)) { 55 + if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) 56 + return r; 57 + 58 + if (irq->dest_mode == APIC_DEST_PHYSICAL && 59 + irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { 57 60 printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); 58 61 irq->delivery_mode = APIC_DM_FIXED; 59 62 } 60 - 61 - if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) 62 - return r; 63 63 64 64 memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); 65 65 ··· 114 114 irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi); 115 115 irq->vector = (e->msi.data & 116 116 MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; 117 - irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; 117 + irq->dest_mode = kvm_lapic_irq_dest_mode( 118 + !!((1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo)); 118 119 irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; 119 120 irq->delivery_mode = e->msi.data & 0x700; 120 121 irq->msi_redir_hint = ((e->msi.address_lo 121 122 & MSI_ADDR_REDIRECTION_LOWPRI) > 0); 122 123 irq->level = 1; 123 - irq->shorthand = 0; 124 + irq->shorthand = APIC_DEST_NOSHORT; 124 125 } 125 126 EXPORT_SYMBOL_GPL(kvm_set_msi_irq); 126 127 ··· 417 416 418 417 kvm_set_msi_irq(vcpu->kvm, entry, &irq); 419 418 420 - if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0, 419 + if (irq.level && 420 + kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, 421 421 irq.dest_id, irq.dest_mode)) 422 422 __set_bit(irq.vector, ioapic_handled_vectors); 423 423 }

+19 -18

arch/x86/kvm/lapic.c

··· 56 56 #define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16)) 57 57 #define LAPIC_MMIO_LENGTH (1 << 12) 58 58 /* followed define is not in apicdef.h */ 59 - #define APIC_SHORT_MASK 0xc0000 60 - #define APIC_DEST_NOSHORT 0x0 61 - #define APIC_DEST_MASK 0x800 62 59 #define MAX_APIC_VECTOR 256 63 60 #define APIC_VECTORS_PER_REG 32 64 61 ··· 789 792 } 790 793 791 794 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 792 - int short_hand, unsigned int dest, int dest_mode) 795 + int shorthand, unsigned int dest, int dest_mode) 793 796 { 794 797 struct kvm_lapic *target = vcpu->arch.apic; 795 798 u32 mda = kvm_apic_mda(vcpu, dest, source, target); 796 799 797 800 ASSERT(target); 798 - switch (short_hand) { 801 + switch (shorthand) { 799 802 case APIC_DEST_NOSHORT: 800 803 if (dest_mode == APIC_DEST_PHYSICAL) 801 804 return kvm_apic_match_physical_addr(target, mda); ··· 964 967 } 965 968 966 969 /* 967 - * This routine tries to handler interrupts in posted mode, here is how 970 + * This routine tries to handle interrupts in posted mode, here is how 968 971 * it deals with different cases: 969 972 * - For single-destination interrupts, handle it in posted mode 970 973 * - Else if vector hashing is enabled and it is a lowest-priority 971 974 * interrupt, handle it in posted mode and use the following mechanism 972 - * to find the destinaiton vCPU. 975 + * to find the destination vCPU. 973 976 * 1. For lowest-priority interrupts, store all the possible 974 977 * destination vCPUs in an array. 975 978 * 2. Use "guest vector % max number of destination vCPUs" to find ··· 1148 1151 if (!kvm_apic_present(vcpu)) 1149 1152 continue; 1150 1153 if (!kvm_apic_match_dest(vcpu, NULL, 1151 - irq->delivery_mode, 1154 + irq->shorthand, 1152 1155 irq->dest_id, 1153 1156 irq->dest_mode)) 1154 1157 continue; ··· 1571 1574 struct kvm_timer *ktimer = &apic->lapic_timer; 1572 1575 1573 1576 kvm_apic_local_deliver(apic, APIC_LVTT); 1574 - if (apic_lvtt_tscdeadline(apic)) 1577 + if (apic_lvtt_tscdeadline(apic)) { 1575 1578 ktimer->tscdeadline = 0; 1576 - if (apic_lvtt_oneshot(apic)) { 1579 + } else if (apic_lvtt_oneshot(apic)) { 1577 1580 ktimer->tscdeadline = 0; 1578 1581 ktimer->target_expiration = 0; 1579 1582 } ··· 1960 1963 case APIC_LVTTHMR: 1961 1964 case APIC_LVTPC: 1962 1965 case APIC_LVT1: 1963 - case APIC_LVTERR: 1966 + case APIC_LVTERR: { 1964 1967 /* TODO: Check vector */ 1968 + size_t size; 1969 + u32 index; 1970 + 1965 1971 if (!kvm_apic_sw_enabled(apic)) 1966 1972 val |= APIC_LVT_MASKED; 1967 - 1968 - val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; 1973 + size = ARRAY_SIZE(apic_lvt_mask); 1974 + index = array_index_nospec( 1975 + (reg - APIC_LVTT) >> 4, size); 1976 + val &= apic_lvt_mask[index]; 1969 1977 kvm_lapic_set_reg(apic, reg, val); 1970 - 1971 1978 break; 1979 + } 1972 1980 1973 1981 case APIC_LVTT: 1974 1982 if (!kvm_apic_sw_enabled(apic)) ··· 2375 2373 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) 2376 2374 { 2377 2375 u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0); 2378 - int r = 0; 2379 2376 2380 2377 if (!kvm_apic_hw_enabled(vcpu->arch.apic)) 2381 - r = 1; 2378 + return 1; 2382 2379 if ((lvt0 & APIC_LVT_MASKED) == 0 && 2383 2380 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) 2384 - r = 1; 2385 - return r; 2381 + return 1; 2382 + return 0; 2386 2383 } 2387 2384 2388 2385 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)

+5 -4

arch/x86/kvm/lapic.h

··· 10 10 #define KVM_APIC_SIPI 1 11 11 #define KVM_APIC_LVT_NUM 6 12 12 13 - #define KVM_APIC_SHORT_MASK 0xc0000 14 - #define KVM_APIC_DEST_MASK 0x800 13 + #define APIC_SHORT_MASK 0xc0000 14 + #define APIC_DEST_NOSHORT 0x0 15 + #define APIC_DEST_MASK 0x800 15 16 16 17 #define APIC_BUS_CYCLE_NS 1 17 18 #define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS) ··· 83 82 int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, 84 83 void *data); 85 84 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 86 - int short_hand, unsigned int dest, int dest_mode); 87 - 85 + int shorthand, unsigned int dest, int dest_mode); 86 + int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); 88 87 bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); 89 88 bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); 90 89 void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);

+246 -359

arch/x86/kvm/mmu/mmu.c

··· 418 418 * requires a full MMU zap). The flag is instead explicitly queried when 419 419 * checking for MMIO spte cache hits. 420 420 */ 421 - #define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0) 421 + #define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0) 422 422 423 423 #define MMIO_SPTE_GEN_LOW_START 3 424 424 #define MMIO_SPTE_GEN_LOW_END 11 425 425 #define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ 426 426 MMIO_SPTE_GEN_LOW_START) 427 427 428 - #define MMIO_SPTE_GEN_HIGH_START 52 429 - #define MMIO_SPTE_GEN_HIGH_END 61 428 + #define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT 429 + #define MMIO_SPTE_GEN_HIGH_END 62 430 430 #define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ 431 431 MMIO_SPTE_GEN_HIGH_START) 432 + 432 433 static u64 generation_mmio_spte_mask(u64 gen) 433 434 { 434 435 u64 mask; 435 436 436 437 WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); 438 + BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); 437 439 438 440 mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; 439 441 mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; ··· 445 443 static u64 get_mmio_spte_generation(u64 spte) 446 444 { 447 445 u64 gen; 448 - 449 - spte &= ~shadow_mmio_mask; 450 446 451 447 gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; 452 448 gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; ··· 538 538 static u8 kvm_get_shadow_phys_bits(void) 539 539 { 540 540 /* 541 - * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected 542 - * in CPU detection code, but MKTME treats those reduced bits as 543 - * 'keyID' thus they are not reserved bits. Therefore for MKTME 544 - * we should still return physical address bits reported by CPUID. 541 + * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected 542 + * in CPU detection code, but the processor treats those reduced bits as 543 + * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at 544 + * the physical address bits reported by CPUID. 545 545 */ 546 - if (!boot_cpu_has(X86_FEATURE_TME) || 547 - WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008)) 548 - return boot_cpu_data.x86_phys_bits; 546 + if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) 547 + return cpuid_eax(0x80000008) & 0xff; 549 548 550 - return cpuid_eax(0x80000008) & 0xff; 549 + /* 550 + * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with 551 + * custom CPUID. Proceed with whatever the kernel found since these features 552 + * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). 553 + */ 554 + return boot_cpu_data.x86_phys_bits; 551 555 } 552 556 553 557 static void kvm_mmu_reset_all_pte_masks(void) ··· 1264 1260 list_del(&sp->lpage_disallowed_link); 1265 1261 } 1266 1262 1267 - static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, 1268 - struct kvm_memory_slot *slot) 1269 - { 1270 - struct kvm_lpage_info *linfo; 1271 - 1272 - if (slot) { 1273 - linfo = lpage_info_slot(gfn, slot, level); 1274 - return !!linfo->disallow_lpage; 1275 - } 1276 - 1277 - return true; 1278 - } 1279 - 1280 - static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn, 1281 - int level) 1282 - { 1283 - struct kvm_memory_slot *slot; 1284 - 1285 - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1286 - return __mmu_gfn_lpage_is_disallowed(gfn, level, slot); 1287 - } 1288 - 1289 - static int host_mapping_level(struct kvm *kvm, gfn_t gfn) 1290 - { 1291 - unsigned long page_size; 1292 - int i, ret = 0; 1293 - 1294 - page_size = kvm_host_page_size(kvm, gfn); 1295 - 1296 - for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { 1297 - if (page_size >= KVM_HPAGE_SIZE(i)) 1298 - ret = i; 1299 - else 1300 - break; 1301 - } 1302 - 1303 - return ret; 1304 - } 1305 - 1306 - static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, 1307 - bool no_dirty_log) 1308 - { 1309 - if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1310 - return false; 1311 - if (no_dirty_log && slot->dirty_bitmap) 1312 - return false; 1313 - 1314 - return true; 1315 - } 1316 - 1317 1263 static struct kvm_memory_slot * 1318 1264 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, 1319 1265 bool no_dirty_log) ··· 1271 1317 struct kvm_memory_slot *slot; 1272 1318 1273 1319 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1274 - if (!memslot_valid_for_gpte(slot, no_dirty_log)) 1275 - slot = NULL; 1320 + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1321 + return NULL; 1322 + if (no_dirty_log && slot->dirty_bitmap) 1323 + return NULL; 1276 1324 1277 1325 return slot; 1278 - } 1279 - 1280 - static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, 1281 - bool *force_pt_level) 1282 - { 1283 - int host_level, level, max_level; 1284 - struct kvm_memory_slot *slot; 1285 - 1286 - if (unlikely(*force_pt_level)) 1287 - return PT_PAGE_TABLE_LEVEL; 1288 - 1289 - slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); 1290 - *force_pt_level = !memslot_valid_for_gpte(slot, true); 1291 - if (unlikely(*force_pt_level)) 1292 - return PT_PAGE_TABLE_LEVEL; 1293 - 1294 - host_level = host_mapping_level(vcpu->kvm, large_gfn); 1295 - 1296 - if (host_level == PT_PAGE_TABLE_LEVEL) 1297 - return host_level; 1298 - 1299 - max_level = min(kvm_x86_ops->get_lpage_level(), host_level); 1300 - 1301 - for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) 1302 - if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot)) 1303 - break; 1304 - 1305 - return level - 1; 1306 1326 } 1307 1327 1308 1328 /* ··· 1338 1410 if (j != 0) 1339 1411 return; 1340 1412 if (!prev_desc && !desc->more) 1341 - rmap_head->val = (unsigned long)desc->sptes[0]; 1413 + rmap_head->val = 0; 1342 1414 else 1343 1415 if (prev_desc) 1344 1416 prev_desc->more = desc->more; ··· 1453 1525 /* 1454 1526 * Iteration must be started by this function. This should also be used after 1455 1527 * removing/dropping sptes from the rmap link because in such cases the 1456 - * information in the itererator may not be valid. 1528 + * information in the iterator may not be valid. 1457 1529 * 1458 1530 * Returns sptep if found, NULL otherwise. 1459 1531 */ ··· 2827 2899 return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 2828 2900 } 2829 2901 2902 + static int make_mmu_pages_available(struct kvm_vcpu *vcpu) 2903 + { 2904 + LIST_HEAD(invalid_list); 2905 + 2906 + if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) 2907 + return 0; 2908 + 2909 + while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { 2910 + if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) 2911 + break; 2912 + 2913 + ++vcpu->kvm->stat.mmu_recycled; 2914 + } 2915 + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2916 + 2917 + if (!kvm_mmu_available_pages(vcpu->kvm)) 2918 + return -ENOSPC; 2919 + return 0; 2920 + } 2921 + 2830 2922 /* 2831 2923 * Changing the number of mmu pages allocated to the vm 2832 2924 * Note: if goal_nr_mmu_pages is too small, you will get dead lock ··· 3047 3099 spte |= (u64)pfn << PAGE_SHIFT; 3048 3100 3049 3101 if (pte_access & ACC_WRITE_MASK) { 3050 - 3051 - /* 3052 - * Other vcpu creates new sp in the window between 3053 - * mapping_level() and acquiring mmu-lock. We can 3054 - * allow guest to retry the access, the mapping can 3055 - * be fixed if guest refault. 3056 - */ 3057 - if (level > PT_PAGE_TABLE_LEVEL && 3058 - mmu_gfn_lpage_is_disallowed(vcpu, gfn, level)) 3059 - goto done; 3060 - 3061 3102 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; 3062 3103 3063 3104 /* ··· 3078 3141 set_pte: 3079 3142 if (mmu_spte_update(sptep, spte)) 3080 3143 ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; 3081 - done: 3082 3144 return ret; 3083 3145 } 3084 3146 ··· 3230 3294 __direct_pte_prefetch(vcpu, sp, sptep); 3231 3295 } 3232 3296 3297 + static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, 3298 + kvm_pfn_t pfn, struct kvm_memory_slot *slot) 3299 + { 3300 + unsigned long hva; 3301 + pte_t *pte; 3302 + int level; 3303 + 3304 + BUILD_BUG_ON(PT_PAGE_TABLE_LEVEL != (int)PG_LEVEL_4K || 3305 + PT_DIRECTORY_LEVEL != (int)PG_LEVEL_2M || 3306 + PT_PDPE_LEVEL != (int)PG_LEVEL_1G); 3307 + 3308 + if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) 3309 + return PT_PAGE_TABLE_LEVEL; 3310 + 3311 + /* 3312 + * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() 3313 + * is not solely for performance, it's also necessary to avoid the 3314 + * "writable" check in __gfn_to_hva_many(), which will always fail on 3315 + * read-only memslots due to gfn_to_hva() assuming writes. Earlier 3316 + * page fault steps have already verified the guest isn't writing a 3317 + * read-only memslot. 3318 + */ 3319 + hva = __gfn_to_hva_memslot(slot, gfn); 3320 + 3321 + pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level); 3322 + if (unlikely(!pte)) 3323 + return PT_PAGE_TABLE_LEVEL; 3324 + 3325 + return level; 3326 + } 3327 + 3328 + static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, 3329 + int max_level, kvm_pfn_t *pfnp) 3330 + { 3331 + struct kvm_memory_slot *slot; 3332 + struct kvm_lpage_info *linfo; 3333 + kvm_pfn_t pfn = *pfnp; 3334 + kvm_pfn_t mask; 3335 + int level; 3336 + 3337 + if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) 3338 + return PT_PAGE_TABLE_LEVEL; 3339 + 3340 + if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)) 3341 + return PT_PAGE_TABLE_LEVEL; 3342 + 3343 + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true); 3344 + if (!slot) 3345 + return PT_PAGE_TABLE_LEVEL; 3346 + 3347 + max_level = min(max_level, kvm_x86_ops->get_lpage_level()); 3348 + for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { 3349 + linfo = lpage_info_slot(gfn, slot, max_level); 3350 + if (!linfo->disallow_lpage) 3351 + break; 3352 + } 3353 + 3354 + if (max_level == PT_PAGE_TABLE_LEVEL) 3355 + return PT_PAGE_TABLE_LEVEL; 3356 + 3357 + level = host_pfn_mapping_level(vcpu, gfn, pfn, slot); 3358 + if (level == PT_PAGE_TABLE_LEVEL) 3359 + return level; 3360 + 3361 + level = min(level, max_level); 3362 + 3363 + /* 3364 + * mmu_notifier_retry() was successful and mmu_lock is held, so 3365 + * the pmd can't be split from under us. 3366 + */ 3367 + mask = KVM_PAGES_PER_HPAGE(level) - 1; 3368 + VM_BUG_ON((gfn & mask) != (pfn & mask)); 3369 + *pfnp = pfn & ~mask; 3370 + 3371 + return level; 3372 + } 3373 + 3233 3374 static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, 3234 3375 gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) 3235 3376 { ··· 3331 3318 } 3332 3319 3333 3320 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, 3334 - int map_writable, int level, kvm_pfn_t pfn, 3335 - bool prefault, bool lpage_disallowed) 3321 + int map_writable, int max_level, kvm_pfn_t pfn, 3322 + bool prefault, bool account_disallowed_nx_lpage) 3336 3323 { 3337 3324 struct kvm_shadow_walk_iterator it; 3338 3325 struct kvm_mmu_page *sp; 3339 - int ret; 3326 + int level, ret; 3340 3327 gfn_t gfn = gpa >> PAGE_SHIFT; 3341 3328 gfn_t base_gfn = gfn; 3342 3329 3343 - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) 3330 + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) 3344 3331 return RET_PF_RETRY; 3332 + 3333 + level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn); 3345 3334 3346 3335 trace_kvm_mmu_spte_requested(gpa, level, pfn); 3347 3336 for_each_shadow_entry(vcpu, gpa, it) { ··· 3363 3348 it.level - 1, true, ACC_ALL); 3364 3349 3365 3350 link_shadow_page(vcpu, it.sptep, sp); 3366 - if (lpage_disallowed) 3351 + if (account_disallowed_nx_lpage) 3367 3352 account_huge_nx_page(vcpu->kvm, sp); 3368 3353 } 3369 3354 } ··· 3397 3382 } 3398 3383 3399 3384 return -EFAULT; 3400 - } 3401 - 3402 - static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, 3403 - gfn_t gfn, kvm_pfn_t *pfnp, 3404 - int *levelp) 3405 - { 3406 - kvm_pfn_t pfn = *pfnp; 3407 - int level = *levelp; 3408 - 3409 - /* 3410 - * Check if it's a transparent hugepage. If this would be an 3411 - * hugetlbfs page, level wouldn't be set to 3412 - * PT_PAGE_TABLE_LEVEL and there would be no adjustment done 3413 - * here. 3414 - */ 3415 - if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && 3416 - !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && 3417 - PageTransCompoundMap(pfn_to_page(pfn)) && 3418 - !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { 3419 - unsigned long mask; 3420 - /* 3421 - * mmu_notifier_retry was successful and we hold the 3422 - * mmu_lock here, so the pmd can't become splitting 3423 - * from under us, and in turn 3424 - * __split_huge_page_refcount() can't run from under 3425 - * us and we can safely transfer the refcount from 3426 - * PG_tail to PG_head as we switch the pfn to tail to 3427 - * head. 3428 - */ 3429 - *levelp = level = PT_DIRECTORY_LEVEL; 3430 - mask = KVM_PAGES_PER_HPAGE(level) - 1; 3431 - VM_BUG_ON((gfn & mask) != (pfn & mask)); 3432 - if (pfn & mask) { 3433 - kvm_release_pfn_clean(pfn); 3434 - pfn &= ~mask; 3435 - kvm_get_pfn(pfn); 3436 - *pfnp = pfn; 3437 - } 3438 - } 3439 3385 } 3440 3386 3441 3387 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, ··· 3504 3528 * - true: let the vcpu to access on the same address again. 3505 3529 * - false: let the real page fault path to fix it. 3506 3530 */ 3507 - static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, 3531 + static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 3508 3532 u32 error_code) 3509 3533 { 3510 3534 struct kvm_shadow_walk_iterator iterator; ··· 3512 3536 bool fault_handled = false; 3513 3537 u64 spte = 0ull; 3514 3538 uint retry_count = 0; 3515 - 3516 - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) 3517 - return false; 3518 3539 3519 3540 if (!page_fault_can_be_fast(error_code)) 3520 3541 return false; ··· 3521 3548 do { 3522 3549 u64 new_spte; 3523 3550 3524 - for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) 3525 - if (!is_shadow_present_pte(spte) || 3526 - iterator.level < level) 3551 + for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte) 3552 + if (!is_shadow_present_pte(spte)) 3527 3553 break; 3528 3554 3529 3555 sp = page_header(__pa(iterator.sptep)); ··· 3598 3626 3599 3627 } while (true); 3600 3628 3601 - trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, 3629 + trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, 3602 3630 spte, fault_handled); 3603 3631 walk_shadow_page_lockless_end(vcpu); 3604 3632 3605 3633 return fault_handled; 3606 - } 3607 - 3608 - static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 3609 - gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable); 3610 - static int make_mmu_pages_available(struct kvm_vcpu *vcpu); 3611 - 3612 - static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, 3613 - gfn_t gfn, bool prefault) 3614 - { 3615 - int r; 3616 - int level; 3617 - bool force_pt_level; 3618 - kvm_pfn_t pfn; 3619 - unsigned long mmu_seq; 3620 - bool map_writable, write = error_code & PFERR_WRITE_MASK; 3621 - bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && 3622 - is_nx_huge_page_enabled(); 3623 - 3624 - force_pt_level = lpage_disallowed; 3625 - level = mapping_level(vcpu, gfn, &force_pt_level); 3626 - if (likely(!force_pt_level)) { 3627 - /* 3628 - * This path builds a PAE pagetable - so we can map 3629 - * 2mb pages at maximum. Therefore check if the level 3630 - * is larger than that. 3631 - */ 3632 - if (level > PT_DIRECTORY_LEVEL) 3633 - level = PT_DIRECTORY_LEVEL; 3634 - 3635 - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 3636 - } 3637 - 3638 - if (fast_page_fault(vcpu, v, level, error_code)) 3639 - return RET_PF_RETRY; 3640 - 3641 - mmu_seq = vcpu->kvm->mmu_notifier_seq; 3642 - smp_rmb(); 3643 - 3644 - if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) 3645 - return RET_PF_RETRY; 3646 - 3647 - if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) 3648 - return r; 3649 - 3650 - r = RET_PF_RETRY; 3651 - spin_lock(&vcpu->kvm->mmu_lock); 3652 - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 3653 - goto out_unlock; 3654 - if (make_mmu_pages_available(vcpu) < 0) 3655 - goto out_unlock; 3656 - if (likely(!force_pt_level)) 3657 - transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); 3658 - r = __direct_map(vcpu, v, write, map_writable, level, pfn, 3659 - prefault, false); 3660 - out_unlock: 3661 - spin_unlock(&vcpu->kvm->mmu_lock); 3662 - kvm_release_pfn_clean(pfn); 3663 - return r; 3664 3634 } 3665 3635 3666 3636 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, ··· 3895 3981 } 3896 3982 EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); 3897 3983 3898 - static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, 3984 + static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr, 3899 3985 u32 access, struct x86_exception *exception) 3900 3986 { 3901 3987 if (exception) ··· 3903 3989 return vaddr; 3904 3990 } 3905 3991 3906 - static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, 3992 + static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr, 3907 3993 u32 access, 3908 3994 struct x86_exception *exception) 3909 3995 { ··· 3915 4001 static bool 3916 4002 __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) 3917 4003 { 3918 - int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f; 4004 + int bit7 = (pte >> 7) & 1; 3919 4005 3920 - return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) | 3921 - ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0); 4006 + return pte & rsvd_check->rsvd_bits_mask[bit7][level-1]; 3922 4007 } 3923 4008 3924 - static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) 4009 + static bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte) 3925 4010 { 3926 - return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level); 3927 - } 3928 - 3929 - static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level) 3930 - { 3931 - return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level); 4011 + return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); 3932 4012 } 3933 4013 3934 4014 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) ··· 3946 4038 { 3947 4039 struct kvm_shadow_walk_iterator iterator; 3948 4040 u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull; 4041 + struct rsvd_bits_validate *rsvd_check; 3949 4042 int root, leaf; 3950 4043 bool reserved = false; 3951 4044 3952 - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) 3953 - goto exit; 4045 + rsvd_check = &vcpu->arch.mmu->shadow_zero_check; 3954 4046 3955 4047 walk_shadow_page_lockless_begin(vcpu); 3956 4048 ··· 3966 4058 if (!is_shadow_present_pte(spte)) 3967 4059 break; 3968 4060 3969 - reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte, 3970 - iterator.level); 4061 + /* 4062 + * Use a bitwise-OR instead of a logical-OR to aggregate the 4063 + * reserved bit and EPT's invalid memtype/XWR checks to avoid 4064 + * adding a Jcc in the loop. 4065 + */ 4066 + reserved |= __is_bad_mt_xwr(rsvd_check, spte) | 4067 + __is_rsvd_bits_set(rsvd_check, spte, iterator.level); 3971 4068 } 3972 4069 3973 4070 walk_shadow_page_lockless_end(vcpu); ··· 3986 4073 root--; 3987 4074 } 3988 4075 } 3989 - exit: 4076 + 3990 4077 *sptep = spte; 3991 4078 return reserved; 3992 4079 } ··· 4050 4137 struct kvm_shadow_walk_iterator iterator; 4051 4138 u64 spte; 4052 4139 4053 - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) 4054 - return; 4055 - 4056 4140 walk_shadow_page_lockless_begin(vcpu); 4057 4141 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { 4058 4142 clear_sp_write_flooding_count(iterator.sptep); ··· 4059 4149 walk_shadow_page_lockless_end(vcpu); 4060 4150 } 4061 4151 4062 - static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 4063 - u32 error_code, bool prefault) 4064 - { 4065 - gfn_t gfn = gva >> PAGE_SHIFT; 4066 - int r; 4067 - 4068 - pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); 4069 - 4070 - if (page_fault_handle_page_track(vcpu, error_code, gfn)) 4071 - return RET_PF_EMULATE; 4072 - 4073 - r = mmu_topup_memory_caches(vcpu); 4074 - if (r) 4075 - return r; 4076 - 4077 - MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); 4078 - 4079 - 4080 - return nonpaging_map(vcpu, gva & PAGE_MASK, 4081 - error_code, gfn, prefault); 4082 - } 4083 - 4084 - static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) 4152 + static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 4153 + gfn_t gfn) 4085 4154 { 4086 4155 struct kvm_arch_async_pf arch; 4087 4156 ··· 4069 4180 arch.direct_map = vcpu->arch.mmu->direct_map; 4070 4181 arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu); 4071 4182 4072 - return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); 4183 + return kvm_setup_async_pf(vcpu, cr2_or_gpa, 4184 + kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); 4073 4185 } 4074 4186 4075 4187 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 4076 - gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable) 4188 + gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, 4189 + bool *writable) 4077 4190 { 4078 4191 struct kvm_memory_slot *slot; 4079 4192 bool async; ··· 4095 4204 return false; /* *pfn has correct page already */ 4096 4205 4097 4206 if (!prefault && kvm_can_do_async_pf(vcpu)) { 4098 - trace_kvm_try_async_get_page(gva, gfn); 4207 + trace_kvm_try_async_get_page(cr2_or_gpa, gfn); 4099 4208 if (kvm_find_async_pf_gfn(vcpu, gfn)) { 4100 - trace_kvm_async_pf_doublefault(gva, gfn); 4209 + trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); 4101 4210 kvm_make_request(KVM_REQ_APF_HALT, vcpu); 4102 4211 return true; 4103 - } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn)) 4212 + } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) 4104 4213 return true; 4105 4214 } 4106 4215 ··· 4108 4217 return false; 4109 4218 } 4110 4219 4220 + static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, 4221 + bool prefault, int max_level, bool is_tdp) 4222 + { 4223 + bool write = error_code & PFERR_WRITE_MASK; 4224 + bool exec = error_code & PFERR_FETCH_MASK; 4225 + bool lpage_disallowed = exec && is_nx_huge_page_enabled(); 4226 + bool map_writable; 4227 + 4228 + gfn_t gfn = gpa >> PAGE_SHIFT; 4229 + unsigned long mmu_seq; 4230 + kvm_pfn_t pfn; 4231 + int r; 4232 + 4233 + if (page_fault_handle_page_track(vcpu, error_code, gfn)) 4234 + return RET_PF_EMULATE; 4235 + 4236 + r = mmu_topup_memory_caches(vcpu); 4237 + if (r) 4238 + return r; 4239 + 4240 + if (lpage_disallowed) 4241 + max_level = PT_PAGE_TABLE_LEVEL; 4242 + 4243 + if (fast_page_fault(vcpu, gpa, error_code)) 4244 + return RET_PF_RETRY; 4245 + 4246 + mmu_seq = vcpu->kvm->mmu_notifier_seq; 4247 + smp_rmb(); 4248 + 4249 + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) 4250 + return RET_PF_RETRY; 4251 + 4252 + if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) 4253 + return r; 4254 + 4255 + r = RET_PF_RETRY; 4256 + spin_lock(&vcpu->kvm->mmu_lock); 4257 + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 4258 + goto out_unlock; 4259 + if (make_mmu_pages_available(vcpu) < 0) 4260 + goto out_unlock; 4261 + r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn, 4262 + prefault, is_tdp && lpage_disallowed); 4263 + 4264 + out_unlock: 4265 + spin_unlock(&vcpu->kvm->mmu_lock); 4266 + kvm_release_pfn_clean(pfn); 4267 + return r; 4268 + } 4269 + 4270 + static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, 4271 + u32 error_code, bool prefault) 4272 + { 4273 + pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code); 4274 + 4275 + /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ 4276 + return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault, 4277 + PT_DIRECTORY_LEVEL, false); 4278 + } 4279 + 4111 4280 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, 4112 4281 u64 fault_address, char *insn, int insn_len) 4113 4282 { 4114 4283 int r = 1; 4284 + 4285 + #ifndef CONFIG_X86_64 4286 + /* A 64-bit CR2 should be impossible on 32-bit KVM. */ 4287 + if (WARN_ON_ONCE(fault_address >> 32)) 4288 + return -EFAULT; 4289 + #endif 4115 4290 4116 4291 vcpu->arch.l1tf_flush_l1d = true; 4117 4292 switch (vcpu->arch.apf.host_apf_reason) { ··· 4206 4249 } 4207 4250 EXPORT_SYMBOL_GPL(kvm_handle_page_fault); 4208 4251 4209 - static bool 4210 - check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) 4211 - { 4212 - int page_num = KVM_PAGES_PER_HPAGE(level); 4213 - 4214 - gfn &= ~(page_num - 1); 4215 - 4216 - return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num); 4217 - } 4218 - 4219 - static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, 4252 + static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, 4220 4253 bool prefault) 4221 4254 { 4222 - kvm_pfn_t pfn; 4223 - int r; 4224 - int level; 4225 - bool force_pt_level; 4226 - gfn_t gfn = gpa >> PAGE_SHIFT; 4227 - unsigned long mmu_seq; 4228 - int write = error_code & PFERR_WRITE_MASK; 4229 - bool map_writable; 4230 - bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && 4231 - is_nx_huge_page_enabled(); 4255 + int max_level; 4232 4256 4233 - MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); 4257 + for (max_level = PT_MAX_HUGEPAGE_LEVEL; 4258 + max_level > PT_PAGE_TABLE_LEVEL; 4259 + max_level--) { 4260 + int page_num = KVM_PAGES_PER_HPAGE(max_level); 4261 + gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1); 4234 4262 4235 - if (page_fault_handle_page_track(vcpu, error_code, gfn)) 4236 - return RET_PF_EMULATE; 4237 - 4238 - r = mmu_topup_memory_caches(vcpu); 4239 - if (r) 4240 - return r; 4241 - 4242 - force_pt_level = 4243 - lpage_disallowed || 4244 - !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL); 4245 - level = mapping_level(vcpu, gfn, &force_pt_level); 4246 - if (likely(!force_pt_level)) { 4247 - if (level > PT_DIRECTORY_LEVEL && 4248 - !check_hugepage_cache_consistency(vcpu, gfn, level)) 4249 - level = PT_DIRECTORY_LEVEL; 4250 - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 4263 + if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) 4264 + break; 4251 4265 } 4252 4266 4253 - if (fast_page_fault(vcpu, gpa, level, error_code)) 4254 - return RET_PF_RETRY; 4255 - 4256 - mmu_seq = vcpu->kvm->mmu_notifier_seq; 4257 - smp_rmb(); 4258 - 4259 - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) 4260 - return RET_PF_RETRY; 4261 - 4262 - if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) 4263 - return r; 4264 - 4265 - r = RET_PF_RETRY; 4266 - spin_lock(&vcpu->kvm->mmu_lock); 4267 - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 4268 - goto out_unlock; 4269 - if (make_mmu_pages_available(vcpu) < 0) 4270 - goto out_unlock; 4271 - if (likely(!force_pt_level)) 4272 - transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); 4273 - r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, 4274 - prefault, lpage_disallowed); 4275 - out_unlock: 4276 - spin_unlock(&vcpu->kvm->mmu_lock); 4277 - kvm_release_pfn_clean(pfn); 4278 - return r; 4267 + return direct_page_fault(vcpu, gpa, error_code, prefault, 4268 + max_level, true); 4279 4269 } 4280 4270 4281 4271 static void nonpaging_init_context(struct kvm_vcpu *vcpu, ··· 5400 5496 } 5401 5497 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); 5402 5498 5403 - static int make_mmu_pages_available(struct kvm_vcpu *vcpu) 5404 - { 5405 - LIST_HEAD(invalid_list); 5406 - 5407 - if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) 5408 - return 0; 5409 - 5410 - while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { 5411 - if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) 5412 - break; 5413 - 5414 - ++vcpu->kvm->stat.mmu_recycled; 5415 - } 5416 - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 5417 - 5418 - if (!kvm_mmu_available_pages(vcpu->kvm)) 5419 - return -ENOSPC; 5420 - return 0; 5421 - } 5422 - 5423 - int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, 5499 + int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, 5424 5500 void *insn, int insn_len) 5425 5501 { 5426 5502 int r, emulation_type = 0; 5427 5503 bool direct = vcpu->arch.mmu->direct_map; 5428 5504 5505 + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) 5506 + return RET_PF_RETRY; 5507 + 5429 5508 /* With shadow page tables, fault_address contains a GVA or nGPA. */ 5430 5509 if (vcpu->arch.mmu->direct_map) { 5431 5510 vcpu->arch.gpa_available = true; 5432 - vcpu->arch.gpa_val = cr2; 5511 + vcpu->arch.gpa_val = cr2_or_gpa; 5433 5512 } 5434 5513 5435 5514 r = RET_PF_INVALID; 5436 5515 if (unlikely(error_code & PFERR_RSVD_MASK)) { 5437 - r = handle_mmio_page_fault(vcpu, cr2, direct); 5516 + r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); 5438 5517 if (r == RET_PF_EMULATE) 5439 5518 goto emulate; 5440 5519 } 5441 5520 5442 5521 if (r == RET_PF_INVALID) { 5443 - r = vcpu->arch.mmu->page_fault(vcpu, cr2, 5522 + r = vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, 5444 5523 lower_32_bits(error_code), 5445 5524 false); 5446 5525 WARN_ON(r == RET_PF_INVALID); ··· 5443 5556 */ 5444 5557 if (vcpu->arch.mmu->direct_map && 5445 5558 (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { 5446 - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2)); 5559 + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); 5447 5560 return 1; 5448 5561 } 5449 5562 ··· 5458 5571 * explicitly shadowing L1's page tables, i.e. unprotecting something 5459 5572 * for L1 isn't going to magically fix whatever issue cause L2 to fail. 5460 5573 */ 5461 - if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu)) 5574 + if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) 5462 5575 emulation_type = EMULTYPE_ALLOW_RETRY; 5463 5576 emulate: 5464 5577 /* ··· 5473 5586 return 1; 5474 5587 } 5475 5588 5476 - return x86_emulate_instruction(vcpu, cr2, emulation_type, insn, 5589 + return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, 5477 5590 insn_len); 5478 5591 } 5479 5592 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); ··· 5902 6015 * mapping if the indirect sp has level = 1. 5903 6016 */ 5904 6017 if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && 5905 - !kvm_is_zone_device_pfn(pfn) && 5906 - PageTransCompoundMap(pfn_to_page(pfn))) { 6018 + (kvm_is_zone_device_pfn(pfn) || 6019 + PageCompound(pfn_to_page(pfn)))) { 5907 6020 pte_list_remove(rmap_head, sptep); 5908 6021 5909 6022 if (kvm_available_flush_tlb_with_range()) ··· 6136 6249 * If reserved bit is not supported, clear the present bit to disable 6137 6250 * mmio page fault. 6138 6251 */ 6139 - if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52) 6252 + if (shadow_phys_bits == 52) 6140 6253 mask &= ~1ull; 6141 6254 6142 6255 kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);

+49 -39

arch/x86/kvm/mmu/paging_tmpl.h

··· 128 128 #endif 129 129 } 130 130 131 + static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte) 132 + { 133 + #if PTTYPE != PTTYPE_EPT 134 + return false; 135 + #else 136 + return __is_bad_mt_xwr(rsvd_check, gpte); 137 + #endif 138 + } 139 + 140 + static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) 141 + { 142 + return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level) || 143 + FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte); 144 + } 145 + 131 146 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 132 147 pt_element_t __user *ptep_user, unsigned index, 133 148 pt_element_t orig_pte, pt_element_t new_pte) ··· 190 175 struct kvm_mmu_page *sp, u64 *spte, 191 176 u64 gpte) 192 177 { 193 - if (is_rsvd_bits_set(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) 194 - goto no_present; 195 - 196 178 if (!FNAME(is_present_gpte)(gpte)) 197 179 goto no_present; 198 180 199 181 /* if accessed bit is not supported prefetch non accessed gpte */ 200 182 if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) && 201 183 !(gpte & PT_GUEST_ACCESSED_MASK)) 184 + goto no_present; 185 + 186 + if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) 202 187 goto no_present; 203 188 204 189 return false; ··· 306 291 } 307 292 308 293 /* 309 - * Fetch a guest pte for a guest virtual address 294 + * Fetch a guest pte for a guest virtual address, or for an L2's GPA. 310 295 */ 311 296 static int FNAME(walk_addr_generic)(struct guest_walker *walker, 312 297 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 313 - gva_t addr, u32 access) 298 + gpa_t addr, u32 access) 314 299 { 315 300 int ret; 316 301 pt_element_t pte; ··· 415 400 if (unlikely(!FNAME(is_present_gpte)(pte))) 416 401 goto error; 417 402 418 - if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) { 403 + if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) { 419 404 errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK; 420 405 goto error; 421 406 } ··· 511 496 } 512 497 513 498 static int FNAME(walk_addr)(struct guest_walker *walker, 514 - struct kvm_vcpu *vcpu, gva_t addr, u32 access) 499 + struct kvm_vcpu *vcpu, gpa_t addr, u32 access) 515 500 { 516 501 return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr, 517 502 access); ··· 626 611 * If the guest tries to write a write-protected page, we need to 627 612 * emulate this operation, return 1 to indicate this case. 628 613 */ 629 - static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 614 + static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, 630 615 struct guest_walker *gw, 631 - int write_fault, int hlevel, 616 + int write_fault, int max_level, 632 617 kvm_pfn_t pfn, bool map_writable, bool prefault, 633 618 bool lpage_disallowed) 634 619 { 635 620 struct kvm_mmu_page *sp = NULL; 636 621 struct kvm_shadow_walk_iterator it; 637 622 unsigned direct_access, access = gw->pt_access; 638 - int top_level, ret; 639 - gfn_t gfn, base_gfn; 623 + int top_level, hlevel, ret; 624 + gfn_t base_gfn = gw->gfn; 640 625 641 626 direct_access = gw->pte_access; 642 627 ··· 652 637 if (FNAME(gpte_changed)(vcpu, gw, top_level)) 653 638 goto out_gpte_changed; 654 639 655 - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) 640 + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) 656 641 goto out_gpte_changed; 657 642 658 643 for (shadow_walk_init(&it, vcpu, addr); ··· 681 666 link_shadow_page(vcpu, it.sptep, sp); 682 667 } 683 668 684 - /* 685 - * FNAME(page_fault) might have clobbered the bottom bits of 686 - * gw->gfn, restore them from the virtual address. 687 - */ 688 - gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); 689 - base_gfn = gfn; 669 + hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn); 690 670 691 671 trace_kvm_mmu_spte_requested(addr, gw->level, pfn); 692 672 ··· 692 682 * We cannot overwrite existing page tables with an NX 693 683 * large page, as the leaf could be executable. 694 684 */ 695 - disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel); 685 + disallowed_hugepage_adjust(it, gw->gfn, &pfn, &hlevel); 696 686 697 - base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); 687 + base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); 698 688 if (it.level == hlevel) 699 689 break; 700 690 ··· 775 765 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or 776 766 * a negative value on error. 777 767 */ 778 - static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, 768 + static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, 779 769 bool prefault) 780 770 { 781 771 int write_fault = error_code & PFERR_WRITE_MASK; ··· 783 773 struct guest_walker walker; 784 774 int r; 785 775 kvm_pfn_t pfn; 786 - int level = PT_PAGE_TABLE_LEVEL; 787 776 unsigned long mmu_seq; 788 777 bool map_writable, is_self_change_mapping; 789 778 bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && 790 779 is_nx_huge_page_enabled(); 791 - bool force_pt_level = lpage_disallowed; 780 + int max_level; 792 781 793 782 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 794 783 ··· 827 818 is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, 828 819 &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); 829 820 830 - if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) { 831 - level = mapping_level(vcpu, walker.gfn, &force_pt_level); 832 - if (likely(!force_pt_level)) { 833 - level = min(walker.level, level); 834 - walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); 835 - } 836 - } else 837 - force_pt_level = true; 821 + if (lpage_disallowed || is_self_change_mapping) 822 + max_level = PT_PAGE_TABLE_LEVEL; 823 + else 824 + max_level = walker.level; 838 825 839 826 mmu_seq = vcpu->kvm->mmu_notifier_seq; 840 827 smp_rmb(); ··· 870 865 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 871 866 if (make_mmu_pages_available(vcpu) < 0) 872 867 goto out_unlock; 873 - if (!force_pt_level) 874 - transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); 875 - r = FNAME(fetch)(vcpu, addr, &walker, write_fault, 876 - level, pfn, map_writable, prefault, lpage_disallowed); 868 + r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn, 869 + map_writable, prefault, lpage_disallowed); 877 870 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); 878 871 879 872 out_unlock: ··· 948 945 spin_unlock(&vcpu->kvm->mmu_lock); 949 946 } 950 947 951 - static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, 948 + /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ 949 + static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access, 952 950 struct x86_exception *exception) 953 951 { 954 952 struct guest_walker walker; 955 953 gpa_t gpa = UNMAPPED_GVA; 956 954 int r; 957 955 958 - r = FNAME(walk_addr)(&walker, vcpu, vaddr, access); 956 + r = FNAME(walk_addr)(&walker, vcpu, addr, access); 959 957 960 958 if (r) { 961 959 gpa = gfn_to_gpa(walker.gfn); 962 - gpa |= vaddr & ~PAGE_MASK; 960 + gpa |= addr & ~PAGE_MASK; 963 961 } else if (exception) 964 962 *exception = walker.fault; 965 963 ··· 968 964 } 969 965 970 966 #if PTTYPE != PTTYPE_EPT 971 - static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, 967 + /* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */ 968 + static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, 972 969 u32 access, 973 970 struct x86_exception *exception) 974 971 { 975 972 struct guest_walker walker; 976 973 gpa_t gpa = UNMAPPED_GVA; 977 974 int r; 975 + 976 + #ifndef CONFIG_X86_64 977 + /* A 64-bit GVA should be impossible on 32-bit KVM. */ 978 + WARN_ON_ONCE(vaddr >> 32); 979 + #endif 978 980 979 981 r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access); 980 982

+6 -6

arch/x86/kvm/mmutrace.h

··· 249 249 250 250 TRACE_EVENT( 251 251 fast_page_fault, 252 - TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, 252 + TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code, 253 253 u64 *sptep, u64 old_spte, bool retry), 254 - TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry), 254 + TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry), 255 255 256 256 TP_STRUCT__entry( 257 257 __field(int, vcpu_id) 258 - __field(gva_t, gva) 258 + __field(gpa_t, cr2_or_gpa) 259 259 __field(u32, error_code) 260 260 __field(u64 *, sptep) 261 261 __field(u64, old_spte) ··· 265 265 266 266 TP_fast_assign( 267 267 __entry->vcpu_id = vcpu->vcpu_id; 268 - __entry->gva = gva; 268 + __entry->cr2_or_gpa = cr2_or_gpa; 269 269 __entry->error_code = error_code; 270 270 __entry->sptep = sptep; 271 271 __entry->old_spte = old_spte; ··· 273 273 __entry->retry = retry; 274 274 ), 275 275 276 - TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx" 276 + TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx" 277 277 " new %llx spurious %d fixed %d", __entry->vcpu_id, 278 - __entry->gva, __print_flags(__entry->error_code, "|", 278 + __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|", 279 279 kvm_mmu_trace_pferr_flags), __entry->sptep, 280 280 __entry->old_spte, __entry->new_spte, 281 281 __spte_satisfied(old_spte), __spte_satisfied(new_spte)

+6 -2

arch/x86/kvm/mtrr.c

··· 192 192 break; 193 193 case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000: 194 194 *seg = 1; 195 - *unit = msr - MSR_MTRRfix16K_80000; 195 + *unit = array_index_nospec( 196 + msr - MSR_MTRRfix16K_80000, 197 + MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1); 196 198 break; 197 199 case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000: 198 200 *seg = 2; 199 - *unit = msr - MSR_MTRRfix4K_C0000; 201 + *unit = array_index_nospec( 202 + msr - MSR_MTRRfix4K_C0000, 203 + MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1); 200 204 break; 201 205 default: 202 206 return false;

+14 -4

arch/x86/kvm/pmu.h

··· 2 2 #ifndef __KVM_X86_PMU_H 3 3 #define __KVM_X86_PMU_H 4 4 5 + #include <linux/nospec.h> 6 + 5 7 #define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu) 6 8 #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) 7 9 #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) ··· 104 102 static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, 105 103 u32 base) 106 104 { 107 - if (msr >= base && msr < base + pmu->nr_arch_gp_counters) 108 - return &pmu->gp_counters[msr - base]; 105 + if (msr >= base && msr < base + pmu->nr_arch_gp_counters) { 106 + u32 index = array_index_nospec(msr - base, 107 + pmu->nr_arch_gp_counters); 108 + 109 + return &pmu->gp_counters[index]; 110 + } 109 111 110 112 return NULL; 111 113 } ··· 119 113 { 120 114 int base = MSR_CORE_PERF_FIXED_CTR0; 121 115 122 - if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) 123 - return &pmu->fixed_counters[msr - base]; 116 + if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) { 117 + u32 index = array_index_nospec(msr - base, 118 + pmu->nr_arch_fixed_counters); 119 + 120 + return &pmu->fixed_counters[index]; 121 + } 124 122 125 123 return NULL; 126 124 }

+75 -59

arch/x86/kvm/svm.c

··· 1307 1307 } 1308 1308 } 1309 1309 1310 + /* 1311 + * The default MMIO mask is a single bit (excluding the present bit), 1312 + * which could conflict with the memory encryption bit. Check for 1313 + * memory encryption support and override the default MMIO mask if 1314 + * memory encryption is enabled. 1315 + */ 1316 + static __init void svm_adjust_mmio_mask(void) 1317 + { 1318 + unsigned int enc_bit, mask_bit; 1319 + u64 msr, mask; 1320 + 1321 + /* If there is no memory encryption support, use existing mask */ 1322 + if (cpuid_eax(0x80000000) < 0x8000001f) 1323 + return; 1324 + 1325 + /* If memory encryption is not enabled, use existing mask */ 1326 + rdmsrl(MSR_K8_SYSCFG, msr); 1327 + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) 1328 + return; 1329 + 1330 + enc_bit = cpuid_ebx(0x8000001f) & 0x3f; 1331 + mask_bit = boot_cpu_data.x86_phys_bits; 1332 + 1333 + /* Increment the mask bit if it is the same as the encryption bit */ 1334 + if (enc_bit == mask_bit) 1335 + mask_bit++; 1336 + 1337 + /* 1338 + * If the mask bit location is below 52, then some bits above the 1339 + * physical addressing limit will always be reserved, so use the 1340 + * rsvd_bits() function to generate the mask. This mask, along with 1341 + * the present bit, will be used to generate a page fault with 1342 + * PFER.RSV = 1. 1343 + * 1344 + * If the mask bit location is 52 (or above), then clear the mask. 1345 + */ 1346 + mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; 1347 + 1348 + kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); 1349 + } 1350 + 1310 1351 static __init int svm_hardware_setup(void) 1311 1352 { 1312 1353 int cpu; ··· 1401 1360 sev = false; 1402 1361 } 1403 1362 } 1363 + 1364 + svm_adjust_mmio_mask(); 1404 1365 1405 1366 for_each_possible_cpu(cpu) { 1406 1367 r = svm_cpu_init(cpu); ··· 2187 2144 return ret; 2188 2145 } 2189 2146 2190 - static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) 2147 + static int svm_create_vcpu(struct kvm_vcpu *vcpu) 2191 2148 { 2192 2149 struct vcpu_svm *svm; 2193 2150 struct page *page; ··· 2196 2153 struct page *nested_msrpm_pages; 2197 2154 int err; 2198 2155 2199 - BUILD_BUG_ON_MSG(offsetof(struct vcpu_svm, vcpu) != 0, 2200 - "struct kvm_vcpu must be at offset 0 for arch usercopy region"); 2201 - 2202 - svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); 2203 - if (!svm) { 2204 - err = -ENOMEM; 2205 - goto out; 2206 - } 2207 - 2208 - svm->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, 2209 - GFP_KERNEL_ACCOUNT); 2210 - if (!svm->vcpu.arch.user_fpu) { 2211 - printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); 2212 - err = -ENOMEM; 2213 - goto free_partial_svm; 2214 - } 2215 - 2216 - svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, 2217 - GFP_KERNEL_ACCOUNT); 2218 - if (!svm->vcpu.arch.guest_fpu) { 2219 - printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); 2220 - err = -ENOMEM; 2221 - goto free_user_fpu; 2222 - } 2223 - 2224 - err = kvm_vcpu_init(&svm->vcpu, kvm, id); 2225 - if (err) 2226 - goto free_svm; 2156 + BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); 2157 + svm = to_svm(vcpu); 2227 2158 2228 2159 err = -ENOMEM; 2229 2160 page = alloc_page(GFP_KERNEL_ACCOUNT); 2230 2161 if (!page) 2231 - goto uninit; 2162 + goto out; 2232 2163 2233 2164 msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); 2234 2165 if (!msrpm_pages) ··· 2239 2222 svm->asid_generation = 0; 2240 2223 init_vmcb(svm); 2241 2224 2242 - svm_init_osvw(&svm->vcpu); 2225 + svm_init_osvw(vcpu); 2243 2226 2244 - return &svm->vcpu; 2227 + return 0; 2245 2228 2246 2229 free_page4: 2247 2230 __free_page(hsave_page); ··· 2251 2234 __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); 2252 2235 free_page1: 2253 2236 __free_page(page); 2254 - uninit: 2255 - kvm_vcpu_uninit(&svm->vcpu); 2256 - free_svm: 2257 - kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); 2258 - free_user_fpu: 2259 - kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu); 2260 - free_partial_svm: 2261 - kmem_cache_free(kvm_vcpu_cache, svm); 2262 2237 out: 2263 - return ERR_PTR(err); 2238 + return err; 2264 2239 } 2265 2240 2266 2241 static void svm_clear_current_vmcb(struct vmcb *vmcb) ··· 2278 2269 __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); 2279 2270 __free_page(virt_to_page(svm->nested.hsave)); 2280 2271 __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); 2281 - kvm_vcpu_uninit(vcpu); 2282 - kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu); 2283 - kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); 2284 - kmem_cache_free(kvm_vcpu_cache, svm); 2285 2272 } 2286 2273 2287 2274 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) ··· 4286 4281 !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) 4287 4282 return 1; 4288 4283 4289 - /* The STIBP bit doesn't fault even if it's not advertised */ 4290 - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) 4284 + if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) 4291 4285 return 1; 4292 4286 4293 4287 svm->spec_ctrl = data; 4294 - 4295 4288 if (!data) 4296 4289 break; 4297 4290 ··· 4313 4310 4314 4311 if (data & ~PRED_CMD_IBPB) 4315 4312 return 1; 4316 - 4313 + if (!boot_cpu_has(X86_FEATURE_AMD_IBPB)) 4314 + return 1; 4317 4315 if (!data) 4318 4316 break; 4319 4317 4320 4318 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); 4321 - if (is_guest_mode(vcpu)) 4322 - break; 4323 4319 set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); 4324 4320 break; 4325 4321 case MSR_AMD64_VIRT_SPEC_CTRL: ··· 4521 4519 */ 4522 4520 kvm_for_each_vcpu(i, vcpu, kvm) { 4523 4521 bool m = kvm_apic_match_dest(vcpu, apic, 4524 - icrl & KVM_APIC_SHORT_MASK, 4522 + icrl & APIC_SHORT_MASK, 4525 4523 GET_APIC_DEST_FIELD(icrh), 4526 - icrl & KVM_APIC_DEST_MASK); 4524 + icrl & APIC_DEST_MASK); 4527 4525 4528 4526 if (m && !avic_vcpu_is_running(vcpu)) 4529 4527 kvm_vcpu_wake_up(vcpu); ··· 4937 4935 *info2 = control->exit_info_2; 4938 4936 } 4939 4937 4940 - static int handle_exit(struct kvm_vcpu *vcpu) 4938 + static int handle_exit(struct kvm_vcpu *vcpu, 4939 + enum exit_fastpath_completion exit_fastpath) 4941 4940 { 4942 4941 struct vcpu_svm *svm = to_svm(vcpu); 4943 4942 struct kvm_run *kvm_run = vcpu->run; ··· 4996 4993 __func__, svm->vmcb->control.exit_int_info, 4997 4994 exit_code); 4998 4995 4999 - if (exit_code >= ARRAY_SIZE(svm_exit_handlers) 4996 + if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) { 4997 + kvm_skip_emulated_instruction(vcpu); 4998 + return 1; 4999 + } else if (exit_code >= ARRAY_SIZE(svm_exit_handlers) 5000 5000 || !svm_exit_handlers[exit_code]) { 5001 5001 vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code); 5002 5002 dump_vmcb(vcpu); ··· 5919 5913 struct vcpu_svm *svm = to_svm(vcpu); 5920 5914 5921 5915 vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && 5916 + boot_cpu_has(X86_FEATURE_XSAVE) && 5922 5917 boot_cpu_has(X86_FEATURE_XSAVES); 5923 5918 5924 5919 /* Update nrips enabled cache */ ··· 5931 5924 guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC); 5932 5925 } 5933 5926 5934 - #define F(x) bit(X86_FEATURE_##x) 5927 + #define F feature_bit 5935 5928 5936 5929 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 5937 5930 { 5938 5931 switch (func) { 5939 5932 case 0x1: 5940 5933 if (avic) 5941 - entry->ecx &= ~bit(X86_FEATURE_X2APIC); 5934 + entry->ecx &= ~F(X2APIC); 5942 5935 break; 5943 5936 case 0x80000001: 5944 5937 if (nested) ··· 6006 5999 static bool svm_has_wbinvd_exit(void) 6007 6000 { 6008 6001 return true; 6002 + } 6003 + 6004 + static bool svm_pku_supported(void) 6005 + { 6006 + return false; 6009 6007 } 6010 6008 6011 6009 #define PRE_EX(exit) { .exit_code = (exit), \ ··· 6198 6186 return ret; 6199 6187 } 6200 6188 6201 - static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) 6189 + static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu, 6190 + enum exit_fastpath_completion *exit_fastpath) 6202 6191 { 6203 - 6192 + if (!is_guest_mode(vcpu) && 6193 + to_svm(vcpu)->vmcb->control.exit_code == EXIT_REASON_MSR_WRITE) 6194 + *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); 6204 6195 } 6205 6196 6206 6197 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) ··· 7356 7341 .xsaves_supported = svm_xsaves_supported, 7357 7342 .umip_emulated = svm_umip_emulated, 7358 7343 .pt_supported = svm_pt_supported, 7344 + .pku_supported = svm_pku_supported, 7359 7345 7360 7346 .set_supported_cpuid = svm_set_supported_cpuid, 7361 7347

+5

arch/x86/kvm/vmx/capabilities.h

··· 145 145 SECONDARY_EXEC_DESC; 146 146 } 147 147 148 + static inline bool vmx_pku_supported(void) 149 + { 150 + return boot_cpu_has(X86_FEATURE_PKU); 151 + } 152 + 148 153 static inline bool cpu_has_vmx_rdtscp(void) 149 154 { 150 155 return vmcs_config.cpu_based_2nd_exec_ctrl &

-5

arch/x86/kvm/vmx/evmcs.c

··· 350 350 uint16_t *vmcs_version) 351 351 { 352 352 struct vcpu_vmx *vmx = to_vmx(vcpu); 353 - bool evmcs_already_enabled = vmx->nested.enlightened_vmcs_enabled; 354 353 355 354 vmx->nested.enlightened_vmcs_enabled = true; 356 355 357 356 if (vmcs_version) 358 357 *vmcs_version = nested_get_evmcs_version(vcpu); 359 - 360 - /* We don't support disabling the feature for simplicity. */ 361 - if (evmcs_already_enabled) 362 - return 0; 363 358 364 359 vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; 365 360 vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;

+86 -105

arch/x86/kvm/vmx/nested.c

··· 28 28 failed; \ 29 29 }) 30 30 31 - #define SET_MSR_OR_WARN(vcpu, idx, data) \ 32 - ({ \ 33 - bool failed = kvm_set_msr(vcpu, idx, data); \ 34 - if (failed) \ 35 - pr_warn_ratelimited( \ 36 - "%s cannot write MSR (0x%x, 0x%llx)\n", \ 37 - __func__, idx, data); \ 38 - failed; \ 39 - }) 40 - 41 31 /* 42 32 * Hyper-V requires all of these, so mark them as supported even though 43 33 * they are just treated the same as all-context. ··· 2162 2172 * EXEC CONTROLS 2163 2173 */ 2164 2174 exec_control = vmx_exec_control(vmx); /* L0's desires */ 2165 - exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 2166 - exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 2175 + exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2176 + exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2167 2177 exec_control &= ~CPU_BASED_TPR_SHADOW; 2168 2178 exec_control |= vmcs12->cpu_based_vm_exec_control; 2169 2179 ··· 2540 2550 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 2541 2551 2542 2552 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2543 - SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2544 - vmcs12->guest_ia32_perf_global_ctrl)) 2553 + WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2554 + vmcs12->guest_ia32_perf_global_ctrl))) 2545 2555 return -EINVAL; 2546 2556 2547 2557 kvm_rsp_write(vcpu, vmcs12->guest_rsp); ··· 2556 2566 return -EINVAL; 2557 2567 2558 2568 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2559 - nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))) 2569 + nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2560 2570 return -EINVAL; 2561 2571 2562 2572 return 0; ··· 2813 2823 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2814 2824 return -EINVAL; 2815 2825 2816 - #ifdef CONFIG_X86_64 2817 2826 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2818 2827 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2819 2828 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || ··· 2820 2831 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 2821 2832 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 2822 2833 return -EINVAL; 2823 - #endif 2824 2834 2825 2835 /* 2826 2836 * If the load IA32_EFER VM-exit control is 1, bits reserved in the ··· 2885 2897 2886 2898 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 2887 2899 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 2900 + return -EINVAL; 2901 + 2902 + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 2903 + CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 2888 2904 return -EINVAL; 2889 2905 2890 2906 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && ··· 3040 3048 return 0; 3041 3049 } 3042 3050 3043 - static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 3044 - struct vmcs12 *vmcs12); 3045 - 3046 3051 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3047 3052 { 3048 3053 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); ··· 3172 3183 u32 exit_qual; 3173 3184 3174 3185 evaluate_pending_interrupts = exec_controls_get(vmx) & 3175 - (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); 3186 + (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); 3176 3187 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3177 3188 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3178 3189 ··· 3219 3230 } 3220 3231 3221 3232 enter_guest_mode(vcpu); 3222 - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 3233 + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3223 3234 vcpu->arch.tsc_offset += vmcs12->tsc_offset; 3224 3235 3225 3236 if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) ··· 3283 3294 * 26.7 "VM-entry failures during or after loading guest state". 3284 3295 */ 3285 3296 vmentry_fail_vmexit_guest_mode: 3286 - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 3297 + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3287 3298 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3288 3299 leave_guest_mode(vcpu); 3289 3300 ··· 3396 3407 */ 3397 3408 if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && 3398 3409 !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3399 - !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) && 3400 - !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) && 3410 + !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_NMI_WINDOW_EXITING) && 3411 + !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_INTR_WINDOW_EXITING) && 3401 3412 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3402 3413 vmx->nested.nested_run_pending = 0; 3403 3414 return kvm_vcpu_halt(vcpu); ··· 3416 3427 3417 3428 /* 3418 3429 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3419 - * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). 3430 + * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3420 3431 * This function returns the new value we should put in vmcs12.guest_cr0. 3421 3432 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3422 3433 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now ··· 3988 3999 vcpu->arch.pat = vmcs12->host_ia32_pat; 3989 4000 } 3990 4001 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 3991 - SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 3992 - vmcs12->host_ia32_perf_global_ctrl); 4002 + WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4003 + vmcs12->host_ia32_perf_global_ctrl)); 3993 4004 3994 4005 /* Set L1 segment info according to Intel SDM 3995 4006 27.5.2 Loading Host Segment and Descriptor-Table Registers */ ··· 4198 4209 if (nested_cpu_has_preemption_timer(vmcs12)) 4199 4210 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4200 4211 4201 - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 4212 + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 4202 4213 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 4203 4214 4204 4215 if (likely(!vmx->fail)) { ··· 4740 4751 4741 4752 static int handle_vmread(struct kvm_vcpu *vcpu) 4742 4753 { 4743 - unsigned long field; 4744 - u64 field_value; 4754 + struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 4755 + : get_vmcs12(vcpu); 4745 4756 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4746 - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4747 - int len; 4748 - gva_t gva = 0; 4749 - struct vmcs12 *vmcs12; 4757 + u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4758 + struct vcpu_vmx *vmx = to_vmx(vcpu); 4750 4759 struct x86_exception e; 4760 + unsigned long field; 4761 + u64 value; 4762 + gva_t gva = 0; 4751 4763 short offset; 4764 + int len; 4752 4765 4753 4766 if (!nested_vmx_check_permission(vcpu)) 4754 4767 return 1; 4755 4768 4756 - if (to_vmx(vcpu)->nested.current_vmptr == -1ull) 4769 + /* 4770 + * In VMX non-root operation, when the VMCS-link pointer is -1ull, 4771 + * any VMREAD sets the ALU flags for VMfailInvalid. 4772 + */ 4773 + if (vmx->nested.current_vmptr == -1ull || 4774 + (is_guest_mode(vcpu) && 4775 + get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) 4757 4776 return nested_vmx_failInvalid(vcpu); 4758 4777 4759 - if (!is_guest_mode(vcpu)) 4760 - vmcs12 = get_vmcs12(vcpu); 4761 - else { 4762 - /* 4763 - * When vmcs->vmcs_link_pointer is -1ull, any VMREAD 4764 - * to shadowed-field sets the ALU flags for VMfailInvalid. 4765 - */ 4766 - if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) 4767 - return nested_vmx_failInvalid(vcpu); 4768 - vmcs12 = get_shadow_vmcs12(vcpu); 4769 - } 4770 - 4771 4778 /* Decode instruction info and find the field to read */ 4772 - field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 4779 + field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf)); 4773 4780 4774 4781 offset = vmcs_field_to_offset(field); 4775 4782 if (offset < 0) ··· 4775 4790 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 4776 4791 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4777 4792 4778 - /* Read the field, zero-extended to a u64 field_value */ 4779 - field_value = vmcs12_read_any(vmcs12, field, offset); 4793 + /* Read the field, zero-extended to a u64 value */ 4794 + value = vmcs12_read_any(vmcs12, field, offset); 4780 4795 4781 4796 /* 4782 4797 * Now copy part of this value to register or memory, as requested. 4783 4798 * Note that the number of bits actually copied is 32 or 64 depending 4784 4799 * on the guest's mode (32 or 64 bit), not on the given field's length. 4785 4800 */ 4786 - if (vmx_instruction_info & (1u << 10)) { 4787 - kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), 4788 - field_value); 4801 + if (instr_info & BIT(10)) { 4802 + kvm_register_writel(vcpu, (((instr_info) >> 3) & 0xf), value); 4789 4803 } else { 4790 4804 len = is_64_bit_mode(vcpu) ? 8 : 4; 4791 4805 if (get_vmx_mem_address(vcpu, exit_qualification, 4792 - vmx_instruction_info, true, len, &gva)) 4806 + instr_info, true, len, &gva)) 4793 4807 return 1; 4794 4808 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 4795 - if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e)) 4809 + if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) { 4796 4810 kvm_inject_page_fault(vcpu, &e); 4811 + return 1; 4812 + } 4797 4813 } 4798 4814 4799 4815 return nested_vmx_succeed(vcpu); ··· 4826 4840 4827 4841 static int handle_vmwrite(struct kvm_vcpu *vcpu) 4828 4842 { 4829 - unsigned long field; 4830 - int len; 4831 - gva_t gva; 4832 - struct vcpu_vmx *vmx = to_vmx(vcpu); 4843 + struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 4844 + : get_vmcs12(vcpu); 4833 4845 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4834 - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4846 + u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4847 + struct vcpu_vmx *vmx = to_vmx(vcpu); 4848 + struct x86_exception e; 4849 + unsigned long field; 4850 + short offset; 4851 + gva_t gva; 4852 + int len; 4835 4853 4836 - /* The value to write might be 32 or 64 bits, depending on L1's long 4854 + /* 4855 + * The value to write might be 32 or 64 bits, depending on L1's long 4837 4856 * mode, and eventually we need to write that into a field of several 4838 4857 * possible lengths. The code below first zero-extends the value to 64 4839 - * bit (field_value), and then copies only the appropriate number of 4858 + * bit (value), and then copies only the appropriate number of 4840 4859 * bits into the vmcs12 field. 4841 4860 */ 4842 - u64 field_value = 0; 4843 - struct x86_exception e; 4844 - struct vmcs12 *vmcs12; 4845 - short offset; 4861 + u64 value = 0; 4846 4862 4847 4863 if (!nested_vmx_check_permission(vcpu)) 4848 4864 return 1; 4849 4865 4850 - if (vmx->nested.current_vmptr == -1ull) 4866 + /* 4867 + * In VMX non-root operation, when the VMCS-link pointer is -1ull, 4868 + * any VMWRITE sets the ALU flags for VMfailInvalid. 4869 + */ 4870 + if (vmx->nested.current_vmptr == -1ull || 4871 + (is_guest_mode(vcpu) && 4872 + get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) 4851 4873 return nested_vmx_failInvalid(vcpu); 4852 4874 4853 - if (vmx_instruction_info & (1u << 10)) 4854 - field_value = kvm_register_readl(vcpu, 4855 - (((vmx_instruction_info) >> 3) & 0xf)); 4875 + if (instr_info & BIT(10)) 4876 + value = kvm_register_readl(vcpu, (((instr_info) >> 3) & 0xf)); 4856 4877 else { 4857 4878 len = is_64_bit_mode(vcpu) ? 8 : 4; 4858 4879 if (get_vmx_mem_address(vcpu, exit_qualification, 4859 - vmx_instruction_info, false, len, &gva)) 4880 + instr_info, false, len, &gva)) 4860 4881 return 1; 4861 - if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) { 4882 + if (kvm_read_guest_virt(vcpu, gva, &value, len, &e)) { 4862 4883 kvm_inject_page_fault(vcpu, &e); 4863 4884 return 1; 4864 4885 } 4865 4886 } 4866 4887 4888 + field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf)); 4867 4889 4868 - field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 4890 + offset = vmcs_field_to_offset(field); 4891 + if (offset < 0) 4892 + return nested_vmx_failValid(vcpu, 4893 + VMXERR_UNSUPPORTED_VMCS_COMPONENT); 4894 + 4869 4895 /* 4870 4896 * If the vCPU supports "VMWRITE to any supported field in the 4871 4897 * VMCS," then the "read-only" fields are actually read/write. ··· 4887 4889 return nested_vmx_failValid(vcpu, 4888 4890 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 4889 4891 4890 - if (!is_guest_mode(vcpu)) { 4891 - vmcs12 = get_vmcs12(vcpu); 4892 - 4893 - /* 4894 - * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 4895 - * vmcs12, else we may crush a field or consume a stale value. 4896 - */ 4897 - if (!is_shadow_field_rw(field)) 4898 - copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4899 - } else { 4900 - /* 4901 - * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE 4902 - * to shadowed-field sets the ALU flags for VMfailInvalid. 4903 - */ 4904 - if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) 4905 - return nested_vmx_failInvalid(vcpu); 4906 - vmcs12 = get_shadow_vmcs12(vcpu); 4907 - } 4908 - 4909 - offset = vmcs_field_to_offset(field); 4910 - if (offset < 0) 4911 - return nested_vmx_failValid(vcpu, 4912 - VMXERR_UNSUPPORTED_VMCS_COMPONENT); 4892 + /* 4893 + * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 4894 + * vmcs12, else we may crush a field or consume a stale value. 4895 + */ 4896 + if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 4897 + copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4913 4898 4914 4899 /* 4915 4900 * Some Intel CPUs intentionally drop the reserved bits of the AR byte ··· 4903 4922 * the stripped down value, L2 sees the full value as stored by KVM). 4904 4923 */ 4905 4924 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 4906 - field_value &= 0x1f0ff; 4925 + value &= 0x1f0ff; 4907 4926 4908 - vmcs12_write_any(vmcs12, field, offset, field_value); 4927 + vmcs12_write_any(vmcs12, field, offset, value); 4909 4928 4910 4929 /* 4911 4930 * Do not track vmcs12 dirty-state if in guest-mode as we actually ··· 4922 4941 preempt_disable(); 4923 4942 vmcs_load(vmx->vmcs01.shadow_vmcs); 4924 4943 4925 - __vmcs_writel(field, field_value); 4944 + __vmcs_writel(field, value); 4926 4945 4927 4946 vmcs_clear(vmx->vmcs01.shadow_vmcs); 4928 4947 vmcs_load(vmx->loaded_vmcs->vmcs); ··· 5505 5524 return false; 5506 5525 case EXIT_REASON_TRIPLE_FAULT: 5507 5526 return true; 5508 - case EXIT_REASON_PENDING_INTERRUPT: 5509 - return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); 5527 + case EXIT_REASON_INTERRUPT_WINDOW: 5528 + return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 5510 5529 case EXIT_REASON_NMI_WINDOW: 5511 - return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); 5530 + return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 5512 5531 case EXIT_REASON_TASK_SWITCH: 5513 5532 return true; 5514 5533 case EXIT_REASON_CPUID: ··· 5996 6015 msrs->procbased_ctls_low = 5997 6016 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 5998 6017 msrs->procbased_ctls_high &= 5999 - CPU_BASED_VIRTUAL_INTR_PENDING | 6000 - CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | 6018 + CPU_BASED_INTR_WINDOW_EXITING | 6019 + CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 6001 6020 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 6002 6021 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 6003 6022 CPU_BASED_CR3_STORE_EXITING |

+16 -8

arch/x86/kvm/vmx/pmu_intel.c

··· 86 86 87 87 static unsigned intel_find_fixed_event(int idx) 88 88 { 89 - if (idx >= ARRAY_SIZE(fixed_pmc_events)) 89 + u32 event; 90 + size_t size = ARRAY_SIZE(fixed_pmc_events); 91 + 92 + if (idx >= size) 90 93 return PERF_COUNT_HW_MAX; 91 94 92 - return intel_arch_events[fixed_pmc_events[idx]].event_type; 95 + event = fixed_pmc_events[array_index_nospec(idx, size)]; 96 + return intel_arch_events[event].event_type; 93 97 } 94 98 95 99 /* check if a PMC is enabled by comparing it with globl_ctrl bits. */ ··· 134 130 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 135 131 bool fixed = idx & (1u << 30); 136 132 struct kvm_pmc *counters; 133 + unsigned int num_counters; 137 134 138 135 idx &= ~(3u << 30); 139 - if (!fixed && idx >= pmu->nr_arch_gp_counters) 136 + if (fixed) { 137 + counters = pmu->fixed_counters; 138 + num_counters = pmu->nr_arch_fixed_counters; 139 + } else { 140 + counters = pmu->gp_counters; 141 + num_counters = pmu->nr_arch_gp_counters; 142 + } 143 + if (idx >= num_counters) 140 144 return NULL; 141 - if (fixed && idx >= pmu->nr_arch_fixed_counters) 142 - return NULL; 143 - counters = fixed ? pmu->fixed_counters : pmu->gp_counters; 144 145 *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP]; 145 - 146 - return &counters[idx]; 146 + return &counters[array_index_nospec(idx, num_counters)]; 147 147 } 148 148 149 149 static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)

+2 -2

arch/x86/kvm/vmx/vmcs_shadow_fields.h

··· 23 23 * 24 24 * When adding or removing fields here, note that shadowed 25 25 * fields must always be synced by prepare_vmcs02, not just 26 - * prepare_vmcs02_full. 26 + * prepare_vmcs02_rare. 27 27 */ 28 28 29 29 /* 30 30 * Keeping the fields ordered by size is an attempt at improving 31 - * branch prediction in vmcs_read_any and vmcs_write_any. 31 + * branch prediction in vmcs12_read_any and vmcs12_write_any. 32 32 */ 33 33 34 34 /* 16-bits */

+142 -158

arch/x86/kvm/vmx/vmx.c

··· 1057 1057 } 1058 1058 #endif 1059 1059 1060 + static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) 1061 + { 1062 + return (pt_mode == PT_MODE_HOST_GUEST) && 1063 + !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 1064 + } 1065 + 1060 1066 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) 1061 1067 { 1062 1068 u32 i; ··· 1722 1716 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1723 1717 1724 1718 if (is_guest_mode(vcpu) && 1725 - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) 1719 + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)) 1726 1720 return vcpu->arch.tsc_offset - vmcs12->tsc_offset; 1727 1721 1728 1722 return vcpu->arch.tsc_offset; ··· 1740 1734 * to the newly set TSC to get L2's TSC. 1741 1735 */ 1742 1736 if (is_guest_mode(vcpu) && 1743 - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) 1737 + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)) 1744 1738 g_tsc_offset = vmcs12->tsc_offset; 1745 1739 1746 1740 trace_kvm_write_tsc_offset(vcpu->vcpu_id, ··· 1779 1773 default: 1780 1774 return 1; 1781 1775 } 1782 - 1783 - return 0; 1784 1776 } 1785 1777 1786 1778 /* ··· 1920 1916 } 1921 1917 1922 1918 /* 1923 - * Writes msr value into into the appropriate "register". 1919 + * Writes msr value into the appropriate "register". 1924 1920 * Returns 0 on success, non-0 otherwise. 1925 1921 * Assumes vcpu_load() was already called. 1926 1922 */ ··· 1998 1994 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) 1999 1995 return 1; 2000 1996 2001 - /* The STIBP bit doesn't fault even if it's not advertised */ 2002 - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) 1997 + if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) 2003 1998 return 1; 2004 1999 2005 2000 vmx->spec_ctrl = data; 2006 - 2007 2001 if (!data) 2008 2002 break; 2009 2003 ··· 2012 2010 * 2013 2011 * For nested: 2014 2012 * The handling of the MSR bitmap for L2 guests is done in 2015 - * nested_vmx_merge_msr_bitmap. We should not touch the 2013 + * nested_vmx_prepare_msr_bitmap. We should not touch the 2016 2014 * vmcs02.msr_bitmap here since it gets completely overwritten 2017 2015 * in the merging. We update the vmcs01 here for L1 as well 2018 2016 * since it will end up touching the MSR anyway now. ··· 2035 2033 2036 2034 if (data & ~PRED_CMD_IBPB) 2037 2035 return 1; 2038 - 2036 + if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL)) 2037 + return 1; 2039 2038 if (!data) 2040 2039 break; 2041 2040 ··· 2049 2046 * 2050 2047 * For nested: 2051 2048 * The handling of the MSR bitmap for L2 guests is done in 2052 - * nested_vmx_merge_msr_bitmap. We should not touch the 2049 + * nested_vmx_prepare_msr_bitmap. We should not touch the 2053 2050 * vmcs02.msr_bitmap here since it gets completely overwritten 2054 2051 * in the merging. 2055 2052 */ ··· 2107 2104 pt_update_intercept_for_msr(vmx); 2108 2105 break; 2109 2106 case MSR_IA32_RTIT_STATUS: 2110 - if ((pt_mode != PT_MODE_HOST_GUEST) || 2111 - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || 2112 - (data & MSR_IA32_RTIT_STATUS_MASK)) 2107 + if (!pt_can_write_msr(vmx)) 2108 + return 1; 2109 + if (data & MSR_IA32_RTIT_STATUS_MASK) 2113 2110 return 1; 2114 2111 vmx->pt_desc.guest.status = data; 2115 2112 break; 2116 2113 case MSR_IA32_RTIT_CR3_MATCH: 2117 - if ((pt_mode != PT_MODE_HOST_GUEST) || 2118 - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || 2119 - !intel_pt_validate_cap(vmx->pt_desc.caps, 2120 - PT_CAP_cr3_filtering)) 2114 + if (!pt_can_write_msr(vmx)) 2115 + return 1; 2116 + if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2117 + PT_CAP_cr3_filtering)) 2121 2118 return 1; 2122 2119 vmx->pt_desc.guest.cr3_match = data; 2123 2120 break; 2124 2121 case MSR_IA32_RTIT_OUTPUT_BASE: 2125 - if ((pt_mode != PT_MODE_HOST_GUEST) || 2126 - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || 2127 - (!intel_pt_validate_cap(vmx->pt_desc.caps, 2128 - PT_CAP_topa_output) && 2129 - !intel_pt_validate_cap(vmx->pt_desc.caps, 2130 - PT_CAP_single_range_output)) || 2131 - (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK)) 2122 + if (!pt_can_write_msr(vmx)) 2123 + return 1; 2124 + if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2125 + PT_CAP_topa_output) && 2126 + !intel_pt_validate_cap(vmx->pt_desc.caps, 2127 + PT_CAP_single_range_output)) 2128 + return 1; 2129 + if (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK) 2132 2130 return 1; 2133 2131 vmx->pt_desc.guest.output_base = data; 2134 2132 break; 2135 2133 case MSR_IA32_RTIT_OUTPUT_MASK: 2136 - if ((pt_mode != PT_MODE_HOST_GUEST) || 2137 - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || 2138 - (!intel_pt_validate_cap(vmx->pt_desc.caps, 2139 - PT_CAP_topa_output) && 2140 - !intel_pt_validate_cap(vmx->pt_desc.caps, 2141 - PT_CAP_single_range_output))) 2134 + if (!pt_can_write_msr(vmx)) 2135 + return 1; 2136 + if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2137 + PT_CAP_topa_output) && 2138 + !intel_pt_validate_cap(vmx->pt_desc.caps, 2139 + PT_CAP_single_range_output)) 2142 2140 return 1; 2143 2141 vmx->pt_desc.guest.output_mask = data; 2144 2142 break; 2145 2143 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2144 + if (!pt_can_write_msr(vmx)) 2145 + return 1; 2146 2146 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2147 - if ((pt_mode != PT_MODE_HOST_GUEST) || 2148 - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || 2149 - (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, 2150 - PT_CAP_num_address_ranges))) 2147 + if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, 2148 + PT_CAP_num_address_ranges)) 2149 + return 1; 2150 + if (is_noncanonical_address(data, vcpu)) 2151 2151 return 1; 2152 2152 if (index % 2) 2153 2153 vmx->pt_desc.guest.addr_b[index / 2] = data; ··· 2328 2322 CPU_BASED_CR3_STORE_EXITING | 2329 2323 CPU_BASED_UNCOND_IO_EXITING | 2330 2324 CPU_BASED_MOV_DR_EXITING | 2331 - CPU_BASED_USE_TSC_OFFSETING | 2325 + CPU_BASED_USE_TSC_OFFSETTING | 2332 2326 CPU_BASED_MWAIT_EXITING | 2333 2327 CPU_BASED_MONITOR_EXITING | 2334 2328 CPU_BASED_INVLPG_EXITING | ··· 2662 2656 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 2663 2657 2664 2658 vmx->rmode.vm86_active = 0; 2665 - 2666 - vmx_segment_cache_clear(vmx); 2667 2659 2668 2660 vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 2669 2661 ··· 3448 3444 static int init_rmode_identity_map(struct kvm *kvm) 3449 3445 { 3450 3446 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 3451 - int i, idx, r = 0; 3447 + int i, r = 0; 3452 3448 kvm_pfn_t identity_map_pfn; 3453 3449 u32 tmp; 3454 3450 ··· 3456 3452 mutex_lock(&kvm->slots_lock); 3457 3453 3458 3454 if (likely(kvm_vmx->ept_identity_pagetable_done)) 3459 - goto out2; 3455 + goto out; 3460 3456 3461 3457 if (!kvm_vmx->ept_identity_map_addr) 3462 3458 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; ··· 3465 3461 r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 3466 3462 kvm_vmx->ept_identity_map_addr, PAGE_SIZE); 3467 3463 if (r < 0) 3468 - goto out2; 3464 + goto out; 3469 3465 3470 - idx = srcu_read_lock(&kvm->srcu); 3471 3466 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); 3472 3467 if (r < 0) 3473 3468 goto out; ··· 3482 3479 kvm_vmx->ept_identity_pagetable_done = true; 3483 3480 3484 3481 out: 3485 - srcu_read_unlock(&kvm->srcu, idx); 3486 - 3487 - out2: 3488 3482 mutex_unlock(&kvm->slots_lock); 3489 3483 return r; 3490 3484 } ··· 4009 4009 if (vmx_xsaves_supported()) { 4010 4010 /* Exposing XSAVES only when XSAVE is exposed */ 4011 4011 bool xsaves_enabled = 4012 + boot_cpu_has(X86_FEATURE_XSAVE) && 4012 4013 guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && 4013 4014 guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); 4014 4015 ··· 4320 4319 4321 4320 static void enable_irq_window(struct kvm_vcpu *vcpu) 4322 4321 { 4323 - exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING); 4322 + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 4324 4323 } 4325 4324 4326 4325 static void enable_nmi_window(struct kvm_vcpu *vcpu) ··· 4331 4330 return; 4332 4331 } 4333 4332 4334 - exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING); 4333 + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 4335 4334 } 4336 4335 4337 4336 static void vmx_inject_irq(struct kvm_vcpu *vcpu) ··· 4456 4455 if (enable_unrestricted_guest) 4457 4456 return 0; 4458 4457 4459 - ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, 4460 - PAGE_SIZE * 3); 4458 + mutex_lock(&kvm->slots_lock); 4459 + ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, 4460 + PAGE_SIZE * 3); 4461 + mutex_unlock(&kvm->slots_lock); 4462 + 4461 4463 if (ret) 4462 4464 return ret; 4463 4465 to_kvm_vmx(kvm)->tss_addr = addr; ··· 4942 4938 4943 4939 static int handle_interrupt_window(struct kvm_vcpu *vcpu) 4944 4940 { 4945 - exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING); 4941 + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 4946 4942 4947 4943 kvm_make_request(KVM_REQ_EVENT, vcpu); 4948 4944 ··· 5155 5151 static int handle_nmi_window(struct kvm_vcpu *vcpu) 5156 5152 { 5157 5153 WARN_ON_ONCE(!enable_vnmi); 5158 - exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING); 5154 + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 5159 5155 ++vcpu->stat.nmi_window_exits; 5160 5156 kvm_make_request(KVM_REQ_EVENT, vcpu); 5161 5157 ··· 5176 5172 WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); 5177 5173 5178 5174 intr_window_requested = exec_controls_get(vmx) & 5179 - CPU_BASED_VIRTUAL_INTR_PENDING; 5175 + CPU_BASED_INTR_WINDOW_EXITING; 5180 5176 5181 5177 while (vmx->emulation_required && count-- != 0) { 5182 5178 if (intr_window_requested && vmx_interrupt_allowed(vcpu)) ··· 5500 5496 [EXIT_REASON_CPUID] = kvm_emulate_cpuid, 5501 5497 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, 5502 5498 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, 5503 - [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 5499 + [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, 5504 5500 [EXIT_REASON_HLT] = kvm_emulate_halt, 5505 5501 [EXIT_REASON_INVD] = handle_invd, 5506 5502 [EXIT_REASON_INVLPG] = handle_invlpg, ··· 5787 5783 * The guest has exited. See if we can fix it or if we need userspace 5788 5784 * assistance. 5789 5785 */ 5790 - static int vmx_handle_exit(struct kvm_vcpu *vcpu) 5786 + static int vmx_handle_exit(struct kvm_vcpu *vcpu, 5787 + enum exit_fastpath_completion exit_fastpath) 5791 5788 { 5792 5789 struct vcpu_vmx *vmx = to_vmx(vcpu); 5793 5790 u32 exit_reason = vmx->exit_reason; ··· 5874 5869 } 5875 5870 } 5876 5871 5877 - if (exit_reason < kvm_vmx_max_exit_handlers 5878 - && kvm_vmx_exit_handlers[exit_reason]) { 5879 - #ifdef CONFIG_RETPOLINE 5880 - if (exit_reason == EXIT_REASON_MSR_WRITE) 5881 - return kvm_emulate_wrmsr(vcpu); 5882 - else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER) 5883 - return handle_preemption_timer(vcpu); 5884 - else if (exit_reason == EXIT_REASON_PENDING_INTERRUPT) 5885 - return handle_interrupt_window(vcpu); 5886 - else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) 5887 - return handle_external_interrupt(vcpu); 5888 - else if (exit_reason == EXIT_REASON_HLT) 5889 - return kvm_emulate_halt(vcpu); 5890 - else if (exit_reason == EXIT_REASON_EPT_MISCONFIG) 5891 - return handle_ept_misconfig(vcpu); 5892 - #endif 5893 - return kvm_vmx_exit_handlers[exit_reason](vcpu); 5894 - } else { 5895 - vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", 5896 - exit_reason); 5897 - dump_vmcs(); 5898 - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5899 - vcpu->run->internal.suberror = 5900 - KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 5901 - vcpu->run->internal.ndata = 1; 5902 - vcpu->run->internal.data[0] = exit_reason; 5903 - return 0; 5872 + if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) { 5873 + kvm_skip_emulated_instruction(vcpu); 5874 + return 1; 5904 5875 } 5876 + 5877 + if (exit_reason >= kvm_vmx_max_exit_handlers) 5878 + goto unexpected_vmexit; 5879 + #ifdef CONFIG_RETPOLINE 5880 + if (exit_reason == EXIT_REASON_MSR_WRITE) 5881 + return kvm_emulate_wrmsr(vcpu); 5882 + else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER) 5883 + return handle_preemption_timer(vcpu); 5884 + else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW) 5885 + return handle_interrupt_window(vcpu); 5886 + else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) 5887 + return handle_external_interrupt(vcpu); 5888 + else if (exit_reason == EXIT_REASON_HLT) 5889 + return kvm_emulate_halt(vcpu); 5890 + else if (exit_reason == EXIT_REASON_EPT_MISCONFIG) 5891 + return handle_ept_misconfig(vcpu); 5892 + #endif 5893 + 5894 + exit_reason = array_index_nospec(exit_reason, 5895 + kvm_vmx_max_exit_handlers); 5896 + if (!kvm_vmx_exit_handlers[exit_reason]) 5897 + goto unexpected_vmexit; 5898 + 5899 + return kvm_vmx_exit_handlers[exit_reason](vcpu); 5900 + 5901 + unexpected_vmexit: 5902 + vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason); 5903 + dump_vmcs(); 5904 + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5905 + vcpu->run->internal.suberror = 5906 + KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 5907 + vcpu->run->internal.ndata = 1; 5908 + vcpu->run->internal.data[0] = exit_reason; 5909 + return 0; 5905 5910 } 5906 5911 5907 5912 /* ··· 6232 6217 } 6233 6218 STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff); 6234 6219 6235 - static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) 6220 + static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, 6221 + enum exit_fastpath_completion *exit_fastpath) 6236 6222 { 6237 6223 struct vcpu_vmx *vmx = to_vmx(vcpu); 6238 6224 ··· 6241 6225 handle_external_interrupt_irqoff(vcpu); 6242 6226 else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI) 6243 6227 handle_exception_nmi_irqoff(vmx); 6228 + else if (!is_guest_mode(vcpu) && 6229 + vmx->exit_reason == EXIT_REASON_MSR_WRITE) 6230 + *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); 6244 6231 } 6245 6232 6246 6233 static bool vmx_has_emulated_msr(int index) ··· 6652 6633 free_vpid(vmx->vpid); 6653 6634 nested_vmx_free_vcpu(vcpu); 6654 6635 free_loaded_vmcs(vmx->loaded_vmcs); 6655 - kvm_vcpu_uninit(vcpu); 6656 - kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu); 6657 - kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); 6658 - kmem_cache_free(kvm_vcpu_cache, vmx); 6659 6636 } 6660 6637 6661 - static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) 6638 + static int vmx_create_vcpu(struct kvm_vcpu *vcpu) 6662 6639 { 6663 - int err; 6664 6640 struct vcpu_vmx *vmx; 6665 6641 unsigned long *msr_bitmap; 6666 - int i, cpu; 6642 + int i, cpu, err; 6667 6643 6668 - BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0, 6669 - "struct kvm_vcpu must be at offset 0 for arch usercopy region"); 6670 - 6671 - vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); 6672 - if (!vmx) 6673 - return ERR_PTR(-ENOMEM); 6674 - 6675 - vmx->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, 6676 - GFP_KERNEL_ACCOUNT); 6677 - if (!vmx->vcpu.arch.user_fpu) { 6678 - printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); 6679 - err = -ENOMEM; 6680 - goto free_partial_vcpu; 6681 - } 6682 - 6683 - vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, 6684 - GFP_KERNEL_ACCOUNT); 6685 - if (!vmx->vcpu.arch.guest_fpu) { 6686 - printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); 6687 - err = -ENOMEM; 6688 - goto free_user_fpu; 6689 - } 6690 - 6691 - vmx->vpid = allocate_vpid(); 6692 - 6693 - err = kvm_vcpu_init(&vmx->vcpu, kvm, id); 6694 - if (err) 6695 - goto free_vcpu; 6644 + BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); 6645 + vmx = to_vmx(vcpu); 6696 6646 6697 6647 err = -ENOMEM; 6648 + 6649 + vmx->vpid = allocate_vpid(); 6698 6650 6699 6651 /* 6700 6652 * If PML is turned on, failure on enabling PML just results in failure 6701 6653 * of creating the vcpu, therefore we can simplify PML logic (by 6702 6654 * avoiding dealing with cases, such as enabling PML partially on vcpus 6703 - * for the guest, etc. 6655 + * for the guest), etc. 6704 6656 */ 6705 6657 if (enable_pml) { 6706 6658 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 6707 6659 if (!vmx->pml_pg) 6708 - goto uninit_vcpu; 6660 + goto free_vpid; 6709 6661 } 6710 6662 6711 6663 BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS); ··· 6721 6731 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 6722 6732 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); 6723 6733 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); 6724 - if (kvm_cstate_in_guest(kvm)) { 6734 + if (kvm_cstate_in_guest(vcpu->kvm)) { 6725 6735 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R); 6726 6736 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); 6727 6737 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); ··· 6731 6741 6732 6742 vmx->loaded_vmcs = &vmx->vmcs01; 6733 6743 cpu = get_cpu(); 6734 - vmx_vcpu_load(&vmx->vcpu, cpu); 6735 - vmx->vcpu.cpu = cpu; 6744 + vmx_vcpu_load(vcpu, cpu); 6745 + vcpu->cpu = cpu; 6736 6746 init_vmcs(vmx); 6737 - vmx_vcpu_put(&vmx->vcpu); 6747 + vmx_vcpu_put(vcpu); 6738 6748 put_cpu(); 6739 - if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { 6740 - err = alloc_apic_access_page(kvm); 6749 + if (cpu_need_virtualize_apic_accesses(vcpu)) { 6750 + err = alloc_apic_access_page(vcpu->kvm); 6741 6751 if (err) 6742 6752 goto free_vmcs; 6743 6753 } 6744 6754 6745 6755 if (enable_ept && !enable_unrestricted_guest) { 6746 - err = init_rmode_identity_map(kvm); 6756 + err = init_rmode_identity_map(vcpu->kvm); 6747 6757 if (err) 6748 6758 goto free_vmcs; 6749 6759 } ··· 6751 6761 if (nested) 6752 6762 nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, 6753 6763 vmx_capability.ept, 6754 - kvm_vcpu_apicv_active(&vmx->vcpu)); 6764 + kvm_vcpu_apicv_active(vcpu)); 6755 6765 else 6756 6766 memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); 6757 6767 ··· 6769 6779 6770 6780 vmx->ept_pointer = INVALID_PAGE; 6771 6781 6772 - return &vmx->vcpu; 6782 + return 0; 6773 6783 6774 6784 free_vmcs: 6775 6785 free_loaded_vmcs(vmx->loaded_vmcs); 6776 6786 free_pml: 6777 6787 vmx_destroy_pml_buffer(vmx); 6778 - uninit_vcpu: 6779 - kvm_vcpu_uninit(&vmx->vcpu); 6780 - free_vcpu: 6788 + free_vpid: 6781 6789 free_vpid(vmx->vpid); 6782 - kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); 6783 - free_user_fpu: 6784 - kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu); 6785 - free_partial_vcpu: 6786 - kmem_cache_free(kvm_vcpu_cache, vmx); 6787 - return ERR_PTR(err); 6790 + return err; 6788 6791 } 6789 6792 6790 6793 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" ··· 6929 6946 } while (0) 6930 6947 6931 6948 entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); 6932 - cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME)); 6933 - cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME)); 6934 - cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC)); 6935 - cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE)); 6936 - cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE)); 6937 - cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE)); 6938 - cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE)); 6939 - cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE)); 6940 - cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR)); 6941 - cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM)); 6942 - cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX)); 6943 - cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX)); 6944 - cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID)); 6945 - cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE)); 6949 + cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); 6950 + cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); 6951 + cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); 6952 + cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); 6953 + cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); 6954 + cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); 6955 + cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); 6956 + cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); 6957 + cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); 6958 + cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); 6959 + cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); 6960 + cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); 6961 + cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); 6962 + cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); 6946 6963 6947 6964 entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); 6948 - cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE)); 6949 - cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP)); 6950 - cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); 6951 - cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); 6952 - cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP)); 6953 - cr4_fixed1_update(X86_CR4_LA57, ecx, bit(X86_FEATURE_LA57)); 6965 + cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); 6966 + cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); 6967 + cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); 6968 + cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); 6969 + cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); 6970 + cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); 6954 6971 6955 6972 #undef cr4_fixed1_update 6956 6973 } ··· 7084 7101 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 7085 7102 { 7086 7103 if (func == 1 && nested) 7087 - entry->ecx |= bit(X86_FEATURE_VMX); 7104 + entry->ecx |= feature_bit(VMX); 7088 7105 } 7089 7106 7090 7107 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) ··· 7826 7843 .xsaves_supported = vmx_xsaves_supported, 7827 7844 .umip_emulated = vmx_umip_emulated, 7828 7845 .pt_supported = vmx_pt_supported, 7846 + .pku_supported = vmx_pku_supported, 7829 7847 7830 7848 .request_immediate_exit = vmx_request_immediate_exit, 7831 7849

+358 -223

arch/x86/kvm/x86.c

··· 93 93 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); 94 94 #endif 95 95 96 + static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; 97 + 96 98 #define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ 97 99 #define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ 98 100 ··· 881 879 } 882 880 EXPORT_SYMBOL_GPL(kvm_set_xcr); 883 881 882 + #define __cr4_reserved_bits(__cpu_has, __c) \ 883 + ({ \ 884 + u64 __reserved_bits = CR4_RESERVED_BITS; \ 885 + \ 886 + if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \ 887 + __reserved_bits |= X86_CR4_OSXSAVE; \ 888 + if (!__cpu_has(__c, X86_FEATURE_SMEP)) \ 889 + __reserved_bits |= X86_CR4_SMEP; \ 890 + if (!__cpu_has(__c, X86_FEATURE_SMAP)) \ 891 + __reserved_bits |= X86_CR4_SMAP; \ 892 + if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \ 893 + __reserved_bits |= X86_CR4_FSGSBASE; \ 894 + if (!__cpu_has(__c, X86_FEATURE_PKU)) \ 895 + __reserved_bits |= X86_CR4_PKE; \ 896 + if (!__cpu_has(__c, X86_FEATURE_LA57)) \ 897 + __reserved_bits |= X86_CR4_LA57; \ 898 + __reserved_bits; \ 899 + }) 900 + 901 + static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) 902 + { 903 + u64 reserved_bits = __cr4_reserved_bits(cpu_has, c); 904 + 905 + if (cpuid_ecx(0x7) & feature_bit(LA57)) 906 + reserved_bits &= ~X86_CR4_LA57; 907 + 908 + if (kvm_x86_ops->umip_emulated()) 909 + reserved_bits &= ~X86_CR4_UMIP; 910 + 911 + return reserved_bits; 912 + } 913 + 884 914 static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 885 915 { 886 - if (cr4 & CR4_RESERVED_BITS) 916 + if (cr4 & cr4_reserved_bits) 887 917 return -EINVAL; 888 918 889 - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) 890 - return -EINVAL; 891 - 892 - if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) 893 - return -EINVAL; 894 - 895 - if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) 896 - return -EINVAL; 897 - 898 - if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) 899 - return -EINVAL; 900 - 901 - if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) 902 - return -EINVAL; 903 - 904 - if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) 905 - return -EINVAL; 906 - 907 - if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) 919 + if (cr4 & __cr4_reserved_bits(guest_cpuid_has, vcpu)) 908 920 return -EINVAL; 909 921 910 922 return 0; ··· 1063 1047 1064 1048 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 1065 1049 { 1050 + size_t size = ARRAY_SIZE(vcpu->arch.db); 1051 + 1066 1052 switch (dr) { 1067 1053 case 0 ... 3: 1068 - vcpu->arch.db[dr] = val; 1054 + vcpu->arch.db[array_index_nospec(dr, size)] = val; 1069 1055 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) 1070 1056 vcpu->arch.eff_db[dr] = val; 1071 1057 break; ··· 1082 1064 case 5: 1083 1065 /* fall through */ 1084 1066 default: /* 7 */ 1085 - if (val & 0xffffffff00000000ULL) 1067 + if (!kvm_dr7_valid(val)) 1086 1068 return -1; /* #GP */ 1087 1069 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 1088 1070 kvm_update_dr7(vcpu); ··· 1104 1086 1105 1087 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) 1106 1088 { 1089 + size_t size = ARRAY_SIZE(vcpu->arch.db); 1090 + 1107 1091 switch (dr) { 1108 1092 case 0 ... 3: 1109 - *val = vcpu->arch.db[dr]; 1093 + *val = vcpu->arch.db[array_index_nospec(dr, size)]; 1110 1094 break; 1111 1095 case 4: 1112 1096 /* fall through */ ··· 1232 1212 MSR_MISC_FEATURES_ENABLES, 1233 1213 MSR_AMD64_VIRT_SPEC_CTRL, 1234 1214 MSR_IA32_POWER_CTL, 1215 + MSR_IA32_UCODE_REV, 1235 1216 1236 1217 /* 1237 1218 * The following list leaves out MSRs whose values are determined ··· 1545 1524 return kvm_skip_emulated_instruction(vcpu); 1546 1525 } 1547 1526 EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); 1527 + 1528 + /* 1529 + * The fast path for frequent and performance sensitive wrmsr emulation, 1530 + * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces 1531 + * the latency of virtual IPI by avoiding the expensive bits of transitioning 1532 + * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the 1533 + * other cases which must be called after interrupts are enabled on the host. 1534 + */ 1535 + static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) 1536 + { 1537 + if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic) && 1538 + ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && 1539 + ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) { 1540 + 1541 + kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32)); 1542 + return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data); 1543 + } 1544 + 1545 + return 1; 1546 + } 1547 + 1548 + enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) 1549 + { 1550 + u32 msr = kvm_rcx_read(vcpu); 1551 + u64 data = kvm_read_edx_eax(vcpu); 1552 + int ret = 0; 1553 + 1554 + switch (msr) { 1555 + case APIC_BASE_MSR + (APIC_ICR >> 4): 1556 + ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); 1557 + break; 1558 + default: 1559 + return EXIT_FASTPATH_NONE; 1560 + } 1561 + 1562 + if (!ret) { 1563 + trace_kvm_msr_write(msr, data); 1564 + return EXIT_FASTPATH_SKIP_EMUL_INS; 1565 + } 1566 + 1567 + return EXIT_FASTPATH_NONE; 1568 + } 1569 + EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); 1548 1570 1549 1571 /* 1550 1572 * Adapt set_msr() to msr_io()'s calling convention ··· 2549 2485 default: 2550 2486 if (msr >= MSR_IA32_MC0_CTL && 2551 2487 msr < MSR_IA32_MCx_CTL(bank_num)) { 2552 - u32 offset = msr - MSR_IA32_MC0_CTL; 2488 + u32 offset = array_index_nospec( 2489 + msr - MSR_IA32_MC0_CTL, 2490 + MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); 2491 + 2553 2492 /* only 0 or all 1s can be written to IA32_MCi_CTL 2554 2493 * some Linux kernels though clear bit 10 in bank 4 to 2555 2494 * workaround a BIOS/GART TBL issue on AMD K8s, ignore ··· 2648 2581 2649 2582 static void record_steal_time(struct kvm_vcpu *vcpu) 2650 2583 { 2584 + struct kvm_host_map map; 2585 + struct kvm_steal_time *st; 2586 + 2651 2587 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) 2652 2588 return; 2653 2589 2654 - if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, 2655 - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) 2590 + /* -EAGAIN is returned in atomic context so we can just return. */ 2591 + if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, 2592 + &map, &vcpu->arch.st.cache, false)) 2656 2593 return; 2594 + 2595 + st = map.hva + 2596 + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); 2657 2597 2658 2598 /* 2659 2599 * Doing a TLB flush here, on the guest's behalf, can avoid 2660 2600 * expensive IPIs. 2661 2601 */ 2662 2602 trace_kvm_pv_tlb_flush(vcpu->vcpu_id, 2663 - vcpu->arch.st.steal.preempted & KVM_VCPU_FLUSH_TLB); 2664 - if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB) 2603 + st->preempted & KVM_VCPU_FLUSH_TLB); 2604 + if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) 2665 2605 kvm_vcpu_flush_tlb(vcpu, false); 2666 2606 2667 - if (vcpu->arch.st.steal.version & 1) 2668 - vcpu->arch.st.steal.version += 1; /* first time write, random junk */ 2607 + vcpu->arch.st.preempted = 0; 2669 2608 2670 - vcpu->arch.st.steal.version += 1; 2609 + if (st->version & 1) 2610 + st->version += 1; /* first time write, random junk */ 2671 2611 2672 - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, 2673 - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); 2612 + st->version += 1; 2674 2613 2675 2614 smp_wmb(); 2676 2615 2677 - vcpu->arch.st.steal.steal += current->sched_info.run_delay - 2616 + st->steal += current->sched_info.run_delay - 2678 2617 vcpu->arch.st.last_steal; 2679 2618 vcpu->arch.st.last_steal = current->sched_info.run_delay; 2680 2619 2681 - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, 2682 - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); 2683 - 2684 2620 smp_wmb(); 2685 2621 2686 - vcpu->arch.st.steal.version += 1; 2622 + st->version += 1; 2687 2623 2688 - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, 2689 - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); 2624 + kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false); 2690 2625 } 2691 2626 2692 2627 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) ··· 2855 2786 if (data & KVM_STEAL_RESERVED_MASK) 2856 2787 return 1; 2857 2788 2858 - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, 2859 - data & KVM_STEAL_VALID_BITS, 2860 - sizeof(struct kvm_steal_time))) 2861 - return 1; 2862 - 2863 2789 vcpu->arch.st.msr_val = data; 2864 2790 2865 2791 if (!(data & KVM_MSR_ENABLED)) ··· 2990 2926 default: 2991 2927 if (msr >= MSR_IA32_MC0_CTL && 2992 2928 msr < MSR_IA32_MCx_CTL(bank_num)) { 2993 - u32 offset = msr - MSR_IA32_MC0_CTL; 2929 + u32 offset = array_index_nospec( 2930 + msr - MSR_IA32_MC0_CTL, 2931 + MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); 2932 + 2994 2933 data = vcpu->arch.mce_banks[offset]; 2995 2934 break; 2996 2935 } ··· 3525 3458 3526 3459 kvm_x86_ops->vcpu_load(vcpu, cpu); 3527 3460 3528 - fpregs_assert_state_consistent(); 3529 - if (test_thread_flag(TIF_NEED_FPU_LOAD)) 3530 - switch_fpu_return(); 3531 - 3532 3461 /* Apply any externally detected TSC adjustments (due to suspend) */ 3533 3462 if (unlikely(vcpu->arch.tsc_offset_adjustment)) { 3534 3463 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); ··· 3564 3501 3565 3502 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) 3566 3503 { 3504 + struct kvm_host_map map; 3505 + struct kvm_steal_time *st; 3506 + 3567 3507 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) 3568 3508 return; 3569 3509 3570 - vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED; 3510 + if (vcpu->arch.st.preempted) 3511 + return; 3571 3512 3572 - kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, 3573 - &vcpu->arch.st.steal.preempted, 3574 - offsetof(struct kvm_steal_time, preempted), 3575 - sizeof(vcpu->arch.st.steal.preempted)); 3513 + if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, 3514 + &vcpu->arch.st.cache, true)) 3515 + return; 3516 + 3517 + st = map.hva + 3518 + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); 3519 + 3520 + st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; 3521 + 3522 + kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); 3576 3523 } 3577 3524 3578 3525 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) ··· 4733 4660 { 4734 4661 struct kvm_pit *pit = kvm->arch.vpit; 4735 4662 4736 - if (!pit) 4737 - return -ENXIO; 4738 - 4739 4663 /* pit->pit_state.lock was overloaded to prevent userspace from getting 4740 4664 * an inconsistent state after running multiple KVM_REINJECT_CONTROL 4741 4665 * ioctls in parallel. Use a separate lock if that ioctl isn't rare. ··· 5098 5028 struct kvm_reinject_control control; 5099 5029 r = -EFAULT; 5100 5030 if (copy_from_user(&control, argp, sizeof(control))) 5031 + goto out; 5032 + r = -ENXIO; 5033 + if (!kvm->arch.vpit) 5101 5034 goto out; 5102 5035 r = kvm_vm_ioctl_reinject(kvm, &control); 5103 5036 break; ··· 6259 6186 return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit); 6260 6187 } 6261 6188 6189 + static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt) 6190 + { 6191 + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM); 6192 + } 6193 + 6194 + static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt) 6195 + { 6196 + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE); 6197 + } 6198 + 6199 + static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt) 6200 + { 6201 + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); 6202 + } 6203 + 6262 6204 static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) 6263 6205 { 6264 6206 return kvm_register_read(emul_to_vcpu(ctxt), reg); ··· 6351 6263 .fix_hypercall = emulator_fix_hypercall, 6352 6264 .intercept = emulator_intercept, 6353 6265 .get_cpuid = emulator_get_cpuid, 6266 + .guest_has_long_mode = emulator_guest_has_long_mode, 6267 + .guest_has_movbe = emulator_guest_has_movbe, 6268 + .guest_has_fxsr = emulator_guest_has_fxsr, 6354 6269 .set_nmi_mask = emulator_set_nmi_mask, 6355 6270 .get_hflags = emulator_get_hflags, 6356 6271 .set_hflags = emulator_set_hflags, ··· 6470 6379 return 1; 6471 6380 } 6472 6381 6473 - static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, 6382 + static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 6474 6383 bool write_fault_to_shadow_pgtable, 6475 6384 int emulation_type) 6476 6385 { 6477 - gpa_t gpa = cr2; 6386 + gpa_t gpa = cr2_or_gpa; 6478 6387 kvm_pfn_t pfn; 6479 6388 6480 6389 if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) ··· 6488 6397 * Write permission should be allowed since only 6489 6398 * write access need to be emulated. 6490 6399 */ 6491 - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); 6400 + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); 6492 6401 6493 6402 /* 6494 6403 * If the mapping is invalid in guest, let cpu retry ··· 6545 6454 } 6546 6455 6547 6456 static bool retry_instruction(struct x86_emulate_ctxt *ctxt, 6548 - unsigned long cr2, int emulation_type) 6457 + gpa_t cr2_or_gpa, int emulation_type) 6549 6458 { 6550 6459 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 6551 - unsigned long last_retry_eip, last_retry_addr, gpa = cr2; 6460 + unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa; 6552 6461 6553 6462 last_retry_eip = vcpu->arch.last_retry_eip; 6554 6463 last_retry_addr = vcpu->arch.last_retry_addr; ··· 6577 6486 if (x86_page_table_writing_insn(ctxt)) 6578 6487 return false; 6579 6488 6580 - if (ctxt->eip == last_retry_eip && last_retry_addr == cr2) 6489 + if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa) 6581 6490 return false; 6582 6491 6583 6492 vcpu->arch.last_retry_eip = ctxt->eip; 6584 - vcpu->arch.last_retry_addr = cr2; 6493 + vcpu->arch.last_retry_addr = cr2_or_gpa; 6585 6494 6586 6495 if (!vcpu->arch.mmu->direct_map) 6587 - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); 6496 + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); 6588 6497 6589 6498 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); 6590 6499 ··· 6730 6639 return false; 6731 6640 } 6732 6641 6733 - int x86_emulate_instruction(struct kvm_vcpu *vcpu, 6734 - unsigned long cr2, 6735 - int emulation_type, 6736 - void *insn, 6737 - int insn_len) 6642 + int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 6643 + int emulation_type, void *insn, int insn_len) 6738 6644 { 6739 6645 int r; 6740 6646 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; ··· 6777 6689 kvm_queue_exception(vcpu, UD_VECTOR); 6778 6690 return 1; 6779 6691 } 6780 - if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, 6781 - emulation_type)) 6692 + if (reexecute_instruction(vcpu, cr2_or_gpa, 6693 + write_fault_to_spt, 6694 + emulation_type)) 6782 6695 return 1; 6783 6696 if (ctxt->have_exception) { 6784 6697 /* ··· 6813 6724 return 1; 6814 6725 } 6815 6726 6816 - if (retry_instruction(ctxt, cr2, emulation_type)) 6727 + if (retry_instruction(ctxt, cr2_or_gpa, emulation_type)) 6817 6728 return 1; 6818 6729 6819 6730 /* this is needed for vmware backdoor interface to work since it ··· 6825 6736 6826 6737 restart: 6827 6738 /* Save the faulting GPA (cr2) in the address field */ 6828 - ctxt->exception.address = cr2; 6739 + ctxt->exception.address = cr2_or_gpa; 6829 6740 6830 6741 r = x86_emulate_insn(ctxt); 6831 6742 ··· 6833 6744 return 1; 6834 6745 6835 6746 if (r == EMULATION_FAILED) { 6836 - if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, 6747 + if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt, 6837 6748 emulation_type)) 6838 6749 return 1; 6839 6750 ··· 7446 7357 { 7447 7358 struct kvm_lapic_irq lapic_irq; 7448 7359 7449 - lapic_irq.shorthand = 0; 7450 - lapic_irq.dest_mode = 0; 7360 + lapic_irq.shorthand = APIC_DEST_NOSHORT; 7361 + lapic_irq.dest_mode = APIC_DEST_PHYSICAL; 7451 7362 lapic_irq.level = 0; 7452 7363 lapic_irq.dest_id = apicid; 7453 7364 lapic_irq.msi_redir_hint = false; ··· 8086 7997 bool req_int_win = 8087 7998 dm_request_for_irq_injection(vcpu) && 8088 7999 kvm_cpu_accept_dm_intr(vcpu); 8000 + enum exit_fastpath_completion exit_fastpath = EXIT_FASTPATH_NONE; 8089 8001 8090 8002 bool req_immediate_exit = false; 8091 8003 ··· 8288 8198 trace_kvm_entry(vcpu->vcpu_id); 8289 8199 guest_enter_irqoff(); 8290 8200 8291 - /* The preempt notifier should have taken care of the FPU already. */ 8292 - WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD)); 8201 + fpregs_assert_state_consistent(); 8202 + if (test_thread_flag(TIF_NEED_FPU_LOAD)) 8203 + switch_fpu_return(); 8293 8204 8294 8205 if (unlikely(vcpu->arch.switch_db_regs)) { 8295 8206 set_debugreg(0, 7); ··· 8334 8243 vcpu->mode = OUTSIDE_GUEST_MODE; 8335 8244 smp_wmb(); 8336 8245 8337 - kvm_x86_ops->handle_exit_irqoff(vcpu); 8246 + kvm_x86_ops->handle_exit_irqoff(vcpu, &exit_fastpath); 8338 8247 8339 8248 /* 8340 8249 * Consume any pending interrupts, including the possible source of ··· 8378 8287 kvm_lapic_sync_from_vapic(vcpu); 8379 8288 8380 8289 vcpu->arch.gpa_available = false; 8381 - r = kvm_x86_ops->handle_exit(vcpu); 8290 + r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath); 8382 8291 return r; 8383 8292 8384 8293 cancel_injection: ··· 8562 8471 return 0; 8563 8472 } 8564 8473 8474 + static void kvm_save_current_fpu(struct fpu *fpu) 8475 + { 8476 + /* 8477 + * If the target FPU state is not resident in the CPU registers, just 8478 + * memcpy() from current, else save CPU state directly to the target. 8479 + */ 8480 + if (test_thread_flag(TIF_NEED_FPU_LOAD)) 8481 + memcpy(&fpu->state, &current->thread.fpu.state, 8482 + fpu_kernel_xstate_size); 8483 + else 8484 + copy_fpregs_to_fpstate(fpu); 8485 + } 8486 + 8565 8487 /* Swap (qemu) user FPU context for the guest FPU context. */ 8566 8488 static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 8567 8489 { 8568 8490 fpregs_lock(); 8569 8491 8570 - copy_fpregs_to_fpstate(vcpu->arch.user_fpu); 8492 + kvm_save_current_fpu(vcpu->arch.user_fpu); 8493 + 8571 8494 /* PKRU is separately restored in kvm_x86_ops->run. */ 8572 8495 __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, 8573 8496 ~XFEATURE_MASK_PKRU); ··· 8597 8492 { 8598 8493 fpregs_lock(); 8599 8494 8600 - copy_fpregs_to_fpstate(vcpu->arch.guest_fpu); 8495 + kvm_save_current_fpu(vcpu->arch.guest_fpu); 8496 + 8601 8497 copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state); 8602 8498 8603 8499 fpregs_mark_activate(); ··· 8820 8714 struct kvm_mp_state *mp_state) 8821 8715 { 8822 8716 vcpu_load(vcpu); 8717 + if (kvm_mpx_supported()) 8718 + kvm_load_guest_fpu(vcpu); 8823 8719 8824 8720 kvm_apic_accept_events(vcpu); 8825 8721 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && ··· 8830 8722 else 8831 8723 mp_state->mp_state = vcpu->arch.mp_state; 8832 8724 8725 + if (kvm_mpx_supported()) 8726 + kvm_put_guest_fpu(vcpu); 8833 8727 vcpu_put(vcpu); 8834 8728 return 0; 8835 8729 } ··· 9192 9082 vcpu->arch.cr0 |= X86_CR0_ET; 9193 9083 } 9194 9084 9195 - void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 9085 + int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) 9196 9086 { 9197 - void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask; 9198 - 9199 - kvmclock_reset(vcpu); 9200 - 9201 - kvm_x86_ops->vcpu_free(vcpu); 9202 - free_cpumask_var(wbinvd_dirty_mask); 9203 - } 9204 - 9205 - struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 9206 - unsigned int id) 9207 - { 9208 - struct kvm_vcpu *vcpu; 9209 - 9210 9087 if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) 9211 - printk_once(KERN_WARNING 9212 - "kvm: SMP vm created on host with unstable TSC; " 9213 - "guest TSC will not be reliable\n"); 9088 + pr_warn_once("kvm: SMP vm created on host with unstable TSC; " 9089 + "guest TSC will not be reliable\n"); 9214 9090 9215 - vcpu = kvm_x86_ops->vcpu_create(kvm, id); 9216 - 9217 - return vcpu; 9091 + return 0; 9218 9092 } 9219 9093 9220 - int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 9094 + int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) 9221 9095 { 9096 + struct page *page; 9097 + int r; 9098 + 9099 + vcpu->arch.emulate_ctxt.ops = &emulate_ops; 9100 + if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) 9101 + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 9102 + else 9103 + vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 9104 + 9105 + kvm_set_tsc_khz(vcpu, max_tsc_khz); 9106 + 9107 + r = kvm_mmu_create(vcpu); 9108 + if (r < 0) 9109 + return r; 9110 + 9111 + if (irqchip_in_kernel(vcpu->kvm)) { 9112 + vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm); 9113 + r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); 9114 + if (r < 0) 9115 + goto fail_mmu_destroy; 9116 + } else 9117 + static_key_slow_inc(&kvm_no_apic_vcpu); 9118 + 9119 + r = -ENOMEM; 9120 + 9121 + page = alloc_page(GFP_KERNEL | __GFP_ZERO); 9122 + if (!page) 9123 + goto fail_free_lapic; 9124 + vcpu->arch.pio_data = page_address(page); 9125 + 9126 + vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, 9127 + GFP_KERNEL_ACCOUNT); 9128 + if (!vcpu->arch.mce_banks) 9129 + goto fail_free_pio_data; 9130 + vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 9131 + 9132 + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, 9133 + GFP_KERNEL_ACCOUNT)) 9134 + goto fail_free_mce_banks; 9135 + 9136 + vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, 9137 + GFP_KERNEL_ACCOUNT); 9138 + if (!vcpu->arch.user_fpu) { 9139 + pr_err("kvm: failed to allocate userspace's fpu\n"); 9140 + goto free_wbinvd_dirty_mask; 9141 + } 9142 + 9143 + vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, 9144 + GFP_KERNEL_ACCOUNT); 9145 + if (!vcpu->arch.guest_fpu) { 9146 + pr_err("kvm: failed to allocate vcpu's fpu\n"); 9147 + goto free_user_fpu; 9148 + } 9149 + fx_init(vcpu); 9150 + 9151 + vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; 9152 + 9153 + vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); 9154 + 9155 + vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; 9156 + 9157 + kvm_async_pf_hash_reset(vcpu); 9158 + kvm_pmu_init(vcpu); 9159 + 9160 + vcpu->arch.pending_external_vector = -1; 9161 + vcpu->arch.preempted_in_kernel = false; 9162 + 9163 + kvm_hv_vcpu_init(vcpu); 9164 + 9165 + r = kvm_x86_ops->vcpu_create(vcpu); 9166 + if (r) 9167 + goto free_guest_fpu; 9168 + 9222 9169 vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); 9223 9170 vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; 9224 9171 kvm_vcpu_mtrr_init(vcpu); ··· 9284 9117 kvm_init_mmu(vcpu, false); 9285 9118 vcpu_put(vcpu); 9286 9119 return 0; 9120 + 9121 + free_guest_fpu: 9122 + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); 9123 + free_user_fpu: 9124 + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); 9125 + free_wbinvd_dirty_mask: 9126 + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); 9127 + fail_free_mce_banks: 9128 + kfree(vcpu->arch.mce_banks); 9129 + fail_free_pio_data: 9130 + free_page((unsigned long)vcpu->arch.pio_data); 9131 + fail_free_lapic: 9132 + kvm_free_lapic(vcpu); 9133 + fail_mmu_destroy: 9134 + kvm_mmu_destroy(vcpu); 9135 + return r; 9287 9136 } 9288 9137 9289 9138 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) ··· 9332 9149 9333 9150 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 9334 9151 { 9335 - vcpu->arch.apf.msr_val = 0; 9152 + struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache; 9153 + int idx; 9336 9154 9337 - vcpu_load(vcpu); 9338 - kvm_mmu_unload(vcpu); 9339 - vcpu_put(vcpu); 9155 + kvm_release_pfn(cache->pfn, cache->dirty, cache); 9156 + 9157 + kvmclock_reset(vcpu); 9340 9158 9341 9159 kvm_x86_ops->vcpu_free(vcpu); 9160 + 9161 + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); 9162 + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); 9163 + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); 9164 + 9165 + kvm_hv_vcpu_uninit(vcpu); 9166 + kvm_pmu_destroy(vcpu); 9167 + kfree(vcpu->arch.mce_banks); 9168 + kvm_free_lapic(vcpu); 9169 + idx = srcu_read_lock(&vcpu->kvm->srcu); 9170 + kvm_mmu_destroy(vcpu); 9171 + srcu_read_unlock(&vcpu->kvm->srcu, idx); 9172 + free_page((unsigned long)vcpu->arch.pio_data); 9173 + if (!lapic_in_kernel(vcpu)) 9174 + static_key_slow_dec(&kvm_no_apic_vcpu); 9342 9175 } 9343 9176 9344 9177 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) ··· 9370 9171 vcpu->arch.nmi_injected = false; 9371 9172 kvm_clear_interrupt_queue(vcpu); 9372 9173 kvm_clear_exception_queue(vcpu); 9373 - vcpu->arch.exception.pending = false; 9374 9174 9375 9175 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 9376 9176 kvm_update_dr0123(vcpu); ··· 9545 9347 if (r != 0) 9546 9348 return r; 9547 9349 9350 + cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data); 9351 + 9548 9352 if (kvm_has_tsc_control) { 9549 9353 /* 9550 9354 * Make sure the user can only configure tsc_khz values that ··· 9575 9375 9576 9376 int kvm_arch_check_processor_compat(void) 9577 9377 { 9378 + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); 9379 + 9380 + WARN_ON(!irqs_disabled()); 9381 + 9382 + if (kvm_host_cr4_reserved_bits(c) != cr4_reserved_bits) 9383 + return -EIO; 9384 + 9578 9385 return kvm_x86_ops->check_processor_compatibility(); 9579 9386 } 9580 9387 ··· 9598 9391 9599 9392 struct static_key kvm_no_apic_vcpu __read_mostly; 9600 9393 EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); 9601 - 9602 - int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 9603 - { 9604 - struct page *page; 9605 - int r; 9606 - 9607 - vcpu->arch.emulate_ctxt.ops = &emulate_ops; 9608 - if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) 9609 - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 9610 - else 9611 - vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 9612 - 9613 - page = alloc_page(GFP_KERNEL | __GFP_ZERO); 9614 - if (!page) { 9615 - r = -ENOMEM; 9616 - goto fail; 9617 - } 9618 - vcpu->arch.pio_data = page_address(page); 9619 - 9620 - kvm_set_tsc_khz(vcpu, max_tsc_khz); 9621 - 9622 - r = kvm_mmu_create(vcpu); 9623 - if (r < 0) 9624 - goto fail_free_pio_data; 9625 - 9626 - if (irqchip_in_kernel(vcpu->kvm)) { 9627 - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm); 9628 - r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); 9629 - if (r < 0) 9630 - goto fail_mmu_destroy; 9631 - } else 9632 - static_key_slow_inc(&kvm_no_apic_vcpu); 9633 - 9634 - vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, 9635 - GFP_KERNEL_ACCOUNT); 9636 - if (!vcpu->arch.mce_banks) { 9637 - r = -ENOMEM; 9638 - goto fail_free_lapic; 9639 - } 9640 - vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 9641 - 9642 - if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, 9643 - GFP_KERNEL_ACCOUNT)) { 9644 - r = -ENOMEM; 9645 - goto fail_free_mce_banks; 9646 - } 9647 - 9648 - fx_init(vcpu); 9649 - 9650 - vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; 9651 - 9652 - vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); 9653 - 9654 - vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; 9655 - 9656 - kvm_async_pf_hash_reset(vcpu); 9657 - kvm_pmu_init(vcpu); 9658 - 9659 - vcpu->arch.pending_external_vector = -1; 9660 - vcpu->arch.preempted_in_kernel = false; 9661 - 9662 - kvm_hv_vcpu_init(vcpu); 9663 - 9664 - return 0; 9665 - 9666 - fail_free_mce_banks: 9667 - kfree(vcpu->arch.mce_banks); 9668 - fail_free_lapic: 9669 - kvm_free_lapic(vcpu); 9670 - fail_mmu_destroy: 9671 - kvm_mmu_destroy(vcpu); 9672 - fail_free_pio_data: 9673 - free_page((unsigned long)vcpu->arch.pio_data); 9674 - fail: 9675 - return r; 9676 - } 9677 - 9678 - void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 9679 - { 9680 - int idx; 9681 - 9682 - kvm_hv_vcpu_uninit(vcpu); 9683 - kvm_pmu_destroy(vcpu); 9684 - kfree(vcpu->arch.mce_banks); 9685 - kvm_free_lapic(vcpu); 9686 - idx = srcu_read_lock(&vcpu->kvm->srcu); 9687 - kvm_mmu_destroy(vcpu); 9688 - srcu_read_unlock(&vcpu->kvm->srcu, idx); 9689 - free_page((unsigned long)vcpu->arch.pio_data); 9690 - if (!lapic_in_kernel(vcpu)) 9691 - static_key_slow_dec(&kvm_no_apic_vcpu); 9692 - } 9693 9394 9694 9395 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) 9695 9396 { ··· 9673 9558 kvm_unload_vcpu_mmu(vcpu); 9674 9559 } 9675 9560 kvm_for_each_vcpu(i, vcpu, kvm) 9676 - kvm_arch_vcpu_free(vcpu); 9561 + kvm_vcpu_destroy(vcpu); 9677 9562 9678 9563 mutex_lock(&kvm->lock); 9679 9564 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) ··· 9742 9627 } 9743 9628 EXPORT_SYMBOL_GPL(__x86_set_memory_region); 9744 9629 9745 - int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) 9746 - { 9747 - int r; 9748 - 9749 - mutex_lock(&kvm->slots_lock); 9750 - r = __x86_set_memory_region(kvm, id, gpa, size); 9751 - mutex_unlock(&kvm->slots_lock); 9752 - 9753 - return r; 9754 - } 9755 - EXPORT_SYMBOL_GPL(x86_set_memory_region); 9756 - 9757 9630 void kvm_arch_pre_destroy_vm(struct kvm *kvm) 9758 9631 { 9759 9632 kvm_mmu_pre_destroy_vm(kvm); ··· 9755 9652 * unless the the memory map has changed due to process exit 9756 9653 * or fd copying. 9757 9654 */ 9758 - x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0); 9759 - x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0); 9760 - x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); 9655 + mutex_lock(&kvm->slots_lock); 9656 + __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 9657 + 0, 0); 9658 + __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 9659 + 0, 0); 9660 + __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); 9661 + mutex_unlock(&kvm->slots_lock); 9761 9662 } 9762 9663 if (kvm_x86_ops->vm_destroy) 9763 9664 kvm_x86_ops->vm_destroy(kvm); ··· 9865 9758 9866 9759 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 9867 9760 { 9761 + struct kvm_vcpu *vcpu; 9762 + int i; 9763 + 9868 9764 /* 9869 9765 * memslots->generation has been incremented. 9870 9766 * mmio generation may have reached its maximum value. 9871 9767 */ 9872 9768 kvm_mmu_invalidate_mmio_sptes(kvm, gen); 9769 + 9770 + /* Force re-initialization of steal_time cache */ 9771 + kvm_for_each_vcpu(i, vcpu, kvm) 9772 + kvm_vcpu_kick(vcpu); 9873 9773 } 9874 9774 9875 9775 int kvm_arch_prepare_memory_region(struct kvm *kvm, ··· 9906 9792 * 9907 9793 * The reason is, in case of PML, we need to set D-bit for any slots 9908 9794 * with dirty logging disabled in order to eliminate unnecessary GPA 9909 - * logging in PML buffer (and potential PML buffer full VMEXT). This 9795 + * logging in PML buffer (and potential PML buffer full VMEXIT). This 9910 9796 * guarantees leaving PML enabled during guest's lifetime won't have 9911 9797 * any additional overhead from PML when guest is running with dirty 9912 9798 * logging disabled for memory slots. ··· 10128 10014 work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu)) 10129 10015 return; 10130 10016 10131 - vcpu->arch.mmu->page_fault(vcpu, work->gva, 0, true); 10017 + vcpu->arch.mmu->page_fault(vcpu, work->cr2_or_gpa, 0, true); 10132 10018 } 10133 10019 10134 10020 static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) ··· 10241 10127 { 10242 10128 struct x86_exception fault; 10243 10129 10244 - trace_kvm_async_pf_not_present(work->arch.token, work->gva); 10130 + trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa); 10245 10131 kvm_add_async_pf_gfn(vcpu, work->arch.gfn); 10246 10132 10247 10133 if (kvm_can_deliver_async_pf(vcpu) && ··· 10276 10162 work->arch.token = ~0; /* broadcast wakeup */ 10277 10163 else 10278 10164 kvm_del_async_pf_gfn(vcpu, work->arch.gfn); 10279 - trace_kvm_async_pf_ready(work->arch.token, work->gva); 10165 + trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa); 10280 10166 10281 10167 if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED && 10282 10168 !apf_get_user(vcpu, &val)) { ··· 10398 10284 { 10399 10285 return vector_hashing; 10400 10286 } 10401 - EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled); 10402 10287 10403 10288 bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) 10404 10289 { ··· 10405 10292 } 10406 10293 EXPORT_SYMBOL_GPL(kvm_arch_no_poll); 10407 10294 10295 + u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu) 10296 + { 10297 + uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD; 10298 + 10299 + /* The STIBP bit doesn't fault even if it's not advertised */ 10300 + if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && 10301 + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS)) 10302 + bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); 10303 + if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) && 10304 + !boot_cpu_has(X86_FEATURE_AMD_IBRS)) 10305 + bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); 10306 + 10307 + if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) && 10308 + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) 10309 + bits &= ~SPEC_CTRL_SSBD; 10310 + if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && 10311 + !boot_cpu_has(X86_FEATURE_AMD_SSBD)) 10312 + bits &= ~SPEC_CTRL_SSBD; 10313 + 10314 + return bits; 10315 + } 10316 + EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits); 10408 10317 10409 10318 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 10410 10319 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);

+9 -14

arch/x86/kvm/x86.h

··· 144 144 return !is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu); 145 145 } 146 146 147 - static inline u32 bit(int bitno) 148 - { 149 - return 1 << (bitno & 31); 150 - } 151 - 152 147 static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu) 153 148 { 154 149 return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48; ··· 161 166 162 167 static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu) 163 168 { 164 - #ifdef CONFIG_X86_64 165 169 return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la; 166 - #else 167 - return false; 168 - #endif 169 170 } 170 171 171 172 static inline bool emul_is_noncanonical_address(u64 la, 172 173 struct x86_emulate_ctxt *ctxt) 173 174 { 174 - #ifdef CONFIG_X86_64 175 175 return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la; 176 - #else 177 - return false; 178 - #endif 179 176 } 180 177 181 178 static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, ··· 276 289 bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, 277 290 int page_num); 278 291 bool kvm_vector_hashing_enabled(void); 279 - int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, 292 + int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 280 293 int emulation_type, void *insn, int insn_len); 294 + enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); 281 295 282 296 #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ 283 297 | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ ··· 357 369 return (data | ((data & 0x0202020202020202ull) << 1)) == data; 358 370 } 359 371 372 + static inline bool kvm_dr7_valid(unsigned long data) 373 + { 374 + /* Bits [63:32] are reserved */ 375 + return !(data >> 32); 376 + } 377 + 360 378 void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); 361 379 void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); 380 + u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu); 362 381 363 382 #endif

+11

arch/x86/mm/pat/set_memory.c

··· 618 618 } 619 619 EXPORT_SYMBOL_GPL(lookup_address); 620 620 621 + /* 622 + * Lookup the page table entry for a virtual address in a given mm. Return a 623 + * pointer to the entry and the level of the mapping. 624 + */ 625 + pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address, 626 + unsigned int *level) 627 + { 628 + return lookup_address_in_pgd(pgd_offset(mm, address), address, level); 629 + } 630 + EXPORT_SYMBOL_GPL(lookup_address_in_mm); 631 + 621 632 static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, 622 633 unsigned int *level) 623 634 {

-9

include/linux/context_tracking.h

··· 154 154 } 155 155 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ 156 156 157 - static inline void guest_enter(void) 158 - { 159 - unsigned long flags; 160 - 161 - local_irq_save(flags); 162 - guest_enter_irqoff(); 163 - local_irq_restore(flags); 164 - } 165 - 166 157 static inline void guest_exit(void) 167 158 { 168 159 unsigned long flags;

+6

include/linux/huge_mm.h

··· 160 160 161 161 extern void prep_transhuge_page(struct page *page); 162 162 extern void free_transhuge_page(struct page *page); 163 + bool is_transparent_hugepage(struct page *page); 163 164 164 165 bool can_split_huge_page(struct page *page, int *pextra_pins); 165 166 int split_huge_page_to_list(struct page *page, struct list_head *list); ··· 308 307 } 309 308 310 309 static inline void prep_transhuge_page(struct page *page) {} 310 + 311 + static inline bool is_transparent_hugepage(struct page *page) 312 + { 313 + return false; 314 + } 311 315 312 316 #define transparent_hugepage_flags 0UL 313 317

+21 -19

include/linux/kvm_host.h

··· 157 157 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 158 158 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 159 159 160 - extern struct kmem_cache *kvm_vcpu_cache; 161 - 162 160 extern struct mutex kvm_lock; 163 161 extern struct list_head vm_list; 164 162 ··· 202 204 struct list_head queue; 203 205 struct kvm_vcpu *vcpu; 204 206 struct mm_struct *mm; 205 - gva_t gva; 207 + gpa_t cr2_or_gpa; 206 208 unsigned long addr; 207 209 struct kvm_arch_async_pf arch; 208 210 bool wakeup_all; ··· 210 212 211 213 void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); 212 214 void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); 213 - int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, 214 - struct kvm_arch_async_pf *arch); 215 + int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 216 + unsigned long hva, struct kvm_arch_async_pf *arch); 215 217 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); 216 218 #endif 217 219 ··· 577 579 memslot < slots->memslots + KVM_MEM_SLOTS_NUM && memslot->npages;\ 578 580 memslot++) 579 581 580 - int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); 581 - void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); 582 + void kvm_vcpu_destroy(struct kvm_vcpu *vcpu); 582 583 583 584 void vcpu_load(struct kvm_vcpu *vcpu); 584 585 void vcpu_put(struct kvm_vcpu *vcpu); ··· 720 723 void kvm_set_pfn_accessed(kvm_pfn_t pfn); 721 724 void kvm_get_pfn(kvm_pfn_t pfn); 722 725 726 + void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache); 723 727 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 724 728 int len); 725 - int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 726 - unsigned long len); 727 729 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); 728 730 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 729 731 void *data, unsigned long len); ··· 763 767 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); 764 768 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); 765 769 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); 766 - unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn); 770 + unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); 767 771 void mark_page_dirty(struct kvm *kvm, gfn_t gfn); 768 772 769 773 struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); ··· 771 775 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); 772 776 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); 773 777 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map); 778 + int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, 779 + struct gfn_to_pfn_cache *cache, bool atomic); 774 780 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn); 775 781 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); 782 + int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, 783 + struct gfn_to_pfn_cache *cache, bool dirty, bool atomic); 776 784 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); 777 785 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); 778 786 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, ··· 867 867 int kvm_arch_init(void *opaque); 868 868 void kvm_arch_exit(void); 869 869 870 - int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu); 871 - void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); 872 - 873 870 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu); 874 871 875 - void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu); 876 872 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); 877 873 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); 878 - struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); 879 - int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); 874 + int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id); 875 + int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu); 880 876 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); 881 877 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); 882 878 ··· 978 982 void kvm_arch_sync_events(struct kvm *kvm); 979 983 980 984 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); 981 - void kvm_vcpu_kick(struct kvm_vcpu *vcpu); 982 985 983 986 bool kvm_is_reserved_pfn(kvm_pfn_t pfn); 984 987 bool kvm_is_zone_device_pfn(kvm_pfn_t pfn); 988 + bool kvm_is_transparent_hugepage(kvm_pfn_t pfn); 985 989 986 990 struct kvm_irq_ack_notifier { 987 991 struct hlist_node link; ··· 1105 1109 }; 1106 1110 1107 1111 struct kvm_stat_data { 1108 - int offset; 1109 - int mode; 1110 1112 struct kvm *kvm; 1113 + struct kvm_stats_debugfs_item *dbgfs_item; 1111 1114 }; 1112 1115 1113 1116 struct kvm_stats_debugfs_item { ··· 1115 1120 enum kvm_stat_kind kind; 1116 1121 int mode; 1117 1122 }; 1123 + 1124 + #define KVM_DBGFS_GET_MODE(dbgfs_item) \ 1125 + ((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644) 1126 + 1118 1127 extern struct kvm_stats_debugfs_item debugfs_entries[]; 1119 1128 extern struct dentry *kvm_debugfs_dir; 1120 1129 ··· 1340 1341 { 1341 1342 } 1342 1343 #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ 1344 + 1345 + struct kvm_vcpu *kvm_get_running_vcpu(void); 1346 + struct kvm_vcpu __percpu **kvm_get_running_vcpus(void); 1343 1347 1344 1348 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS 1345 1349 bool kvm_arch_has_irq_bypass(void);

+8 -1

include/linux/kvm_types.h

··· 18 18 19 19 enum kvm_mr_change; 20 20 21 - #include <asm/types.h> 21 + #include <linux/types.h> 22 22 23 23 /* 24 24 * Address types: ··· 49 49 unsigned long hva; 50 50 unsigned long len; 51 51 struct kvm_memory_slot *memslot; 52 + }; 53 + 54 + struct gfn_to_pfn_cache { 55 + u64 generation; 56 + gfn_t gfn; 57 + kvm_pfn_t pfn; 58 + bool dirty; 52 59 }; 53 60 54 61 #endif /* __KVM_TYPES_H__ */

+11

mm/huge_memory.c

··· 527 527 set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); 528 528 } 529 529 530 + bool is_transparent_hugepage(struct page *page) 531 + { 532 + if (!PageCompound(page)) 533 + return 0; 534 + 535 + page = compound_head(page); 536 + return is_huge_zero_page(page) || 537 + page[1].compound_dtor == TRANSHUGE_PAGE_DTOR; 538 + } 539 + EXPORT_SYMBOL_GPL(is_transparent_hugepage); 540 + 530 541 static unsigned long __thp_get_unmapped_area(struct file *filp, 531 542 unsigned long addr, unsigned long len, 532 543 loff_t off, unsigned long flags, unsigned long size)

+2 -2

tools/arch/x86/include/uapi/asm/vmx.h

··· 33 33 #define EXIT_REASON_TRIPLE_FAULT 2 34 34 #define EXIT_REASON_INIT_SIGNAL 3 35 35 36 - #define EXIT_REASON_PENDING_INTERRUPT 7 36 + #define EXIT_REASON_INTERRUPT_WINDOW 7 37 37 #define EXIT_REASON_NMI_WINDOW 8 38 38 #define EXIT_REASON_TASK_SWITCH 9 39 39 #define EXIT_REASON_CPUID 10 ··· 94 94 { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ 95 95 { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ 96 96 { EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \ 97 - { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ 97 + { EXIT_REASON_INTERRUPT_WINDOW, "INTERRUPT_WINDOW" }, \ 98 98 { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ 99 99 { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ 100 100 { EXIT_REASON_CPUID, "CPUID" }, \

+6 -2

tools/kvm/kvm_stat/kvm_stat

··· 270 270 def __init__(self, exit_reasons): 271 271 self.sc_perf_evt_open = 298 272 272 self.ioctl_numbers = IOCTL_NUMBERS 273 + self.exit_reason_field = 'exit_reason' 273 274 self.exit_reasons = exit_reasons 274 275 275 276 def debugfs_is_child(self, field): ··· 290 289 # numbers depend on the wordsize. 291 290 char_ptr_size = ctypes.sizeof(ctypes.c_char_p) 292 291 self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16 292 + self.exit_reason_field = 'exit_nr' 293 293 self.exit_reasons = {} 294 294 295 295 def debugfs_is_child(self, field): ··· 302 300 def __init__(self): 303 301 self.sc_perf_evt_open = 241 304 302 self.ioctl_numbers = IOCTL_NUMBERS 303 + self.exit_reason_field = 'esr_ec' 305 304 self.exit_reasons = AARCH64_EXIT_REASONS 306 305 307 306 def debugfs_is_child(self, field): ··· 314 311 def __init__(self): 315 312 self.sc_perf_evt_open = 331 316 313 self.ioctl_numbers = IOCTL_NUMBERS 314 + self.exit_reason_field = None 317 315 self.exit_reasons = None 318 316 319 317 def debugfs_is_child(self, field): ··· 545 541 """ 546 542 filters = {} 547 543 filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS) 548 - if ARCH.exit_reasons: 549 - filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons) 544 + if ARCH.exit_reason_field and ARCH.exit_reasons: 545 + filters['kvm_exit'] = (ARCH.exit_reason_field, ARCH.exit_reasons) 550 546 return filters 551 547 552 548 def _get_available_fields(self):

+4 -4

tools/testing/selftests/kvm/include/x86_64/vmx.h

··· 18 18 /* 19 19 * Definitions of Primary Processor-Based VM-Execution Controls. 20 20 */ 21 - #define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 22 - #define CPU_BASED_USE_TSC_OFFSETING 0x00000008 21 + #define CPU_BASED_INTR_WINDOW_EXITING 0x00000004 22 + #define CPU_BASED_USE_TSC_OFFSETTING 0x00000008 23 23 #define CPU_BASED_HLT_EXITING 0x00000080 24 24 #define CPU_BASED_INVLPG_EXITING 0x00000200 25 25 #define CPU_BASED_MWAIT_EXITING 0x00000400 ··· 30 30 #define CPU_BASED_CR8_LOAD_EXITING 0x00080000 31 31 #define CPU_BASED_CR8_STORE_EXITING 0x00100000 32 32 #define CPU_BASED_TPR_SHADOW 0x00200000 33 - #define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000 33 + #define CPU_BASED_NMI_WINDOW_EXITING 0x00400000 34 34 #define CPU_BASED_MOV_DR_EXITING 0x00800000 35 35 #define CPU_BASED_UNCOND_IO_EXITING 0x01000000 36 36 #define CPU_BASED_USE_IO_BITMAPS 0x02000000 ··· 103 103 #define EXIT_REASON_EXCEPTION_NMI 0 104 104 #define EXIT_REASON_EXTERNAL_INTERRUPT 1 105 105 #define EXIT_REASON_TRIPLE_FAULT 2 106 - #define EXIT_REASON_PENDING_INTERRUPT 7 106 + #define EXIT_REASON_INTERRUPT_WINDOW 7 107 107 #define EXIT_REASON_NMI_WINDOW 8 108 108 #define EXIT_REASON_TASK_SWITCH 9 109 109 #define EXIT_REASON_CPUID 10

+1 -1

tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c

··· 98 98 prepare_vmcs(vmx_pages, l2_guest_code, 99 99 &l2_guest_stack[L2_GUEST_STACK_SIZE]); 100 100 control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); 101 - control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETING; 101 + control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING; 102 102 vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); 103 103 vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE); 104 104

+114 -17

virt/kvm/arm/aarch32.c

··· 10 10 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 11 11 */ 12 12 13 + #include <linux/bits.h> 13 14 #include <linux/kvm_host.h> 14 15 #include <asm/kvm_emulate.h> 15 16 #include <asm/kvm_hyp.h> 17 + 18 + #define DFSR_FSC_EXTABT_LPAE 0x10 19 + #define DFSR_FSC_EXTABT_nLPAE 0x08 20 + #define DFSR_LPAE BIT(9) 16 21 17 22 /* 18 23 * Table taken from ARMv8 ARM DDI0487B-B, table G1-10. ··· 33 28 [7] = { 4, 4 }, /* FIQ, unused */ 34 29 }; 35 30 31 + /* 32 + * When an exception is taken, most CPSR fields are left unchanged in the 33 + * handler. However, some are explicitly overridden (e.g. M[4:0]). 34 + * 35 + * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with 36 + * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was 37 + * obsoleted by the ARMv7 virtualization extensions and is RES0. 38 + * 39 + * For the SPSR layout seen from AArch32, see: 40 + * - ARM DDI 0406C.d, page B1-1148 41 + * - ARM DDI 0487E.a, page G8-6264 42 + * 43 + * For the SPSR_ELx layout for AArch32 seen from AArch64, see: 44 + * - ARM DDI 0487E.a, page C5-426 45 + * 46 + * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from 47 + * MSB to LSB. 48 + */ 49 + static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode) 50 + { 51 + u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); 52 + unsigned long old, new; 53 + 54 + old = *vcpu_cpsr(vcpu); 55 + new = 0; 56 + 57 + new |= (old & PSR_AA32_N_BIT); 58 + new |= (old & PSR_AA32_Z_BIT); 59 + new |= (old & PSR_AA32_C_BIT); 60 + new |= (old & PSR_AA32_V_BIT); 61 + new |= (old & PSR_AA32_Q_BIT); 62 + 63 + // CPSR.IT[7:0] are set to zero upon any exception 64 + // See ARM DDI 0487E.a, section G1.12.3 65 + // See ARM DDI 0406C.d, section B1.8.3 66 + 67 + new |= (old & PSR_AA32_DIT_BIT); 68 + 69 + // CPSR.SSBS is set to SCTLR.DSSBS upon any exception 70 + // See ARM DDI 0487E.a, page G8-6244 71 + if (sctlr & BIT(31)) 72 + new |= PSR_AA32_SSBS_BIT; 73 + 74 + // CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0 75 + // SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented 76 + // See ARM DDI 0487E.a, page G8-6246 77 + new |= (old & PSR_AA32_PAN_BIT); 78 + if (!(sctlr & BIT(23))) 79 + new |= PSR_AA32_PAN_BIT; 80 + 81 + // SS does not exist in AArch32, so ignore 82 + 83 + // CPSR.IL is set to zero upon any exception 84 + // See ARM DDI 0487E.a, page G1-5527 85 + 86 + new |= (old & PSR_AA32_GE_MASK); 87 + 88 + // CPSR.IT[7:0] are set to zero upon any exception 89 + // See prior comment above 90 + 91 + // CPSR.E is set to SCTLR.EE upon any exception 92 + // See ARM DDI 0487E.a, page G8-6245 93 + // See ARM DDI 0406C.d, page B4-1701 94 + if (sctlr & BIT(25)) 95 + new |= PSR_AA32_E_BIT; 96 + 97 + // CPSR.A is unchanged upon an exception to Undefined, Supervisor 98 + // CPSR.A is set upon an exception to other modes 99 + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 100 + // See ARM DDI 0406C.d, page B1-1182 101 + new |= (old & PSR_AA32_A_BIT); 102 + if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC) 103 + new |= PSR_AA32_A_BIT; 104 + 105 + // CPSR.I is set upon any exception 106 + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 107 + // See ARM DDI 0406C.d, page B1-1182 108 + new |= PSR_AA32_I_BIT; 109 + 110 + // CPSR.F is set upon an exception to FIQ 111 + // CPSR.F is unchanged upon an exception to other modes 112 + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 113 + // See ARM DDI 0406C.d, page B1-1182 114 + new |= (old & PSR_AA32_F_BIT); 115 + if (mode == PSR_AA32_MODE_FIQ) 116 + new |= PSR_AA32_F_BIT; 117 + 118 + // CPSR.T is set to SCTLR.TE upon any exception 119 + // See ARM DDI 0487E.a, page G8-5514 120 + // See ARM DDI 0406C.d, page B1-1181 121 + if (sctlr & BIT(30)) 122 + new |= PSR_AA32_T_BIT; 123 + 124 + new |= mode; 125 + 126 + return new; 127 + } 128 + 36 129 static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) 37 130 { 38 - unsigned long cpsr; 39 - unsigned long new_spsr_value = *vcpu_cpsr(vcpu); 40 - bool is_thumb = (new_spsr_value & PSR_AA32_T_BIT); 131 + unsigned long spsr = *vcpu_cpsr(vcpu); 132 + bool is_thumb = (spsr & PSR_AA32_T_BIT); 41 133 u32 return_offset = return_offsets[vect_offset >> 2][is_thumb]; 42 134 u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); 43 135 44 - cpsr = mode | PSR_AA32_I_BIT; 45 - 46 - if (sctlr & (1 << 30)) 47 - cpsr |= PSR_AA32_T_BIT; 48 - if (sctlr & (1 << 25)) 49 - cpsr |= PSR_AA32_E_BIT; 50 - 51 - *vcpu_cpsr(vcpu) = cpsr; 136 + *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode); 52 137 53 138 /* Note: These now point to the banked copies */ 54 - vcpu_write_spsr(vcpu, new_spsr_value); 139 + vcpu_write_spsr(vcpu, host_spsr_to_spsr32(spsr)); 55 140 *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; 56 141 57 142 /* Branch to exception vector */ ··· 179 84 fsr = &vcpu_cp15(vcpu, c5_DFSR); 180 85 } 181 86 182 - prepare_fault32(vcpu, PSR_AA32_MODE_ABT | PSR_AA32_A_BIT, vect_offset); 87 + prepare_fault32(vcpu, PSR_AA32_MODE_ABT, vect_offset); 183 88 184 89 *far = addr; 185 90 186 91 /* Give the guest an IMPLEMENTATION DEFINED exception */ 187 92 is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); 188 - if (is_lpae) 189 - *fsr = 1 << 9 | 0x34; 190 - else 191 - *fsr = 0x14; 93 + if (is_lpae) { 94 + *fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE; 95 + } else { 96 + /* no need to shuffle FS[4] into DFSR[10] as its 0 */ 97 + *fsr = DFSR_FSC_EXTABT_nLPAE; 98 + } 192 99 } 193 100 194 101 void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr)

+3 -2

virt/kvm/arm/arch_timer.c

··· 805 805 switch (treg) { 806 806 case TIMER_REG_TVAL: 807 807 val = timer->cnt_cval - kvm_phys_timer_read() + timer->cntvoff; 808 + val &= lower_32_bits(val); 808 809 break; 809 810 810 811 case TIMER_REG_CTL: ··· 851 850 { 852 851 switch (treg) { 853 852 case TIMER_REG_TVAL: 854 - timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + val; 853 + timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + (s32)val; 855 854 break; 856 855 857 856 case TIMER_REG_CTL: ··· 1023 1022 1024 1023 bool kvm_arch_timer_get_input_level(int vintid) 1025 1024 { 1026 - struct kvm_vcpu *vcpu = kvm_arm_get_running_vcpu(); 1025 + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); 1027 1026 struct arch_timer_context *timer; 1028 1027 1029 1028 if (vintid == vcpu_vtimer(vcpu)->irq.irq)

+29 -86

virt/kvm/arm/arm.c

··· 20 20 #include <linux/irqbypass.h> 21 21 #include <linux/sched/stat.h> 22 22 #include <trace/events/kvm.h> 23 - #include <kvm/arm_pmu.h> 24 - #include <kvm/arm_psci.h> 25 23 26 24 #define CREATE_TRACE_POINTS 27 25 #include "trace.h" ··· 49 51 DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data); 50 52 static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); 51 53 52 - /* Per-CPU variable containing the currently running vcpu. */ 53 - static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu); 54 - 55 54 /* The VMID used in the VTTBR */ 56 55 static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); 57 56 static u32 kvm_next_vmid; ··· 57 62 static bool vgic_present; 58 63 59 64 static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); 60 - 61 - static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) 62 - { 63 - __this_cpu_write(kvm_arm_running_vcpu, vcpu); 64 - } 65 - 66 65 DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); 67 - 68 - /** 69 - * kvm_arm_get_running_vcpu - get the vcpu running on the current CPU. 70 - * Must be called from non-preemptible context 71 - */ 72 - struct kvm_vcpu *kvm_arm_get_running_vcpu(void) 73 - { 74 - return __this_cpu_read(kvm_arm_running_vcpu); 75 - } 76 - 77 - /** 78 - * kvm_arm_get_running_vcpus - get the per-CPU array of currently running vcpus. 79 - */ 80 - struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) 81 - { 82 - return &kvm_arm_running_vcpu; 83 - } 84 66 85 67 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) 86 68 { ··· 166 194 167 195 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 168 196 if (kvm->vcpus[i]) { 169 - kvm_arch_vcpu_free(kvm->vcpus[i]); 197 + kvm_vcpu_destroy(kvm->vcpus[i]); 170 198 kvm->vcpus[i] = NULL; 171 199 } 172 200 } ··· 251 279 vfree(kvm); 252 280 } 253 281 254 - struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) 282 + int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) 283 + { 284 + if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) 285 + return -EBUSY; 286 + 287 + if (id >= kvm->arch.max_vcpus) 288 + return -EINVAL; 289 + 290 + return 0; 291 + } 292 + 293 + int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) 255 294 { 256 295 int err; 257 - struct kvm_vcpu *vcpu; 258 296 259 - if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) { 260 - err = -EBUSY; 261 - goto out; 262 - } 297 + /* Force users to call KVM_ARM_VCPU_INIT */ 298 + vcpu->arch.target = -1; 299 + bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); 263 300 264 - if (id >= kvm->arch.max_vcpus) { 265 - err = -EINVAL; 266 - goto out; 267 - } 301 + /* Set up the timer */ 302 + kvm_timer_vcpu_init(vcpu); 268 303 269 - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 270 - if (!vcpu) { 271 - err = -ENOMEM; 272 - goto out; 273 - } 304 + kvm_pmu_vcpu_init(vcpu); 274 305 275 - err = kvm_vcpu_init(vcpu, kvm, id); 306 + kvm_arm_reset_debug_ptr(vcpu); 307 + 308 + kvm_arm_pvtime_vcpu_init(&vcpu->arch); 309 + 310 + err = kvm_vgic_vcpu_init(vcpu); 276 311 if (err) 277 - goto free_vcpu; 312 + return err; 278 313 279 - err = create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP); 280 - if (err) 281 - goto vcpu_uninit; 282 - 283 - return vcpu; 284 - vcpu_uninit: 285 - kvm_vcpu_uninit(vcpu); 286 - free_vcpu: 287 - kmem_cache_free(kvm_vcpu_cache, vcpu); 288 - out: 289 - return ERR_PTR(err); 314 + return create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP); 290 315 } 291 316 292 317 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 293 318 { 294 319 } 295 320 296 - void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 321 + void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 297 322 { 298 323 if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm))) 299 324 static_branch_dec(&userspace_irqchip_in_use); ··· 298 329 kvm_mmu_free_memory_caches(vcpu); 299 330 kvm_timer_vcpu_terminate(vcpu); 300 331 kvm_pmu_vcpu_destroy(vcpu); 301 - kvm_vcpu_uninit(vcpu); 302 - kmem_cache_free(kvm_vcpu_cache, vcpu); 303 - } 304 332 305 - void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 306 - { 307 - kvm_arch_vcpu_free(vcpu); 333 + kvm_arm_vcpu_destroy(vcpu); 308 334 } 309 335 310 336 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) ··· 332 368 preempt_enable(); 333 369 } 334 370 335 - int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 336 - { 337 - /* Force users to call KVM_ARM_VCPU_INIT */ 338 - vcpu->arch.target = -1; 339 - bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); 340 - 341 - /* Set up the timer */ 342 - kvm_timer_vcpu_init(vcpu); 343 - 344 - kvm_pmu_vcpu_init(vcpu); 345 - 346 - kvm_arm_reset_debug_ptr(vcpu); 347 - 348 - kvm_arm_pvtime_vcpu_init(&vcpu->arch); 349 - 350 - return kvm_vgic_vcpu_init(vcpu); 351 - } 352 - 353 371 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 354 372 { 355 373 int *last_ran; ··· 352 406 vcpu->cpu = cpu; 353 407 vcpu->arch.host_cpu_context = &cpu_data->host_ctxt; 354 408 355 - kvm_arm_set_running_vcpu(vcpu); 356 409 kvm_vgic_load(vcpu); 357 410 kvm_timer_vcpu_load(vcpu); 358 411 kvm_vcpu_load_sysregs(vcpu); ··· 377 432 kvm_vcpu_pmu_restore_host(vcpu); 378 433 379 434 vcpu->cpu = -1; 380 - 381 - kvm_arm_set_running_vcpu(NULL); 382 435 } 383 436 384 437 static void vcpu_power_off(struct kvm_vcpu *vcpu) ··· 1480 1537 free_hyp_pgds(); 1481 1538 for_each_possible_cpu(cpu) 1482 1539 free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); 1483 - hyp_cpu_pm_exit(); 1484 1540 } 1485 1541 1486 1542 /** ··· 1693 1751 return 0; 1694 1752 1695 1753 out_hyp: 1754 + hyp_cpu_pm_exit(); 1696 1755 if (!in_hyp_mode) 1697 1756 teardown_hyp_mode(); 1698 1757 out_err:

+24 -44

virt/kvm/arm/mmio.c

··· 5 5 */ 6 6 7 7 #include <linux/kvm_host.h> 8 - #include <asm/kvm_mmio.h> 9 8 #include <asm/kvm_emulate.h> 10 9 #include <trace/events/kvm.h> 11 10 ··· 91 92 92 93 vcpu->mmio_needed = 0; 93 94 94 - if (!run->mmio.is_write) { 95 - len = run->mmio.len; 96 - if (len > sizeof(unsigned long)) 97 - return -EINVAL; 98 - 95 + if (!kvm_vcpu_dabt_iswrite(vcpu)) { 96 + len = kvm_vcpu_dabt_get_as(vcpu); 99 97 data = kvm_mmio_read_buf(run->mmio.data, len); 100 98 101 - if (vcpu->arch.mmio_decode.sign_extend && 99 + if (kvm_vcpu_dabt_issext(vcpu) && 102 100 len < sizeof(unsigned long)) { 103 101 mask = 1U << ((len * 8) - 1); 104 102 data = (data ^ mask) - mask; 105 103 } 106 104 105 + if (!kvm_vcpu_dabt_issf(vcpu)) 106 + data = data & 0xffffffff; 107 + 107 108 trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, 108 109 &data); 109 110 data = vcpu_data_host_to_guest(vcpu, data, len); 110 - vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data); 111 + vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data); 111 112 } 112 113 113 114 /* ··· 115 116 * in the guest. 116 117 */ 117 118 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); 118 - 119 - return 0; 120 - } 121 - 122 - static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len) 123 - { 124 - unsigned long rt; 125 - int access_size; 126 - bool sign_extend; 127 - 128 - if (kvm_vcpu_dabt_iss1tw(vcpu)) { 129 - /* page table accesses IO mem: tell guest to fix its TTBR */ 130 - kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 131 - return 1; 132 - } 133 - 134 - access_size = kvm_vcpu_dabt_get_as(vcpu); 135 - if (unlikely(access_size < 0)) 136 - return access_size; 137 - 138 - *is_write = kvm_vcpu_dabt_iswrite(vcpu); 139 - sign_extend = kvm_vcpu_dabt_issext(vcpu); 140 - rt = kvm_vcpu_dabt_get_rd(vcpu); 141 - 142 - *len = access_size; 143 - vcpu->arch.mmio_decode.sign_extend = sign_extend; 144 - vcpu->arch.mmio_decode.rt = rt; 145 119 146 120 return 0; 147 121 } ··· 130 158 u8 data_buf[8]; 131 159 132 160 /* 133 - * Prepare MMIO operation. First decode the syndrome data we get 134 - * from the CPU. Then try if some in-kernel emulation feels 135 - * responsible, otherwise let user space do its magic. 161 + * No valid syndrome? Ask userspace for help if it has 162 + * voluntered to do so, and bail out otherwise. 136 163 */ 137 - if (kvm_vcpu_dabt_isvalid(vcpu)) { 138 - ret = decode_hsr(vcpu, &is_write, &len); 139 - if (ret) 140 - return ret; 141 - } else { 164 + if (!kvm_vcpu_dabt_isvalid(vcpu)) { 142 165 if (vcpu->kvm->arch.return_nisv_io_abort_to_user) { 143 166 run->exit_reason = KVM_EXIT_ARM_NISV; 144 167 run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu); ··· 145 178 return -ENOSYS; 146 179 } 147 180 148 - rt = vcpu->arch.mmio_decode.rt; 181 + /* Page table accesses IO mem: tell guest to fix its TTBR */ 182 + if (kvm_vcpu_dabt_iss1tw(vcpu)) { 183 + kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 184 + return 1; 185 + } 186 + 187 + /* 188 + * Prepare MMIO operation. First decode the syndrome data we get 189 + * from the CPU. Then try if some in-kernel emulation feels 190 + * responsible, otherwise let user space do its magic. 191 + */ 192 + is_write = kvm_vcpu_dabt_iswrite(vcpu); 193 + len = kvm_vcpu_dabt_get_as(vcpu); 194 + rt = kvm_vcpu_dabt_get_rd(vcpu); 149 195 150 196 if (is_write) { 151 197 data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt),

+12 -20

virt/kvm/arm/mmu.c

··· 14 14 #include <asm/cacheflush.h> 15 15 #include <asm/kvm_arm.h> 16 16 #include <asm/kvm_mmu.h> 17 - #include <asm/kvm_mmio.h> 18 17 #include <asm/kvm_ras.h> 19 18 #include <asm/kvm_asm.h> 20 19 #include <asm/kvm_emulate.h> ··· 1376 1377 { 1377 1378 kvm_pfn_t pfn = *pfnp; 1378 1379 gfn_t gfn = *ipap >> PAGE_SHIFT; 1379 - struct page *page = pfn_to_page(pfn); 1380 1380 1381 - /* 1382 - * PageTransCompoundMap() returns true for THP and 1383 - * hugetlbfs. Make sure the adjustment is done only for THP 1384 - * pages. 1385 - */ 1386 - if (!PageHuge(page) && PageTransCompoundMap(page)) { 1381 + if (kvm_is_transparent_hugepage(pfn)) { 1387 1382 unsigned long mask; 1388 1383 /* 1389 1384 * The address we faulted on is backed by a transparent huge ··· 1589 1596 __invalidate_icache_guest_page(pfn, size); 1590 1597 } 1591 1598 1592 - static void kvm_send_hwpoison_signal(unsigned long address, 1593 - struct vm_area_struct *vma) 1599 + static void kvm_send_hwpoison_signal(unsigned long address, short lsb) 1594 1600 { 1595 - short lsb; 1596 - 1597 - if (is_vm_hugetlb_page(vma)) 1598 - lsb = huge_page_shift(hstate_vma(vma)); 1599 - else 1600 - lsb = PAGE_SHIFT; 1601 - 1602 1601 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1603 1602 } 1604 1603 ··· 1663 1678 struct kvm *kvm = vcpu->kvm; 1664 1679 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 1665 1680 struct vm_area_struct *vma; 1681 + short vma_shift; 1666 1682 kvm_pfn_t pfn; 1667 1683 pgprot_t mem_type = PAGE_S2; 1668 1684 bool logging_active = memslot_is_logging(memslot); ··· 1687 1701 return -EFAULT; 1688 1702 } 1689 1703 1690 - vma_pagesize = vma_kernel_pagesize(vma); 1704 + if (is_vm_hugetlb_page(vma)) 1705 + vma_shift = huge_page_shift(hstate_vma(vma)); 1706 + else 1707 + vma_shift = PAGE_SHIFT; 1708 + 1709 + vma_pagesize = 1ULL << vma_shift; 1691 1710 if (logging_active || 1692 1711 (vma->vm_flags & VM_PFNMAP) || 1693 1712 !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) { ··· 1732 1741 1733 1742 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); 1734 1743 if (pfn == KVM_PFN_ERR_HWPOISON) { 1735 - kvm_send_hwpoison_signal(hva, vma); 1744 + kvm_send_hwpoison_signal(hva, vma_shift); 1736 1745 return 0; 1737 1746 } 1738 1747 if (is_error_noslot_pfn(pfn)) ··· 2138 2147 if (!kvm->arch.pgd) 2139 2148 return 0; 2140 2149 trace_kvm_test_age_hva(hva); 2141 - return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); 2150 + return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE, 2151 + kvm_test_age_hva_handler, NULL); 2142 2152 } 2143 2153 2144 2154 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)

+3 -3

virt/kvm/arm/perf.c

··· 13 13 14 14 static int kvm_is_in_guest(void) 15 15 { 16 - return kvm_arm_get_running_vcpu() != NULL; 16 + return kvm_get_running_vcpu() != NULL; 17 17 } 18 18 19 19 static int kvm_is_user_mode(void) 20 20 { 21 21 struct kvm_vcpu *vcpu; 22 22 23 - vcpu = kvm_arm_get_running_vcpu(); 23 + vcpu = kvm_get_running_vcpu(); 24 24 25 25 if (vcpu) 26 26 return !vcpu_mode_priv(vcpu); ··· 32 32 { 33 33 struct kvm_vcpu *vcpu; 34 34 35 - vcpu = kvm_arm_get_running_vcpu(); 35 + vcpu = kvm_get_running_vcpu(); 36 36 37 37 if (vcpu) 38 38 return *vcpu_pc(vcpu);

+69 -45

virt/kvm/arm/pmu.c

··· 15 15 #include <kvm/arm_vgic.h> 16 16 17 17 static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx); 18 + static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx); 19 + static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc); 18 20 19 21 #define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1 20 22 ··· 76 74 return pmc - 1; 77 75 78 76 return pmc; 77 + } 78 + static struct kvm_pmc *kvm_pmu_get_alternate_pmc(struct kvm_pmc *pmc) 79 + { 80 + if (kvm_pmu_idx_is_high_counter(pmc->idx)) 81 + return pmc - 1; 82 + else 83 + return pmc + 1; 79 84 } 80 85 81 86 /** ··· 247 238 */ 248 239 void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) 249 240 { 250 - int i; 241 + unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); 251 242 struct kvm_pmu *pmu = &vcpu->arch.pmu; 243 + int i; 252 244 253 - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) 245 + for_each_set_bit(i, &mask, 32) 254 246 kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]); 255 247 256 248 bitmap_zero(vcpu->arch.pmu.chained, ARMV8_PMU_MAX_COUNTER_PAIRS); ··· 304 294 305 295 pmc = &pmu->pmc[i]; 306 296 307 - /* 308 - * For high counters of chained events we must recreate the 309 - * perf event with the long (64bit) attribute set. 310 - */ 311 - if (kvm_pmu_pmc_is_chained(pmc) && 312 - kvm_pmu_idx_is_high_counter(i)) { 313 - kvm_pmu_create_perf_event(vcpu, i); 314 - continue; 315 - } 297 + /* A change in the enable state may affect the chain state */ 298 + kvm_pmu_update_pmc_chained(vcpu, i); 299 + kvm_pmu_create_perf_event(vcpu, i); 316 300 317 301 /* At this point, pmc must be the canonical */ 318 302 if (pmc->perf_event) { ··· 339 335 340 336 pmc = &pmu->pmc[i]; 341 337 342 - /* 343 - * For high counters of chained events we must recreate the 344 - * perf event with the long (64bit) attribute unset. 345 - */ 346 - if (kvm_pmu_pmc_is_chained(pmc) && 347 - kvm_pmu_idx_is_high_counter(i)) { 348 - kvm_pmu_create_perf_event(vcpu, i); 349 - continue; 350 - } 338 + /* A change in the enable state may affect the chain state */ 339 + kvm_pmu_update_pmc_chained(vcpu, i); 340 + kvm_pmu_create_perf_event(vcpu, i); 351 341 352 342 /* At this point, pmc must be the canonical */ 353 343 if (pmc->perf_event) ··· 478 480 */ 479 481 void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) 480 482 { 483 + struct kvm_pmu *pmu = &vcpu->arch.pmu; 481 484 int i; 482 - u64 type, enable, reg; 483 485 484 - if (val == 0) 486 + if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) 485 487 return; 486 488 487 - enable = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); 489 + /* Weed out disabled counters */ 490 + val &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); 491 + 488 492 for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) { 493 + u64 type, reg; 494 + 489 495 if (!(val & BIT(i))) 490 496 continue; 491 - type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i) 492 - & ARMV8_PMU_EVTYPE_EVENT; 493 - if ((type == ARMV8_PMUV3_PERFCTR_SW_INCR) 494 - && (enable & BIT(i))) { 495 - reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1; 497 + 498 + /* PMSWINC only applies to ... SW_INC! */ 499 + type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i); 500 + type &= ARMV8_PMU_EVTYPE_EVENT; 501 + if (type != ARMV8_PMUV3_PERFCTR_SW_INCR) 502 + continue; 503 + 504 + /* increment this even SW_INC counter */ 505 + reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1; 506 + reg = lower_32_bits(reg); 507 + __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg; 508 + 509 + if (reg) /* no overflow on the low part */ 510 + continue; 511 + 512 + if (kvm_pmu_pmc_is_chained(&pmu->pmc[i])) { 513 + /* increment the high counter */ 514 + reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) + 1; 496 515 reg = lower_32_bits(reg); 497 - __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg; 498 - if (!reg) 499 - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); 516 + __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) = reg; 517 + if (!reg) /* mark overflow on the high counter */ 518 + __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i + 1); 519 + } else { 520 + /* mark overflow on low counter */ 521 + __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); 500 522 } 501 523 } 502 524 } ··· 528 510 */ 529 511 void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) 530 512 { 531 - u64 mask; 513 + unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); 532 514 int i; 533 515 534 - mask = kvm_pmu_valid_counter_mask(vcpu); 535 516 if (val & ARMV8_PMU_PMCR_E) { 536 517 kvm_pmu_enable_counter_mask(vcpu, 537 518 __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask); ··· 542 525 kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); 543 526 544 527 if (val & ARMV8_PMU_PMCR_P) { 545 - for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) 528 + for_each_set_bit(i, &mask, 32) 546 529 kvm_pmu_set_counter_value(vcpu, i, 0); 547 530 } 548 531 } ··· 599 582 600 583 counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); 601 584 602 - if (kvm_pmu_idx_has_chain_evtype(vcpu, pmc->idx)) { 585 + if (kvm_pmu_pmc_is_chained(pmc)) { 603 586 /** 604 587 * The initial sample period (overflow count) of an event. For 605 588 * chained counters we only support overflow interrupts on the 606 589 * high counter. 607 590 */ 608 591 attr.sample_period = (-counter) & GENMASK(63, 0); 609 - if (kvm_pmu_counter_is_enabled(vcpu, pmc->idx + 1)) 610 - attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED; 592 + attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED; 611 593 612 594 event = perf_event_create_kernel_counter(&attr, -1, current, 613 595 kvm_pmu_perf_overflow, ··· 637 621 * @select_idx: The number of selected counter 638 622 * 639 623 * Update the chained bitmap based on the event type written in the 640 - * typer register. 624 + * typer register and the enable state of the odd register. 641 625 */ 642 626 static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx) 643 627 { 644 628 struct kvm_pmu *pmu = &vcpu->arch.pmu; 645 - struct kvm_pmc *pmc = &pmu->pmc[select_idx]; 629 + struct kvm_pmc *pmc = &pmu->pmc[select_idx], *canonical_pmc; 630 + bool new_state, old_state; 646 631 647 - if (kvm_pmu_idx_has_chain_evtype(vcpu, pmc->idx)) { 632 + old_state = kvm_pmu_pmc_is_chained(pmc); 633 + new_state = kvm_pmu_idx_has_chain_evtype(vcpu, pmc->idx) && 634 + kvm_pmu_counter_is_enabled(vcpu, pmc->idx | 0x1); 635 + 636 + if (old_state == new_state) 637 + return; 638 + 639 + canonical_pmc = kvm_pmu_get_canonical_pmc(pmc); 640 + kvm_pmu_stop_counter(vcpu, canonical_pmc); 641 + if (new_state) { 648 642 /* 649 643 * During promotion from !chained to chained we must ensure 650 644 * the adjacent counter is stopped and its event destroyed 651 645 */ 652 - if (!kvm_pmu_pmc_is_chained(pmc)) 653 - kvm_pmu_stop_counter(vcpu, pmc); 654 - 646 + kvm_pmu_stop_counter(vcpu, kvm_pmu_get_alternate_pmc(pmc)); 655 647 set_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); 656 - } else { 657 - clear_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); 648 + return; 658 649 } 650 + clear_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); 659 651 } 660 652 661 653 /**

+3 -3

virt/kvm/arm/vgic/vgic-its.c

··· 839 839 u32 event_id = its_cmd_get_id(its_cmd); 840 840 struct its_ite *ite; 841 841 842 - 843 842 ite = find_ite(its, device_id, event_id); 844 - if (ite && ite->collection) { 843 + if (ite && its_is_collection_mapped(ite->collection)) { 845 844 /* 846 845 * Though the spec talks about removing the pending state, we 847 846 * don't bother here since we clear the ITTE anyway and the ··· 2474 2475 target_addr = (u32)(val >> KVM_ITS_CTE_RDBASE_SHIFT); 2475 2476 coll_id = val & KVM_ITS_CTE_ICID_MASK; 2476 2477 2477 - if (target_addr >= atomic_read(&kvm->online_vcpus)) 2478 + if (target_addr != COLLECTION_NOT_MAPPED && 2479 + target_addr >= atomic_read(&kvm->online_vcpus)) 2478 2480 return -EINVAL; 2479 2481 2480 2482 collection = find_collection(its, coll_id);

+4 -1

virt/kvm/arm/vgic/vgic-mmio-v3.c

··· 414 414 gpa_t addr, unsigned int len) 415 415 { 416 416 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 417 + u64 value = vgic_cpu->pendbaser; 417 418 418 - return extract_bytes(vgic_cpu->pendbaser, addr & 7, len); 419 + value &= ~GICR_PENDBASER_PTZ; 420 + 421 + return extract_bytes(value, addr & 7, len); 419 422 } 420 423 421 424 static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu,

+3 -12

virt/kvm/arm/vgic/vgic-mmio.c

··· 190 190 * value later will give us the same value as we update the per-CPU variable 191 191 * in the preempt notifier handlers. 192 192 */ 193 - static struct kvm_vcpu *vgic_get_mmio_requester_vcpu(void) 194 - { 195 - struct kvm_vcpu *vcpu; 196 - 197 - preempt_disable(); 198 - vcpu = kvm_arm_get_running_vcpu(); 199 - preempt_enable(); 200 - return vcpu; 201 - } 202 193 203 194 /* Must be called with irq->irq_lock held */ 204 195 static void vgic_hw_irq_spending(struct kvm_vcpu *vcpu, struct vgic_irq *irq, ··· 212 221 gpa_t addr, unsigned int len, 213 222 unsigned long val) 214 223 { 215 - bool is_uaccess = !vgic_get_mmio_requester_vcpu(); 224 + bool is_uaccess = !kvm_get_running_vcpu(); 216 225 u32 intid = VGIC_ADDR_TO_INTID(addr, 1); 217 226 int i; 218 227 unsigned long flags; ··· 265 274 gpa_t addr, unsigned int len, 266 275 unsigned long val) 267 276 { 268 - bool is_uaccess = !vgic_get_mmio_requester_vcpu(); 277 + bool is_uaccess = !kvm_get_running_vcpu(); 269 278 u32 intid = VGIC_ADDR_TO_INTID(addr, 1); 270 279 int i; 271 280 unsigned long flags; ··· 326 335 bool active) 327 336 { 328 337 unsigned long flags; 329 - struct kvm_vcpu *requester_vcpu = vgic_get_mmio_requester_vcpu(); 338 + struct kvm_vcpu *requester_vcpu = kvm_get_running_vcpu(); 330 339 331 340 raw_spin_lock_irqsave(&irq->irq_lock, flags); 332 341

-5

virt/kvm/arm/vgic/vgic-mmio.h

··· 98 98 .uaccess_write = uwr, \ 99 99 } 100 100 101 - int kvm_vgic_register_mmio_region(struct kvm *kvm, struct kvm_vcpu *vcpu, 102 - struct vgic_register_region *reg_desc, 103 - struct vgic_io_device *region, 104 - int nr_irqs, bool offset_private); 105 - 106 101 unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len); 107 102 108 103 void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,

+9 -22

virt/kvm/async_pf.c

··· 17 17 #include "async_pf.h" 18 18 #include <trace/events/kvm.h> 19 19 20 - static inline void kvm_async_page_present_sync(struct kvm_vcpu *vcpu, 21 - struct kvm_async_pf *work) 22 - { 23 - #ifdef CONFIG_KVM_ASYNC_PF_SYNC 24 - kvm_arch_async_page_present(vcpu, work); 25 - #endif 26 - } 27 - static inline void kvm_async_page_present_async(struct kvm_vcpu *vcpu, 28 - struct kvm_async_pf *work) 29 - { 30 - #ifndef CONFIG_KVM_ASYNC_PF_SYNC 31 - kvm_arch_async_page_present(vcpu, work); 32 - #endif 33 - } 34 - 35 20 static struct kmem_cache *async_pf_cache; 36 21 37 22 int kvm_async_pf_init(void) ··· 49 64 struct mm_struct *mm = apf->mm; 50 65 struct kvm_vcpu *vcpu = apf->vcpu; 51 66 unsigned long addr = apf->addr; 52 - gva_t gva = apf->gva; 67 + gpa_t cr2_or_gpa = apf->cr2_or_gpa; 53 68 int locked = 1; 54 69 55 70 might_sleep(); ··· 65 80 if (locked) 66 81 up_read(&mm->mmap_sem); 67 82 68 - kvm_async_page_present_sync(vcpu, apf); 83 + if (IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC)) 84 + kvm_arch_async_page_present(vcpu, apf); 69 85 70 86 spin_lock(&vcpu->async_pf.lock); 71 87 list_add_tail(&apf->link, &vcpu->async_pf.done); ··· 78 92 * this point 79 93 */ 80 94 81 - trace_kvm_async_pf_completed(addr, gva); 95 + trace_kvm_async_pf_completed(addr, cr2_or_gpa); 82 96 83 97 if (swq_has_sleeper(&vcpu->wq)) 84 98 swake_up_one(&vcpu->wq); ··· 143 157 spin_unlock(&vcpu->async_pf.lock); 144 158 145 159 kvm_arch_async_page_ready(vcpu, work); 146 - kvm_async_page_present_async(vcpu, work); 160 + if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC)) 161 + kvm_arch_async_page_present(vcpu, work); 147 162 148 163 list_del(&work->queue); 149 164 vcpu->async_pf.queued--; ··· 152 165 } 153 166 } 154 167 155 - int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, 156 - struct kvm_arch_async_pf *arch) 168 + int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 169 + unsigned long hva, struct kvm_arch_async_pf *arch) 157 170 { 158 171 struct kvm_async_pf *work; 159 172 ··· 172 185 173 186 work->wakeup_all = false; 174 187 work->vcpu = vcpu; 175 - work->gva = gva; 188 + work->cr2_or_gpa = cr2_or_gpa; 176 189 work->addr = hva; 177 190 work->arch = *arch; 178 191 work->mm = current->mm;

+270 -167

virt/kvm/kvm_main.c

··· 104 104 static int kvm_usage_count; 105 105 static atomic_t hardware_enable_failed; 106 106 107 - struct kmem_cache *kvm_vcpu_cache; 108 - EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 107 + static struct kmem_cache *kvm_vcpu_cache; 109 108 110 109 static __read_mostly struct preempt_ops kvm_preempt_ops; 110 + static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu); 111 111 112 112 struct dentry *kvm_debugfs_dir; 113 113 EXPORT_SYMBOL_GPL(kvm_debugfs_dir); 114 114 115 115 static int kvm_debugfs_num_entries; 116 - static const struct file_operations *stat_fops_per_vm[]; 116 + static const struct file_operations stat_fops_per_vm; 117 117 118 118 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 119 119 unsigned long arg); ··· 191 191 return true; 192 192 } 193 193 194 + bool kvm_is_transparent_hugepage(kvm_pfn_t pfn) 195 + { 196 + struct page *page = pfn_to_page(pfn); 197 + 198 + if (!PageTransCompoundMap(page)) 199 + return false; 200 + 201 + return is_transparent_hugepage(compound_head(page)); 202 + } 203 + 194 204 /* 195 205 * Switches to specified vcpu, until a matching vcpu_put() 196 206 */ 197 207 void vcpu_load(struct kvm_vcpu *vcpu) 198 208 { 199 209 int cpu = get_cpu(); 210 + 211 + __this_cpu_write(kvm_running_vcpu, vcpu); 200 212 preempt_notifier_register(&vcpu->preempt_notifier); 201 213 kvm_arch_vcpu_load(vcpu, cpu); 202 214 put_cpu(); ··· 220 208 preempt_disable(); 221 209 kvm_arch_vcpu_put(vcpu); 222 210 preempt_notifier_unregister(&vcpu->preempt_notifier); 211 + __this_cpu_write(kvm_running_vcpu, NULL); 223 212 preempt_enable(); 224 213 } 225 214 EXPORT_SYMBOL_GPL(vcpu_put); ··· 335 322 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 336 323 } 337 324 338 - int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 325 + static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 339 326 { 340 - struct page *page; 341 - int r; 342 - 343 327 mutex_init(&vcpu->mutex); 344 328 vcpu->cpu = -1; 345 329 vcpu->kvm = kvm; ··· 348 338 vcpu->pre_pcpu = -1; 349 339 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); 350 340 351 - page = alloc_page(GFP_KERNEL | __GFP_ZERO); 352 - if (!page) { 353 - r = -ENOMEM; 354 - goto fail; 355 - } 356 - vcpu->run = page_address(page); 357 - 358 341 kvm_vcpu_set_in_spin_loop(vcpu, false); 359 342 kvm_vcpu_set_dy_eligible(vcpu, false); 360 343 vcpu->preempted = false; 361 344 vcpu->ready = false; 362 - 363 - r = kvm_arch_vcpu_init(vcpu); 364 - if (r < 0) 365 - goto fail_free_run; 366 - return 0; 367 - 368 - fail_free_run: 369 - free_page((unsigned long)vcpu->run); 370 - fail: 371 - return r; 345 + preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 372 346 } 373 - EXPORT_SYMBOL_GPL(kvm_vcpu_init); 374 347 375 - void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 348 + void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) 376 349 { 350 + kvm_arch_vcpu_destroy(vcpu); 351 + 377 352 /* 378 - * no need for rcu_read_lock as VCPU_RUN is the only place that 379 - * will change the vcpu->pid pointer and on uninit all file 380 - * descriptors are already gone. 353 + * No need for rcu_read_lock as VCPU_RUN is the only place that changes 354 + * the vcpu->pid pointer, and at destruction time all file descriptors 355 + * are already gone. 381 356 */ 382 357 put_pid(rcu_dereference_protected(vcpu->pid, 1)); 383 - kvm_arch_vcpu_uninit(vcpu); 358 + 384 359 free_page((unsigned long)vcpu->run); 360 + kmem_cache_free(kvm_vcpu_cache, vcpu); 385 361 } 386 - EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 362 + EXPORT_SYMBOL_GPL(kvm_vcpu_destroy); 387 363 388 364 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 389 365 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) ··· 646 650 return -ENOMEM; 647 651 648 652 stat_data->kvm = kvm; 649 - stat_data->offset = p->offset; 650 - stat_data->mode = p->mode ? p->mode : 0644; 653 + stat_data->dbgfs_item = p; 651 654 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; 652 - debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry, 653 - stat_data, stat_fops_per_vm[p->kind]); 655 + debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p), 656 + kvm->debugfs_dentry, stat_data, 657 + &stat_fops_per_vm); 654 658 } 655 659 return 0; 656 660 } ··· 960 964 961 965 /* 962 966 * Increment the new memslot generation a second time, dropping the 963 - * update in-progress flag and incrementing then generation based on 967 + * update in-progress flag and incrementing the generation based on 964 968 * the number of address spaces. This provides a unique and easily 965 969 * identifiable generation number while the memslots are in flux. 966 970 */ ··· 1113 1117 * 1114 1118 * validation of sp->gfn happens in: 1115 1119 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 1116 - * - kvm_is_visible_gfn (mmu_check_roots) 1120 + * - kvm_is_visible_gfn (mmu_check_root) 1117 1121 */ 1118 1122 kvm_arch_flush_shadow_memslot(kvm, slot); 1119 1123 ··· 1402 1406 } 1403 1407 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1404 1408 1405 - unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) 1409 + unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) 1406 1410 { 1407 1411 struct vm_area_struct *vma; 1408 1412 unsigned long addr, size; 1409 1413 1410 1414 size = PAGE_SIZE; 1411 1415 1412 - addr = gfn_to_hva(kvm, gfn); 1416 + addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL); 1413 1417 if (kvm_is_error_hva(addr)) 1414 1418 return PAGE_SIZE; 1415 1419 ··· 1515 1519 /* 1516 1520 * The fast path to get the writable pfn which will be stored in @pfn, 1517 1521 * true indicates success, otherwise false is returned. It's also the 1518 - * only part that runs if we can are in atomic context. 1522 + * only part that runs if we can in atomic context. 1519 1523 */ 1520 1524 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, 1521 1525 bool *writable, kvm_pfn_t *pfn) ··· 1817 1821 } 1818 1822 EXPORT_SYMBOL_GPL(gfn_to_page); 1819 1823 1820 - static int __kvm_map_gfn(struct kvm_memory_slot *slot, gfn_t gfn, 1821 - struct kvm_host_map *map) 1824 + void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache) 1825 + { 1826 + if (pfn == 0) 1827 + return; 1828 + 1829 + if (cache) 1830 + cache->pfn = cache->gfn = 0; 1831 + 1832 + if (dirty) 1833 + kvm_release_pfn_dirty(pfn); 1834 + else 1835 + kvm_release_pfn_clean(pfn); 1836 + } 1837 + 1838 + static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn, 1839 + struct gfn_to_pfn_cache *cache, u64 gen) 1840 + { 1841 + kvm_release_pfn(cache->pfn, cache->dirty, cache); 1842 + 1843 + cache->pfn = gfn_to_pfn_memslot(slot, gfn); 1844 + cache->gfn = gfn; 1845 + cache->dirty = false; 1846 + cache->generation = gen; 1847 + } 1848 + 1849 + static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn, 1850 + struct kvm_host_map *map, 1851 + struct gfn_to_pfn_cache *cache, 1852 + bool atomic) 1822 1853 { 1823 1854 kvm_pfn_t pfn; 1824 1855 void *hva = NULL; 1825 1856 struct page *page = KVM_UNMAPPED_PAGE; 1857 + struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn); 1858 + u64 gen = slots->generation; 1826 1859 1827 1860 if (!map) 1828 1861 return -EINVAL; 1829 1862 1830 - pfn = gfn_to_pfn_memslot(slot, gfn); 1863 + if (cache) { 1864 + if (!cache->pfn || cache->gfn != gfn || 1865 + cache->generation != gen) { 1866 + if (atomic) 1867 + return -EAGAIN; 1868 + kvm_cache_gfn_to_pfn(slot, gfn, cache, gen); 1869 + } 1870 + pfn = cache->pfn; 1871 + } else { 1872 + if (atomic) 1873 + return -EAGAIN; 1874 + pfn = gfn_to_pfn_memslot(slot, gfn); 1875 + } 1831 1876 if (is_error_noslot_pfn(pfn)) 1832 1877 return -EINVAL; 1833 1878 1834 1879 if (pfn_valid(pfn)) { 1835 1880 page = pfn_to_page(pfn); 1836 - hva = kmap(page); 1881 + if (atomic) 1882 + hva = kmap_atomic(page); 1883 + else 1884 + hva = kmap(page); 1837 1885 #ifdef CONFIG_HAS_IOMEM 1838 - } else { 1886 + } else if (!atomic) { 1839 1887 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); 1888 + } else { 1889 + return -EINVAL; 1840 1890 #endif 1841 1891 } 1842 1892 ··· 1897 1855 return 0; 1898 1856 } 1899 1857 1858 + int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, 1859 + struct gfn_to_pfn_cache *cache, bool atomic) 1860 + { 1861 + return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map, 1862 + cache, atomic); 1863 + } 1864 + EXPORT_SYMBOL_GPL(kvm_map_gfn); 1865 + 1900 1866 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) 1901 1867 { 1902 - return __kvm_map_gfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, map); 1868 + return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map, 1869 + NULL, false); 1903 1870 } 1904 1871 EXPORT_SYMBOL_GPL(kvm_vcpu_map); 1905 1872 1906 - void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, 1907 - bool dirty) 1873 + static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot, 1874 + struct kvm_host_map *map, 1875 + struct gfn_to_pfn_cache *cache, 1876 + bool dirty, bool atomic) 1908 1877 { 1909 1878 if (!map) 1910 1879 return; ··· 1923 1870 if (!map->hva) 1924 1871 return; 1925 1872 1926 - if (map->page != KVM_UNMAPPED_PAGE) 1927 - kunmap(map->page); 1873 + if (map->page != KVM_UNMAPPED_PAGE) { 1874 + if (atomic) 1875 + kunmap_atomic(map->hva); 1876 + else 1877 + kunmap(map->page); 1878 + } 1928 1879 #ifdef CONFIG_HAS_IOMEM 1929 - else 1880 + else if (!atomic) 1930 1881 memunmap(map->hva); 1882 + else 1883 + WARN_ONCE(1, "Unexpected unmapping in atomic context"); 1931 1884 #endif 1932 1885 1933 - if (dirty) { 1934 - kvm_vcpu_mark_page_dirty(vcpu, map->gfn); 1935 - kvm_release_pfn_dirty(map->pfn); 1936 - } else { 1937 - kvm_release_pfn_clean(map->pfn); 1938 - } 1886 + if (dirty) 1887 + mark_page_dirty_in_slot(memslot, map->gfn); 1888 + 1889 + if (cache) 1890 + cache->dirty |= dirty; 1891 + else 1892 + kvm_release_pfn(map->pfn, dirty, NULL); 1939 1893 1940 1894 map->hva = NULL; 1941 1895 map->page = NULL; 1896 + } 1897 + 1898 + int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, 1899 + struct gfn_to_pfn_cache *cache, bool dirty, bool atomic) 1900 + { 1901 + __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map, 1902 + cache, dirty, atomic); 1903 + return 0; 1904 + } 1905 + EXPORT_SYMBOL_GPL(kvm_unmap_gfn); 1906 + 1907 + void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) 1908 + { 1909 + __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL, 1910 + dirty, false); 1942 1911 } 1943 1912 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); 1944 1913 ··· 2006 1931 2007 1932 void kvm_set_pfn_dirty(kvm_pfn_t pfn) 2008 1933 { 2009 - if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) { 2010 - struct page *page = pfn_to_page(pfn); 2011 - 2012 - SetPageDirty(page); 2013 - } 1934 + if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) 1935 + SetPageDirty(pfn_to_page(pfn)); 2014 1936 } 2015 1937 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 2016 1938 ··· 2123 2051 return 0; 2124 2052 } 2125 2053 2126 - int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 2127 - unsigned long len) 2128 - { 2129 - gfn_t gfn = gpa >> PAGE_SHIFT; 2130 - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2131 - int offset = offset_in_page(gpa); 2132 - 2133 - return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 2134 - } 2135 - EXPORT_SYMBOL_GPL(kvm_read_guest_atomic); 2136 - 2137 2054 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, 2138 2055 void *data, unsigned long len) 2139 2056 { ··· 2219 2158 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; 2220 2159 gfn_t nr_pages_needed = end_gfn - start_gfn + 1; 2221 2160 gfn_t nr_pages_avail; 2222 - int r = start_gfn <= end_gfn ? 0 : -EINVAL; 2223 2161 2224 - ghc->gpa = gpa; 2162 + /* Update ghc->generation before performing any error checks. */ 2225 2163 ghc->generation = slots->generation; 2226 - ghc->len = len; 2227 - ghc->hva = KVM_HVA_ERR_BAD; 2164 + 2165 + if (start_gfn > end_gfn) { 2166 + ghc->hva = KVM_HVA_ERR_BAD; 2167 + return -EINVAL; 2168 + } 2228 2169 2229 2170 /* 2230 2171 * If the requested region crosses two memslots, we still 2231 2172 * verify that the entire region is valid here. 2232 2173 */ 2233 - while (!r && start_gfn <= end_gfn) { 2174 + for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) { 2234 2175 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 2235 2176 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, 2236 2177 &nr_pages_avail); 2237 2178 if (kvm_is_error_hva(ghc->hva)) 2238 - r = -EFAULT; 2239 - start_gfn += nr_pages_avail; 2179 + return -EFAULT; 2240 2180 } 2241 2181 2242 2182 /* Use the slow path for cross page reads and writes. */ 2243 - if (!r && nr_pages_needed == 1) 2183 + if (nr_pages_needed == 1) 2244 2184 ghc->hva += offset; 2245 2185 else 2246 2186 ghc->memslot = NULL; 2247 2187 2248 - return r; 2188 + ghc->gpa = gpa; 2189 + ghc->len = len; 2190 + return 0; 2249 2191 } 2250 2192 2251 2193 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, ··· 2269 2205 2270 2206 BUG_ON(len + offset > ghc->len); 2271 2207 2272 - if (slots->generation != ghc->generation) 2273 - __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); 2274 - 2275 - if (unlikely(!ghc->memslot)) 2276 - return kvm_write_guest(kvm, gpa, data, len); 2208 + if (slots->generation != ghc->generation) { 2209 + if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) 2210 + return -EFAULT; 2211 + } 2277 2212 2278 2213 if (kvm_is_error_hva(ghc->hva)) 2279 2214 return -EFAULT; 2215 + 2216 + if (unlikely(!ghc->memslot)) 2217 + return kvm_write_guest(kvm, gpa, data, len); 2280 2218 2281 2219 r = __copy_to_user((void __user *)ghc->hva + offset, data, len); 2282 2220 if (r) ··· 2304 2238 2305 2239 BUG_ON(len > ghc->len); 2306 2240 2307 - if (slots->generation != ghc->generation) 2308 - __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); 2309 - 2310 - if (unlikely(!ghc->memslot)) 2311 - return kvm_read_guest(kvm, ghc->gpa, data, len); 2241 + if (slots->generation != ghc->generation) { 2242 + if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) 2243 + return -EFAULT; 2244 + } 2312 2245 2313 2246 if (kvm_is_error_hva(ghc->hva)) 2314 2247 return -EFAULT; 2248 + 2249 + if (unlikely(!ghc->memslot)) 2250 + return kvm_read_guest(kvm, ghc->gpa, data, len); 2315 2251 2316 2252 r = __copy_from_user(data, (void __user *)ghc->hva, len); 2317 2253 if (r) ··· 2786 2718 { 2787 2719 int r; 2788 2720 struct kvm_vcpu *vcpu; 2721 + struct page *page; 2789 2722 2790 2723 if (id >= KVM_MAX_VCPU_ID) 2791 2724 return -EINVAL; ··· 2800 2731 kvm->created_vcpus++; 2801 2732 mutex_unlock(&kvm->lock); 2802 2733 2803 - vcpu = kvm_arch_vcpu_create(kvm, id); 2804 - if (IS_ERR(vcpu)) { 2805 - r = PTR_ERR(vcpu); 2734 + r = kvm_arch_vcpu_precreate(kvm, id); 2735 + if (r) 2736 + goto vcpu_decrement; 2737 + 2738 + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 2739 + if (!vcpu) { 2740 + r = -ENOMEM; 2806 2741 goto vcpu_decrement; 2807 2742 } 2808 2743 2809 - preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 2744 + BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE); 2745 + page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2746 + if (!page) { 2747 + r = -ENOMEM; 2748 + goto vcpu_free; 2749 + } 2750 + vcpu->run = page_address(page); 2810 2751 2811 - r = kvm_arch_vcpu_setup(vcpu); 2752 + kvm_vcpu_init(vcpu, kvm, id); 2753 + 2754 + r = kvm_arch_vcpu_create(vcpu); 2812 2755 if (r) 2813 - goto vcpu_destroy; 2756 + goto vcpu_free_run_page; 2814 2757 2815 2758 kvm_create_vcpu_debugfs(vcpu); 2816 2759 ··· 2859 2778 unlock_vcpu_destroy: 2860 2779 mutex_unlock(&kvm->lock); 2861 2780 debugfs_remove_recursive(vcpu->debugfs_dentry); 2862 - vcpu_destroy: 2863 2781 kvm_arch_vcpu_destroy(vcpu); 2782 + vcpu_free_run_page: 2783 + free_page((unsigned long)vcpu->run); 2784 + vcpu_free: 2785 + kmem_cache_free(kvm_vcpu_cache, vcpu); 2864 2786 vcpu_decrement: 2865 2787 mutex_lock(&kvm->lock); 2866 2788 kvm->created_vcpus--; ··· 4097 4013 return -ENOENT; 4098 4014 4099 4015 if (simple_attr_open(inode, file, get, 4100 - stat_data->mode & S_IWUGO ? set : NULL, 4101 - fmt)) { 4016 + KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222 4017 + ? set : NULL, 4018 + fmt)) { 4102 4019 kvm_put_kvm(stat_data->kvm); 4103 4020 return -ENOMEM; 4104 4021 } ··· 4118 4033 return 0; 4119 4034 } 4120 4035 4121 - static int vm_stat_get_per_vm(void *data, u64 *val) 4036 + static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val) 4122 4037 { 4123 - struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4124 - 4125 - *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset); 4038 + *val = *(ulong *)((void *)kvm + offset); 4126 4039 4127 4040 return 0; 4128 4041 } 4129 4042 4130 - static int vm_stat_clear_per_vm(void *data, u64 val) 4043 + static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset) 4131 4044 { 4132 - struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4133 - 4134 - if (val) 4135 - return -EINVAL; 4136 - 4137 - *(ulong *)((void *)stat_data->kvm + stat_data->offset) = 0; 4045 + *(ulong *)((void *)kvm + offset) = 0; 4138 4046 4139 4047 return 0; 4140 4048 } 4141 4049 4142 - static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file) 4143 - { 4144 - __simple_attr_check_format("%llu\n", 0ull); 4145 - return kvm_debugfs_open(inode, file, vm_stat_get_per_vm, 4146 - vm_stat_clear_per_vm, "%llu\n"); 4147 - } 4148 - 4149 - static const struct file_operations vm_stat_get_per_vm_fops = { 4150 - .owner = THIS_MODULE, 4151 - .open = vm_stat_get_per_vm_open, 4152 - .release = kvm_debugfs_release, 4153 - .read = simple_attr_read, 4154 - .write = simple_attr_write, 4155 - .llseek = no_llseek, 4156 - }; 4157 - 4158 - static int vcpu_stat_get_per_vm(void *data, u64 *val) 4050 + static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val) 4159 4051 { 4160 4052 int i; 4161 - struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4162 4053 struct kvm_vcpu *vcpu; 4163 4054 4164 4055 *val = 0; 4165 4056 4166 - kvm_for_each_vcpu(i, vcpu, stat_data->kvm) 4167 - *val += *(u64 *)((void *)vcpu + stat_data->offset); 4057 + kvm_for_each_vcpu(i, vcpu, kvm) 4058 + *val += *(u64 *)((void *)vcpu + offset); 4168 4059 4169 4060 return 0; 4170 4061 } 4171 4062 4172 - static int vcpu_stat_clear_per_vm(void *data, u64 val) 4063 + static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset) 4173 4064 { 4174 4065 int i; 4175 - struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4176 4066 struct kvm_vcpu *vcpu; 4067 + 4068 + kvm_for_each_vcpu(i, vcpu, kvm) 4069 + *(u64 *)((void *)vcpu + offset) = 0; 4070 + 4071 + return 0; 4072 + } 4073 + 4074 + static int kvm_stat_data_get(void *data, u64 *val) 4075 + { 4076 + int r = -EFAULT; 4077 + struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4078 + 4079 + switch (stat_data->dbgfs_item->kind) { 4080 + case KVM_STAT_VM: 4081 + r = kvm_get_stat_per_vm(stat_data->kvm, 4082 + stat_data->dbgfs_item->offset, val); 4083 + break; 4084 + case KVM_STAT_VCPU: 4085 + r = kvm_get_stat_per_vcpu(stat_data->kvm, 4086 + stat_data->dbgfs_item->offset, val); 4087 + break; 4088 + } 4089 + 4090 + return r; 4091 + } 4092 + 4093 + static int kvm_stat_data_clear(void *data, u64 val) 4094 + { 4095 + int r = -EFAULT; 4096 + struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4177 4097 4178 4098 if (val) 4179 4099 return -EINVAL; 4180 4100 4181 - kvm_for_each_vcpu(i, vcpu, stat_data->kvm) 4182 - *(u64 *)((void *)vcpu + stat_data->offset) = 0; 4101 + switch (stat_data->dbgfs_item->kind) { 4102 + case KVM_STAT_VM: 4103 + r = kvm_clear_stat_per_vm(stat_data->kvm, 4104 + stat_data->dbgfs_item->offset); 4105 + break; 4106 + case KVM_STAT_VCPU: 4107 + r = kvm_clear_stat_per_vcpu(stat_data->kvm, 4108 + stat_data->dbgfs_item->offset); 4109 + break; 4110 + } 4183 4111 4184 - return 0; 4112 + return r; 4185 4113 } 4186 4114 4187 - static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file) 4115 + static int kvm_stat_data_open(struct inode *inode, struct file *file) 4188 4116 { 4189 4117 __simple_attr_check_format("%llu\n", 0ull); 4190 - return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm, 4191 - vcpu_stat_clear_per_vm, "%llu\n"); 4118 + return kvm_debugfs_open(inode, file, kvm_stat_data_get, 4119 + kvm_stat_data_clear, "%llu\n"); 4192 4120 } 4193 4121 4194 - static const struct file_operations vcpu_stat_get_per_vm_fops = { 4195 - .owner = THIS_MODULE, 4196 - .open = vcpu_stat_get_per_vm_open, 4122 + static const struct file_operations stat_fops_per_vm = { 4123 + .owner = THIS_MODULE, 4124 + .open = kvm_stat_data_open, 4197 4125 .release = kvm_debugfs_release, 4198 - .read = simple_attr_read, 4199 - .write = simple_attr_write, 4200 - .llseek = no_llseek, 4201 - }; 4202 - 4203 - static const struct file_operations *stat_fops_per_vm[] = { 4204 - [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops, 4205 - [KVM_STAT_VM] = &vm_stat_get_per_vm_fops, 4126 + .read = simple_attr_read, 4127 + .write = simple_attr_write, 4128 + .llseek = no_llseek, 4206 4129 }; 4207 4130 4208 4131 static int vm_stat_get(void *_offset, u64 *val) 4209 4132 { 4210 4133 unsigned offset = (long)_offset; 4211 4134 struct kvm *kvm; 4212 - struct kvm_stat_data stat_tmp = {.offset = offset}; 4213 4135 u64 tmp_val; 4214 4136 4215 4137 *val = 0; 4216 4138 mutex_lock(&kvm_lock); 4217 4139 list_for_each_entry(kvm, &vm_list, vm_list) { 4218 - stat_tmp.kvm = kvm; 4219 - vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val); 4140 + kvm_get_stat_per_vm(kvm, offset, &tmp_val); 4220 4141 *val += tmp_val; 4221 4142 } 4222 4143 mutex_unlock(&kvm_lock); ··· 4233 4142 { 4234 4143 unsigned offset = (long)_offset; 4235 4144 struct kvm *kvm; 4236 - struct kvm_stat_data stat_tmp = {.offset = offset}; 4237 4145 4238 4146 if (val) 4239 4147 return -EINVAL; 4240 4148 4241 4149 mutex_lock(&kvm_lock); 4242 4150 list_for_each_entry(kvm, &vm_list, vm_list) { 4243 - stat_tmp.kvm = kvm; 4244 - vm_stat_clear_per_vm((void *)&stat_tmp, 0); 4151 + kvm_clear_stat_per_vm(kvm, offset); 4245 4152 } 4246 4153 mutex_unlock(&kvm_lock); 4247 4154 ··· 4252 4163 { 4253 4164 unsigned offset = (long)_offset; 4254 4165 struct kvm *kvm; 4255 - struct kvm_stat_data stat_tmp = {.offset = offset}; 4256 4166 u64 tmp_val; 4257 4167 4258 4168 *val = 0; 4259 4169 mutex_lock(&kvm_lock); 4260 4170 list_for_each_entry(kvm, &vm_list, vm_list) { 4261 - stat_tmp.kvm = kvm; 4262 - vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val); 4171 + kvm_get_stat_per_vcpu(kvm, offset, &tmp_val); 4263 4172 *val += tmp_val; 4264 4173 } 4265 4174 mutex_unlock(&kvm_lock); ··· 4268 4181 { 4269 4182 unsigned offset = (long)_offset; 4270 4183 struct kvm *kvm; 4271 - struct kvm_stat_data stat_tmp = {.offset = offset}; 4272 4184 4273 4185 if (val) 4274 4186 return -EINVAL; 4275 4187 4276 4188 mutex_lock(&kvm_lock); 4277 4189 list_for_each_entry(kvm, &vm_list, vm_list) { 4278 - stat_tmp.kvm = kvm; 4279 - vcpu_stat_clear_per_vm((void *)&stat_tmp, 0); 4190 + kvm_clear_stat_per_vcpu(kvm, offset); 4280 4191 } 4281 4192 mutex_unlock(&kvm_lock); 4282 4193 ··· 4347 4262 4348 4263 kvm_debugfs_num_entries = 0; 4349 4264 for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { 4350 - int mode = p->mode ? p->mode : 0644; 4351 - debugfs_create_file(p->name, mode, kvm_debugfs_dir, 4352 - (void *)(long)p->offset, 4265 + debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p), 4266 + kvm_debugfs_dir, (void *)(long)p->offset, 4353 4267 stat_fops[p->kind]); 4354 4268 } 4355 4269 } ··· 4388 4304 WRITE_ONCE(vcpu->preempted, false); 4389 4305 WRITE_ONCE(vcpu->ready, false); 4390 4306 4307 + __this_cpu_write(kvm_running_vcpu, vcpu); 4391 4308 kvm_arch_sched_in(vcpu, cpu); 4392 - 4393 4309 kvm_arch_vcpu_load(vcpu, cpu); 4394 4310 } 4395 4311 ··· 4403 4319 WRITE_ONCE(vcpu->ready, true); 4404 4320 } 4405 4321 kvm_arch_vcpu_put(vcpu); 4322 + __this_cpu_write(kvm_running_vcpu, NULL); 4323 + } 4324 + 4325 + /** 4326 + * kvm_get_running_vcpu - get the vcpu running on the current CPU. 4327 + * Thanks to preempt notifiers, this can also be called from 4328 + * preemptible context. 4329 + */ 4330 + struct kvm_vcpu *kvm_get_running_vcpu(void) 4331 + { 4332 + return __this_cpu_read(kvm_running_vcpu); 4333 + } 4334 + 4335 + /** 4336 + * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus. 4337 + */ 4338 + struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) 4339 + { 4340 + return &kvm_running_vcpu; 4406 4341 } 4407 4342 4408 4343 static void check_processor_compat(void *rtn)

+20 -18

virt/lib/irqbypass.c

··· 85 85 { 86 86 struct irq_bypass_producer *tmp; 87 87 struct irq_bypass_consumer *consumer; 88 + int ret; 88 89 89 90 if (!producer->token) 90 91 return -EINVAL; ··· 99 98 100 99 list_for_each_entry(tmp, &producers, node) { 101 100 if (tmp->token == producer->token) { 102 - mutex_unlock(&lock); 103 - module_put(THIS_MODULE); 104 - return -EBUSY; 101 + ret = -EBUSY; 102 + goto out_err; 105 103 } 106 104 } 107 105 108 106 list_for_each_entry(consumer, &consumers, node) { 109 107 if (consumer->token == producer->token) { 110 - int ret = __connect(producer, consumer); 111 - if (ret) { 112 - mutex_unlock(&lock); 113 - module_put(THIS_MODULE); 114 - return ret; 115 - } 108 + ret = __connect(producer, consumer); 109 + if (ret) 110 + goto out_err; 116 111 break; 117 112 } 118 113 } ··· 118 121 mutex_unlock(&lock); 119 122 120 123 return 0; 124 + out_err: 125 + mutex_unlock(&lock); 126 + module_put(THIS_MODULE); 127 + return ret; 121 128 } 122 129 EXPORT_SYMBOL_GPL(irq_bypass_register_producer); 123 130 ··· 180 179 { 181 180 struct irq_bypass_consumer *tmp; 182 181 struct irq_bypass_producer *producer; 182 + int ret; 183 183 184 184 if (!consumer->token || 185 185 !consumer->add_producer || !consumer->del_producer) ··· 195 193 196 194 list_for_each_entry(tmp, &consumers, node) { 197 195 if (tmp->token == consumer->token || tmp == consumer) { 198 - mutex_unlock(&lock); 199 - module_put(THIS_MODULE); 200 - return -EBUSY; 196 + ret = -EBUSY; 197 + goto out_err; 201 198 } 202 199 } 203 200 204 201 list_for_each_entry(producer, &producers, node) { 205 202 if (producer->token == consumer->token) { 206 - int ret = __connect(producer, consumer); 207 - if (ret) { 208 - mutex_unlock(&lock); 209 - module_put(THIS_MODULE); 210 - return ret; 211 - } 203 + ret = __connect(producer, consumer); 204 + if (ret) 205 + goto out_err; 212 206 break; 213 207 } 214 208 } ··· 214 216 mutex_unlock(&lock); 215 217 216 218 return 0; 219 + out_err: 220 + mutex_unlock(&lock); 221 + module_put(THIS_MODULE); 222 + return ret; 217 223 } 218 224 EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); 219 225