Merge tag 'kvm-4.14-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm

+5

Documentation/virtual/kvm/devices/arm-vgic.txt

··· 83 83 84 84 Bits for undefined preemption levels are RAZ/WI. 85 85 86 + Note that this differs from a CPU's view of the APRs on hardware in which 87 + a GIC without the security extensions expose group 0 and group 1 active 88 + priorities in separate register groups, whereas we show a combined view 89 + similar to GICv2's GICH_APR. 90 + 86 91 For historical reasons and to provide ABI compatibility with userspace we 87 92 export the GICC_PMR register in the format of the GICH_VMCR.VMPriMask 88 93 field in the lower 5 bits of a word, meaning that userspace must always

+13 -1

Documentation/virtual/kvm/devices/vm.txt

··· 176 176 177 177 3.1. ATTRIBUTE: KVM_S390_VM_TOD_HIGH 178 178 179 - Allows user space to set/get the TOD clock extension (u8). 179 + Allows user space to set/get the TOD clock extension (u8) (superseded by 180 + KVM_S390_VM_TOD_EXT). 180 181 181 182 Parameters: address of a buffer in user space to store the data (u8) to 182 183 Returns: -EFAULT if the given address is not accessible from kernel space ··· 190 189 191 190 Parameters: address of a buffer in user space to store the data (u64) to 192 191 Returns: -EFAULT if the given address is not accessible from kernel space 192 + 193 + 3.3. ATTRIBUTE: KVM_S390_VM_TOD_EXT 194 + Allows user space to set/get bits 0-63 of the TOD clock register as defined in 195 + the POP (u64). If the guest CPU model supports the TOD clock extension (u8), it 196 + also allows user space to get/set it. If the guest CPU model does not support 197 + it, it is stored as 0 and not allowed to be set to a value != 0. 198 + 199 + Parameters: address of a buffer in user space to store the data 200 + (kvm_s390_vm_tod_clock) to 201 + Returns: -EFAULT if the given address is not accessible from kernel space 202 + -EINVAL if setting the TOD clock extension to != 0 is not supported 193 203 194 204 4. GROUP: KVM_S390_VM_CRYPTO 195 205 Architectures: s390

+26 -11

MAINTAINERS

··· 7464 7464 W: http://www.linux-kvm.org 7465 7465 T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git 7466 7466 S: Supported 7467 - F: Documentation/*/kvm*.txt 7468 7467 F: Documentation/virtual/kvm/ 7469 - F: arch/*/kvm/ 7470 - F: arch/x86/kernel/kvm.c 7471 - F: arch/x86/kernel/kvmclock.c 7472 - F: arch/*/include/asm/kvm* 7473 - F: include/linux/kvm* 7468 + F: include/trace/events/kvm.h 7469 + F: include/uapi/asm-generic/kvm* 7474 7470 F: include/uapi/linux/kvm* 7475 - F: virt/kvm/ 7471 + F: include/asm-generic/kvm* 7472 + F: include/linux/kvm* 7473 + F: include/kvm/iodev.h 7474 + F: virt/kvm/* 7476 7475 F: tools/kvm/ 7477 7476 7478 - KERNEL VIRTUAL MACHINE (KVM) FOR AMD-V 7477 + KERNEL VIRTUAL MACHINE FOR X86 (KVM/x86) 7478 + M: Paolo Bonzini <pbonzini@redhat.com> 7479 + M: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com> 7480 + L: kvm@vger.kernel.org 7481 + W: http://www.linux-kvm.org 7482 + T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git 7483 + S: Supported 7484 + F: arch/x86/kvm/ 7485 + F: arch/x86/include/uapi/asm/kvm* 7486 + F: arch/x86/include/asm/kvm* 7487 + F: arch/x86/kernel/kvm.c 7488 + F: arch/x86/kernel/kvmclock.c 7489 + 7490 + KERNEL VIRTUAL MACHINE FOR AMD-V (KVM/amd) 7479 7491 M: Joerg Roedel <joro@8bytes.org> 7480 7492 L: kvm@vger.kernel.org 7481 7493 W: http://www.linux-kvm.org/ ··· 7495 7483 F: arch/x86/include/asm/svm.h 7496 7484 F: arch/x86/kvm/svm.c 7497 7485 7498 - KERNEL VIRTUAL MACHINE (KVM) FOR ARM 7486 + KERNEL VIRTUAL MACHINE FOR ARM (KVM/arm) 7499 7487 M: Christoffer Dall <christoffer.dall@linaro.org> 7500 7488 M: Marc Zyngier <marc.zyngier@arm.com> 7501 7489 L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) ··· 7509 7497 F: virt/kvm/arm/ 7510 7498 F: include/kvm/arm_* 7511 7499 7512 - KERNEL VIRTUAL MACHINE (KVM) FOR POWERPC 7500 + KERNEL VIRTUAL MACHINE FOR POWERPC (KVM/powerpc) 7513 7501 M: Alexander Graf <agraf@suse.com> 7514 7502 L: kvm-ppc@vger.kernel.org 7515 7503 W: http://www.linux-kvm.org/ 7516 7504 T: git git://github.com/agraf/linux-2.6.git 7517 7505 S: Supported 7506 + F: arch/powerpc/include/uapi/asm/kvm* 7518 7507 F: arch/powerpc/include/asm/kvm* 7519 7508 F: arch/powerpc/kvm/ 7509 + F: arch/powerpc/kernel/kvm* 7520 7510 7521 7511 KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64) 7522 7512 M: Christoffer Dall <christoffer.dall@linaro.org> ··· 7545 7531 W: http://www.ibm.com/developerworks/linux/linux390/ 7546 7532 T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git 7547 7533 S: Supported 7548 - F: Documentation/s390/kvm.txt 7534 + F: arch/s390/include/uapi/asm/kvm* 7535 + F: arch/s390/include/asm/gmap.h 7549 7536 F: arch/s390/include/asm/kvm* 7550 7537 F: arch/s390/kvm/ 7551 7538 F: arch/s390/mm/gmap.c

-1

arch/arm/include/asm/kvm_arm.h

··· 227 227 228 228 #define HSR_DABT_S1PTW (_AC(1, UL) << 7) 229 229 #define HSR_DABT_CM (_AC(1, UL) << 8) 230 - #define HSR_DABT_EA (_AC(1, UL) << 9) 231 230 232 231 #define kvm_arm_exception_type \ 233 232 {0, "RESET" }, \

+19 -5

arch/arm/include/asm/kvm_emulate.h

··· 149 149 return (kvm_vcpu_get_hsr(vcpu) & HSR_SRT_MASK) >> HSR_SRT_SHIFT; 150 150 } 151 151 152 - static inline bool kvm_vcpu_dabt_isextabt(struct kvm_vcpu *vcpu) 153 - { 154 - return kvm_vcpu_get_hsr(vcpu) & HSR_DABT_EA; 155 - } 156 - 157 152 static inline bool kvm_vcpu_dabt_iss1tw(struct kvm_vcpu *vcpu) 158 153 { 159 154 return kvm_vcpu_get_hsr(vcpu) & HSR_DABT_S1PTW; ··· 199 204 static inline u8 kvm_vcpu_trap_get_fault_type(struct kvm_vcpu *vcpu) 200 205 { 201 206 return kvm_vcpu_get_hsr(vcpu) & HSR_FSC_TYPE; 207 + } 208 + 209 + static inline bool kvm_vcpu_dabt_isextabt(struct kvm_vcpu *vcpu) 210 + { 211 + switch (kvm_vcpu_trap_get_fault_type(vcpu)) { 212 + case FSC_SEA: 213 + case FSC_SEA_TTW0: 214 + case FSC_SEA_TTW1: 215 + case FSC_SEA_TTW2: 216 + case FSC_SEA_TTW3: 217 + case FSC_SECC: 218 + case FSC_SECC_TTW0: 219 + case FSC_SECC_TTW1: 220 + case FSC_SECC_TTW2: 221 + case FSC_SECC_TTW3: 222 + return true; 223 + default: 224 + return false; 225 + } 202 226 } 203 227 204 228 static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu)

+1 -1

arch/arm/kvm/handle_exit.c

··· 67 67 if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) { 68 68 trace_kvm_wfx(*vcpu_pc(vcpu), true); 69 69 vcpu->stat.wfe_exit_stat++; 70 - kvm_vcpu_on_spin(vcpu); 70 + kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu)); 71 71 } else { 72 72 trace_kvm_wfx(*vcpu_pc(vcpu), false); 73 73 vcpu->stat.wfi_exit_stat++;

+19 -5

arch/arm64/include/asm/kvm_emulate.h

··· 188 188 return (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT; 189 189 } 190 190 191 - static inline bool kvm_vcpu_dabt_isextabt(const struct kvm_vcpu *vcpu) 192 - { 193 - return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_EA); 194 - } 195 - 196 191 static inline bool kvm_vcpu_dabt_iss1tw(const struct kvm_vcpu *vcpu) 197 192 { 198 193 return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_S1PTW); ··· 233 238 static inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vcpu) 234 239 { 235 240 return kvm_vcpu_get_hsr(vcpu) & ESR_ELx_FSC_TYPE; 241 + } 242 + 243 + static inline bool kvm_vcpu_dabt_isextabt(const struct kvm_vcpu *vcpu) 244 + { 245 + switch (kvm_vcpu_trap_get_fault_type(vcpu)) { 246 + case FSC_SEA: 247 + case FSC_SEA_TTW0: 248 + case FSC_SEA_TTW1: 249 + case FSC_SEA_TTW2: 250 + case FSC_SEA_TTW3: 251 + case FSC_SECC: 252 + case FSC_SECC_TTW0: 253 + case FSC_SECC_TTW1: 254 + case FSC_SECC_TTW2: 255 + case FSC_SECC_TTW3: 256 + return true; 257 + default: 258 + return false; 259 + } 236 260 } 237 261 238 262 static inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu)

+1 -1

arch/arm64/kvm/handle_exit.c

··· 84 84 if (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_WFx_ISS_WFE) { 85 85 trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true); 86 86 vcpu->stat.wfe_exit_stat++; 87 - kvm_vcpu_on_spin(vcpu); 87 + kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu)); 88 88 } else { 89 89 trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false); 90 90 vcpu->stat.wfi_exit_stat++;

+3 -20

arch/arm64/kvm/vgic-sys-reg-v3.c

··· 208 208 static bool access_gic_aprn(struct kvm_vcpu *vcpu, struct sys_reg_params *p, 209 209 const struct sys_reg_desc *r, u8 apr) 210 210 { 211 - struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu; 212 211 u8 idx = r->Op2 & 3; 213 212 214 - /* 215 - * num_pri_bits are initialized with HW supported values. 216 - * We can rely safely on num_pri_bits even if VM has not 217 - * restored ICC_CTLR_EL1 before restoring APnR registers. 218 - */ 219 - switch (vgic_v3_cpu->num_pri_bits) { 220 - case 7: 221 - vgic_v3_access_apr_reg(vcpu, p, apr, idx); 222 - break; 223 - case 6: 224 - if (idx > 1) 225 - goto err; 226 - vgic_v3_access_apr_reg(vcpu, p, apr, idx); 227 - break; 228 - default: 229 - if (idx > 0) 230 - goto err; 231 - vgic_v3_access_apr_reg(vcpu, p, apr, idx); 232 - } 213 + if (idx > vgic_v3_max_apr_idx(vcpu)) 214 + goto err; 233 215 216 + vgic_v3_access_apr_reg(vcpu, p, apr, idx); 234 217 return true; 235 218 err: 236 219 if (!p->is_write)

+5

arch/mips/kvm/mips.c

··· 98 98 return !!(vcpu->arch.pending_exceptions); 99 99 } 100 100 101 + bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) 102 + { 103 + return false; 104 + } 105 + 101 106 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) 102 107 { 103 108 return 1;

+1

arch/powerpc/include/asm/book3s/64/mmu-hash.h

··· 104 104 #define HPTE_R_C ASM_CONST(0x0000000000000080) 105 105 #define HPTE_R_R ASM_CONST(0x0000000000000100) 106 106 #define HPTE_R_KEY_LO ASM_CONST(0x0000000000000e00) 107 + #define HPTE_R_KEY (HPTE_R_KEY_LO | HPTE_R_KEY_HI) 107 108 108 109 #define HPTE_V_1TB_SEG ASM_CONST(0x4000000000000000) 109 110 #define HPTE_V_VRMA_MASK ASM_CONST(0x4001ffffff000000)

+1

arch/powerpc/kvm/book3s_64_mmu_hv.c

··· 1941 1941 rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; 1942 1942 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC); 1943 1943 if (ret < 0) { 1944 + kfree(ctx); 1944 1945 kvm_put_kvm(kvm); 1945 1946 return ret; 1946 1947 }

+10 -11

arch/powerpc/kvm/book3s_64_vio.c

··· 265 265 { 266 266 struct kvmppc_spapr_tce_table *stt = filp->private_data; 267 267 struct kvmppc_spapr_tce_iommu_table *stit, *tmp; 268 + struct kvm *kvm = stt->kvm; 268 269 270 + mutex_lock(&kvm->lock); 269 271 list_del_rcu(&stt->list); 272 + mutex_unlock(&kvm->lock); 270 273 271 274 list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) { 272 275 WARN_ON(!kref_read(&stit->kref)); ··· 301 298 unsigned long npages, size; 302 299 int ret = -ENOMEM; 303 300 int i; 304 - int fd = -1; 305 301 306 302 if (!args->size) 307 303 return -EINVAL; ··· 330 328 goto fail; 331 329 } 332 330 333 - ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, 334 - stt, O_RDWR | O_CLOEXEC); 335 - if (ret < 0) 336 - goto fail; 337 - 338 331 mutex_lock(&kvm->lock); 339 332 340 333 /* Check this LIOBN hasn't been previously allocated */ ··· 341 344 } 342 345 } 343 346 344 - if (!ret) { 347 + if (!ret) 348 + ret = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, 349 + stt, O_RDWR | O_CLOEXEC); 350 + 351 + if (ret >= 0) { 345 352 list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); 346 353 kvm_get_kvm(kvm); 347 354 } 348 355 349 356 mutex_unlock(&kvm->lock); 350 357 351 - if (!ret) 352 - return fd; 353 - 354 - put_unused_fd(fd); 358 + if (ret >= 0) 359 + return ret; 355 360 356 361 fail: 357 362 for (i = 0; i < npages; i++)

+15 -1

arch/powerpc/kvm/book3s_hv.c

··· 485 485 486 486 switch (subfunc) { 487 487 case H_VPA_REG_VPA: /* register VPA */ 488 - if (len < sizeof(struct lppaca)) 488 + /* 489 + * The size of our lppaca is 1kB because of the way we align 490 + * it for the guest to avoid crossing a 4kB boundary. We only 491 + * use 640 bytes of the structure though, so we should accept 492 + * clients that set a size of 640. 493 + */ 494 + if (len < 640) 489 495 break; 490 496 vpap = &tvcpu->arch.vpa; 491 497 err = 0; ··· 3341 3335 */ 3342 3336 if (radix_enabled()) 3343 3337 return -EINVAL; 3338 + 3339 + /* 3340 + * POWER7, POWER8 and POWER9 all support 32 storage keys for data. 3341 + * POWER7 doesn't support keys for instruction accesses, 3342 + * POWER8 and POWER9 do. 3343 + */ 3344 + info->data_keys = 32; 3345 + info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0; 3344 3346 3345 3347 info->flags = KVM_PPC_PAGE_SIZES_REAL; 3346 3348 if (mmu_has_feature(MMU_FTR_1T_SEGMENT))

+1 -1

arch/powerpc/kvm/book3s_hv_rm_mmu.c

··· 269 269 if (!realmode) 270 270 local_irq_restore(irq_flags); 271 271 272 - ptel &= ~(HPTE_R_PP0 - psize); 272 + ptel &= HPTE_R_KEY | HPTE_R_PP0 | (psize-1); 273 273 ptel |= pa; 274 274 275 275 if (pa)

+3 -2

arch/powerpc/kvm/book3s_hv_rmhandlers.S

··· 982 982 #ifdef CONFIG_KVM_XICS 983 983 /* We are entering the guest on that thread, push VCPU to XIVE */ 984 984 ld r10, HSTATE_XIVE_TIMA_PHYS(r13) 985 - cmpldi cr0, r10, r0 985 + cmpldi cr0, r10, 0 986 986 beq no_xive 987 987 ld r11, VCPU_XIVE_SAVED_STATE(r4) 988 988 li r9, TM_QW1_OS ··· 1286 1286 cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER 1287 1287 bne 2f 1288 1288 mfspr r3,SPRN_HDEC 1289 - cmpwi r3,0 1289 + EXTEND_HDEC(r3) 1290 + cmpdi r3,0 1290 1291 mr r4,r9 1291 1292 bge fast_guest_return 1292 1293 2:

+6 -2

arch/powerpc/kvm/e500.c

··· 455 455 if (err) 456 456 goto free_vcpu; 457 457 458 - if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) 458 + if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) { 459 + err = -ENOMEM; 459 460 goto uninit_vcpu; 461 + } 460 462 461 463 err = kvmppc_e500_tlb_init(vcpu_e500); 462 464 if (err) 463 465 goto uninit_id; 464 466 465 467 vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO); 466 - if (!vcpu->arch.shared) 468 + if (!vcpu->arch.shared) { 469 + err = -ENOMEM; 467 470 goto uninit_tlb; 471 + } 468 472 469 473 return vcpu; 470 474

+3 -1

arch/powerpc/kvm/e500mc.c

··· 331 331 goto uninit_vcpu; 332 332 333 333 vcpu->arch.shared = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 334 - if (!vcpu->arch.shared) 334 + if (!vcpu->arch.shared) { 335 + err = -ENOMEM; 335 336 goto uninit_tlb; 337 + } 336 338 337 339 return vcpu; 338 340

+5

arch/powerpc/kvm/powerpc.c

··· 58 58 return !!(v->arch.pending_exceptions) || kvm_request_pending(v); 59 59 } 60 60 61 + bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) 62 + { 63 + return false; 64 + } 65 + 61 66 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) 62 67 { 63 68 return 1;

+5 -1

arch/s390/include/asm/kvm_host.h

··· 226 226 #define ECB3_RI 0x01 227 227 __u8 ecb3; /* 0x0063 */ 228 228 __u32 scaol; /* 0x0064 */ 229 - __u8 reserved68[4]; /* 0x0068 */ 229 + __u8 reserved68; /* 0x0068 */ 230 + __u8 epdx; /* 0x0069 */ 231 + __u8 reserved6a[2]; /* 0x006a */ 230 232 __u32 todpr; /* 0x006c */ 231 233 __u8 reserved70[16]; /* 0x0070 */ 232 234 __u64 mso; /* 0x0080 */ ··· 267 265 __u64 cbrlo; /* 0x01b8 */ 268 266 __u8 reserved1c0[8]; /* 0x01c0 */ 269 267 #define ECD_HOSTREGMGMT 0x20000000 268 + #define ECD_MEF 0x08000000 270 269 __u32 ecd; /* 0x01c8 */ 271 270 __u8 reserved1cc[18]; /* 0x01cc */ 272 271 __u64 pp; /* 0x01de */ ··· 742 739 struct kvm_s390_cpu_model model; 743 740 struct kvm_s390_crypto crypto; 744 741 struct kvm_s390_vsie vsie; 742 + u8 epdx; 745 743 u64 epoch; 746 744 struct kvm_s390_migration_state *migration_state; 747 745 /* subset of available cpu features enabled by user space */

+1 -1

arch/s390/include/asm/page-states.h

··· 15 15 #define ESSA_SET_STABLE_IF_RESIDENT 6 16 16 #define ESSA_SET_STABLE_NODAT 7 17 17 18 - #define ESSA_MAX ESSA_SET_STABLE_IF_RESIDENT 18 + #define ESSA_MAX ESSA_SET_STABLE_NODAT 19 19 20 20 #endif

+6

arch/s390/include/uapi/asm/kvm.h

··· 88 88 /* kvm attributes for KVM_S390_VM_TOD */ 89 89 #define KVM_S390_VM_TOD_LOW 0 90 90 #define KVM_S390_VM_TOD_HIGH 1 91 + #define KVM_S390_VM_TOD_EXT 2 92 + 93 + struct kvm_s390_vm_tod_clock { 94 + __u8 epoch_idx; 95 + __u64 tod; 96 + }; 91 97 92 98 /* kvm attributes for KVM_S390_VM_CPU_MODEL */ 93 99 /* processor related attributes are r/w */

+1 -1

arch/s390/kvm/diag.c

··· 150 150 { 151 151 VCPU_EVENT(vcpu, 5, "%s", "diag time slice end"); 152 152 vcpu->stat.diagnose_44++; 153 - kvm_vcpu_on_spin(vcpu); 153 + kvm_vcpu_on_spin(vcpu, true); 154 154 return 0; 155 155 } 156 156

+1 -1

arch/s390/kvm/guestdbg.c

··· 308 308 return (addr >= a) && (addr <= b); 309 309 else 310 310 /* "overflowing" interval */ 311 - return (addr <= a) && (addr >= b); 311 + return (addr >= a) || (addr <= b); 312 312 } 313 313 314 314 #define end_of_range(bp_info) (bp_info->addr + bp_info->len - 1)

+4 -2

arch/s390/kvm/interrupt.c

··· 2479 2479 struct kvm_s390_mchk_info *mchk; 2480 2480 union mci mci; 2481 2481 __u64 cr14 = 0; /* upper bits are not used */ 2482 + int rc; 2482 2483 2483 2484 mci.val = mcck_info->mcic; 2484 2485 if (mci.sr) ··· 2497 2496 if (mci.ck) { 2498 2497 /* Inject the floating machine check */ 2499 2498 inti.type = KVM_S390_MCHK; 2500 - WARN_ON_ONCE(__inject_vm(vcpu->kvm, &inti)); 2499 + rc = __inject_vm(vcpu->kvm, &inti); 2501 2500 } else { 2502 2501 /* Inject the machine check to specified vcpu */ 2503 2502 irq.type = KVM_S390_MCHK; 2504 - WARN_ON_ONCE(kvm_s390_inject_vcpu(vcpu, &irq)); 2503 + rc = kvm_s390_inject_vcpu(vcpu, &irq); 2505 2504 } 2505 + WARN_ON_ONCE(rc); 2506 2506 } 2507 2507 2508 2508 int kvm_set_routing_entry(struct kvm *kvm,

+116 -2

arch/s390/kvm/kvm-s390.c

··· 130 130 { NULL } 131 131 }; 132 132 133 + struct kvm_s390_tod_clock_ext { 134 + __u8 epoch_idx; 135 + __u64 tod; 136 + __u8 reserved[7]; 137 + } __packed; 138 + 133 139 /* allow nested virtualization in KVM (if enabled by user space) */ 134 140 static int nested; 135 141 module_param(nested, int, S_IRUGO); ··· 880 874 return 0; 881 875 } 882 876 877 + static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr) 878 + { 879 + struct kvm_s390_vm_tod_clock gtod; 880 + 881 + if (copy_from_user(&gtod, (void __user *)attr->addr, sizeof(gtod))) 882 + return -EFAULT; 883 + 884 + if (test_kvm_facility(kvm, 139)) 885 + kvm_s390_set_tod_clock_ext(kvm, &gtod); 886 + else if (gtod.epoch_idx == 0) 887 + kvm_s390_set_tod_clock(kvm, gtod.tod); 888 + else 889 + return -EINVAL; 890 + 891 + VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x, TOD base: 0x%llx", 892 + gtod.epoch_idx, gtod.tod); 893 + 894 + return 0; 895 + } 896 + 883 897 static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) 884 898 { 885 899 u8 gtod_high; ··· 935 909 return -EINVAL; 936 910 937 911 switch (attr->attr) { 912 + case KVM_S390_VM_TOD_EXT: 913 + ret = kvm_s390_set_tod_ext(kvm, attr); 914 + break; 938 915 case KVM_S390_VM_TOD_HIGH: 939 916 ret = kvm_s390_set_tod_high(kvm, attr); 940 917 break; ··· 949 920 break; 950 921 } 951 922 return ret; 923 + } 924 + 925 + static void kvm_s390_get_tod_clock_ext(struct kvm *kvm, 926 + struct kvm_s390_vm_tod_clock *gtod) 927 + { 928 + struct kvm_s390_tod_clock_ext htod; 929 + 930 + preempt_disable(); 931 + 932 + get_tod_clock_ext((char *)&htod); 933 + 934 + gtod->tod = htod.tod + kvm->arch.epoch; 935 + gtod->epoch_idx = htod.epoch_idx + kvm->arch.epdx; 936 + 937 + if (gtod->tod < htod.tod) 938 + gtod->epoch_idx += 1; 939 + 940 + preempt_enable(); 941 + } 942 + 943 + static int kvm_s390_get_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr) 944 + { 945 + struct kvm_s390_vm_tod_clock gtod; 946 + 947 + memset(&gtod, 0, sizeof(gtod)); 948 + 949 + if (test_kvm_facility(kvm, 139)) 950 + kvm_s390_get_tod_clock_ext(kvm, &gtod); 951 + else 952 + gtod.tod = kvm_s390_get_tod_clock_fast(kvm); 953 + 954 + if (copy_to_user((void __user *)attr->addr, &gtod, sizeof(gtod))) 955 + return -EFAULT; 956 + 957 + VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x, TOD base: 0x%llx", 958 + gtod.epoch_idx, gtod.tod); 959 + return 0; 952 960 } 953 961 954 962 static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) ··· 1020 954 return -EINVAL; 1021 955 1022 956 switch (attr->attr) { 957 + case KVM_S390_VM_TOD_EXT: 958 + ret = kvm_s390_get_tod_ext(kvm, attr); 959 + break; 1023 960 case KVM_S390_VM_TOD_HIGH: 1024 961 ret = kvm_s390_get_tod_high(kvm, attr); 1025 962 break; ··· 1574 1505 if (r < 0) 1575 1506 pgstev = 0; 1576 1507 /* save the value */ 1577 - res[i++] = (pgstev >> 24) & 0x3; 1508 + res[i++] = (pgstev >> 24) & 0x43; 1578 1509 /* 1579 1510 * if the next bit is too far away, stop. 1580 1511 * if we reached the previous "next", find the next one ··· 1652 1583 1653 1584 pgstev = bits[i]; 1654 1585 pgstev = pgstev << 24; 1655 - mask &= _PGSTE_GPS_USAGE_MASK; 1586 + mask &= _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT; 1656 1587 set_pgste_bits(kvm->mm, hva, mask, pgstev); 1657 1588 } 1658 1589 srcu_read_unlock(&kvm->srcu, srcu_idx); ··· 1927 1858 memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask, 1928 1859 S390_ARCH_FAC_LIST_SIZE_BYTE); 1929 1860 1861 + /* we are always in czam mode - even on pre z14 machines */ 1862 + set_kvm_facility(kvm->arch.model.fac_mask, 138); 1863 + set_kvm_facility(kvm->arch.model.fac_list, 138); 1864 + /* we emulate STHYI in kvm */ 1930 1865 set_kvm_facility(kvm->arch.model.fac_mask, 74); 1931 1866 set_kvm_facility(kvm->arch.model.fac_list, 74); 1867 + if (MACHINE_HAS_TLB_GUEST) { 1868 + set_kvm_facility(kvm->arch.model.fac_mask, 147); 1869 + set_kvm_facility(kvm->arch.model.fac_list, 147); 1870 + } 1932 1871 1933 1872 kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid(); 1934 1873 kvm->arch.model.ibc = sclp.ibc & 0x0fff; ··· 2446 2369 vcpu->arch.sie_block->eca |= ECA_VX; 2447 2370 vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT; 2448 2371 } 2372 + if (test_kvm_facility(vcpu->kvm, 139)) 2373 + vcpu->arch.sie_block->ecd |= ECD_MEF; 2374 + 2449 2375 vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx) 2450 2376 | SDNXC; 2451 2377 vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb; ··· 2525 2445 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 2526 2446 { 2527 2447 return kvm_s390_vcpu_has_irq(vcpu, 0); 2448 + } 2449 + 2450 + bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) 2451 + { 2452 + return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE); 2528 2453 } 2529 2454 2530 2455 void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu) ··· 2938 2853 kvm_clear_request(KVM_REQ_UNHALT, vcpu); 2939 2854 2940 2855 return 0; 2856 + } 2857 + 2858 + void kvm_s390_set_tod_clock_ext(struct kvm *kvm, 2859 + const struct kvm_s390_vm_tod_clock *gtod) 2860 + { 2861 + struct kvm_vcpu *vcpu; 2862 + struct kvm_s390_tod_clock_ext htod; 2863 + int i; 2864 + 2865 + mutex_lock(&kvm->lock); 2866 + preempt_disable(); 2867 + 2868 + get_tod_clock_ext((char *)&htod); 2869 + 2870 + kvm->arch.epoch = gtod->tod - htod.tod; 2871 + kvm->arch.epdx = gtod->epoch_idx - htod.epoch_idx; 2872 + 2873 + if (kvm->arch.epoch > gtod->tod) 2874 + kvm->arch.epdx -= 1; 2875 + 2876 + kvm_s390_vcpu_block_all(kvm); 2877 + kvm_for_each_vcpu(i, vcpu, kvm) { 2878 + vcpu->arch.sie_block->epoch = kvm->arch.epoch; 2879 + vcpu->arch.sie_block->epdx = kvm->arch.epdx; 2880 + } 2881 + 2882 + kvm_s390_vcpu_unblock_all(kvm); 2883 + preempt_enable(); 2884 + mutex_unlock(&kvm->lock); 2941 2885 } 2942 2886 2943 2887 void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod)

+2

arch/s390/kvm/kvm-s390.h

··· 272 272 int handle_sthyi(struct kvm_vcpu *vcpu); 273 273 274 274 /* implemented in kvm-s390.c */ 275 + void kvm_s390_set_tod_clock_ext(struct kvm *kvm, 276 + const struct kvm_s390_vm_tod_clock *gtod); 275 277 void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod); 276 278 long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); 277 279 int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);

+5 -1

arch/s390/kvm/priv.c

··· 988 988 if (pgstev & _PGSTE_GPS_ZERO) 989 989 res |= 1; 990 990 } 991 + if (pgstev & _PGSTE_GPS_NODAT) 992 + res |= 0x20; 991 993 vcpu->run->s.regs.gprs[r1] = res; 992 994 /* 993 995 * It is possible that all the normal 511 slots were full, in which case ··· 1029 1027 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); 1030 1028 /* Check for invalid operation request code */ 1031 1029 orc = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; 1032 - if (orc > ESSA_MAX) 1030 + /* ORCs 0-6 are always valid */ 1031 + if (orc > (test_kvm_facility(vcpu->kvm, 147) ? ESSA_SET_STABLE_NODAT 1032 + : ESSA_SET_STABLE_IF_RESIDENT)) 1033 1033 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 1034 1034 1035 1035 if (likely(!vcpu->kvm->arch.migration_state)) {

+17 -19

arch/s390/kvm/sigp.c

··· 155 155 return rc; 156 156 } 157 157 158 - static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) 158 + static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter, 159 + u64 *status_reg) 159 160 { 160 - int rc; 161 161 unsigned int i; 162 162 struct kvm_vcpu *v; 163 + bool all_stopped = true; 163 164 164 - switch (parameter & 0xff) { 165 - case 0: 166 - rc = SIGP_CC_NOT_OPERATIONAL; 167 - break; 168 - case 1: 169 - case 2: 170 - kvm_for_each_vcpu(i, v, vcpu->kvm) { 171 - v->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; 172 - kvm_clear_async_pf_completion_queue(v); 173 - } 174 - 175 - rc = SIGP_CC_ORDER_CODE_ACCEPTED; 176 - break; 177 - default: 178 - rc = -EOPNOTSUPP; 165 + kvm_for_each_vcpu(i, v, vcpu->kvm) { 166 + if (v == vcpu) 167 + continue; 168 + if (!is_vcpu_stopped(v)) 169 + all_stopped = false; 179 170 } 180 - return rc; 171 + 172 + *status_reg &= 0xffffffff00000000UL; 173 + 174 + /* Reject set arch order, with czam we're always in z/Arch mode. */ 175 + *status_reg |= (all_stopped ? SIGP_STATUS_INVALID_PARAMETER : 176 + SIGP_STATUS_INCORRECT_STATE); 177 + return SIGP_CC_STATUS_STORED; 181 178 } 182 179 183 180 static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu, ··· 443 446 switch (order_code) { 444 447 case SIGP_SET_ARCHITECTURE: 445 448 vcpu->stat.instruction_sigp_arch++; 446 - rc = __sigp_set_arch(vcpu, parameter); 449 + rc = __sigp_set_arch(vcpu, parameter, 450 + &vcpu->run->s.regs.gprs[r1]); 447 451 break; 448 452 default: 449 453 rc = handle_sigp_dst(vcpu, order_code, cpu_addr,

-8

arch/s390/kvm/sthyi.c

··· 436 436 if (addr & ~PAGE_MASK) 437 437 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 438 438 439 - /* 440 - * If the page has not yet been faulted in, we want to do that 441 - * now and not after all the expensive calculations. 442 - */ 443 - r = write_guest(vcpu, addr, reg2, &cc, 1); 444 - if (r) 445 - return kvm_s390_inject_prog_cond(vcpu, r); 446 - 447 439 sctns = (void *)get_zeroed_page(GFP_KERNEL); 448 440 if (!sctns) 449 441 return -ENOMEM;

+11 -5

arch/s390/kvm/vsie.c

··· 349 349 scb_s->eca |= scb_o->eca & ECA_IB; 350 350 if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI)) 351 351 scb_s->eca |= scb_o->eca & ECA_CEI; 352 + /* Epoch Extension */ 353 + if (test_kvm_facility(vcpu->kvm, 139)) 354 + scb_s->ecd |= scb_o->ecd & ECD_MEF; 352 355 353 356 prepare_ibc(vcpu, vsie_page); 354 357 rc = shadow_crycb(vcpu, vsie_page); ··· 809 806 { 810 807 struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; 811 808 struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; 812 - struct mcck_volatile_info *mcck_info; 813 - struct sie_page *sie_page; 814 809 int rc; 815 810 816 811 handle_last_fault(vcpu, vsie_page); ··· 832 831 833 832 if (rc == -EINTR) { 834 833 VCPU_EVENT(vcpu, 3, "%s", "machine check"); 835 - sie_page = container_of(scb_s, struct sie_page, sie_block); 836 - mcck_info = &sie_page->mcck_info; 837 - kvm_s390_reinject_machine_check(vcpu, mcck_info); 834 + kvm_s390_reinject_machine_check(vcpu, &vsie_page->mcck_info); 838 835 return 0; 839 836 } 840 837 ··· 918 919 */ 919 920 preempt_disable(); 920 921 scb_s->epoch += vcpu->kvm->arch.epoch; 922 + 923 + if (scb_s->ecd & ECD_MEF) { 924 + scb_s->epdx += vcpu->kvm->arch.epdx; 925 + if (scb_s->epoch < vcpu->kvm->arch.epoch) 926 + scb_s->epdx += 1; 927 + } 928 + 921 929 preempt_enable(); 922 930 } 923 931

+5 -1

arch/s390/mm/pgtable.c

··· 919 919 case ESSA_GET_STATE: 920 920 break; 921 921 case ESSA_SET_STABLE: 922 - pgstev &= ~_PGSTE_GPS_USAGE_MASK; 922 + pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); 923 923 pgstev |= _PGSTE_GPS_USAGE_STABLE; 924 924 break; 925 925 case ESSA_SET_UNUSED: ··· 964 964 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 965 965 pgstev |= _PGSTE_GPS_USAGE_STABLE; 966 966 } 967 + break; 968 + case ESSA_SET_STABLE_NODAT: 969 + pgstev &= ~_PGSTE_GPS_USAGE_MASK; 970 + pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT; 967 971 break; 968 972 default: 969 973 /* we should never get here! */

+1

arch/s390/tools/gen_facilities.c

··· 83 83 78, /* enhanced-DAT 2 */ 84 84 130, /* instruction-execution-protection */ 85 85 131, /* enhanced-SOP 2 and side-effect */ 86 + 139, /* multiple epoch facility */ 86 87 146, /* msa extension 8 */ 87 88 -1 /* END */ 88 89 }

+1

arch/x86/include/asm/cpufeatures.h

··· 288 288 #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ 289 289 #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ 290 290 #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ 291 + #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ 291 292 292 293 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ 293 294 #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/

+2 -2

arch/x86/include/asm/kvm_emulate.h

··· 219 219 struct x86_instruction_info *info, 220 220 enum x86_intercept_stage stage); 221 221 222 - void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, 223 - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); 222 + bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, 223 + u32 *ecx, u32 *edx, bool check_limit); 224 224 void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); 225 225 226 226 unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);

+15 -25

arch/x86/include/asm/kvm_host.h

··· 79 79 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 80 80 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 81 81 82 - #define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL 83 82 #define CR3_PCID_INVD BIT_64(63) 84 83 #define CR4_RESERVED_BITS \ 85 84 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 86 85 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 87 86 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ 88 87 | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ 89 - | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \ 90 - | X86_CR4_PKE)) 88 + | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \ 89 + | X86_CR4_SMAP | X86_CR4_PKE)) 91 90 92 91 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 93 92 ··· 203 204 #define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) 204 205 205 206 #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ 206 - PFERR_USER_MASK | \ 207 207 PFERR_WRITE_MASK | \ 208 208 PFERR_PRESENT_MASK) 209 209 ··· 315 317 int size; 316 318 }; 317 319 320 + #define PT64_ROOT_MAX_LEVEL 5 321 + 318 322 struct rsvd_bits_validate { 319 - u64 rsvd_bits_mask[2][4]; 323 + u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL]; 320 324 u64 bad_mt_xwr; 321 325 }; 322 326 323 327 /* 324 - * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level 325 - * 32-bit). The kvm_mmu structure abstracts the details of the current mmu 326 - * mode. 328 + * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, 329 + * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the 330 + * current mmu mode. 327 331 */ 328 332 struct kvm_mmu { 329 333 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); ··· 548 548 549 549 struct kvm_queued_exception { 550 550 bool pending; 551 + bool injected; 551 552 bool has_error_code; 552 - bool reinject; 553 553 u8 nr; 554 554 u32 error_code; 555 555 u8 nested_apf; ··· 687 687 int pending_ioapic_eoi; 688 688 int pending_external_vector; 689 689 690 - /* GPA available (AMD only) */ 690 + /* GPA available */ 691 691 bool gpa_available; 692 + gpa_t gpa_val; 693 + 694 + /* be preempted when it's in kernel-mode(cpl=0) */ 695 + bool preempted_in_kernel; 692 696 }; 693 697 694 698 struct kvm_lpage_info { ··· 983 979 void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); 984 980 int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); 985 981 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 986 - int (*get_tdp_level)(void); 982 + int (*get_tdp_level)(struct kvm_vcpu *vcpu); 987 983 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 988 984 int (*get_lpage_level)(void); 989 985 bool (*rdtscp_supported)(void); ··· 1299 1295 static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) 1300 1296 { 1301 1297 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 1302 - } 1303 - 1304 - static inline u64 get_canonical(u64 la) 1305 - { 1306 - return ((int64_t)la << 16) >> 16; 1307 - } 1308 - 1309 - static inline bool is_noncanonical_address(u64 la) 1310 - { 1311 - #ifdef CONFIG_X86_64 1312 - return get_canonical(la) != la; 1313 - #else 1314 - return false; 1315 - #endif 1316 1298 } 1317 1299 1318 1300 #define TSS_IOPB_BASE_OFFSET 0x66

+6

arch/x86/include/asm/svm.h

··· 107 107 #define V_IRQ_SHIFT 8 108 108 #define V_IRQ_MASK (1 << V_IRQ_SHIFT) 109 109 110 + #define V_GIF_SHIFT 9 111 + #define V_GIF_MASK (1 << V_GIF_SHIFT) 112 + 110 113 #define V_INTR_PRIO_SHIFT 16 111 114 #define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT) 112 115 ··· 118 115 119 116 #define V_INTR_MASKING_SHIFT 24 120 117 #define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) 118 + 119 + #define V_GIF_ENABLE_SHIFT 25 120 + #define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT) 121 121 122 122 #define AVIC_ENABLE_SHIFT 31 123 123 #define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT)

+17 -5

arch/x86/include/asm/vmx.h

··· 72 72 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 73 73 #define SECONDARY_EXEC_RDRAND 0x00000800 74 74 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 75 + #define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000 75 76 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 76 77 #define SECONDARY_EXEC_RDSEED 0x00010000 77 78 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 ··· 114 113 #define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f 115 114 #define VMX_MISC_SAVE_EFER_LMA 0x00000020 116 115 #define VMX_MISC_ACTIVITY_HLT 0x00000040 116 + 117 + /* VMFUNC functions */ 118 + #define VMX_VMFUNC_EPTP_SWITCHING 0x00000001 119 + #define VMFUNC_EPTP_ENTRIES 512 117 120 118 121 static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) 119 122 { ··· 192 187 APIC_ACCESS_ADDR_HIGH = 0x00002015, 193 188 POSTED_INTR_DESC_ADDR = 0x00002016, 194 189 POSTED_INTR_DESC_ADDR_HIGH = 0x00002017, 190 + VM_FUNCTION_CONTROL = 0x00002018, 191 + VM_FUNCTION_CONTROL_HIGH = 0x00002019, 195 192 EPT_POINTER = 0x0000201a, 196 193 EPT_POINTER_HIGH = 0x0000201b, 197 194 EOI_EXIT_BITMAP0 = 0x0000201c, ··· 204 197 EOI_EXIT_BITMAP2_HIGH = 0x00002021, 205 198 EOI_EXIT_BITMAP3 = 0x00002022, 206 199 EOI_EXIT_BITMAP3_HIGH = 0x00002023, 200 + EPTP_LIST_ADDRESS = 0x00002024, 201 + EPTP_LIST_ADDRESS_HIGH = 0x00002025, 207 202 VMREAD_BITMAP = 0x00002026, 208 203 VMWRITE_BITMAP = 0x00002028, 209 204 XSS_EXIT_BITMAP = 0x0000202C, ··· 453 444 454 445 #define VMX_EPT_EXECUTE_ONLY_BIT (1ull) 455 446 #define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) 447 + #define VMX_EPT_PAGE_WALK_5_BIT (1ull << 7) 456 448 #define VMX_EPTP_UC_BIT (1ull << 8) 457 449 #define VMX_EPTP_WB_BIT (1ull << 14) 458 450 #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) ··· 469 459 #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ 470 460 #define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT (1ull << 11) /* (43 - 32) */ 471 461 472 - #define VMX_EPT_DEFAULT_GAW 3 473 - #define VMX_EPT_MAX_GAW 0x4 474 462 #define VMX_EPT_MT_EPTE_SHIFT 3 475 - #define VMX_EPT_GAW_EPTP_SHIFT 3 476 - #define VMX_EPT_AD_ENABLE_BIT (1ull << 6) 477 - #define VMX_EPT_DEFAULT_MT 0x6ull 463 + #define VMX_EPTP_PWL_MASK 0x38ull 464 + #define VMX_EPTP_PWL_4 0x18ull 465 + #define VMX_EPTP_PWL_5 0x20ull 466 + #define VMX_EPTP_AD_ENABLE_BIT (1ull << 6) 467 + #define VMX_EPTP_MT_MASK 0x7ull 468 + #define VMX_EPTP_MT_WB 0x6ull 469 + #define VMX_EPTP_MT_UC 0x0ull 478 470 #define VMX_EPT_READABLE_MASK 0x1ull 479 471 #define VMX_EPT_WRITABLE_MASK 0x2ull 480 472 #define VMX_EPT_EXECUTABLE_MASK 0x4ull

+25 -11

arch/x86/kvm/cpuid.c

··· 126 126 best->ebx = xstate_required_size(vcpu->arch.xcr0, true); 127 127 128 128 /* 129 - * The existing code assumes virtual address is 48-bit in the canonical 130 - * address checks; exit if it is ever changed. 129 + * The existing code assumes virtual address is 48-bit or 57-bit in the 130 + * canonical address checks; exit if it is ever changed. 131 131 */ 132 132 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); 133 - if (best && ((best->eax & 0xff00) >> 8) != 48 && 134 - ((best->eax & 0xff00) >> 8) != 0) 135 - return -EINVAL; 133 + if (best) { 134 + int vaddr_bits = (best->eax & 0xff00) >> 8; 135 + 136 + if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0) 137 + return -EINVAL; 138 + } 136 139 137 140 /* Update physical-address width */ 138 141 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); 142 + kvm_mmu_reset_context(vcpu); 139 143 140 144 kvm_pmu_refresh(vcpu); 141 145 return 0; ··· 387 383 388 384 /* cpuid 7.0.ecx*/ 389 385 const u32 kvm_cpuid_7_0_ecx_x86_features = 390 - F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ); 386 + F(AVX512VBMI) | F(LA57) | F(PKU) | 387 + 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ); 391 388 392 389 /* cpuid 7.0.edx*/ 393 390 const u32 kvm_cpuid_7_0_edx_x86_features = ··· 858 853 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); 859 854 } 860 855 861 - void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) 856 + bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, 857 + u32 *ecx, u32 *edx, bool check_limit) 862 858 { 863 859 u32 function = *eax, index = *ecx; 864 860 struct kvm_cpuid_entry2 *best; 861 + bool entry_found = true; 865 862 866 863 best = kvm_find_cpuid_entry(vcpu, function, index); 867 864 868 - if (!best) 869 - best = check_cpuid_limit(vcpu, function, index); 865 + if (!best) { 866 + entry_found = false; 867 + if (!check_limit) 868 + goto out; 870 869 870 + best = check_cpuid_limit(vcpu, function, index); 871 + } 872 + 873 + out: 871 874 if (best) { 872 875 *eax = best->eax; 873 876 *ebx = best->ebx; ··· 883 870 *edx = best->edx; 884 871 } else 885 872 *eax = *ebx = *ecx = *edx = 0; 886 - trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx); 873 + trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found); 874 + return entry_found; 887 875 } 888 876 EXPORT_SYMBOL_GPL(kvm_cpuid); 889 877 ··· 897 883 898 884 eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 899 885 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 900 - kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); 886 + kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, true); 901 887 kvm_register_write(vcpu, VCPU_REGS_RAX, eax); 902 888 kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); 903 889 kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);

+77 -135

arch/x86/kvm/cpuid.h

··· 3 3 4 4 #include "x86.h" 5 5 #include <asm/cpu.h> 6 + #include <asm/processor.h> 6 7 7 8 int kvm_update_cpuid(struct kvm_vcpu *vcpu); 8 9 bool kvm_mpx_supported(void); ··· 21 20 int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 22 21 struct kvm_cpuid2 *cpuid, 23 22 struct kvm_cpuid_entry2 __user *entries); 24 - void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); 23 + bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, 24 + u32 *ecx, u32 *edx, bool check_limit); 25 25 26 26 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); 27 27 ··· 31 29 return vcpu->arch.maxphyaddr; 32 30 } 33 31 34 - static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) 35 - { 36 - struct kvm_cpuid_entry2 *best; 32 + struct cpuid_reg { 33 + u32 function; 34 + u32 index; 35 + int reg; 36 + }; 37 37 38 - if (!static_cpu_has(X86_FEATURE_XSAVE)) 38 + static const struct cpuid_reg reverse_cpuid[] = { 39 + [CPUID_1_EDX] = { 1, 0, CPUID_EDX}, 40 + [CPUID_8000_0001_EDX] = {0x80000001, 0, CPUID_EDX}, 41 + [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX}, 42 + [CPUID_1_ECX] = { 1, 0, CPUID_ECX}, 43 + [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX}, 44 + [CPUID_8000_0001_ECX] = {0xc0000001, 0, CPUID_ECX}, 45 + [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX}, 46 + [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX}, 47 + [CPUID_F_0_EDX] = { 0xf, 0, CPUID_EDX}, 48 + [CPUID_F_1_EDX] = { 0xf, 1, CPUID_EDX}, 49 + [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX}, 50 + [CPUID_6_EAX] = { 6, 0, CPUID_EAX}, 51 + [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX}, 52 + [CPUID_7_ECX] = { 7, 0, CPUID_ECX}, 53 + [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX}, 54 + }; 55 + 56 + static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) 57 + { 58 + unsigned x86_leaf = x86_feature / 32; 59 + 60 + BUILD_BUG_ON(!__builtin_constant_p(x86_leaf)); 61 + BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); 62 + BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); 63 + 64 + return reverse_cpuid[x86_leaf]; 65 + } 66 + 67 + static __always_inline int *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsigned x86_feature) 68 + { 69 + struct kvm_cpuid_entry2 *entry; 70 + const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature); 71 + 72 + entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index); 73 + if (!entry) 74 + return NULL; 75 + 76 + switch (cpuid.reg) { 77 + case CPUID_EAX: 78 + return &entry->eax; 79 + case CPUID_EBX: 80 + return &entry->ebx; 81 + case CPUID_ECX: 82 + return &entry->ecx; 83 + case CPUID_EDX: 84 + return &entry->edx; 85 + default: 86 + BUILD_BUG(); 87 + return NULL; 88 + } 89 + } 90 + 91 + static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_feature) 92 + { 93 + int *reg; 94 + 95 + if (x86_feature == X86_FEATURE_XSAVE && 96 + !static_cpu_has(X86_FEATURE_XSAVE)) 39 97 return false; 40 98 41 - best = kvm_find_cpuid_entry(vcpu, 1, 0); 42 - return best && (best->ecx & bit(X86_FEATURE_XSAVE)); 99 + reg = guest_cpuid_get_register(vcpu, x86_feature); 100 + if (!reg) 101 + return false; 102 + 103 + return *reg & bit(x86_feature); 43 104 } 44 105 45 - static inline bool guest_cpuid_has_mtrr(struct kvm_vcpu *vcpu) 106 + static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x86_feature) 46 107 { 47 - struct kvm_cpuid_entry2 *best; 108 + int *reg; 48 109 49 - best = kvm_find_cpuid_entry(vcpu, 1, 0); 50 - return best && (best->edx & bit(X86_FEATURE_MTRR)); 51 - } 52 - 53 - static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu) 54 - { 55 - struct kvm_cpuid_entry2 *best; 56 - 57 - best = kvm_find_cpuid_entry(vcpu, 7, 0); 58 - return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST)); 59 - } 60 - 61 - static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) 62 - { 63 - struct kvm_cpuid_entry2 *best; 64 - 65 - best = kvm_find_cpuid_entry(vcpu, 7, 0); 66 - return best && (best->ebx & bit(X86_FEATURE_SMEP)); 67 - } 68 - 69 - static inline bool guest_cpuid_has_smap(struct kvm_vcpu *vcpu) 70 - { 71 - struct kvm_cpuid_entry2 *best; 72 - 73 - best = kvm_find_cpuid_entry(vcpu, 7, 0); 74 - return best && (best->ebx & bit(X86_FEATURE_SMAP)); 75 - } 76 - 77 - static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) 78 - { 79 - struct kvm_cpuid_entry2 *best; 80 - 81 - best = kvm_find_cpuid_entry(vcpu, 7, 0); 82 - return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); 83 - } 84 - 85 - static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu) 86 - { 87 - struct kvm_cpuid_entry2 *best; 88 - 89 - best = kvm_find_cpuid_entry(vcpu, 7, 0); 90 - return best && (best->ecx & bit(X86_FEATURE_PKU)); 91 - } 92 - 93 - static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu) 94 - { 95 - struct kvm_cpuid_entry2 *best; 96 - 97 - best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 98 - return best && (best->edx & bit(X86_FEATURE_LM)); 99 - } 100 - 101 - static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) 102 - { 103 - struct kvm_cpuid_entry2 *best; 104 - 105 - best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 106 - return best && (best->ecx & bit(X86_FEATURE_OSVW)); 107 - } 108 - 109 - static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu) 110 - { 111 - struct kvm_cpuid_entry2 *best; 112 - 113 - best = kvm_find_cpuid_entry(vcpu, 1, 0); 114 - return best && (best->ecx & bit(X86_FEATURE_PCID)); 115 - } 116 - 117 - static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu) 118 - { 119 - struct kvm_cpuid_entry2 *best; 120 - 121 - best = kvm_find_cpuid_entry(vcpu, 1, 0); 122 - return best && (best->ecx & bit(X86_FEATURE_X2APIC)); 110 + reg = guest_cpuid_get_register(vcpu, x86_feature); 111 + if (reg) 112 + *reg &= ~bit(x86_feature); 123 113 } 124 114 125 115 static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu) ··· 121 127 best = kvm_find_cpuid_entry(vcpu, 0, 0); 122 128 return best && best->ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx; 123 129 } 124 - 125 - static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu) 126 - { 127 - struct kvm_cpuid_entry2 *best; 128 - 129 - best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 130 - return best && (best->edx & bit(X86_FEATURE_GBPAGES)); 131 - } 132 - 133 - static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) 134 - { 135 - struct kvm_cpuid_entry2 *best; 136 - 137 - best = kvm_find_cpuid_entry(vcpu, 7, 0); 138 - return best && (best->ebx & bit(X86_FEATURE_RTM)); 139 - } 140 - 141 - static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) 142 - { 143 - struct kvm_cpuid_entry2 *best; 144 - 145 - best = kvm_find_cpuid_entry(vcpu, 7, 0); 146 - return best && (best->ebx & bit(X86_FEATURE_MPX)); 147 - } 148 - 149 - static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) 150 - { 151 - struct kvm_cpuid_entry2 *best; 152 - 153 - best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 154 - return best && (best->edx & bit(X86_FEATURE_RDTSCP)); 155 - } 156 - 157 - /* 158 - * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3 159 - */ 160 - #define BIT_NRIPS 3 161 - 162 - static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu) 163 - { 164 - struct kvm_cpuid_entry2 *best; 165 - 166 - best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0); 167 - 168 - /* 169 - * NRIPS is a scattered cpuid feature, so we can't use 170 - * X86_FEATURE_NRIPS here (X86_FEATURE_NRIPS would be bit 171 - * position 8, not 3). 172 - */ 173 - return best && (best->edx & bit(BIT_NRIPS)); 174 - } 175 - #undef BIT_NRIPS 176 130 177 131 static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) 178 132 {

+27 -15

arch/x86/kvm/emulate.c

··· 28 28 29 29 #include "x86.h" 30 30 #include "tss.h" 31 + #include "mmu.h" 31 32 32 33 /* 33 34 * Operand types ··· 689 688 ulong la; 690 689 u32 lim; 691 690 u16 sel; 691 + u8 va_bits; 692 692 693 693 la = seg_base(ctxt, addr.seg) + addr.ea; 694 694 *max_size = 0; 695 695 switch (mode) { 696 696 case X86EMUL_MODE_PROT64: 697 697 *linear = la; 698 - if (is_noncanonical_address(la)) 698 + va_bits = ctxt_virt_addr_bits(ctxt); 699 + if (get_canonical(la, va_bits) != la) 699 700 goto bad; 700 701 701 - *max_size = min_t(u64, ~0u, (1ull << 48) - la); 702 + *max_size = min_t(u64, ~0u, (1ull << va_bits) - la); 702 703 if (size > *max_size) 703 704 goto bad; 704 705 break; ··· 1751 1748 sizeof(base3), &ctxt->exception); 1752 1749 if (ret != X86EMUL_CONTINUE) 1753 1750 return ret; 1754 - if (is_noncanonical_address(get_desc_base(&seg_desc) | 1755 - ((u64)base3 << 32))) 1751 + if (emul_is_noncanonical_address(get_desc_base(&seg_desc) | 1752 + ((u64)base3 << 32), ctxt)) 1756 1753 return emulate_gp(ctxt, 0); 1757 1754 } 1758 1755 load: ··· 2336 2333 2337 2334 eax = 0x80000001; 2338 2335 ecx = 0; 2339 - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); 2336 + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); 2340 2337 return edx & bit(X86_FEATURE_LM); 2341 2338 } 2342 2339 ··· 2639 2636 u32 eax, ebx, ecx, edx; 2640 2637 2641 2638 eax = ecx = 0; 2642 - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); 2639 + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); 2643 2640 return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 2644 2641 && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 2645 2642 && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; ··· 2659 2656 2660 2657 eax = 0x00000000; 2661 2658 ecx = 0x00000000; 2662 - ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); 2659 + ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); 2663 2660 /* 2664 2661 * Intel ("GenuineIntel") 2665 2662 * remark: Intel CPUs only support "syscall" in 64bit ··· 2843 2840 ss_sel = cs_sel + 8; 2844 2841 cs.d = 0; 2845 2842 cs.l = 1; 2846 - if (is_noncanonical_address(rcx) || 2847 - is_noncanonical_address(rdx)) 2843 + if (emul_is_noncanonical_address(rcx, ctxt) || 2844 + emul_is_noncanonical_address(rdx, ctxt)) 2848 2845 return emulate_gp(ctxt, 0); 2849 2846 break; 2850 2847 } ··· 3554 3551 /* 3555 3552 * Check MOVBE is set in the guest-visible CPUID leaf. 3556 3553 */ 3557 - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); 3554 + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); 3558 3555 if (!(ecx & FFL(MOVBE))) 3559 3556 return emulate_ud(ctxt); 3560 3557 ··· 3759 3756 if (rc != X86EMUL_CONTINUE) 3760 3757 return rc; 3761 3758 if (ctxt->mode == X86EMUL_MODE_PROT64 && 3762 - is_noncanonical_address(desc_ptr.address)) 3759 + emul_is_noncanonical_address(desc_ptr.address, ctxt)) 3763 3760 return emulate_gp(ctxt, 0); 3764 3761 if (lgdt) 3765 3762 ctxt->ops->set_gdt(ctxt, &desc_ptr); ··· 3868 3865 3869 3866 eax = reg_read(ctxt, VCPU_REGS_RAX); 3870 3867 ecx = reg_read(ctxt, VCPU_REGS_RCX); 3871 - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); 3868 + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true); 3872 3869 *reg_write(ctxt, VCPU_REGS_RAX) = eax; 3873 3870 *reg_write(ctxt, VCPU_REGS_RBX) = ebx; 3874 3871 *reg_write(ctxt, VCPU_REGS_RCX) = ecx; ··· 3927 3924 { 3928 3925 u32 eax = 1, ebx, ecx = 0, edx; 3929 3926 3930 - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); 3927 + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); 3931 3928 if (!(edx & FFL(FXSR))) 3932 3929 return emulate_ud(ctxt); 3933 3930 ··· 4100 4097 u64 rsvd = 0; 4101 4098 4102 4099 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); 4103 - if (efer & EFER_LMA) 4104 - rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD; 4100 + if (efer & EFER_LMA) { 4101 + u64 maxphyaddr; 4102 + u32 eax = 0x80000008; 4103 + 4104 + if (ctxt->ops->get_cpuid(ctxt, &eax, NULL, NULL, 4105 + NULL, false)) 4106 + maxphyaddr = eax & 0xff; 4107 + else 4108 + maxphyaddr = 36; 4109 + rsvd = rsvd_bits(maxphyaddr, 62); 4110 + } 4105 4111 4106 4112 if (new_val & rsvd) 4107 4113 return emulate_gp(ctxt, 0);

+7 -1

arch/x86/kvm/hyperv.c

··· 1160 1160 return stimer_get_count(vcpu_to_stimer(vcpu, timer_index), 1161 1161 pdata); 1162 1162 } 1163 + case HV_X64_MSR_TSC_FREQUENCY: 1164 + data = (u64)vcpu->arch.virtual_tsc_khz * 1000; 1165 + break; 1166 + case HV_X64_MSR_APIC_FREQUENCY: 1167 + data = APIC_BUS_FREQUENCY; 1168 + break; 1163 1169 default: 1164 1170 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1165 1171 return 1; ··· 1274 1268 1275 1269 switch (code) { 1276 1270 case HVCALL_NOTIFY_LONG_SPIN_WAIT: 1277 - kvm_vcpu_on_spin(vcpu); 1271 + kvm_vcpu_on_spin(vcpu, true); 1278 1272 break; 1279 1273 case HVCALL_POST_MESSAGE: 1280 1274 case HVCALL_SIGNAL_EVENT:

+1 -1

arch/x86/kvm/kvm_cache_regs.h

-2

arch/x86/kvm/lapic.c

··· 54 54 #define PRIu64 "u" 55 55 #define PRIo64 "o" 56 56 57 - #define APIC_BUS_CYCLE_NS 1 58 - 59 57 /* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ 60 58 #define apic_debug(fmt, arg...) 61 59

+3

arch/x86/kvm/lapic.h

··· 12 12 #define KVM_APIC_SHORT_MASK 0xc0000 13 13 #define KVM_APIC_DEST_MASK 0x800 14 14 15 + #define APIC_BUS_CYCLE_NS 1 16 + #define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS) 17 + 15 18 struct kvm_timer { 16 19 struct hrtimer timer; 17 20 s64 period; /* unit: ns */

+171 -92

arch/x86/kvm/mmu.c

··· 2169 2169 } 2170 2170 2171 2171 struct mmu_page_path { 2172 - struct kvm_mmu_page *parent[PT64_ROOT_LEVEL]; 2173 - unsigned int idx[PT64_ROOT_LEVEL]; 2172 + struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL]; 2173 + unsigned int idx[PT64_ROOT_MAX_LEVEL]; 2174 2174 }; 2175 2175 2176 2176 #define for_each_sp(pvec, sp, parents, i) \ ··· 2385 2385 iterator->shadow_addr = vcpu->arch.mmu.root_hpa; 2386 2386 iterator->level = vcpu->arch.mmu.shadow_root_level; 2387 2387 2388 - if (iterator->level == PT64_ROOT_LEVEL && 2389 - vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL && 2388 + if (iterator->level == PT64_ROOT_4LEVEL && 2389 + vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL && 2390 2390 !vcpu->arch.mmu.direct_map) 2391 2391 --iterator->level; 2392 2392 ··· 2610 2610 2611 2611 sp = list_last_entry(&kvm->arch.active_mmu_pages, 2612 2612 struct kvm_mmu_page, link); 2613 - kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 2614 - 2615 - return true; 2613 + return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 2616 2614 } 2617 2615 2618 2616 /* ··· 3260 3262 3261 3263 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 3262 3264 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable); 3263 - static void make_mmu_pages_available(struct kvm_vcpu *vcpu); 3265 + static int make_mmu_pages_available(struct kvm_vcpu *vcpu); 3264 3266 3265 3267 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, 3266 3268 gfn_t gfn, bool prefault) ··· 3300 3302 spin_lock(&vcpu->kvm->mmu_lock); 3301 3303 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 3302 3304 goto out_unlock; 3303 - make_mmu_pages_available(vcpu); 3305 + if (make_mmu_pages_available(vcpu) < 0) 3306 + goto out_unlock; 3304 3307 if (likely(!force_pt_level)) 3305 3308 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); 3306 3309 r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); ··· 3325 3326 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 3326 3327 return; 3327 3328 3328 - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && 3329 - (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || 3329 + if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL && 3330 + (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL || 3330 3331 vcpu->arch.mmu.direct_map)) { 3331 3332 hpa_t root = vcpu->arch.mmu.root_hpa; 3332 3333 ··· 3378 3379 struct kvm_mmu_page *sp; 3379 3380 unsigned i; 3380 3381 3381 - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 3382 + if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) { 3382 3383 spin_lock(&vcpu->kvm->mmu_lock); 3383 - make_mmu_pages_available(vcpu); 3384 - sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL); 3384 + if(make_mmu_pages_available(vcpu) < 0) { 3385 + spin_unlock(&vcpu->kvm->mmu_lock); 3386 + return 1; 3387 + } 3388 + sp = kvm_mmu_get_page(vcpu, 0, 0, 3389 + vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL); 3385 3390 ++sp->root_count; 3386 3391 spin_unlock(&vcpu->kvm->mmu_lock); 3387 3392 vcpu->arch.mmu.root_hpa = __pa(sp->spt); ··· 3395 3392 3396 3393 MMU_WARN_ON(VALID_PAGE(root)); 3397 3394 spin_lock(&vcpu->kvm->mmu_lock); 3398 - make_mmu_pages_available(vcpu); 3395 + if (make_mmu_pages_available(vcpu) < 0) { 3396 + spin_unlock(&vcpu->kvm->mmu_lock); 3397 + return 1; 3398 + } 3399 3399 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), 3400 3400 i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL); 3401 3401 root = __pa(sp->spt); ··· 3429 3423 * Do we shadow a long mode page table? If so we need to 3430 3424 * write-protect the guests page table root. 3431 3425 */ 3432 - if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { 3426 + if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { 3433 3427 hpa_t root = vcpu->arch.mmu.root_hpa; 3434 3428 3435 3429 MMU_WARN_ON(VALID_PAGE(root)); 3436 3430 3437 3431 spin_lock(&vcpu->kvm->mmu_lock); 3438 - make_mmu_pages_available(vcpu); 3439 - sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, 3440 - 0, ACC_ALL); 3432 + if (make_mmu_pages_available(vcpu) < 0) { 3433 + spin_unlock(&vcpu->kvm->mmu_lock); 3434 + return 1; 3435 + } 3436 + sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 3437 + vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL); 3441 3438 root = __pa(sp->spt); 3442 3439 ++sp->root_count; 3443 3440 spin_unlock(&vcpu->kvm->mmu_lock); ··· 3454 3445 * the shadow page table may be a PAE or a long mode page table. 3455 3446 */ 3456 3447 pm_mask = PT_PRESENT_MASK; 3457 - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) 3448 + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) 3458 3449 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; 3459 3450 3460 3451 for (i = 0; i < 4; ++i) { ··· 3472 3463 return 1; 3473 3464 } 3474 3465 spin_lock(&vcpu->kvm->mmu_lock); 3475 - make_mmu_pages_available(vcpu); 3466 + if (make_mmu_pages_available(vcpu) < 0) { 3467 + spin_unlock(&vcpu->kvm->mmu_lock); 3468 + return 1; 3469 + } 3476 3470 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, 3477 3471 0, ACC_ALL); 3478 3472 root = __pa(sp->spt); ··· 3490 3478 * If we shadow a 32 bit page table with a long mode page 3491 3479 * table we enter this path. 3492 3480 */ 3493 - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 3481 + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) { 3494 3482 if (vcpu->arch.mmu.lm_root == NULL) { 3495 3483 /* 3496 3484 * The additional page necessary for this is only ··· 3535 3523 3536 3524 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); 3537 3525 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 3538 - if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { 3526 + if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { 3539 3527 hpa_t root = vcpu->arch.mmu.root_hpa; 3540 3528 sp = page_header(root); 3541 3529 mmu_sync_children(vcpu, sp); ··· 3600 3588 3601 3589 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3602 3590 { 3591 + /* 3592 + * A nested guest cannot use the MMIO cache if it is using nested 3593 + * page tables, because cr2 is a nGPA while the cache stores GPAs. 3594 + */ 3595 + if (mmu_is_nested(vcpu)) 3596 + return false; 3597 + 3603 3598 if (direct) 3604 3599 return vcpu_match_mmio_gpa(vcpu, addr); 3605 3600 ··· 3618 3599 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) 3619 3600 { 3620 3601 struct kvm_shadow_walk_iterator iterator; 3621 - u64 sptes[PT64_ROOT_LEVEL], spte = 0ull; 3602 + u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull; 3622 3603 int root, leaf; 3623 3604 bool reserved = false; 3624 3605 ··· 3659 3640 return reserved; 3660 3641 } 3661 3642 3662 - int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3643 + /* 3644 + * Return values of handle_mmio_page_fault: 3645 + * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction 3646 + * directly. 3647 + * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page 3648 + * fault path update the mmio spte. 3649 + * RET_MMIO_PF_RETRY: let CPU fault again on the address. 3650 + * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed). 3651 + */ 3652 + enum { 3653 + RET_MMIO_PF_EMULATE = 1, 3654 + RET_MMIO_PF_INVALID = 2, 3655 + RET_MMIO_PF_RETRY = 0, 3656 + RET_MMIO_PF_BUG = -1 3657 + }; 3658 + 3659 + static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3663 3660 { 3664 3661 u64 spte; 3665 3662 bool reserved; ··· 3907 3872 spin_lock(&vcpu->kvm->mmu_lock); 3908 3873 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 3909 3874 goto out_unlock; 3910 - make_mmu_pages_available(vcpu); 3875 + if (make_mmu_pages_available(vcpu) < 0) 3876 + goto out_unlock; 3911 3877 if (likely(!force_pt_level)) 3912 3878 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); 3913 3879 r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); ··· 4061 4025 rsvd_check->rsvd_bits_mask[1][0] = 4062 4026 rsvd_check->rsvd_bits_mask[0][0]; 4063 4027 break; 4064 - case PT64_ROOT_LEVEL: 4028 + case PT64_ROOT_5LEVEL: 4029 + rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd | 4030 + nonleaf_bit8_rsvd | rsvd_bits(7, 7) | 4031 + rsvd_bits(maxphyaddr, 51); 4032 + rsvd_check->rsvd_bits_mask[1][4] = 4033 + rsvd_check->rsvd_bits_mask[0][4]; 4034 + case PT64_ROOT_4LEVEL: 4065 4035 rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | 4066 4036 nonleaf_bit8_rsvd | rsvd_bits(7, 7) | 4067 4037 rsvd_bits(maxphyaddr, 51); ··· 4097 4055 { 4098 4056 __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check, 4099 4057 cpuid_maxphyaddr(vcpu), context->root_level, 4100 - context->nx, guest_cpuid_has_gbpages(vcpu), 4058 + context->nx, 4059 + guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), 4101 4060 is_pse(vcpu), guest_cpuid_is_amd(vcpu)); 4102 4061 } 4103 4062 ··· 4108 4065 { 4109 4066 u64 bad_mt_xwr; 4110 4067 4068 + rsvd_check->rsvd_bits_mask[0][4] = 4069 + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); 4111 4070 rsvd_check->rsvd_bits_mask[0][3] = 4112 4071 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); 4113 4072 rsvd_check->rsvd_bits_mask[0][2] = ··· 4119 4074 rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); 4120 4075 4121 4076 /* large page */ 4077 + rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; 4122 4078 rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; 4123 4079 rsvd_check->rsvd_bits_mask[1][2] = 4124 4080 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); ··· 4166 4120 __reset_rsvds_bits_mask(vcpu, shadow_zero_check, 4167 4121 boot_cpu_data.x86_phys_bits, 4168 4122 context->shadow_root_level, uses_nx, 4169 - guest_cpuid_has_gbpages(vcpu), is_pse(vcpu), 4170 - true); 4123 + guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), 4124 + is_pse(vcpu), true); 4171 4125 4172 4126 if (!shadow_me_mask) 4173 4127 return; ··· 4231 4185 boot_cpu_data.x86_phys_bits, execonly); 4232 4186 } 4233 4187 4188 + #define BYTE_MASK(access) \ 4189 + ((1 & (access) ? 2 : 0) | \ 4190 + (2 & (access) ? 4 : 0) | \ 4191 + (3 & (access) ? 8 : 0) | \ 4192 + (4 & (access) ? 16 : 0) | \ 4193 + (5 & (access) ? 32 : 0) | \ 4194 + (6 & (access) ? 64 : 0) | \ 4195 + (7 & (access) ? 128 : 0)) 4196 + 4197 + 4234 4198 static void update_permission_bitmask(struct kvm_vcpu *vcpu, 4235 4199 struct kvm_mmu *mmu, bool ept) 4236 4200 { 4237 - unsigned bit, byte, pfec; 4238 - u8 map; 4239 - bool fault, x, w, u, wf, uf, ff, smapf, cr4_smap, cr4_smep, smap = 0; 4201 + unsigned byte; 4240 4202 4241 - cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); 4242 - cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); 4203 + const u8 x = BYTE_MASK(ACC_EXEC_MASK); 4204 + const u8 w = BYTE_MASK(ACC_WRITE_MASK); 4205 + const u8 u = BYTE_MASK(ACC_USER_MASK); 4206 + 4207 + bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0; 4208 + bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0; 4209 + bool cr0_wp = is_write_protection(vcpu); 4210 + 4243 4211 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { 4244 - pfec = byte << 1; 4245 - map = 0; 4246 - wf = pfec & PFERR_WRITE_MASK; 4247 - uf = pfec & PFERR_USER_MASK; 4248 - ff = pfec & PFERR_FETCH_MASK; 4212 + unsigned pfec = byte << 1; 4213 + 4249 4214 /* 4250 - * PFERR_RSVD_MASK bit is set in PFEC if the access is not 4251 - * subject to SMAP restrictions, and cleared otherwise. The 4252 - * bit is only meaningful if the SMAP bit is set in CR4. 4215 + * Each "*f" variable has a 1 bit for each UWX value 4216 + * that causes a fault with the given PFEC. 4253 4217 */ 4254 - smapf = !(pfec & PFERR_RSVD_MASK); 4255 - for (bit = 0; bit < 8; ++bit) { 4256 - x = bit & ACC_EXEC_MASK; 4257 - w = bit & ACC_WRITE_MASK; 4258 - u = bit & ACC_USER_MASK; 4259 4218 4260 - if (!ept) { 4261 - /* Not really needed: !nx will cause pte.nx to fault */ 4262 - x |= !mmu->nx; 4263 - /* Allow supervisor writes if !cr0.wp */ 4264 - w |= !is_write_protection(vcpu) && !uf; 4265 - /* Disallow supervisor fetches of user code if cr4.smep */ 4266 - x &= !(cr4_smep && u && !uf); 4219 + /* Faults from writes to non-writable pages */ 4220 + u8 wf = (pfec & PFERR_WRITE_MASK) ? ~w : 0; 4221 + /* Faults from user mode accesses to supervisor pages */ 4222 + u8 uf = (pfec & PFERR_USER_MASK) ? ~u : 0; 4223 + /* Faults from fetches of non-executable pages*/ 4224 + u8 ff = (pfec & PFERR_FETCH_MASK) ? ~x : 0; 4225 + /* Faults from kernel mode fetches of user pages */ 4226 + u8 smepf = 0; 4227 + /* Faults from kernel mode accesses of user pages */ 4228 + u8 smapf = 0; 4267 4229 4268 - /* 4269 - * SMAP:kernel-mode data accesses from user-mode 4270 - * mappings should fault. A fault is considered 4271 - * as a SMAP violation if all of the following 4272 - * conditions are ture: 4273 - * - X86_CR4_SMAP is set in CR4 4274 - * - A user page is accessed 4275 - * - Page fault in kernel mode 4276 - * - if CPL = 3 or X86_EFLAGS_AC is clear 4277 - * 4278 - * Here, we cover the first three conditions. 4279 - * The fourth is computed dynamically in 4280 - * permission_fault() and is in smapf. 4281 - * 4282 - * Also, SMAP does not affect instruction 4283 - * fetches, add the !ff check here to make it 4284 - * clearer. 4285 - */ 4286 - smap = cr4_smap && u && !uf && !ff; 4287 - } 4230 + if (!ept) { 4231 + /* Faults from kernel mode accesses to user pages */ 4232 + u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u; 4288 4233 4289 - fault = (ff && !x) || (uf && !u) || (wf && !w) || 4290 - (smapf && smap); 4291 - map |= fault << bit; 4234 + /* Not really needed: !nx will cause pte.nx to fault */ 4235 + if (!mmu->nx) 4236 + ff = 0; 4237 + 4238 + /* Allow supervisor writes if !cr0.wp */ 4239 + if (!cr0_wp) 4240 + wf = (pfec & PFERR_USER_MASK) ? wf : 0; 4241 + 4242 + /* Disallow supervisor fetches of user code if cr4.smep */ 4243 + if (cr4_smep) 4244 + smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0; 4245 + 4246 + /* 4247 + * SMAP:kernel-mode data accesses from user-mode 4248 + * mappings should fault. A fault is considered 4249 + * as a SMAP violation if all of the following 4250 + * conditions are ture: 4251 + * - X86_CR4_SMAP is set in CR4 4252 + * - A user page is accessed 4253 + * - The access is not a fetch 4254 + * - Page fault in kernel mode 4255 + * - if CPL = 3 or X86_EFLAGS_AC is clear 4256 + * 4257 + * Here, we cover the first three conditions. 4258 + * The fourth is computed dynamically in permission_fault(); 4259 + * PFERR_RSVD_MASK bit will be set in PFEC if the access is 4260 + * *not* subject to SMAP restrictions. 4261 + */ 4262 + if (cr4_smap) 4263 + smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf; 4292 4264 } 4293 - mmu->permissions[byte] = map; 4265 + 4266 + mmu->permissions[byte] = ff | uf | wf | smepf | smapf; 4294 4267 } 4295 4268 } 4296 4269 ··· 4423 4358 static void paging64_init_context(struct kvm_vcpu *vcpu, 4424 4359 struct kvm_mmu *context) 4425 4360 { 4426 - paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); 4361 + int root_level = is_la57_mode(vcpu) ? 4362 + PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; 4363 + 4364 + paging64_init_context_common(vcpu, context, root_level); 4427 4365 } 4428 4366 4429 4367 static void paging32_init_context(struct kvm_vcpu *vcpu, ··· 4467 4399 context->sync_page = nonpaging_sync_page; 4468 4400 context->invlpg = nonpaging_invlpg; 4469 4401 context->update_pte = nonpaging_update_pte; 4470 - context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 4402 + context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu); 4471 4403 context->root_hpa = INVALID_PAGE; 4472 4404 context->direct_map = true; 4473 4405 context->set_cr3 = kvm_x86_ops->set_tdp_cr3; ··· 4481 4413 context->root_level = 0; 4482 4414 } else if (is_long_mode(vcpu)) { 4483 4415 context->nx = is_nx(vcpu); 4484 - context->root_level = PT64_ROOT_LEVEL; 4416 + context->root_level = is_la57_mode(vcpu) ? 4417 + PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; 4485 4418 reset_rsvds_bits_mask(vcpu, context); 4486 4419 context->gva_to_gpa = paging64_gva_to_gpa; 4487 4420 } else if (is_pae(vcpu)) { ··· 4539 4470 4540 4471 MMU_WARN_ON(VALID_PAGE(context->root_hpa)); 4541 4472 4542 - context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 4473 + context->shadow_root_level = PT64_ROOT_4LEVEL; 4543 4474 4544 4475 context->nx = true; 4545 4476 context->ept_ad = accessed_dirty; ··· 4548 4479 context->sync_page = ept_sync_page; 4549 4480 context->invlpg = ept_invlpg; 4550 4481 context->update_pte = ept_update_pte; 4551 - context->root_level = context->shadow_root_level; 4482 + context->root_level = PT64_ROOT_4LEVEL; 4552 4483 context->root_hpa = INVALID_PAGE; 4553 4484 context->direct_map = false; 4554 4485 context->base_role.ad_disabled = !accessed_dirty; ··· 4593 4524 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; 4594 4525 } else if (is_long_mode(vcpu)) { 4595 4526 g_context->nx = is_nx(vcpu); 4596 - g_context->root_level = PT64_ROOT_LEVEL; 4527 + g_context->root_level = is_la57_mode(vcpu) ? 4528 + PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; 4597 4529 reset_rsvds_bits_mask(vcpu, g_context); 4598 4530 g_context->gva_to_gpa = paging64_gva_to_gpa_nested; 4599 4531 } else if (is_pae(vcpu)) { ··· 4884 4814 } 4885 4815 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); 4886 4816 4887 - static void make_mmu_pages_available(struct kvm_vcpu *vcpu) 4817 + static int make_mmu_pages_available(struct kvm_vcpu *vcpu) 4888 4818 { 4889 4819 LIST_HEAD(invalid_list); 4890 4820 4891 4821 if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) 4892 - return; 4822 + return 0; 4893 4823 4894 4824 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { 4895 4825 if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) ··· 4898 4828 ++vcpu->kvm->stat.mmu_recycled; 4899 4829 } 4900 4830 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 4831 + 4832 + if (!kvm_mmu_available_pages(vcpu->kvm)) 4833 + return -ENOSPC; 4834 + return 0; 4901 4835 } 4902 4836 4903 4837 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, ··· 4909 4835 { 4910 4836 int r, emulation_type = EMULTYPE_RETRY; 4911 4837 enum emulation_result er; 4912 - bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu); 4838 + bool direct = vcpu->arch.mmu.direct_map; 4839 + 4840 + /* With shadow page tables, fault_address contains a GVA or nGPA. */ 4841 + if (vcpu->arch.mmu.direct_map) { 4842 + vcpu->arch.gpa_available = true; 4843 + vcpu->arch.gpa_val = cr2; 4844 + } 4913 4845 4914 4846 if (unlikely(error_code & PFERR_RSVD_MASK)) { 4915 4847 r = handle_mmio_page_fault(vcpu, cr2, direct); ··· 4927 4847 return 1; 4928 4848 if (r < 0) 4929 4849 return r; 4850 + /* Must be RET_MMIO_PF_INVALID. */ 4930 4851 } 4931 4852 4932 4853 r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), ··· 4943 4862 * This can occur when using nested virtualization with nested 4944 4863 * paging in both guests. If true, we simply unprotect the page 4945 4864 * and resume the guest. 4946 - * 4947 - * Note: AMD only (since it supports the PFERR_GUEST_PAGE_MASK used 4948 - * in PFERR_NEXT_GUEST_PAGE) 4949 4865 */ 4950 - if (error_code == PFERR_NESTED_GUEST_PAGE) { 4866 + if (vcpu->arch.mmu.direct_map && 4867 + (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { 4951 4868 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2)); 4952 4869 return 1; 4953 4870 }

+5 -18

arch/x86/kvm/mmu.h

··· 37 37 #define PT32_DIR_PSE36_MASK \ 38 38 (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) 39 39 40 - #define PT64_ROOT_LEVEL 4 40 + #define PT64_ROOT_5LEVEL 5 41 + #define PT64_ROOT_4LEVEL 4 41 42 #define PT32_ROOT_LEVEL 2 42 43 #define PT32E_ROOT_LEVEL 3 43 44 ··· 49 48 50 49 static inline u64 rsvd_bits(int s, int e) 51 50 { 51 + if (e < s) 52 + return 0; 53 + 52 54 return ((1ULL << (e - s + 1)) - 1) << s; 53 55 } 54 56 ··· 60 56 void 61 57 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); 62 58 63 - /* 64 - * Return values of handle_mmio_page_fault: 65 - * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction 66 - * directly. 67 - * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page 68 - * fault path update the mmio spte. 69 - * RET_MMIO_PF_RETRY: let CPU fault again on the address. 70 - * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed). 71 - */ 72 - enum { 73 - RET_MMIO_PF_EMULATE = 1, 74 - RET_MMIO_PF_INVALID = 2, 75 - RET_MMIO_PF_RETRY = 0, 76 - RET_MMIO_PF_BUG = -1 77 - }; 78 - 79 - int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct); 80 59 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); 81 60 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, 82 61 bool accessed_dirty);

+2 -2

arch/x86/kvm/mmu_audit.c

··· 62 62 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 63 63 return; 64 64 65 - if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { 65 + if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { 66 66 hpa_t root = vcpu->arch.mmu.root_hpa; 67 67 68 68 sp = page_header(root); 69 - __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL); 69 + __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level); 70 70 return; 71 71 } 72 72

+1 -1

arch/x86/kvm/mtrr.c

··· 130 130 * enable MTRRs and it is obviously undesirable to run the 131 131 * guest entirely with UC memory and we use WB. 132 132 */ 133 - if (guest_cpuid_has_mtrr(vcpu)) 133 + if (guest_cpuid_has(vcpu, X86_FEATURE_MTRR)) 134 134 return MTRR_TYPE_UNCACHABLE; 135 135 else 136 136 return MTRR_TYPE_WRBACK;

+3 -3

arch/x86/kvm/paging_tmpl.h

··· 790 790 &map_writable)) 791 791 return 0; 792 792 793 - if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr, 794 - walker.gfn, pfn, walker.pte_access, &r)) 793 + if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) 795 794 return r; 796 795 797 796 /* ··· 818 819 goto out_unlock; 819 820 820 821 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 821 - make_mmu_pages_available(vcpu); 822 + if (make_mmu_pages_available(vcpu) < 0) 823 + goto out_unlock; 822 824 if (!force_pt_level) 823 825 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); 824 826 r = FNAME(fetch)(vcpu, addr, &walker, write_fault,

+81 -58

arch/x86/kvm/svm.c

··· 280 280 static int vls = true; 281 281 module_param(vls, int, 0444); 282 282 283 - /* AVIC VM ID bit masks and lock */ 284 - static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR); 285 - static DEFINE_SPINLOCK(avic_vm_id_lock); 283 + /* enable/disable Virtual GIF */ 284 + static int vgif = true; 285 + module_param(vgif, int, 0444); 286 286 287 287 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 288 288 static void svm_flush_tlb(struct kvm_vcpu *vcpu); ··· 479 479 recalc_intercepts(svm); 480 480 } 481 481 482 + static inline bool vgif_enabled(struct vcpu_svm *svm) 483 + { 484 + return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK); 485 + } 486 + 482 487 static inline void enable_gif(struct vcpu_svm *svm) 483 488 { 484 - svm->vcpu.arch.hflags |= HF_GIF_MASK; 489 + if (vgif_enabled(svm)) 490 + svm->vmcb->control.int_ctl |= V_GIF_MASK; 491 + else 492 + svm->vcpu.arch.hflags |= HF_GIF_MASK; 485 493 } 486 494 487 495 static inline void disable_gif(struct vcpu_svm *svm) 488 496 { 489 - svm->vcpu.arch.hflags &= ~HF_GIF_MASK; 497 + if (vgif_enabled(svm)) 498 + svm->vmcb->control.int_ctl &= ~V_GIF_MASK; 499 + else 500 + svm->vcpu.arch.hflags &= ~HF_GIF_MASK; 490 501 } 491 502 492 503 static inline bool gif_set(struct vcpu_svm *svm) 493 504 { 494 - return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); 505 + if (vgif_enabled(svm)) 506 + return !!(svm->vmcb->control.int_ctl & V_GIF_MASK); 507 + else 508 + return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); 495 509 } 496 510 497 511 static unsigned long iopm_base; ··· 581 567 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); 582 568 } 583 569 584 - static int get_npt_level(void) 570 + static int get_npt_level(struct kvm_vcpu *vcpu) 585 571 { 586 572 #ifdef CONFIG_X86_64 587 - return PT64_ROOT_LEVEL; 573 + return PT64_ROOT_4LEVEL; 588 574 #else 589 575 return PT32E_ROOT_LEVEL; 590 576 #endif ··· 655 641 struct vcpu_svm *svm = to_svm(vcpu); 656 642 unsigned nr = vcpu->arch.exception.nr; 657 643 bool has_error_code = vcpu->arch.exception.has_error_code; 658 - bool reinject = vcpu->arch.exception.reinject; 644 + bool reinject = vcpu->arch.exception.injected; 659 645 u32 error_code = vcpu->arch.exception.error_code; 660 646 661 647 /* ··· 987 973 static void disable_nmi_singlestep(struct vcpu_svm *svm) 988 974 { 989 975 svm->nmi_singlestep = false; 976 + 990 977 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) { 991 978 /* Clear our flags if they were not set by the guest */ 992 979 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) ··· 1004 989 */ 1005 990 #define SVM_VM_DATA_HASH_BITS 8 1006 991 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); 992 + static u32 next_vm_id = 0; 993 + static bool next_vm_id_wrapped = 0; 1007 994 static DEFINE_SPINLOCK(svm_vm_data_hash_lock); 1008 995 1009 996 /* Note: ··· 1123 1106 } else { 1124 1107 pr_info("Virtual VMLOAD VMSAVE supported\n"); 1125 1108 } 1109 + } 1110 + 1111 + if (vgif) { 1112 + if (!boot_cpu_has(X86_FEATURE_VGIF)) 1113 + vgif = false; 1114 + else 1115 + pr_info("Virtual GIF supported\n"); 1126 1116 } 1127 1117 1128 1118 return 0; ··· 1329 1305 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 1330 1306 } 1331 1307 1308 + if (vgif) { 1309 + clr_intercept(svm, INTERCEPT_STGI); 1310 + clr_intercept(svm, INTERCEPT_CLGI); 1311 + svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; 1312 + } 1313 + 1332 1314 mark_all_dirty(svm->vmcb); 1333 1315 1334 1316 enable_gif(svm); ··· 1417 1387 return 0; 1418 1388 } 1419 1389 1420 - static inline int avic_get_next_vm_id(void) 1421 - { 1422 - int id; 1423 - 1424 - spin_lock(&avic_vm_id_lock); 1425 - 1426 - /* AVIC VM ID is one-based. */ 1427 - id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1); 1428 - if (id <= AVIC_VM_ID_MASK) 1429 - __set_bit(id, avic_vm_id_bitmap); 1430 - else 1431 - id = -EAGAIN; 1432 - 1433 - spin_unlock(&avic_vm_id_lock); 1434 - return id; 1435 - } 1436 - 1437 - static inline int avic_free_vm_id(int id) 1438 - { 1439 - if (id <= 0 || id > AVIC_VM_ID_MASK) 1440 - return -EINVAL; 1441 - 1442 - spin_lock(&avic_vm_id_lock); 1443 - __clear_bit(id, avic_vm_id_bitmap); 1444 - spin_unlock(&avic_vm_id_lock); 1445 - return 0; 1446 - } 1447 - 1448 1390 static void avic_vm_destroy(struct kvm *kvm) 1449 1391 { 1450 1392 unsigned long flags; ··· 1424 1422 1425 1423 if (!avic) 1426 1424 return; 1427 - 1428 - avic_free_vm_id(vm_data->avic_vm_id); 1429 1425 1430 1426 if (vm_data->avic_logical_id_table_page) 1431 1427 __free_page(vm_data->avic_logical_id_table_page); ··· 1438 1438 static int avic_vm_init(struct kvm *kvm) 1439 1439 { 1440 1440 unsigned long flags; 1441 - int vm_id, err = -ENOMEM; 1441 + int err = -ENOMEM; 1442 1442 struct kvm_arch *vm_data = &kvm->arch; 1443 1443 struct page *p_page; 1444 1444 struct page *l_page; 1445 + struct kvm_arch *ka; 1446 + u32 vm_id; 1445 1447 1446 1448 if (!avic) 1447 1449 return 0; 1448 - 1449 - vm_id = avic_get_next_vm_id(); 1450 - if (vm_id < 0) 1451 - return vm_id; 1452 - vm_data->avic_vm_id = (u32)vm_id; 1453 1450 1454 1451 /* Allocating physical APIC ID table (4KB) */ 1455 1452 p_page = alloc_page(GFP_KERNEL); ··· 1465 1468 clear_page(page_address(l_page)); 1466 1469 1467 1470 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 1471 + again: 1472 + vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; 1473 + if (vm_id == 0) { /* id is 1-based, zero is not okay */ 1474 + next_vm_id_wrapped = 1; 1475 + goto again; 1476 + } 1477 + /* Is it still in use? Only possible if wrapped at least once */ 1478 + if (next_vm_id_wrapped) { 1479 + hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) { 1480 + struct kvm *k2 = container_of(ka, struct kvm, arch); 1481 + struct kvm_arch *vd2 = &k2->arch; 1482 + if (vd2->avic_vm_id == vm_id) 1483 + goto again; 1484 + } 1485 + } 1486 + vm_data->avic_vm_id = vm_id; 1468 1487 hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id); 1469 1488 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 1470 1489 ··· 1593 1580 } 1594 1581 init_vmcb(svm); 1595 1582 1596 - kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); 1583 + kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true); 1597 1584 kvm_register_write(vcpu, VCPU_REGS_RDX, eax); 1598 1585 1599 1586 if (kvm_vcpu_apicv_active(vcpu) && !init_event) ··· 2397 2384 vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; 2398 2385 vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; 2399 2386 vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; 2400 - vcpu->arch.mmu.shadow_root_level = get_npt_level(); 2387 + vcpu->arch.mmu.shadow_root_level = get_npt_level(vcpu); 2401 2388 reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu); 2402 2389 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 2403 2390 } ··· 3160 3147 if (nested_svm_check_permissions(svm)) 3161 3148 return 1; 3162 3149 3150 + /* 3151 + * If VGIF is enabled, the STGI intercept is only added to 3152 + * detect the opening of the NMI window; remove it now. 3153 + */ 3154 + if (vgif_enabled(svm)) 3155 + clr_intercept(svm, INTERCEPT_STGI); 3156 + 3163 3157 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3164 3158 ret = kvm_skip_emulated_instruction(&svm->vcpu); 3165 3159 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); ··· 3764 3744 3765 3745 static int pause_interception(struct vcpu_svm *svm) 3766 3746 { 3767 - kvm_vcpu_on_spin(&(svm->vcpu)); 3747 + struct kvm_vcpu *vcpu = &svm->vcpu; 3748 + bool in_kernel = (svm_get_cpl(vcpu) == 0); 3749 + 3750 + kvm_vcpu_on_spin(vcpu, in_kernel); 3768 3751 return 1; 3769 3752 } 3770 3753 ··· 4251 4228 4252 4229 trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); 4253 4230 4254 - vcpu->arch.gpa_available = (exit_code == SVM_EXIT_NPF); 4255 - 4256 4231 if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) 4257 4232 vcpu->arch.cr0 = svm->vmcb->save.cr0; 4258 4233 if (npt_enabled) ··· 4703 4682 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes 4704 4683 * 1, because that's a separate STGI/VMRUN intercept. The next time we 4705 4684 * get that intercept, this function will be called again though and 4706 - * we'll get the vintr intercept. 4685 + * we'll get the vintr intercept. However, if the vGIF feature is 4686 + * enabled, the STGI interception will not occur. Enable the irq 4687 + * window under the assumption that the hardware will set the GIF. 4707 4688 */ 4708 - if (gif_set(svm) && nested_svm_intr(svm)) { 4689 + if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) { 4709 4690 svm_set_vintr(svm); 4710 4691 svm_inject_irq(svm, 0x0); 4711 4692 } ··· 4721 4698 == HF_NMI_MASK) 4722 4699 return; /* IRET will cause a vm exit */ 4723 4700 4724 - if ((svm->vcpu.arch.hflags & HF_GIF_MASK) == 0) 4701 + if (!gif_set(svm)) { 4702 + if (vgif_enabled(svm)) 4703 + set_intercept(svm, INTERCEPT_STGI); 4725 4704 return; /* STGI will cause a vm exit */ 4705 + } 4726 4706 4727 4707 if (svm->nested.exit_required) 4728 4708 return; /* we're not going to run the guest yet */ ··· 5097 5071 static void svm_cpuid_update(struct kvm_vcpu *vcpu) 5098 5072 { 5099 5073 struct vcpu_svm *svm = to_svm(vcpu); 5100 - struct kvm_cpuid_entry2 *entry; 5101 5074 5102 5075 /* Update nrips enabled cache */ 5103 - svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu); 5076 + svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS); 5104 5077 5105 5078 if (!kvm_vcpu_apicv_active(vcpu)) 5106 5079 return; 5107 5080 5108 - entry = kvm_find_cpuid_entry(vcpu, 1, 0); 5109 - if (entry) 5110 - entry->ecx &= ~bit(X86_FEATURE_X2APIC); 5081 + guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC); 5111 5082 } 5112 5083 5113 5084 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)

+7 -4

arch/x86/kvm/trace.h

··· 151 151 */ 152 152 TRACE_EVENT(kvm_cpuid, 153 153 TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx, 154 - unsigned long rcx, unsigned long rdx), 155 - TP_ARGS(function, rax, rbx, rcx, rdx), 154 + unsigned long rcx, unsigned long rdx, bool found), 155 + TP_ARGS(function, rax, rbx, rcx, rdx, found), 156 156 157 157 TP_STRUCT__entry( 158 158 __field( unsigned int, function ) ··· 160 160 __field( unsigned long, rbx ) 161 161 __field( unsigned long, rcx ) 162 162 __field( unsigned long, rdx ) 163 + __field( bool, found ) 163 164 ), 164 165 165 166 TP_fast_assign( ··· 169 168 __entry->rbx = rbx; 170 169 __entry->rcx = rcx; 171 170 __entry->rdx = rdx; 171 + __entry->found = found; 172 172 ), 173 173 174 - TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx", 174 + TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx, cpuid entry %s", 175 175 __entry->function, __entry->rax, 176 - __entry->rbx, __entry->rcx, __entry->rdx) 176 + __entry->rbx, __entry->rcx, __entry->rdx, 177 + __entry->found ? "found" : "not found") 177 178 ); 178 179 179 180 #define AREG(x) { APIC_##x, "APIC_" #x }

+451 -169

arch/x86/kvm/vmx.c

··· 122 122 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 123 123 #define KVM_CR4_GUEST_OWNED_BITS \ 124 124 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ 125 - | X86_CR4_OSXMMEXCPT | X86_CR4_TSD) 125 + | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD) 126 126 127 127 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 128 128 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) ··· 243 243 u64 virtual_apic_page_addr; 244 244 u64 apic_access_addr; 245 245 u64 posted_intr_desc_addr; 246 + u64 vm_function_control; 246 247 u64 ept_pointer; 247 248 u64 eoi_exit_bitmap0; 248 249 u64 eoi_exit_bitmap1; 249 250 u64 eoi_exit_bitmap2; 250 251 u64 eoi_exit_bitmap3; 252 + u64 eptp_list_address; 251 253 u64 xss_exit_bitmap; 252 254 u64 guest_physical_address; 253 255 u64 vmcs_link_pointer; ··· 483 481 u64 nested_vmx_cr4_fixed0; 484 482 u64 nested_vmx_cr4_fixed1; 485 483 u64 nested_vmx_vmcs_enum; 484 + u64 nested_vmx_vmfunc_controls; 486 485 }; 487 486 488 487 #define POSTED_INTR_ON 0 ··· 576 573 #endif 577 574 u32 vm_entry_controls_shadow; 578 575 u32 vm_exit_controls_shadow; 576 + u32 secondary_exec_control; 577 + 579 578 /* 580 579 * loaded_vmcs points to the VMCS currently used in this vcpu. For a 581 580 * non-nested (L1) guest, it always points to vmcs01. For a nested ··· 766 761 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), 767 762 FIELD64(APIC_ACCESS_ADDR, apic_access_addr), 768 763 FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), 764 + FIELD64(VM_FUNCTION_CONTROL, vm_function_control), 769 765 FIELD64(EPT_POINTER, ept_pointer), 770 766 FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), 771 767 FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), 772 768 FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), 773 769 FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), 770 + FIELD64(EPTP_LIST_ADDRESS, eptp_list_address), 774 771 FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), 775 772 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), 776 773 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), ··· 894 887 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) 895 888 { 896 889 return to_vmx(vcpu)->nested.cached_vmcs12; 897 - } 898 - 899 - static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) 900 - { 901 - struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT); 902 - if (is_error_page(page)) 903 - return NULL; 904 - 905 - return page; 906 - } 907 - 908 - static void nested_release_page(struct page *page) 909 - { 910 - kvm_release_page_dirty(page); 911 - } 912 - 913 - static void nested_release_page_clean(struct page *page) 914 - { 915 - kvm_release_page_clean(page); 916 890 } 917 891 918 892 static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); ··· 1200 1212 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; 1201 1213 } 1202 1214 1215 + static inline bool cpu_has_vmx_ept_mt_wb(void) 1216 + { 1217 + return vmx_capability.ept & VMX_EPTP_WB_BIT; 1218 + } 1219 + 1220 + static inline bool cpu_has_vmx_ept_5levels(void) 1221 + { 1222 + return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT; 1223 + } 1224 + 1203 1225 static inline bool cpu_has_vmx_ept_ad_bits(void) 1204 1226 { 1205 1227 return vmx_capability.ept & VMX_EPT_AD_BIT; ··· 1315 1317 SECONDARY_EXEC_TSC_SCALING; 1316 1318 } 1317 1319 1320 + static inline bool cpu_has_vmx_vmfunc(void) 1321 + { 1322 + return vmcs_config.cpu_based_2nd_exec_ctrl & 1323 + SECONDARY_EXEC_ENABLE_VMFUNC; 1324 + } 1325 + 1318 1326 static inline bool report_flexpriority(void) 1319 1327 { 1320 1328 return flexpriority_enabled; ··· 1361 1357 1362 1358 static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) 1363 1359 { 1364 - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES) && 1365 - vmx_xsaves_supported(); 1360 + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 1366 1361 } 1367 1362 1368 1363 static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) ··· 1392 1389 static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) 1393 1390 { 1394 1391 return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; 1392 + } 1393 + 1394 + static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12) 1395 + { 1396 + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC); 1397 + } 1398 + 1399 + static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) 1400 + { 1401 + return nested_cpu_has_vmfunc(vmcs12) && 1402 + (vmcs12->vm_function_control & 1403 + VMX_VMFUNC_EPTP_SWITCHING); 1395 1404 } 1396 1405 1397 1406 static inline bool is_nmi(u32 intr_info) ··· 2465 2450 * KVM wants to inject page-faults which it got to the guest. This function 2466 2451 * checks whether in a nested guest, we need to inject them to L1 or L2. 2467 2452 */ 2468 - static int nested_vmx_check_exception(struct kvm_vcpu *vcpu) 2453 + static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) 2469 2454 { 2470 2455 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2471 2456 unsigned int nr = vcpu->arch.exception.nr; 2472 2457 2473 2458 if (nr == PF_VECTOR) { 2474 2459 if (vcpu->arch.exception.nested_apf) { 2475 - nested_vmx_inject_exception_vmexit(vcpu, 2476 - vcpu->arch.apf.nested_apf_token); 2460 + *exit_qual = vcpu->arch.apf.nested_apf_token; 2477 2461 return 1; 2478 2462 } 2479 2463 /* ··· 2486 2472 */ 2487 2473 if (nested_vmx_is_page_fault_vmexit(vmcs12, 2488 2474 vcpu->arch.exception.error_code)) { 2489 - nested_vmx_inject_exception_vmexit(vcpu, vcpu->arch.cr2); 2475 + *exit_qual = vcpu->arch.cr2; 2490 2476 return 1; 2491 2477 } 2492 2478 } else { 2493 - unsigned long exit_qual = 0; 2494 - if (nr == DB_VECTOR) 2495 - exit_qual = vcpu->arch.dr6; 2496 - 2497 2479 if (vmcs12->exception_bitmap & (1u << nr)) { 2498 - nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 2480 + if (nr == DB_VECTOR) 2481 + *exit_qual = vcpu->arch.dr6; 2482 + else 2483 + *exit_qual = 0; 2499 2484 return 1; 2500 2485 } 2501 2486 } ··· 2507 2494 struct vcpu_vmx *vmx = to_vmx(vcpu); 2508 2495 unsigned nr = vcpu->arch.exception.nr; 2509 2496 bool has_error_code = vcpu->arch.exception.has_error_code; 2510 - bool reinject = vcpu->arch.exception.reinject; 2511 2497 u32 error_code = vcpu->arch.exception.error_code; 2512 2498 u32 intr_info = nr | INTR_INFO_VALID_MASK; 2513 - 2514 - if (!reinject && is_guest_mode(vcpu) && 2515 - nested_vmx_check_exception(vcpu)) 2516 - return; 2517 2499 2518 2500 if (has_error_code) { 2519 2501 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); ··· 2608 2600 if (index >= 0) 2609 2601 move_msr_up(vmx, index, save_nmsrs++); 2610 2602 index = __find_msr_index(vmx, MSR_TSC_AUX); 2611 - if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu)) 2603 + if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) 2612 2604 move_msr_up(vmx, index, save_nmsrs++); 2613 2605 /* 2614 2606 * MSR_STAR is only needed on long mode guests, and only ··· 2668 2660 } 2669 2661 } 2670 2662 2671 - static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) 2672 - { 2673 - struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); 2674 - return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31))); 2675 - } 2676 - 2677 2663 /* 2678 2664 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX 2679 2665 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for ··· 2676 2674 */ 2677 2675 static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) 2678 2676 { 2679 - return nested && guest_cpuid_has_vmx(vcpu); 2677 + return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); 2680 2678 } 2681 2679 2682 2680 /* ··· 2799 2797 vmx->nested.nested_vmx_procbased_ctls_low &= 2800 2798 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 2801 2799 2802 - /* secondary cpu-based controls */ 2800 + /* 2801 + * secondary cpu-based controls. Do not include those that 2802 + * depend on CPUID bits, they are added later by vmx_cpuid_update. 2803 + */ 2803 2804 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 2804 2805 vmx->nested.nested_vmx_secondary_ctls_low, 2805 2806 vmx->nested.nested_vmx_secondary_ctls_high); 2806 2807 vmx->nested.nested_vmx_secondary_ctls_low = 0; 2807 2808 vmx->nested.nested_vmx_secondary_ctls_high &= 2808 - SECONDARY_EXEC_RDRAND | SECONDARY_EXEC_RDSEED | 2809 2809 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2810 - SECONDARY_EXEC_RDTSCP | 2811 2810 SECONDARY_EXEC_DESC | 2812 2811 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2813 2812 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2814 2813 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2815 - SECONDARY_EXEC_WBINVD_EXITING | 2816 - SECONDARY_EXEC_XSAVES; 2814 + SECONDARY_EXEC_WBINVD_EXITING; 2817 2815 2818 2816 if (enable_ept) { 2819 2817 /* nested EPT: emulate EPT also to L1 */ ··· 2835 2833 } 2836 2834 } else 2837 2835 vmx->nested.nested_vmx_ept_caps = 0; 2836 + 2837 + if (cpu_has_vmx_vmfunc()) { 2838 + vmx->nested.nested_vmx_secondary_ctls_high |= 2839 + SECONDARY_EXEC_ENABLE_VMFUNC; 2840 + /* 2841 + * Advertise EPTP switching unconditionally 2842 + * since we emulate it 2843 + */ 2844 + vmx->nested.nested_vmx_vmfunc_controls = 2845 + VMX_VMFUNC_EPTP_SWITCHING; 2846 + } 2838 2847 2839 2848 /* 2840 2849 * Old versions of KVM use the single-context version without ··· 3216 3203 *pdata = vmx->nested.nested_vmx_ept_caps | 3217 3204 ((u64)vmx->nested.nested_vmx_vpid_caps << 32); 3218 3205 break; 3206 + case MSR_IA32_VMX_VMFUNC: 3207 + *pdata = vmx->nested.nested_vmx_vmfunc_controls; 3208 + break; 3219 3209 default: 3220 3210 return 1; 3221 3211 } ··· 3272 3256 break; 3273 3257 case MSR_IA32_BNDCFGS: 3274 3258 if (!kvm_mpx_supported() || 3275 - (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu))) 3259 + (!msr_info->host_initiated && 3260 + !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) 3276 3261 return 1; 3277 3262 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 3278 3263 break; ··· 3297 3280 msr_info->data = vcpu->arch.ia32_xss; 3298 3281 break; 3299 3282 case MSR_TSC_AUX: 3300 - if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated) 3283 + if (!msr_info->host_initiated && 3284 + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) 3301 3285 return 1; 3302 3286 /* Otherwise falls through */ 3303 3287 default: ··· 3357 3339 break; 3358 3340 case MSR_IA32_BNDCFGS: 3359 3341 if (!kvm_mpx_supported() || 3360 - (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu))) 3342 + (!msr_info->host_initiated && 3343 + !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) 3361 3344 return 1; 3362 - if (is_noncanonical_address(data & PAGE_MASK) || 3345 + if (is_noncanonical_address(data & PAGE_MASK, vcpu) || 3363 3346 (data & MSR_IA32_BNDCFGS_RSVD)) 3364 3347 return 1; 3365 3348 vmcs_write64(GUEST_BNDCFGS, data); ··· 3421 3402 clear_atomic_switch_msr(vmx, MSR_IA32_XSS); 3422 3403 break; 3423 3404 case MSR_TSC_AUX: 3424 - if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated) 3405 + if (!msr_info->host_initiated && 3406 + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) 3425 3407 return 1; 3426 3408 /* Check reserved bit, higher 32 bits should be zero */ 3427 3409 if ((data >> 32) != 0) ··· 3659 3639 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 3660 3640 SECONDARY_EXEC_SHADOW_VMCS | 3661 3641 SECONDARY_EXEC_XSAVES | 3642 + SECONDARY_EXEC_RDSEED | 3643 + SECONDARY_EXEC_RDRAND | 3662 3644 SECONDARY_EXEC_ENABLE_PML | 3663 - SECONDARY_EXEC_TSC_SCALING; 3645 + SECONDARY_EXEC_TSC_SCALING | 3646 + SECONDARY_EXEC_ENABLE_VMFUNC; 3664 3647 if (adjust_vmx_controls(min2, opt2, 3665 3648 MSR_IA32_VMX_PROCBASED_CTLS2, 3666 3649 &_cpu_based_2nd_exec_control) < 0) ··· 4295 4272 vmx->emulation_required = emulation_required(vcpu); 4296 4273 } 4297 4274 4275 + static int get_ept_level(struct kvm_vcpu *vcpu) 4276 + { 4277 + if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) 4278 + return 5; 4279 + return 4; 4280 + } 4281 + 4298 4282 static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) 4299 4283 { 4300 - u64 eptp; 4284 + u64 eptp = VMX_EPTP_MT_WB; 4301 4285 4302 - /* TODO write the value reading from MSR */ 4303 - eptp = VMX_EPT_DEFAULT_MT | 4304 - VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; 4286 + eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; 4287 + 4305 4288 if (enable_ept_ad_bits && 4306 4289 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) 4307 - eptp |= VMX_EPT_AD_ENABLE_BIT; 4290 + eptp |= VMX_EPTP_AD_ENABLE_BIT; 4308 4291 eptp |= (root_hpa & PAGE_MASK); 4309 4292 4310 4293 return eptp; ··· 5272 5243 return exec_control; 5273 5244 } 5274 5245 5275 - static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 5246 + static bool vmx_rdrand_supported(void) 5276 5247 { 5248 + return vmcs_config.cpu_based_2nd_exec_ctrl & 5249 + SECONDARY_EXEC_RDRAND; 5250 + } 5251 + 5252 + static bool vmx_rdseed_supported(void) 5253 + { 5254 + return vmcs_config.cpu_based_2nd_exec_ctrl & 5255 + SECONDARY_EXEC_RDSEED; 5256 + } 5257 + 5258 + static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) 5259 + { 5260 + struct kvm_vcpu *vcpu = &vmx->vcpu; 5261 + 5277 5262 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 5278 - if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu)) 5263 + if (!cpu_need_virtualize_apic_accesses(vcpu)) 5279 5264 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 5280 5265 if (vmx->vpid == 0) 5281 5266 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; ··· 5303 5260 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 5304 5261 if (!ple_gap) 5305 5262 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 5306 - if (!kvm_vcpu_apicv_active(&vmx->vcpu)) 5263 + if (!kvm_vcpu_apicv_active(vcpu)) 5307 5264 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 5308 5265 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 5309 5266 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; ··· 5317 5274 if (!enable_pml) 5318 5275 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 5319 5276 5320 - return exec_control; 5277 + if (vmx_xsaves_supported()) { 5278 + /* Exposing XSAVES only when XSAVE is exposed */ 5279 + bool xsaves_enabled = 5280 + guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && 5281 + guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); 5282 + 5283 + if (!xsaves_enabled) 5284 + exec_control &= ~SECONDARY_EXEC_XSAVES; 5285 + 5286 + if (nested) { 5287 + if (xsaves_enabled) 5288 + vmx->nested.nested_vmx_secondary_ctls_high |= 5289 + SECONDARY_EXEC_XSAVES; 5290 + else 5291 + vmx->nested.nested_vmx_secondary_ctls_high &= 5292 + ~SECONDARY_EXEC_XSAVES; 5293 + } 5294 + } 5295 + 5296 + if (vmx_rdtscp_supported()) { 5297 + bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP); 5298 + if (!rdtscp_enabled) 5299 + exec_control &= ~SECONDARY_EXEC_RDTSCP; 5300 + 5301 + if (nested) { 5302 + if (rdtscp_enabled) 5303 + vmx->nested.nested_vmx_secondary_ctls_high |= 5304 + SECONDARY_EXEC_RDTSCP; 5305 + else 5306 + vmx->nested.nested_vmx_secondary_ctls_high &= 5307 + ~SECONDARY_EXEC_RDTSCP; 5308 + } 5309 + } 5310 + 5311 + if (vmx_invpcid_supported()) { 5312 + /* Exposing INVPCID only when PCID is exposed */ 5313 + bool invpcid_enabled = 5314 + guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) && 5315 + guest_cpuid_has(vcpu, X86_FEATURE_PCID); 5316 + 5317 + if (!invpcid_enabled) { 5318 + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; 5319 + guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); 5320 + } 5321 + 5322 + if (nested) { 5323 + if (invpcid_enabled) 5324 + vmx->nested.nested_vmx_secondary_ctls_high |= 5325 + SECONDARY_EXEC_ENABLE_INVPCID; 5326 + else 5327 + vmx->nested.nested_vmx_secondary_ctls_high &= 5328 + ~SECONDARY_EXEC_ENABLE_INVPCID; 5329 + } 5330 + } 5331 + 5332 + if (vmx_rdrand_supported()) { 5333 + bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); 5334 + if (rdrand_enabled) 5335 + exec_control &= ~SECONDARY_EXEC_RDRAND; 5336 + 5337 + if (nested) { 5338 + if (rdrand_enabled) 5339 + vmx->nested.nested_vmx_secondary_ctls_high |= 5340 + SECONDARY_EXEC_RDRAND; 5341 + else 5342 + vmx->nested.nested_vmx_secondary_ctls_high &= 5343 + ~SECONDARY_EXEC_RDRAND; 5344 + } 5345 + } 5346 + 5347 + if (vmx_rdseed_supported()) { 5348 + bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); 5349 + if (rdseed_enabled) 5350 + exec_control &= ~SECONDARY_EXEC_RDSEED; 5351 + 5352 + if (nested) { 5353 + if (rdseed_enabled) 5354 + vmx->nested.nested_vmx_secondary_ctls_high |= 5355 + SECONDARY_EXEC_RDSEED; 5356 + else 5357 + vmx->nested.nested_vmx_secondary_ctls_high &= 5358 + ~SECONDARY_EXEC_RDSEED; 5359 + } 5360 + } 5361 + 5362 + vmx->secondary_exec_control = exec_control; 5321 5363 } 5322 5364 5323 5365 static void ept_set_mmio_spte_mask(void) ··· 5446 5318 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); 5447 5319 5448 5320 if (cpu_has_secondary_exec_ctrls()) { 5321 + vmx_compute_secondary_exec_control(vmx); 5449 5322 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 5450 - vmx_secondary_exec_control(vmx)); 5323 + vmx->secondary_exec_control); 5451 5324 } 5452 5325 5453 5326 if (kvm_vcpu_apicv_active(&vmx->vcpu)) { ··· 5485 5356 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 5486 5357 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 5487 5358 #endif 5359 + 5360 + if (cpu_has_vmx_vmfunc()) 5361 + vmcs_write64(VM_FUNCTION_CONTROL, 0); 5488 5362 5489 5363 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 5490 5364 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); ··· 5967 5835 static int handle_triple_fault(struct kvm_vcpu *vcpu) 5968 5836 { 5969 5837 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5838 + vcpu->mmio_needed = 0; 5970 5839 return 0; 5971 5840 } 5972 5841 ··· 6463 6330 { 6464 6331 unsigned long exit_qualification; 6465 6332 gpa_t gpa; 6466 - u32 error_code; 6333 + u64 error_code; 6467 6334 6468 6335 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 6469 6336 ··· 6495 6362 EPT_VIOLATION_EXECUTABLE)) 6496 6363 ? PFERR_PRESENT_MASK : 0; 6497 6364 6498 - vcpu->arch.gpa_available = true; 6499 - vcpu->arch.exit_qualification = exit_qualification; 6365 + error_code |= (exit_qualification & 0x100) != 0 ? 6366 + PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; 6500 6367 6368 + vcpu->arch.exit_qualification = exit_qualification; 6501 6369 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 6502 6370 } 6503 6371 ··· 6507 6373 int ret; 6508 6374 gpa_t gpa; 6509 6375 6376 + /* 6377 + * A nested guest cannot optimize MMIO vmexits, because we have an 6378 + * nGPA here instead of the required GPA. 6379 + */ 6510 6380 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 6511 - if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 6381 + if (!is_guest_mode(vcpu) && 6382 + !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 6512 6383 trace_kvm_fast_mmio(gpa); 6513 6384 return kvm_skip_emulated_instruction(vcpu); 6514 6385 } 6515 6386 6516 - ret = handle_mmio_page_fault(vcpu, gpa, true); 6517 - vcpu->arch.gpa_available = true; 6518 - if (likely(ret == RET_MMIO_PF_EMULATE)) 6519 - return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == 6520 - EMULATE_DONE; 6521 - 6522 - if (unlikely(ret == RET_MMIO_PF_INVALID)) 6523 - return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0); 6524 - 6525 - if (unlikely(ret == RET_MMIO_PF_RETRY)) 6526 - return 1; 6387 + ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); 6388 + if (ret >= 0) 6389 + return ret; 6527 6390 6528 6391 /* It is the real ept misconfig */ 6529 6392 WARN_ON(1); ··· 6742 6611 init_vmcs_shadow_fields(); 6743 6612 6744 6613 if (!cpu_has_vmx_ept() || 6745 - !cpu_has_vmx_ept_4levels()) { 6614 + !cpu_has_vmx_ept_4levels() || 6615 + !cpu_has_vmx_ept_mt_wb()) { 6746 6616 enable_ept = 0; 6747 6617 enable_unrestricted_guest = 0; 6748 6618 enable_ept_ad_bits = 0; ··· 6886 6754 if (ple_gap) 6887 6755 grow_ple_window(vcpu); 6888 6756 6889 - kvm_vcpu_on_spin(vcpu); 6757 + /* 6758 + * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" 6759 + * VM-execution control is ignored if CPL > 0. OTOH, KVM 6760 + * never set PAUSE_EXITING and just set PLE if supported, 6761 + * so the vcpu must be CPL=0 if it gets a PAUSE exit. 6762 + */ 6763 + kvm_vcpu_on_spin(vcpu, true); 6890 6764 return kvm_skip_emulated_instruction(vcpu); 6891 6765 } 6892 6766 ··· 6905 6767 { 6906 6768 printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); 6907 6769 return handle_nop(vcpu); 6770 + } 6771 + 6772 + static int handle_invalid_op(struct kvm_vcpu *vcpu) 6773 + { 6774 + kvm_queue_exception(vcpu, UD_VECTOR); 6775 + return 1; 6908 6776 } 6909 6777 6910 6778 static int handle_monitor_trap(struct kvm_vcpu *vcpu) ··· 7129 6985 * non-canonical form. This is the only check on the memory 7130 6986 * destination for long mode! 7131 6987 */ 7132 - exn = is_noncanonical_address(*ret); 6988 + exn = is_noncanonical_address(*ret, vcpu); 7133 6989 } else if (is_protmode(vcpu)) { 7134 6990 /* Protected mode: apply checks for segment validity in the 7135 6991 * following order: ··· 7293 7149 return kvm_skip_emulated_instruction(vcpu); 7294 7150 } 7295 7151 7296 - page = nested_get_page(vcpu, vmptr); 7297 - if (page == NULL) { 7152 + page = kvm_vcpu_gpa_to_page(vcpu, vmptr); 7153 + if (is_error_page(page)) { 7298 7154 nested_vmx_failInvalid(vcpu); 7299 7155 return kvm_skip_emulated_instruction(vcpu); 7300 7156 } 7301 7157 if (*(u32 *)kmap(page) != VMCS12_REVISION) { 7302 7158 kunmap(page); 7303 - nested_release_page_clean(page); 7159 + kvm_release_page_clean(page); 7304 7160 nested_vmx_failInvalid(vcpu); 7305 7161 return kvm_skip_emulated_instruction(vcpu); 7306 7162 } 7307 7163 kunmap(page); 7308 - nested_release_page_clean(page); 7164 + kvm_release_page_clean(page); 7309 7165 7310 7166 vmx->nested.vmxon_ptr = vmptr; 7311 7167 ret = enter_vmx_operation(vcpu); ··· 7386 7242 kfree(vmx->nested.cached_vmcs12); 7387 7243 /* Unpin physical memory we referred to in current vmcs02 */ 7388 7244 if (vmx->nested.apic_access_page) { 7389 - nested_release_page(vmx->nested.apic_access_page); 7245 + kvm_release_page_dirty(vmx->nested.apic_access_page); 7390 7246 vmx->nested.apic_access_page = NULL; 7391 7247 } 7392 7248 if (vmx->nested.virtual_apic_page) { 7393 - nested_release_page(vmx->nested.virtual_apic_page); 7249 + kvm_release_page_dirty(vmx->nested.virtual_apic_page); 7394 7250 vmx->nested.virtual_apic_page = NULL; 7395 7251 } 7396 7252 if (vmx->nested.pi_desc_page) { 7397 7253 kunmap(vmx->nested.pi_desc_page); 7398 - nested_release_page(vmx->nested.pi_desc_page); 7254 + kvm_release_page_dirty(vmx->nested.pi_desc_page); 7399 7255 vmx->nested.pi_desc_page = NULL; 7400 7256 vmx->nested.pi_desc = NULL; 7401 7257 } ··· 7762 7618 if (vmx->nested.current_vmptr != vmptr) { 7763 7619 struct vmcs12 *new_vmcs12; 7764 7620 struct page *page; 7765 - page = nested_get_page(vcpu, vmptr); 7766 - if (page == NULL) { 7621 + page = kvm_vcpu_gpa_to_page(vcpu, vmptr); 7622 + if (is_error_page(page)) { 7767 7623 nested_vmx_failInvalid(vcpu); 7768 7624 return kvm_skip_emulated_instruction(vcpu); 7769 7625 } 7770 7626 new_vmcs12 = kmap(page); 7771 7627 if (new_vmcs12->revision_id != VMCS12_REVISION) { 7772 7628 kunmap(page); 7773 - nested_release_page_clean(page); 7629 + kvm_release_page_clean(page); 7774 7630 nested_vmx_failValid(vcpu, 7775 7631 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 7776 7632 return kvm_skip_emulated_instruction(vcpu); ··· 7783 7639 */ 7784 7640 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); 7785 7641 kunmap(page); 7786 - nested_release_page_clean(page); 7642 + kvm_release_page_clean(page); 7787 7643 7788 7644 set_current_vmptr(vmx, vmptr); 7789 7645 } ··· 7934 7790 7935 7791 switch (type) { 7936 7792 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 7937 - if (is_noncanonical_address(operand.gla)) { 7793 + if (is_noncanonical_address(operand.gla, vcpu)) { 7938 7794 nested_vmx_failValid(vcpu, 7939 7795 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 7940 7796 return kvm_skip_emulated_instruction(vcpu); ··· 7991 7847 return 1; 7992 7848 } 7993 7849 7850 + static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) 7851 + { 7852 + struct vcpu_vmx *vmx = to_vmx(vcpu); 7853 + int maxphyaddr = cpuid_maxphyaddr(vcpu); 7854 + 7855 + /* Check for memory type validity */ 7856 + switch (address & VMX_EPTP_MT_MASK) { 7857 + case VMX_EPTP_MT_UC: 7858 + if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_UC_BIT)) 7859 + return false; 7860 + break; 7861 + case VMX_EPTP_MT_WB: 7862 + if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_WB_BIT)) 7863 + return false; 7864 + break; 7865 + default: 7866 + return false; 7867 + } 7868 + 7869 + /* only 4 levels page-walk length are valid */ 7870 + if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4) 7871 + return false; 7872 + 7873 + /* Reserved bits should not be set */ 7874 + if (address >> maxphyaddr || ((address >> 7) & 0x1f)) 7875 + return false; 7876 + 7877 + /* AD, if set, should be supported */ 7878 + if (address & VMX_EPTP_AD_ENABLE_BIT) { 7879 + if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPT_AD_BIT)) 7880 + return false; 7881 + } 7882 + 7883 + return true; 7884 + } 7885 + 7886 + static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 7887 + struct vmcs12 *vmcs12) 7888 + { 7889 + u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; 7890 + u64 address; 7891 + bool accessed_dirty; 7892 + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 7893 + 7894 + if (!nested_cpu_has_eptp_switching(vmcs12) || 7895 + !nested_cpu_has_ept(vmcs12)) 7896 + return 1; 7897 + 7898 + if (index >= VMFUNC_EPTP_ENTRIES) 7899 + return 1; 7900 + 7901 + 7902 + if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 7903 + &address, index * 8, 8)) 7904 + return 1; 7905 + 7906 + accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT); 7907 + 7908 + /* 7909 + * If the (L2) guest does a vmfunc to the currently 7910 + * active ept pointer, we don't have to do anything else 7911 + */ 7912 + if (vmcs12->ept_pointer != address) { 7913 + if (!valid_ept_address(vcpu, address)) 7914 + return 1; 7915 + 7916 + kvm_mmu_unload(vcpu); 7917 + mmu->ept_ad = accessed_dirty; 7918 + mmu->base_role.ad_disabled = !accessed_dirty; 7919 + vmcs12->ept_pointer = address; 7920 + /* 7921 + * TODO: Check what's the correct approach in case 7922 + * mmu reload fails. Currently, we just let the next 7923 + * reload potentially fail 7924 + */ 7925 + kvm_mmu_reload(vcpu); 7926 + } 7927 + 7928 + return 0; 7929 + } 7930 + 7931 + static int handle_vmfunc(struct kvm_vcpu *vcpu) 7932 + { 7933 + struct vcpu_vmx *vmx = to_vmx(vcpu); 7934 + struct vmcs12 *vmcs12; 7935 + u32 function = vcpu->arch.regs[VCPU_REGS_RAX]; 7936 + 7937 + /* 7938 + * VMFUNC is only supported for nested guests, but we always enable the 7939 + * secondary control for simplicity; for non-nested mode, fake that we 7940 + * didn't by injecting #UD. 7941 + */ 7942 + if (!is_guest_mode(vcpu)) { 7943 + kvm_queue_exception(vcpu, UD_VECTOR); 7944 + return 1; 7945 + } 7946 + 7947 + vmcs12 = get_vmcs12(vcpu); 7948 + if ((vmcs12->vm_function_control & (1 << function)) == 0) 7949 + goto fail; 7950 + 7951 + switch (function) { 7952 + case 0: 7953 + if (nested_vmx_eptp_switching(vcpu, vmcs12)) 7954 + goto fail; 7955 + break; 7956 + default: 7957 + goto fail; 7958 + } 7959 + return kvm_skip_emulated_instruction(vcpu); 7960 + 7961 + fail: 7962 + nested_vmx_vmexit(vcpu, vmx->exit_reason, 7963 + vmcs_read32(VM_EXIT_INTR_INFO), 7964 + vmcs_readl(EXIT_QUALIFICATION)); 7965 + return 1; 7966 + } 7967 + 7994 7968 /* 7995 7969 * The exit handlers return 1 if the exit was handled fully and guest execution 7996 7970 * may resume. Otherwise they set the kvm_run parameter to indicate what needs ··· 8156 7894 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, 8157 7895 [EXIT_REASON_INVEPT] = handle_invept, 8158 7896 [EXIT_REASON_INVVPID] = handle_invvpid, 7897 + [EXIT_REASON_RDRAND] = handle_invalid_op, 7898 + [EXIT_REASON_RDSEED] = handle_invalid_op, 8159 7899 [EXIT_REASON_XSAVES] = handle_xsaves, 8160 7900 [EXIT_REASON_XRSTORS] = handle_xrstors, 8161 7901 [EXIT_REASON_PML_FULL] = handle_pml_full, 7902 + [EXIT_REASON_VMFUNC] = handle_vmfunc, 8162 7903 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, 8163 7904 }; 8164 7905 ··· 8477 8212 * table is L0's fault. 8478 8213 */ 8479 8214 return false; 8215 + case EXIT_REASON_INVPCID: 8216 + return 8217 + nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 8218 + nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 8480 8219 case EXIT_REASON_WBINVD: 8481 8220 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 8482 8221 case EXIT_REASON_XSETBV: ··· 8497 8228 return false; 8498 8229 case EXIT_REASON_PML_FULL: 8499 8230 /* We emulate PML support to L1. */ 8231 + return false; 8232 + case EXIT_REASON_VMFUNC: 8233 + /* VM functions are emulated through L2->L0 vmexits. */ 8500 8234 return false; 8501 8235 default: 8502 8236 return true; ··· 8759 8487 u32 vectoring_info = vmx->idt_vectoring_info; 8760 8488 8761 8489 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); 8762 - vcpu->arch.gpa_available = false; 8763 8490 8764 8491 /* 8765 8492 * Flush logged GPAs PML buffer, this will make dirty_bitmap more ··· 9612 9341 } 9613 9342 } 9614 9343 9615 - static int get_ept_level(void) 9616 - { 9617 - return VMX_EPT_DEFAULT_GAW + 1; 9618 - } 9619 - 9620 9344 static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 9621 9345 { 9622 9346 u8 cache; ··· 9728 9462 9729 9463 static void vmx_cpuid_update(struct kvm_vcpu *vcpu) 9730 9464 { 9731 - struct kvm_cpuid_entry2 *best; 9732 9465 struct vcpu_vmx *vmx = to_vmx(vcpu); 9733 - u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx); 9734 9466 9735 - if (vmx_rdtscp_supported()) { 9736 - bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu); 9737 - if (!rdtscp_enabled) 9738 - secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP; 9739 - 9740 - if (nested) { 9741 - if (rdtscp_enabled) 9742 - vmx->nested.nested_vmx_secondary_ctls_high |= 9743 - SECONDARY_EXEC_RDTSCP; 9744 - else 9745 - vmx->nested.nested_vmx_secondary_ctls_high &= 9746 - ~SECONDARY_EXEC_RDTSCP; 9747 - } 9467 + if (cpu_has_secondary_exec_ctrls()) { 9468 + vmx_compute_secondary_exec_control(vmx); 9469 + vmcs_set_secondary_exec_control(vmx->secondary_exec_control); 9748 9470 } 9749 - 9750 - /* Exposing INVPCID only when PCID is exposed */ 9751 - best = kvm_find_cpuid_entry(vcpu, 0x7, 0); 9752 - if (vmx_invpcid_supported() && 9753 - (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) || 9754 - !guest_cpuid_has_pcid(vcpu))) { 9755 - secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID; 9756 - 9757 - if (best) 9758 - best->ebx &= ~bit(X86_FEATURE_INVPCID); 9759 - } 9760 - 9761 - if (cpu_has_secondary_exec_ctrls()) 9762 - vmcs_set_secondary_exec_control(secondary_exec_ctl); 9763 9471 9764 9472 if (nested_vmx_allowed(vcpu)) 9765 9473 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= ··· 9775 9535 9776 9536 static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) 9777 9537 { 9778 - return nested_ept_get_cr3(vcpu) & VMX_EPT_AD_ENABLE_BIT; 9538 + return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT; 9779 9539 } 9780 9540 9781 9541 /* Callbacks for nested_ept_init_mmu_context: */ ··· 9788 9548 9789 9549 static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 9790 9550 { 9791 - bool wants_ad; 9792 - 9793 9551 WARN_ON(mmu_is_nested(vcpu)); 9794 - wants_ad = nested_ept_ad_enabled(vcpu); 9795 - if (wants_ad && !enable_ept_ad_bits) 9552 + if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu))) 9796 9553 return 1; 9797 9554 9798 9555 kvm_mmu_unload(vcpu); 9799 9556 kvm_init_shadow_ept_mmu(vcpu, 9800 9557 to_vmx(vcpu)->nested.nested_vmx_ept_caps & 9801 9558 VMX_EPT_EXECUTE_ONLY_BIT, 9802 - wants_ad); 9559 + nested_ept_ad_enabled(vcpu)); 9803 9560 vcpu->arch.mmu.set_cr3 = vmx_set_cr3; 9804 9561 vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; 9805 9562 vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; ··· 9847 9610 struct vmcs12 *vmcs12) 9848 9611 { 9849 9612 struct vcpu_vmx *vmx = to_vmx(vcpu); 9613 + struct page *page; 9850 9614 u64 hpa; 9851 9615 9852 9616 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { ··· 9857 9619 * physical address remains valid. We keep a reference 9858 9620 * to it so we can release it later. 9859 9621 */ 9860 - if (vmx->nested.apic_access_page) /* shouldn't happen */ 9861 - nested_release_page(vmx->nested.apic_access_page); 9862 - vmx->nested.apic_access_page = 9863 - nested_get_page(vcpu, vmcs12->apic_access_addr); 9622 + if (vmx->nested.apic_access_page) { /* shouldn't happen */ 9623 + kvm_release_page_dirty(vmx->nested.apic_access_page); 9624 + vmx->nested.apic_access_page = NULL; 9625 + } 9626 + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); 9864 9627 /* 9865 9628 * If translation failed, no matter: This feature asks 9866 9629 * to exit when accessing the given address, and if it 9867 9630 * can never be accessed, this feature won't do 9868 9631 * anything anyway. 9869 9632 */ 9870 - if (vmx->nested.apic_access_page) { 9633 + if (!is_error_page(page)) { 9634 + vmx->nested.apic_access_page = page; 9871 9635 hpa = page_to_phys(vmx->nested.apic_access_page); 9872 9636 vmcs_write64(APIC_ACCESS_ADDR, hpa); 9873 9637 } else { ··· 9884 9644 } 9885 9645 9886 9646 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 9887 - if (vmx->nested.virtual_apic_page) /* shouldn't happen */ 9888 - nested_release_page(vmx->nested.virtual_apic_page); 9889 - vmx->nested.virtual_apic_page = 9890 - nested_get_page(vcpu, vmcs12->virtual_apic_page_addr); 9647 + if (vmx->nested.virtual_apic_page) { /* shouldn't happen */ 9648 + kvm_release_page_dirty(vmx->nested.virtual_apic_page); 9649 + vmx->nested.virtual_apic_page = NULL; 9650 + } 9651 + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr); 9891 9652 9892 9653 /* 9893 9654 * If translation failed, VM entry will fail because ··· 9903 9662 * control. But such a configuration is useless, so 9904 9663 * let's keep the code simple. 9905 9664 */ 9906 - if (vmx->nested.virtual_apic_page) { 9665 + if (!is_error_page(page)) { 9666 + vmx->nested.virtual_apic_page = page; 9907 9667 hpa = page_to_phys(vmx->nested.virtual_apic_page); 9908 9668 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); 9909 9669 } ··· 9913 9671 if (nested_cpu_has_posted_intr(vmcs12)) { 9914 9672 if (vmx->nested.pi_desc_page) { /* shouldn't happen */ 9915 9673 kunmap(vmx->nested.pi_desc_page); 9916 - nested_release_page(vmx->nested.pi_desc_page); 9674 + kvm_release_page_dirty(vmx->nested.pi_desc_page); 9675 + vmx->nested.pi_desc_page = NULL; 9917 9676 } 9918 - vmx->nested.pi_desc_page = 9919 - nested_get_page(vcpu, vmcs12->posted_intr_desc_addr); 9920 - vmx->nested.pi_desc = 9921 - (struct pi_desc *)kmap(vmx->nested.pi_desc_page); 9922 - if (!vmx->nested.pi_desc) { 9923 - nested_release_page_clean(vmx->nested.pi_desc_page); 9677 + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr); 9678 + if (is_error_page(page)) 9924 9679 return; 9925 - } 9680 + vmx->nested.pi_desc_page = page; 9681 + vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page); 9926 9682 vmx->nested.pi_desc = 9927 9683 (struct pi_desc *)((void *)vmx->nested.pi_desc + 9928 9684 (unsigned long)(vmcs12->posted_intr_desc_addr & ··· 9986 9746 return 0; 9987 9747 } 9988 9748 9749 + static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 9750 + struct vmcs12 *vmcs12) 9751 + { 9752 + if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 9753 + return 0; 9754 + 9755 + if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)) 9756 + return -EINVAL; 9757 + 9758 + return 0; 9759 + } 9760 + 9989 9761 /* 9990 9762 * Merge L0's and L1's MSR bitmap, return false to indicate that 9991 9763 * we do not use the hardware. ··· 10014 9762 if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) 10015 9763 return false; 10016 9764 10017 - page = nested_get_page(vcpu, vmcs12->msr_bitmap); 10018 - if (!page) 9765 + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap); 9766 + if (is_error_page(page)) 10019 9767 return false; 10020 9768 msr_bitmap_l1 = (unsigned long *)kmap(page); 10021 9769 ··· 10045 9793 } 10046 9794 } 10047 9795 kunmap(page); 10048 - nested_release_page_clean(page); 9796 + kvm_release_page_clean(page); 10049 9797 10050 9798 return true; 10051 9799 } ··· 10439 10187 enable_ept ? vmcs12->page_fault_error_code_match : 0); 10440 10188 10441 10189 if (cpu_has_secondary_exec_ctrls()) { 10442 - exec_control = vmx_secondary_exec_control(vmx); 10190 + exec_control = vmx->secondary_exec_control; 10443 10191 10444 10192 /* Take the following fields only from vmcs12 */ 10445 10193 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 10194 + SECONDARY_EXEC_ENABLE_INVPCID | 10446 10195 SECONDARY_EXEC_RDTSCP | 10196 + SECONDARY_EXEC_XSAVES | 10447 10197 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 10448 - SECONDARY_EXEC_APIC_REGISTER_VIRT); 10198 + SECONDARY_EXEC_APIC_REGISTER_VIRT | 10199 + SECONDARY_EXEC_ENABLE_VMFUNC); 10449 10200 if (nested_cpu_has(vmcs12, 10450 10201 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { 10451 10202 vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & 10452 10203 ~SECONDARY_EXEC_ENABLE_PML; 10453 10204 exec_control |= vmcs12_exec_ctrl; 10454 10205 } 10206 + 10207 + /* All VMFUNCs are currently emulated through L0 vmexits. */ 10208 + if (exec_control & SECONDARY_EXEC_ENABLE_VMFUNC) 10209 + vmcs_write64(VM_FUNCTION_CONTROL, 0); 10455 10210 10456 10211 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 10457 10212 vmcs_write64(EOI_EXIT_BITMAP0, ··· 10685 10426 if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) 10686 10427 return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 10687 10428 10429 + if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12)) 10430 + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 10431 + 10688 10432 if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) 10689 10433 return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 10690 10434 ··· 10714 10452 vmx->nested.nested_vmx_entry_ctls_low, 10715 10453 vmx->nested.nested_vmx_entry_ctls_high)) 10716 10454 return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 10455 + 10456 + if (nested_cpu_has_vmfunc(vmcs12)) { 10457 + if (vmcs12->vm_function_control & 10458 + ~vmx->nested.nested_vmx_vmfunc_controls) 10459 + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 10460 + 10461 + if (nested_cpu_has_eptp_switching(vmcs12)) { 10462 + if (!nested_cpu_has_ept(vmcs12) || 10463 + !page_address_valid(vcpu, vmcs12->eptp_list_address)) 10464 + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 10465 + } 10466 + } 10717 10467 10718 10468 if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) 10719 10469 return VMXERR_ENTRY_INVALID_CONTROL_FIELD; ··· 10973 10699 u32 idt_vectoring; 10974 10700 unsigned int nr; 10975 10701 10976 - if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) { 10702 + if (vcpu->arch.exception.injected) { 10977 10703 nr = vcpu->arch.exception.nr; 10978 10704 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 10979 10705 ··· 11012 10738 static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) 11013 10739 { 11014 10740 struct vcpu_vmx *vmx = to_vmx(vcpu); 10741 + unsigned long exit_qual; 11015 10742 11016 - if (vcpu->arch.exception.pending || 11017 - vcpu->arch.nmi_injected || 11018 - vcpu->arch.interrupt.pending) 10743 + if (kvm_event_needs_reinjection(vcpu)) 11019 10744 return -EBUSY; 10745 + 10746 + if (vcpu->arch.exception.pending && 10747 + nested_vmx_check_exception(vcpu, &exit_qual)) { 10748 + if (vmx->nested.nested_run_pending) 10749 + return -EBUSY; 10750 + nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 10751 + vcpu->arch.exception.pending = false; 10752 + return 0; 10753 + } 11020 10754 11021 10755 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 11022 10756 vmx->nested.preemption_timer_expired) { ··· 11466 11184 11467 11185 /* Unpin physical memory we referred to in vmcs02 */ 11468 11186 if (vmx->nested.apic_access_page) { 11469 - nested_release_page(vmx->nested.apic_access_page); 11187 + kvm_release_page_dirty(vmx->nested.apic_access_page); 11470 11188 vmx->nested.apic_access_page = NULL; 11471 11189 } 11472 11190 if (vmx->nested.virtual_apic_page) { 11473 - nested_release_page(vmx->nested.virtual_apic_page); 11191 + kvm_release_page_dirty(vmx->nested.virtual_apic_page); 11474 11192 vmx->nested.virtual_apic_page = NULL; 11475 11193 } 11476 11194 if (vmx->nested.pi_desc_page) { 11477 11195 kunmap(vmx->nested.pi_desc_page); 11478 - nested_release_page(vmx->nested.pi_desc_page); 11196 + kvm_release_page_dirty(vmx->nested.pi_desc_page); 11479 11197 vmx->nested.pi_desc_page = NULL; 11480 11198 vmx->nested.pi_desc = NULL; 11481 11199 } ··· 11651 11369 11652 11370 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; 11653 11371 11654 - page = nested_get_page(vcpu, vmcs12->pml_address); 11655 - if (!page) 11372 + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address); 11373 + if (is_error_page(page)) 11656 11374 return 0; 11657 11375 11658 11376 pml_address = kmap(page); 11659 11377 pml_address[vmcs12->guest_pml_index--] = gpa; 11660 11378 kunmap(page); 11661 - nested_release_page_clean(page); 11379 + kvm_release_page_clean(page); 11662 11380 } 11663 11381 11664 11382 return 0;

+127 -86

arch/x86/kvm/x86.c

··· 311 311 (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); 312 312 u64 new_state = msr_info->data & 313 313 (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); 314 - u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 315 - 0x2ff | (guest_cpuid_has_x2apic(vcpu) ? 0 : X2APIC_ENABLE); 314 + u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff | 315 + (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); 316 316 317 + if ((msr_info->data & reserved_bits) || new_state == X2APIC_ENABLE) 318 + return 1; 317 319 if (!msr_info->host_initiated && 318 - ((msr_info->data & reserved_bits) != 0 || 319 - new_state == X2APIC_ENABLE || 320 - (new_state == MSR_IA32_APICBASE_ENABLE && 320 + ((new_state == MSR_IA32_APICBASE_ENABLE && 321 321 old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) || 322 322 (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) && 323 323 old_state == 0))) ··· 390 390 391 391 kvm_make_request(KVM_REQ_EVENT, vcpu); 392 392 393 - if (!vcpu->arch.exception.pending) { 393 + if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { 394 394 queue: 395 395 if (has_error && !is_protmode(vcpu)) 396 396 has_error = false; 397 - vcpu->arch.exception.pending = true; 397 + if (reinject) { 398 + /* 399 + * On vmentry, vcpu->arch.exception.pending is only 400 + * true if an event injection was blocked by 401 + * nested_run_pending. In that case, however, 402 + * vcpu_enter_guest requests an immediate exit, 403 + * and the guest shouldn't proceed far enough to 404 + * need reinjection. 405 + */ 406 + WARN_ON_ONCE(vcpu->arch.exception.pending); 407 + vcpu->arch.exception.injected = true; 408 + } else { 409 + vcpu->arch.exception.pending = true; 410 + vcpu->arch.exception.injected = false; 411 + } 398 412 vcpu->arch.exception.has_error_code = has_error; 399 413 vcpu->arch.exception.nr = nr; 400 414 vcpu->arch.exception.error_code = error_code; 401 - vcpu->arch.exception.reinject = reinject; 402 415 return; 403 416 } 404 417 ··· 426 413 class2 = exception_class(nr); 427 414 if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) 428 415 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { 429 - /* generate double fault per SDM Table 5-5 */ 416 + /* 417 + * Generate double fault per SDM Table 5-5. Set 418 + * exception.pending = true so that the double fault 419 + * can trigger a nested vmexit. 420 + */ 430 421 vcpu->arch.exception.pending = true; 422 + vcpu->arch.exception.injected = false; 431 423 vcpu->arch.exception.has_error_code = true; 432 424 vcpu->arch.exception.nr = DF_VECTOR; 433 425 vcpu->arch.exception.error_code = 0; ··· 773 755 if (cr4 & CR4_RESERVED_BITS) 774 756 return 1; 775 757 776 - if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) 758 + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) 777 759 return 1; 778 760 779 - if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) 761 + if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) 780 762 return 1; 781 763 782 - if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP)) 764 + if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) 783 765 return 1; 784 766 785 - if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE)) 767 + if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) 786 768 return 1; 787 769 788 - if (!guest_cpuid_has_pku(vcpu) && (cr4 & X86_CR4_PKE)) 770 + if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) 771 + return 1; 772 + 773 + if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) 789 774 return 1; 790 775 791 776 if (is_long_mode(vcpu)) { ··· 801 780 return 1; 802 781 803 782 if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { 804 - if (!guest_cpuid_has_pcid(vcpu)) 783 + if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID)) 805 784 return 1; 806 785 807 786 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ ··· 835 814 return 0; 836 815 } 837 816 838 - if (is_long_mode(vcpu)) { 839 - if (cr3 & CR3_L_MODE_RESERVED_BITS) 840 - return 1; 841 - } else if (is_pae(vcpu) && is_paging(vcpu) && 817 + if (is_long_mode(vcpu) && 818 + (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 62))) 819 + return 1; 820 + else if (is_pae(vcpu) && is_paging(vcpu) && 842 821 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) 843 822 return 1; 844 823 ··· 905 884 { 906 885 u64 fixed = DR6_FIXED_1; 907 886 908 - if (!guest_cpuid_has_rtm(vcpu)) 887 + if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM)) 909 888 fixed |= DR6_RTM; 910 889 return fixed; 911 890 } ··· 1015 994 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 1016 995 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 1017 996 HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, 997 + HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, 1018 998 HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, 1019 999 HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, 1020 1000 HV_X64_MSR_RESET, ··· 1044 1022 if (efer & efer_reserved_bits) 1045 1023 return false; 1046 1024 1047 - if (efer & EFER_FFXSR) { 1048 - struct kvm_cpuid_entry2 *feat; 1049 - 1050 - feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 1051 - if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) 1025 + if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) 1052 1026 return false; 1053 - } 1054 1027 1055 - if (efer & EFER_SVME) { 1056 - struct kvm_cpuid_entry2 *feat; 1057 - 1058 - feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 1059 - if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) 1028 + if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) 1060 1029 return false; 1061 - } 1062 1030 1063 1031 return true; 1064 1032 } ··· 1096 1084 case MSR_KERNEL_GS_BASE: 1097 1085 case MSR_CSTAR: 1098 1086 case MSR_LSTAR: 1099 - if (is_noncanonical_address(msr->data)) 1087 + if (is_noncanonical_address(msr->data, vcpu)) 1100 1088 return 1; 1101 1089 break; 1102 1090 case MSR_IA32_SYSENTER_EIP: ··· 1113 1101 * value, and that something deterministic happens if the guest 1114 1102 * invokes 64-bit SYSENTER. 1115 1103 */ 1116 - msr->data = get_canonical(msr->data); 1104 + msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu)); 1117 1105 } 1118 1106 return kvm_x86_ops->set_msr(vcpu, msr); 1119 1107 } ··· 1546 1534 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; 1547 1535 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; 1548 1536 1549 - if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated) 1537 + if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) 1550 1538 update_ia32_tsc_adjust_msr(vcpu, offset); 1539 + 1551 1540 kvm_vcpu_write_tsc_offset(vcpu, offset); 1552 1541 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); 1553 1542 ··· 2198 2185 kvm_set_lapic_tscdeadline_msr(vcpu, data); 2199 2186 break; 2200 2187 case MSR_IA32_TSC_ADJUST: 2201 - if (guest_cpuid_has_tsc_adjust(vcpu)) { 2188 + if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) { 2202 2189 if (!msr_info->host_initiated) { 2203 2190 s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; 2204 2191 adjust_tsc_offset_guest(vcpu, adj); ··· 2320 2307 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", msr, data); 2321 2308 break; 2322 2309 case MSR_AMD64_OSVW_ID_LENGTH: 2323 - if (!guest_cpuid_has_osvw(vcpu)) 2310 + if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) 2324 2311 return 1; 2325 2312 vcpu->arch.osvw.length = data; 2326 2313 break; 2327 2314 case MSR_AMD64_OSVW_STATUS: 2328 - if (!guest_cpuid_has_osvw(vcpu)) 2315 + if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) 2329 2316 return 1; 2330 2317 vcpu->arch.osvw.status = data; 2331 2318 break; ··· 2550 2537 msr_info->data = 0xbe702111; 2551 2538 break; 2552 2539 case MSR_AMD64_OSVW_ID_LENGTH: 2553 - if (!guest_cpuid_has_osvw(vcpu)) 2540 + if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) 2554 2541 return 1; 2555 2542 msr_info->data = vcpu->arch.osvw.length; 2556 2543 break; 2557 2544 case MSR_AMD64_OSVW_STATUS: 2558 - if (!guest_cpuid_has_osvw(vcpu)) 2545 + if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) 2559 2546 return 1; 2560 2547 msr_info->data = vcpu->arch.osvw.status; 2561 2548 break; ··· 2895 2882 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 2896 2883 { 2897 2884 int idx; 2885 + 2886 + if (vcpu->preempted) 2887 + vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu); 2888 + 2898 2889 /* 2899 2890 * Disable page faults because we're in atomic context here. 2900 2891 * kvm_write_guest_offset_cached() would call might_fault() ··· 3091 3074 struct kvm_vcpu_events *events) 3092 3075 { 3093 3076 process_nmi(vcpu); 3077 + /* 3078 + * FIXME: pass injected and pending separately. This is only 3079 + * needed for nested virtualization, whose state cannot be 3080 + * migrated yet. For now we can combine them. 3081 + */ 3094 3082 events->exception.injected = 3095 - vcpu->arch.exception.pending && 3083 + (vcpu->arch.exception.pending || 3084 + vcpu->arch.exception.injected) && 3096 3085 !kvm_exception_is_soft(vcpu->arch.exception.nr); 3097 3086 events->exception.nr = vcpu->arch.exception.nr; 3098 3087 events->exception.has_error_code = vcpu->arch.exception.has_error_code; ··· 3153 3130 return -EINVAL; 3154 3131 3155 3132 process_nmi(vcpu); 3133 + vcpu->arch.exception.injected = false; 3156 3134 vcpu->arch.exception.pending = events->exception.injected; 3157 3135 vcpu->arch.exception.nr = events->exception.nr; 3158 3136 vcpu->arch.exception.has_error_code = events->exception.has_error_code; ··· 4695 4671 */ 4696 4672 if (vcpu->arch.gpa_available && 4697 4673 emulator_can_use_gpa(ctxt) && 4698 - vcpu_is_mmio_gpa(vcpu, addr, exception->address, write) && 4699 - (addr & ~PAGE_MASK) == (exception->address & ~PAGE_MASK)) { 4700 - gpa = exception->address; 4701 - goto mmio; 4674 + (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) { 4675 + gpa = vcpu->arch.gpa_val; 4676 + ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write); 4677 + } else { 4678 + ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); 4679 + if (ret < 0) 4680 + return X86EMUL_PROPAGATE_FAULT; 4702 4681 } 4703 4682 4704 - ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); 4705 - 4706 - if (ret < 0) 4707 - return X86EMUL_PROPAGATE_FAULT; 4708 - 4709 - /* For APIC access vmexit */ 4710 - if (ret) 4711 - goto mmio; 4712 - 4713 - if (ops->read_write_emulate(vcpu, gpa, val, bytes)) 4683 + if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes)) 4714 4684 return X86EMUL_CONTINUE; 4715 4685 4716 - mmio: 4717 4686 /* 4718 4687 * Is this MMIO handled locally? 4719 4688 */ ··· 5244 5227 return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); 5245 5228 } 5246 5229 5247 - static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, 5248 - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) 5230 + static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, 5231 + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit) 5249 5232 { 5250 - kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx); 5233 + return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit); 5251 5234 } 5252 5235 5253 5236 static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) ··· 6379 6362 int r; 6380 6363 6381 6364 /* try to reinject previous events if any */ 6365 + if (vcpu->arch.exception.injected) { 6366 + kvm_x86_ops->queue_exception(vcpu); 6367 + return 0; 6368 + } 6369 + 6370 + /* 6371 + * Exceptions must be injected immediately, or the exception 6372 + * frame will have the address of the NMI or interrupt handler. 6373 + */ 6374 + if (!vcpu->arch.exception.pending) { 6375 + if (vcpu->arch.nmi_injected) { 6376 + kvm_x86_ops->set_nmi(vcpu); 6377 + return 0; 6378 + } 6379 + 6380 + if (vcpu->arch.interrupt.pending) { 6381 + kvm_x86_ops->set_irq(vcpu); 6382 + return 0; 6383 + } 6384 + } 6385 + 6386 + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { 6387 + r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); 6388 + if (r != 0) 6389 + return r; 6390 + } 6391 + 6392 + /* try to inject new event if pending */ 6382 6393 if (vcpu->arch.exception.pending) { 6383 6394 trace_kvm_inj_exception(vcpu->arch.exception.nr, 6384 6395 vcpu->arch.exception.has_error_code, 6385 6396 vcpu->arch.exception.error_code); 6397 + 6398 + vcpu->arch.exception.pending = false; 6399 + vcpu->arch.exception.injected = true; 6386 6400 6387 6401 if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) 6388 6402 __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | ··· 6426 6378 } 6427 6379 6428 6380 kvm_x86_ops->queue_exception(vcpu); 6429 - return 0; 6430 - } 6431 - 6432 - if (vcpu->arch.nmi_injected) { 6433 - kvm_x86_ops->set_nmi(vcpu); 6434 - return 0; 6435 - } 6436 - 6437 - if (vcpu->arch.interrupt.pending) { 6438 - kvm_x86_ops->set_irq(vcpu); 6439 - return 0; 6440 - } 6441 - 6442 - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { 6443 - r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); 6444 - if (r != 0) 6445 - return r; 6446 - } 6447 - 6448 - /* try to inject new event if pending */ 6449 - if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 6381 + } else if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 6450 6382 vcpu->arch.smi_pending = false; 6451 6383 enter_smm(vcpu); 6452 6384 } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { ··· 6643 6615 trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); 6644 6616 vcpu->arch.hflags |= HF_SMM_MASK; 6645 6617 memset(buf, 0, 512); 6646 - if (guest_cpuid_has_longmode(vcpu)) 6618 + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) 6647 6619 enter_smm_save_state_64(vcpu, buf); 6648 6620 else 6649 6621 enter_smm_save_state_32(vcpu, buf); ··· 6695 6667 kvm_set_segment(vcpu, &ds, VCPU_SREG_GS); 6696 6668 kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); 6697 6669 6698 - if (guest_cpuid_has_longmode(vcpu)) 6670 + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) 6699 6671 kvm_x86_ops->set_efer(vcpu, 0); 6700 6672 6701 6673 kvm_update_cpuid(vcpu); ··· 6802 6774 } 6803 6775 if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { 6804 6776 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 6777 + vcpu->mmio_needed = 0; 6805 6778 r = 0; 6806 6779 goto out; 6807 6780 } ··· 6891 6862 kvm_x86_ops->enable_nmi_window(vcpu); 6892 6863 if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) 6893 6864 kvm_x86_ops->enable_irq_window(vcpu); 6865 + WARN_ON(vcpu->arch.exception.pending); 6894 6866 } 6895 6867 6896 6868 if (kvm_lapic_enabled(vcpu)) { ··· 7034 7004 if (vcpu->arch.apic_attention) 7035 7005 kvm_lapic_sync_from_vapic(vcpu); 7036 7006 7007 + vcpu->arch.gpa_available = false; 7037 7008 r = kvm_x86_ops->handle_exit(vcpu); 7038 7009 return r; 7039 7010 ··· 7453 7422 int pending_vec, max_bits, idx; 7454 7423 struct desc_ptr dt; 7455 7424 7456 - if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE)) 7425 + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && 7426 + (sregs->cr4 & X86_CR4_OSXSAVE)) 7427 + return -EINVAL; 7428 + 7429 + apic_base_msr.data = sregs->apic_base; 7430 + apic_base_msr.host_initiated = true; 7431 + if (kvm_set_apic_base(vcpu, &apic_base_msr)) 7457 7432 return -EINVAL; 7458 7433 7459 7434 dt.size = sregs->idt.limit; ··· 7478 7441 7479 7442 mmu_reset_needed |= vcpu->arch.efer != sregs->efer; 7480 7443 kvm_x86_ops->set_efer(vcpu, sregs->efer); 7481 - apic_base_msr.data = sregs->apic_base; 7482 - apic_base_msr.host_initiated = true; 7483 - kvm_set_apic_base(vcpu, &apic_base_msr); 7484 7444 7485 7445 mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; 7486 7446 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); ··· 7768 7734 vcpu->arch.nmi_injected = false; 7769 7735 kvm_clear_interrupt_queue(vcpu); 7770 7736 kvm_clear_exception_queue(vcpu); 7737 + vcpu->arch.exception.pending = false; 7771 7738 7772 7739 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 7773 7740 kvm_update_dr0123(vcpu); ··· 8028 7993 kvm_pmu_init(vcpu); 8029 7994 8030 7995 vcpu->arch.pending_external_vector = -1; 7996 + vcpu->arch.preempted_in_kernel = false; 8031 7997 8032 7998 kvm_hv_vcpu_init(vcpu); 8033 7999 ··· 8474 8438 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 8475 8439 { 8476 8440 return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); 8441 + } 8442 + 8443 + bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) 8444 + { 8445 + return vcpu->arch.preempted_in_kernel; 8477 8446 } 8478 8447 8479 8448 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)

+51 -3

arch/x86/kvm/x86.h

··· 11 11 12 12 static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) 13 13 { 14 - vcpu->arch.exception.pending = false; 14 + vcpu->arch.exception.injected = false; 15 15 } 16 16 17 17 static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector, ··· 29 29 30 30 static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu) 31 31 { 32 - return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending || 32 + return vcpu->arch.exception.injected || vcpu->arch.interrupt.pending || 33 33 vcpu->arch.nmi_injected; 34 34 } 35 35 ··· 62 62 return cs_l; 63 63 } 64 64 65 + static inline bool is_la57_mode(struct kvm_vcpu *vcpu) 66 + { 67 + #ifdef CONFIG_X86_64 68 + return (vcpu->arch.efer & EFER_LMA) && 69 + kvm_read_cr4_bits(vcpu, X86_CR4_LA57); 70 + #else 71 + return 0; 72 + #endif 73 + } 74 + 65 75 static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) 66 76 { 67 77 return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; ··· 97 87 return 1 << (bitno & 31); 98 88 } 99 89 90 + static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu) 91 + { 92 + return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48; 93 + } 94 + 95 + static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt) 96 + { 97 + return (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_LA57) ? 57 : 48; 98 + } 99 + 100 + static inline u64 get_canonical(u64 la, u8 vaddr_bits) 101 + { 102 + return ((int64_t)la << (64 - vaddr_bits)) >> (64 - vaddr_bits); 103 + } 104 + 105 + static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu) 106 + { 107 + #ifdef CONFIG_X86_64 108 + return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la; 109 + #else 110 + return false; 111 + #endif 112 + } 113 + 114 + static inline bool emul_is_noncanonical_address(u64 la, 115 + struct x86_emulate_ctxt *ctxt) 116 + { 117 + #ifdef CONFIG_X86_64 118 + return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la; 119 + #else 120 + return false; 121 + #endif 122 + } 123 + 100 124 static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, 101 125 gva_t gva, gfn_t gfn, unsigned access) 102 126 { 103 - vcpu->arch.mmio_gva = gva & PAGE_MASK; 127 + /* 128 + * If this is a shadow nested page table, the "GVA" is 129 + * actually a nGPA. 130 + */ 131 + vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK; 104 132 vcpu->arch.access = access; 105 133 vcpu->arch.mmio_gfn = gfn; 106 134 vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation;

+8 -1

include/linux/kvm_host.h

··· 720 720 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); 721 721 void kvm_vcpu_kick(struct kvm_vcpu *vcpu); 722 722 int kvm_vcpu_yield_to(struct kvm_vcpu *target); 723 - void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); 723 + void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool usermode_vcpu_not_eligible); 724 724 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); 725 725 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); 726 726 ··· 800 800 void kvm_arch_hardware_unsetup(void); 801 801 void kvm_arch_check_processor_compat(void *rtn); 802 802 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); 803 + bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); 803 804 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); 804 805 805 806 #ifndef __KVM_HAVE_ARCH_VM_ALLOC ··· 984 983 static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn) 985 984 { 986 985 return (hpa_t)pfn << PAGE_SHIFT; 986 + } 987 + 988 + static inline struct page *kvm_vcpu_gpa_to_page(struct kvm_vcpu *vcpu, 989 + gpa_t gpa) 990 + { 991 + return kvm_vcpu_gfn_to_page(vcpu, gpa_to_gfn(gpa)); 987 992 } 988 993 989 994 static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa)

+2 -1

include/uapi/linux/kvm.h

··· 711 711 struct kvm_ppc_smmu_info { 712 712 __u64 flags; 713 713 __u32 slb_size; 714 - __u32 pad; 714 + __u16 data_keys; /* # storage keys supported for data */ 715 + __u16 instr_keys; /* # storage keys supported for instructions */ 715 716 struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; 716 717 }; 717 718

+5

virt/kvm/arm/arm.c

··· 416 416 && !v->arch.power_off && !v->arch.pause); 417 417 } 418 418 419 + bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) 420 + { 421 + return vcpu_mode_priv(vcpu); 422 + } 423 + 419 424 /* Just ensure a guest exit from a particular CPU */ 420 425 static void exit_vm_noop(void *info) 421 426 {

+11 -29

virt/kvm/arm/mmu.c

··· 1454 1454 kvm_set_pfn_accessed(pfn); 1455 1455 } 1456 1456 1457 - static bool is_abort_sea(unsigned long fault_status) 1458 - { 1459 - switch (fault_status) { 1460 - case FSC_SEA: 1461 - case FSC_SEA_TTW0: 1462 - case FSC_SEA_TTW1: 1463 - case FSC_SEA_TTW2: 1464 - case FSC_SEA_TTW3: 1465 - case FSC_SECC: 1466 - case FSC_SECC_TTW0: 1467 - case FSC_SECC_TTW1: 1468 - case FSC_SECC_TTW2: 1469 - case FSC_SECC_TTW3: 1470 - return true; 1471 - default: 1472 - return false; 1473 - } 1474 - } 1475 - 1476 1457 /** 1477 1458 * kvm_handle_guest_abort - handles all 2nd stage aborts 1478 1459 * @vcpu: the VCPU pointer ··· 1479 1498 fault_status = kvm_vcpu_trap_get_fault_type(vcpu); 1480 1499 1481 1500 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 1501 + is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1482 1502 1483 - /* 1484 - * The host kernel will handle the synchronous external abort. There 1485 - * is no need to pass the error into the guest. 1486 - */ 1487 - if (is_abort_sea(fault_status)) { 1503 + /* Synchronous External Abort? */ 1504 + if (kvm_vcpu_dabt_isextabt(vcpu)) { 1505 + /* 1506 + * For RAS the host kernel may handle this abort. 1507 + * There is no need to pass the error into the guest. 1508 + */ 1488 1509 if (!handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu))) 1489 1510 return 1; 1490 - } 1491 1511 1492 - is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1493 - if (unlikely(!is_iabt && kvm_vcpu_dabt_isextabt(vcpu))) { 1494 - kvm_inject_vabt(vcpu); 1495 - return 1; 1512 + if (unlikely(!is_iabt)) { 1513 + kvm_inject_vabt(vcpu); 1514 + return 1; 1515 + } 1496 1516 } 1497 1517 1498 1518 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),

+2 -2

virt/kvm/arm/vgic/vgic-debug.c

··· 234 234 return 0; 235 235 } 236 236 237 - static struct seq_operations vgic_debug_seq_ops = { 237 + static const struct seq_operations vgic_debug_seq_ops = { 238 238 .start = vgic_debug_start, 239 239 .next = vgic_debug_next, 240 240 .stop = vgic_debug_stop, ··· 255 255 return ret; 256 256 }; 257 257 258 - static struct file_operations vgic_debug_fops = { 258 + static const struct file_operations vgic_debug_fops = { 259 259 .owner = THIS_MODULE, 260 260 .open = debug_open, 261 261 .read = seq_read,

+4 -6

virt/kvm/arm/vgic/vgic-its.c

··· 144 144 145 145 struct vgic_irq *irq; 146 146 struct its_collection *collection; 147 - u32 lpi; 148 147 u32 event_id; 149 148 }; 150 149 ··· 812 813 /* Must be called with its_lock mutex held */ 813 814 static struct its_ite *vgic_its_alloc_ite(struct its_device *device, 814 815 struct its_collection *collection, 815 - u32 lpi_id, u32 event_id) 816 + u32 event_id) 816 817 { 817 818 struct its_ite *ite; 818 819 ··· 822 823 823 824 ite->event_id = event_id; 824 825 ite->collection = collection; 825 - ite->lpi = lpi_id; 826 826 827 827 list_add_tail(&ite->ite_list, &device->itt_head); 828 828 return ite; ··· 871 873 new_coll = collection; 872 874 } 873 875 874 - ite = vgic_its_alloc_ite(device, collection, lpi_nr, event_id); 876 + ite = vgic_its_alloc_ite(device, collection, event_id); 875 877 if (IS_ERR(ite)) { 876 878 if (new_coll) 877 879 vgic_its_free_collection(its, coll_id); ··· 1846 1848 1847 1849 next_offset = compute_next_eventid_offset(&dev->itt_head, ite); 1848 1850 val = ((u64)next_offset << KVM_ITS_ITE_NEXT_SHIFT) | 1849 - ((u64)ite->lpi << KVM_ITS_ITE_PINTID_SHIFT) | 1851 + ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | 1850 1852 ite->collection->collection_id; 1851 1853 val = cpu_to_le64(val); 1852 1854 return kvm_write_guest(kvm, gpa, &val, ite_esz); ··· 1893 1895 if (!collection) 1894 1896 return -EINVAL; 1895 1897 1896 - ite = vgic_its_alloc_ite(dev, collection, lpi_id, event_id); 1898 + ite = vgic_its_alloc_ite(dev, collection, event_id); 1897 1899 if (IS_ERR(ite)) 1898 1900 return PTR_ERR(ite); 1899 1901

+46 -1

virt/kvm/arm/vgic/vgic-mmio-v2.c

··· 303 303 vgic_set_vmcr(vcpu, &vmcr); 304 304 } 305 305 306 + static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu, 307 + gpa_t addr, unsigned int len) 308 + { 309 + int n; /* which APRn is this */ 310 + 311 + n = (addr >> 2) & 0x3; 312 + 313 + if (kvm_vgic_global_state.type == VGIC_V2) { 314 + /* GICv2 hardware systems support max. 32 groups */ 315 + if (n != 0) 316 + return 0; 317 + return vcpu->arch.vgic_cpu.vgic_v2.vgic_apr; 318 + } else { 319 + struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; 320 + 321 + if (n > vgic_v3_max_apr_idx(vcpu)) 322 + return 0; 323 + /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */ 324 + return vgicv3->vgic_ap1r[n]; 325 + } 326 + } 327 + 328 + static void vgic_mmio_write_apr(struct kvm_vcpu *vcpu, 329 + gpa_t addr, unsigned int len, 330 + unsigned long val) 331 + { 332 + int n; /* which APRn is this */ 333 + 334 + n = (addr >> 2) & 0x3; 335 + 336 + if (kvm_vgic_global_state.type == VGIC_V2) { 337 + /* GICv2 hardware systems support max. 32 groups */ 338 + if (n != 0) 339 + return; 340 + vcpu->arch.vgic_cpu.vgic_v2.vgic_apr = val; 341 + } else { 342 + struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; 343 + 344 + if (n > vgic_v3_max_apr_idx(vcpu)) 345 + return; 346 + /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */ 347 + vgicv3->vgic_ap1r[n] = val; 348 + } 349 + } 350 + 306 351 static const struct vgic_register_region vgic_v2_dist_registers[] = { 307 352 REGISTER_DESC_WITH_LENGTH(GIC_DIST_CTRL, 308 353 vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, 12, ··· 409 364 vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, 410 365 VGIC_ACCESS_32bit), 411 366 REGISTER_DESC_WITH_LENGTH(GIC_CPU_ACTIVEPRIO, 412 - vgic_mmio_read_raz, vgic_mmio_write_wi, 16, 367 + vgic_mmio_read_apr, vgic_mmio_write_apr, 16, 413 368 VGIC_ACCESS_32bit), 414 369 REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT, 415 370 vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,

+16

virt/kvm/arm/vgic/vgic.h

··· 220 220 bool lock_all_vcpus(struct kvm *kvm); 221 221 void unlock_all_vcpus(struct kvm *kvm); 222 222 223 + static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu) 224 + { 225 + struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu; 226 + 227 + /* 228 + * num_pri_bits are initialized with HW supported values. 229 + * We can rely safely on num_pri_bits even if VM has not 230 + * restored ICC_CTLR_EL1 before restoring APnR registers. 231 + */ 232 + switch (cpu_if->num_pri_bits) { 233 + case 7: return 3; 234 + case 6: return 1; 235 + default: return 0; 236 + } 237 + } 238 + 223 239 #endif

+5 -2

virt/kvm/kvm_main.c

··· 1609 1609 struct page **pages, int nr_pages) 1610 1610 { 1611 1611 unsigned long addr; 1612 - gfn_t entry; 1612 + gfn_t entry = 0; 1613 1613 1614 1614 addr = gfn_to_hva_many(slot, gfn, &entry); 1615 1615 if (kvm_is_error_hva(addr)) ··· 1928 1928 * verify that the entire region is valid here. 1929 1929 */ 1930 1930 while (start_gfn <= end_gfn) { 1931 + nr_pages_avail = 0; 1931 1932 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 1932 1933 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, 1933 1934 &nr_pages_avail); ··· 2276 2275 #endif 2277 2276 } 2278 2277 2279 - void kvm_vcpu_on_spin(struct kvm_vcpu *me) 2278 + void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) 2280 2279 { 2281 2280 struct kvm *kvm = me->kvm; 2282 2281 struct kvm_vcpu *vcpu; ··· 2306 2305 if (vcpu == me) 2307 2306 continue; 2308 2307 if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) 2308 + continue; 2309 + if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu)) 2309 2310 continue; 2310 2311 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 2311 2312 continue;