Merge tag 'kvm-x86-cet-6.18' of https://github.com/kvm-x86/linux into HEAD

+13 -1

Documentation/virt/kvm/api.rst

··· 2908 2908 2909 2909 0x9030 0000 0002 <reg:16> 2910 2910 2911 + x86 MSR registers have the following id bit patterns:: 2912 + 0x2030 0002 <msr number:32> 2913 + 2914 + Following are the KVM-defined registers for x86: 2915 + 2916 + ======================= ========= ============================================= 2917 + Encoding Register Description 2918 + ======================= ========= ============================================= 2919 + 0x2030 0003 0000 0000 SSP Shadow Stack Pointer 2920 + ======================= ========= ============================================= 2911 2921 2912 2922 4.69 KVM_GET_ONE_REG 2913 2923 -------------------- ··· 3598 3588 --------------------- 3599 3589 3600 3590 :Capability: basic 3601 - :Architectures: arm64, mips, riscv 3591 + :Architectures: arm64, mips, riscv, x86 (if KVM_CAP_ONE_REG) 3602 3592 :Type: vcpu ioctl 3603 3593 :Parameters: struct kvm_reg_list (in/out) 3604 3594 :Returns: 0 on success; -1 on error ··· 3641 3631 3642 3632 - KVM_REG_S390_GBEA 3643 3633 3634 + Note, for x86, all MSRs enumerated by KVM_GET_MSR_INDEX_LIST are supported as 3635 + type KVM_X86_REG_TYPE_MSR, but are NOT enumerated via KVM_GET_REG_LIST. 3644 3636 3645 3637 4.85 KVM_ARM_SET_DEVICE_ADDR (deprecated) 3646 3638 -----------------------------------------

+4 -2

arch/x86/include/asm/kvm_host.h

··· 142 142 | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ 143 143 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \ 144 144 | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \ 145 - | X86_CR4_LAM_SUP)) 145 + | X86_CR4_LAM_SUP | X86_CR4_CET)) 146 146 147 147 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 148 148 ··· 267 267 #define PFERR_RSVD_MASK BIT(3) 268 268 #define PFERR_FETCH_MASK BIT(4) 269 269 #define PFERR_PK_MASK BIT(5) 270 + #define PFERR_SS_MASK BIT(6) 270 271 #define PFERR_SGX_MASK BIT(15) 271 272 #define PFERR_GUEST_RMP_MASK BIT_ULL(31) 272 273 #define PFERR_GUEST_FINAL_MASK BIT_ULL(32) ··· 816 815 bool at_instruction_boundary; 817 816 bool tpr_access_reporting; 818 817 bool xfd_no_write_intercept; 819 - u64 ia32_xss; 820 818 u64 microcode_version; 821 819 u64 arch_capabilities; 822 820 u64 perf_capabilities; ··· 876 876 877 877 u64 xcr0; 878 878 u64 guest_supported_xcr0; 879 + u64 ia32_xss; 880 + u64 guest_supported_xss; 879 881 880 882 struct kvm_pio_request pio; 881 883 void *pio_data;

+9

arch/x86/include/asm/vmx.h

··· 106 106 #define VM_EXIT_CLEAR_BNDCFGS 0x00800000 107 107 #define VM_EXIT_PT_CONCEAL_PIP 0x01000000 108 108 #define VM_EXIT_CLEAR_IA32_RTIT_CTL 0x02000000 109 + #define VM_EXIT_LOAD_CET_STATE 0x10000000 109 110 110 111 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff 111 112 ··· 120 119 #define VM_ENTRY_LOAD_BNDCFGS 0x00010000 121 120 #define VM_ENTRY_PT_CONCEAL_PIP 0x00020000 122 121 #define VM_ENTRY_LOAD_IA32_RTIT_CTL 0x00040000 122 + #define VM_ENTRY_LOAD_CET_STATE 0x00100000 123 123 124 124 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff 125 125 ··· 134 132 #define VMX_BASIC_DUAL_MONITOR_TREATMENT BIT_ULL(49) 135 133 #define VMX_BASIC_INOUT BIT_ULL(54) 136 134 #define VMX_BASIC_TRUE_CTLS BIT_ULL(55) 135 + #define VMX_BASIC_NO_HW_ERROR_CODE_CC BIT_ULL(56) 137 136 138 137 static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) 139 138 { ··· 372 369 GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822, 373 370 GUEST_SYSENTER_ESP = 0x00006824, 374 371 GUEST_SYSENTER_EIP = 0x00006826, 372 + GUEST_S_CET = 0x00006828, 373 + GUEST_SSP = 0x0000682a, 374 + GUEST_INTR_SSP_TABLE = 0x0000682c, 375 375 HOST_CR0 = 0x00006c00, 376 376 HOST_CR3 = 0x00006c02, 377 377 HOST_CR4 = 0x00006c04, ··· 387 381 HOST_IA32_SYSENTER_EIP = 0x00006c12, 388 382 HOST_RSP = 0x00006c14, 389 383 HOST_RIP = 0x00006c16, 384 + HOST_S_CET = 0x00006c18, 385 + HOST_SSP = 0x00006c1a, 386 + HOST_INTR_SSP_TABLE = 0x00006c1c 390 387 }; 391 388 392 389 /*

+34

arch/x86/include/uapi/asm/kvm.h

··· 35 35 #define MC_VECTOR 18 36 36 #define XM_VECTOR 19 37 37 #define VE_VECTOR 20 38 + #define CP_VECTOR 21 39 + 40 + #define HV_VECTOR 28 41 + #define VC_VECTOR 29 42 + #define SX_VECTOR 30 38 43 39 44 /* Select x86 specific features in <linux/kvm.h> */ 40 45 #define __KVM_HAVE_PIT ··· 415 410 struct kvm_xcr xcrs[KVM_MAX_XCRS]; 416 411 __u64 padding[16]; 417 412 }; 413 + 414 + #define KVM_X86_REG_TYPE_MSR 2 415 + #define KVM_X86_REG_TYPE_KVM 3 416 + 417 + #define KVM_X86_KVM_REG_SIZE(reg) \ 418 + ({ \ 419 + reg == KVM_REG_GUEST_SSP ? KVM_REG_SIZE_U64 : 0; \ 420 + }) 421 + 422 + #define KVM_X86_REG_TYPE_SIZE(type, reg) \ 423 + ({ \ 424 + __u64 type_size = (__u64)type << 32; \ 425 + \ 426 + type_size |= type == KVM_X86_REG_TYPE_MSR ? KVM_REG_SIZE_U64 : \ 427 + type == KVM_X86_REG_TYPE_KVM ? KVM_X86_KVM_REG_SIZE(reg) : \ 428 + 0; \ 429 + type_size; \ 430 + }) 431 + 432 + #define KVM_X86_REG_ID(type, index) \ 433 + (KVM_REG_X86 | KVM_X86_REG_TYPE_SIZE(type, index) | index) 434 + 435 + #define KVM_X86_REG_MSR(index) \ 436 + KVM_X86_REG_ID(KVM_X86_REG_TYPE_MSR, index) 437 + #define KVM_X86_REG_KVM(index) \ 438 + KVM_X86_REG_ID(KVM_X86_REG_TYPE_KVM, index) 439 + 440 + /* KVM-defined registers starting from 0 */ 441 + #define KVM_REG_GUEST_SSP 0 418 442 419 443 #define KVM_SYNC_X86_REGS (1UL << 0) 420 444 #define KVM_SYNC_X86_SREGS (1UL << 1)

+34 -1

arch/x86/kvm/cpuid.c

··· 263 263 return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0; 264 264 } 265 265 266 + static u64 cpuid_get_supported_xss(struct kvm_vcpu *vcpu) 267 + { 268 + struct kvm_cpuid_entry2 *best; 269 + 270 + best = kvm_find_cpuid_entry_index(vcpu, 0xd, 1); 271 + if (!best) 272 + return 0; 273 + 274 + return (best->ecx | ((u64)best->edx << 32)) & kvm_caps.supported_xss; 275 + } 276 + 266 277 static __always_inline void kvm_update_feature_runtime(struct kvm_vcpu *vcpu, 267 278 struct kvm_cpuid_entry2 *entry, 268 279 unsigned int x86_feature, ··· 316 305 best = kvm_find_cpuid_entry_index(vcpu, 0xD, 1); 317 306 if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || 318 307 cpuid_entry_has(best, X86_FEATURE_XSAVEC))) 319 - best->ebx = xstate_required_size(vcpu->arch.xcr0, true); 308 + best->ebx = xstate_required_size(vcpu->arch.xcr0 | 309 + vcpu->arch.ia32_xss, true); 320 310 } 321 311 322 312 static bool kvm_cpuid_has_hyperv(struct kvm_vcpu *vcpu) ··· 436 424 } 437 425 438 426 vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu); 427 + vcpu->arch.guest_supported_xss = cpuid_get_supported_xss(vcpu); 439 428 440 429 vcpu->arch.pv_cpuid.features = kvm_apply_cpuid_pv_features_quirk(vcpu); 441 430 ··· 946 933 VENDOR_F(WAITPKG), 947 934 F(SGX_LC), 948 935 F(BUS_LOCK_DETECT), 936 + X86_64_F(SHSTK), 949 937 ); 950 938 951 939 /* ··· 955 941 */ 956 942 if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) 957 943 kvm_cpu_cap_clear(X86_FEATURE_PKU); 944 + 945 + /* 946 + * Shadow Stacks aren't implemented in the Shadow MMU. Shadow Stack 947 + * accesses require "magic" Writable=0,Dirty=1 protection, which KVM 948 + * doesn't know how to emulate or map. 949 + */ 950 + if (!tdp_enabled) 951 + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); 958 952 959 953 kvm_cpu_cap_init(CPUID_7_EDX, 960 954 F(AVX512_4VNNIW), ··· 981 959 F(AMX_INT8), 982 960 F(AMX_BF16), 983 961 F(FLUSH_L1D), 962 + F(IBT), 984 963 ); 964 + 965 + /* 966 + * Disable support for IBT and SHSTK if KVM is configured to emulate 967 + * accesses to reserved GPAs, as KVM's emulator doesn't support IBT or 968 + * SHSTK, nor does KVM handle Shadow Stack #PFs (see above). 969 + */ 970 + if (allow_smaller_maxphyaddr) { 971 + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); 972 + kvm_cpu_cap_clear(X86_FEATURE_IBT); 973 + } 985 974 986 975 if (boot_cpu_has(X86_FEATURE_AMD_IBPB_RET) && 987 976 boot_cpu_has(X86_FEATURE_AMD_IBPB) &&

+137 -13

arch/x86/kvm/emulate.c

··· 178 178 #define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ 179 179 #define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */ 180 180 #define IsBranch ((u64)1 << 56) /* Instruction is considered a branch. */ 181 + #define ShadowStack ((u64)1 << 57) /* Instruction affects Shadow Stacks. */ 181 182 182 183 #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) 183 184 ··· 1554 1553 return linear_write_system(ctxt, addr, desc, sizeof(*desc)); 1555 1554 } 1556 1555 1556 + static bool emulator_is_ssp_invalid(struct x86_emulate_ctxt *ctxt, u8 cpl) 1557 + { 1558 + const u32 MSR_IA32_X_CET = cpl == 3 ? MSR_IA32_U_CET : MSR_IA32_S_CET; 1559 + u64 efer = 0, cet = 0, ssp = 0; 1560 + 1561 + if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_CET)) 1562 + return false; 1563 + 1564 + if (ctxt->ops->get_msr(ctxt, MSR_EFER, &efer)) 1565 + return true; 1566 + 1567 + /* SSP is guaranteed to be valid if the vCPU was already in 32-bit mode. */ 1568 + if (!(efer & EFER_LMA)) 1569 + return false; 1570 + 1571 + if (ctxt->ops->get_msr(ctxt, MSR_IA32_X_CET, &cet)) 1572 + return true; 1573 + 1574 + if (!(cet & CET_SHSTK_EN)) 1575 + return false; 1576 + 1577 + if (ctxt->ops->get_msr(ctxt, MSR_KVM_INTERNAL_GUEST_SSP, &ssp)) 1578 + return true; 1579 + 1580 + /* 1581 + * On transfer from 64-bit mode to compatibility mode, SSP[63:32] must 1582 + * be 0, i.e. SSP must be a 32-bit value outside of 64-bit mode. 1583 + */ 1584 + return ssp >> 32; 1585 + } 1586 + 1557 1587 static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1558 1588 u16 selector, int seg, u8 cpl, 1559 1589 enum x86_transfer_type transfer, ··· 1724 1692 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); 1725 1693 if (efer & EFER_LMA) 1726 1694 goto exception; 1695 + } 1696 + if (!seg_desc.l && emulator_is_ssp_invalid(ctxt, cpl)) { 1697 + err_code = 0; 1698 + goto exception; 1727 1699 } 1728 1700 1729 1701 /* CS(RPL) <- CPL */ ··· 4104 4068 static const struct opcode group5[] = { 4105 4069 F(DstMem | SrcNone | Lock, em_inc), 4106 4070 F(DstMem | SrcNone | Lock, em_dec), 4107 - I(SrcMem | NearBranch | IsBranch, em_call_near_abs), 4108 - I(SrcMemFAddr | ImplicitOps | IsBranch, em_call_far), 4071 + I(SrcMem | NearBranch | IsBranch | ShadowStack, em_call_near_abs), 4072 + I(SrcMemFAddr | ImplicitOps | IsBranch | ShadowStack, em_call_far), 4109 4073 I(SrcMem | NearBranch | IsBranch, em_jmp_abs), 4110 4074 I(SrcMemFAddr | ImplicitOps | IsBranch, em_jmp_far), 4111 4075 I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined), ··· 4340 4304 DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), 4341 4305 /* 0x98 - 0x9F */ 4342 4306 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), 4343 - I(SrcImmFAddr | No64 | IsBranch, em_call_far), N, 4307 + I(SrcImmFAddr | No64 | IsBranch | ShadowStack, em_call_far), N, 4344 4308 II(ImplicitOps | Stack, em_pushf, pushf), 4345 4309 II(ImplicitOps | Stack, em_popf, popf), 4346 4310 I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf), ··· 4360 4324 X8(I(DstReg | SrcImm64 | Mov, em_mov)), 4361 4325 /* 0xC0 - 0xC7 */ 4362 4326 G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2), 4363 - I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch, em_ret_near_imm), 4364 - I(ImplicitOps | NearBranch | IsBranch, em_ret), 4327 + I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch | ShadowStack, em_ret_near_imm), 4328 + I(ImplicitOps | NearBranch | IsBranch | ShadowStack, em_ret), 4365 4329 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), 4366 4330 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), 4367 4331 G(ByteOp, group11), G(0, group11), 4368 4332 /* 0xC8 - 0xCF */ 4369 4333 I(Stack | SrcImmU16 | Src2ImmByte, em_enter), 4370 4334 I(Stack, em_leave), 4371 - I(ImplicitOps | SrcImmU16 | IsBranch, em_ret_far_imm), 4372 - I(ImplicitOps | IsBranch, em_ret_far), 4373 - D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch, intn), 4335 + I(ImplicitOps | SrcImmU16 | IsBranch | ShadowStack, em_ret_far_imm), 4336 + I(ImplicitOps | IsBranch | ShadowStack, em_ret_far), 4337 + D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch | ShadowStack, intn), 4374 4338 D(ImplicitOps | No64 | IsBranch), 4375 - II(ImplicitOps | IsBranch, em_iret, iret), 4339 + II(ImplicitOps | IsBranch | ShadowStack, em_iret, iret), 4376 4340 /* 0xD0 - 0xD7 */ 4377 4341 G(Src2One | ByteOp, group2), G(Src2One, group2), 4378 4342 G(Src2CL | ByteOp, group2), G(Src2CL, group2), ··· 4388 4352 I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in), 4389 4353 I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out), 4390 4354 /* 0xE8 - 0xEF */ 4391 - I(SrcImm | NearBranch | IsBranch, em_call), 4355 + I(SrcImm | NearBranch | IsBranch | ShadowStack, em_call), 4392 4356 D(SrcImm | ImplicitOps | NearBranch | IsBranch), 4393 4357 I(SrcImmFAddr | No64 | IsBranch, em_jmp_far), 4394 4358 D(SrcImmByte | ImplicitOps | NearBranch | IsBranch), ··· 4407 4371 static const struct opcode twobyte_table[256] = { 4408 4372 /* 0x00 - 0x0F */ 4409 4373 G(0, group6), GD(0, &group7), N, N, 4410 - N, I(ImplicitOps | EmulateOnUD | IsBranch, em_syscall), 4374 + N, I(ImplicitOps | EmulateOnUD | IsBranch | ShadowStack, em_syscall), 4411 4375 II(ImplicitOps | Priv, em_clts, clts), N, 4412 4376 DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, 4413 4377 N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, ··· 4438 4402 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), 4439 4403 II(ImplicitOps | Priv, em_rdmsr, rdmsr), 4440 4404 IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc), 4441 - I(ImplicitOps | EmulateOnUD | IsBranch, em_sysenter), 4442 - I(ImplicitOps | Priv | EmulateOnUD | IsBranch, em_sysexit), 4405 + I(ImplicitOps | EmulateOnUD | IsBranch | ShadowStack, em_sysenter), 4406 + I(ImplicitOps | Priv | EmulateOnUD | IsBranch | ShadowStack, em_sysexit), 4443 4407 N, N, 4444 4408 N, N, N, N, N, N, N, N, 4445 4409 /* 0x40 - 0x4F */ ··· 4549 4513 #undef I2bv 4550 4514 #undef I2bvIP 4551 4515 #undef I6ALU 4516 + 4517 + static bool is_shstk_instruction(struct x86_emulate_ctxt *ctxt) 4518 + { 4519 + return ctxt->d & ShadowStack; 4520 + } 4521 + 4522 + static bool is_ibt_instruction(struct x86_emulate_ctxt *ctxt) 4523 + { 4524 + u64 flags = ctxt->d; 4525 + 4526 + if (!(flags & IsBranch)) 4527 + return false; 4528 + 4529 + /* 4530 + * All far JMPs and CALLs (including SYSCALL, SYSENTER, and INTn) are 4531 + * indirect and thus affect IBT state. All far RETs (including SYSEXIT 4532 + * and IRET) are protected via Shadow Stacks and thus don't affect IBT 4533 + * state. IRET #GPs when returning to virtual-8086 and IBT or SHSTK is 4534 + * enabled, but that should be handled by IRET emulation (in the very 4535 + * unlikely scenario that KVM adds support for fully emulating IRET). 4536 + */ 4537 + if (!(flags & NearBranch)) 4538 + return ctxt->execute != em_iret && 4539 + ctxt->execute != em_ret_far && 4540 + ctxt->execute != em_ret_far_imm && 4541 + ctxt->execute != em_sysexit; 4542 + 4543 + switch (flags & SrcMask) { 4544 + case SrcReg: 4545 + case SrcMem: 4546 + case SrcMem16: 4547 + case SrcMem32: 4548 + return true; 4549 + case SrcMemFAddr: 4550 + case SrcImmFAddr: 4551 + /* Far branches should be handled above. */ 4552 + WARN_ON_ONCE(1); 4553 + return true; 4554 + case SrcNone: 4555 + case SrcImm: 4556 + case SrcImmByte: 4557 + /* 4558 + * Note, ImmU16 is used only for the stack adjustment operand on ENTER 4559 + * and RET instructions. ENTER isn't a branch and RET FAR is handled 4560 + * by the NearBranch check above. RET itself isn't an indirect branch. 4561 + */ 4562 + case SrcImmU16: 4563 + return false; 4564 + default: 4565 + WARN_ONCE(1, "Unexpected Src operand '%llx' on branch", 4566 + flags & SrcMask); 4567 + return false; 4568 + } 4569 + } 4552 4570 4553 4571 static unsigned imm_size(struct x86_emulate_ctxt *ctxt) 4554 4572 { ··· 5032 4942 return EMULATION_FAILED; 5033 4943 5034 4944 ctxt->execute = opcode.u.execute; 4945 + 4946 + /* 4947 + * Reject emulation if KVM might need to emulate shadow stack updates 4948 + * and/or indirect branch tracking enforcement, which the emulator 4949 + * doesn't support. 4950 + */ 4951 + if ((is_ibt_instruction(ctxt) || is_shstk_instruction(ctxt)) && 4952 + ctxt->ops->get_cr(ctxt, 4) & X86_CR4_CET) { 4953 + u64 u_cet = 0, s_cet = 0; 4954 + 4955 + /* 4956 + * Check both User and Supervisor on far transfers as inter- 4957 + * privilege level transfers are impacted by CET at the target 4958 + * privilege level, and that is not known at this time. The 4959 + * expectation is that the guest will not require emulation of 4960 + * any CET-affected instructions at any privilege level. 4961 + */ 4962 + if (!(ctxt->d & NearBranch)) 4963 + u_cet = s_cet = CET_SHSTK_EN | CET_ENDBR_EN; 4964 + else if (ctxt->ops->cpl(ctxt) == 3) 4965 + u_cet = CET_SHSTK_EN | CET_ENDBR_EN; 4966 + else 4967 + s_cet = CET_SHSTK_EN | CET_ENDBR_EN; 4968 + 4969 + if ((u_cet && ctxt->ops->get_msr(ctxt, MSR_IA32_U_CET, &u_cet)) || 4970 + (s_cet && ctxt->ops->get_msr(ctxt, MSR_IA32_S_CET, &s_cet))) 4971 + return EMULATION_FAILED; 4972 + 4973 + if ((u_cet | s_cet) & CET_SHSTK_EN && is_shstk_instruction(ctxt)) 4974 + return EMULATION_FAILED; 4975 + 4976 + if ((u_cet | s_cet) & CET_ENDBR_EN && is_ibt_instruction(ctxt)) 4977 + return EMULATION_FAILED; 4978 + } 5035 4979 5036 4980 if (unlikely(emulation_type & EMULTYPE_TRAP_UD) && 5037 4981 likely(!(ctxt->d & EmulateOnUD)))

+2 -1

arch/x86/kvm/kvm_cache_regs.h

+1 -1

arch/x86/kvm/mmu.h

··· 212 212 213 213 fault = (mmu->permissions[index] >> pte_access) & 1; 214 214 215 - WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK)); 215 + WARN_ON_ONCE(pfec & (PFERR_PK_MASK | PFERR_SS_MASK | PFERR_RSVD_MASK)); 216 216 if (unlikely(mmu->pkru_mask)) { 217 217 u32 pkru_bits, offset; 218 218

+3

arch/x86/kvm/mmu/mmutrace.h

··· 51 51 { PFERR_PRESENT_MASK, "P" }, \ 52 52 { PFERR_WRITE_MASK, "W" }, \ 53 53 { PFERR_USER_MASK, "U" }, \ 54 + { PFERR_PK_MASK, "PK" }, \ 55 + { PFERR_SS_MASK, "SS" }, \ 56 + { PFERR_SGX_MASK, "SGX" }, \ 54 57 { PFERR_RSVD_MASK, "RSVD" }, \ 55 58 { PFERR_FETCH_MASK, "F" } 56 59

+8

arch/x86/kvm/smm.c

··· 269 269 enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS); 270 270 271 271 smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu); 272 + 273 + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && 274 + kvm_msr_read(vcpu, MSR_KVM_INTERNAL_GUEST_SSP, &smram->ssp)) 275 + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 272 276 } 273 277 #endif 274 278 ··· 561 557 562 558 kvm_x86_call(set_interrupt_shadow)(vcpu, 0); 563 559 ctxt->interruptibility = (u8)smstate->int_shadow; 560 + 561 + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && 562 + kvm_msr_write(vcpu, MSR_KVM_INTERNAL_GUEST_SSP, smstate->ssp)) 563 + return X86EMUL_UNHANDLEABLE; 564 564 565 565 return X86EMUL_CONTINUE; 566 566 }

+1 -1

arch/x86/kvm/smm.h

··· 116 116 u32 smbase; 117 117 u32 reserved4[5]; 118 118 119 - /* ssp and svm_* fields below are not implemented by KVM */ 120 119 u64 ssp; 120 + /* svm_* fields below are not implemented by KVM */ 121 121 u64 svm_guest_pat; 122 122 u64 svm_host_efer; 123 123 u64 svm_host_cr4;

+20

arch/x86/kvm/svm/nested.c

··· 636 636 vmcb_mark_dirty(vmcb02, VMCB_DT); 637 637 } 638 638 639 + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && 640 + (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_CET)))) { 641 + vmcb02->save.s_cet = vmcb12->save.s_cet; 642 + vmcb02->save.isst_addr = vmcb12->save.isst_addr; 643 + vmcb02->save.ssp = vmcb12->save.ssp; 644 + vmcb_mark_dirty(vmcb02, VMCB_CET); 645 + } 646 + 639 647 kvm_set_rflags(vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); 640 648 641 649 svm_set_efer(vcpu, svm->nested.save.efer); ··· 1052 1044 to_save->rsp = from_save->rsp; 1053 1045 to_save->rip = from_save->rip; 1054 1046 to_save->cpl = 0; 1047 + 1048 + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { 1049 + to_save->s_cet = from_save->s_cet; 1050 + to_save->isst_addr = from_save->isst_addr; 1051 + to_save->ssp = from_save->ssp; 1052 + } 1055 1053 } 1056 1054 1057 1055 void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb) ··· 1124 1110 vmcb12->save.dr7 = vmcb02->save.dr7; 1125 1111 vmcb12->save.dr6 = svm->vcpu.arch.dr6; 1126 1112 vmcb12->save.cpl = vmcb02->save.cpl; 1113 + 1114 + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { 1115 + vmcb12->save.s_cet = vmcb02->save.s_cet; 1116 + vmcb12->save.isst_addr = vmcb02->save.isst_addr; 1117 + vmcb12->save.ssp = vmcb02->save.ssp; 1118 + } 1127 1119 1128 1120 vmcb12->control.int_state = vmcb02->control.int_state; 1129 1121 vmcb12->control.exit_code = vmcb02->control.exit_code;

+3

arch/x86/kvm/svm/sev.c

··· 3354 3354 if (kvm_ghcb_xcr0_is_valid(svm)) 3355 3355 __kvm_set_xcr(vcpu, 0, kvm_ghcb_get_xcr0(svm)); 3356 3356 3357 + if (kvm_ghcb_xss_is_valid(svm)) 3358 + __kvm_emulate_msr_write(vcpu, MSR_IA32_XSS, kvm_ghcb_get_xss(svm)); 3359 + 3357 3360 /* Copy the GHCB exit information into the VMCB fields */ 3358 3361 exit_code = kvm_ghcb_get_sw_exit_code(svm); 3359 3362 control->exit_code = lower_32_bits(exit_code);

+62 -18

arch/x86/kvm/svm/svm.c

··· 775 775 svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); 776 776 } 777 777 778 + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { 779 + bool shstk_enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); 780 + 781 + svm_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, !shstk_enabled); 782 + svm_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, !shstk_enabled); 783 + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, !shstk_enabled); 784 + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, !shstk_enabled); 785 + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, !shstk_enabled); 786 + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, !shstk_enabled); 787 + } 788 + 778 789 if (sev_es_guest(vcpu->kvm)) 779 790 sev_es_recalc_msr_intercepts(vcpu); 780 791 ··· 2652 2641 static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, 2653 2642 struct msr_data *msr_info) 2654 2643 { 2655 - return sev_es_guest(vcpu->kvm) && 2656 - vcpu->arch.guest_state_protected && 2644 + return sev_es_guest(vcpu->kvm) && vcpu->arch.guest_state_protected && 2645 + msr_info->index != MSR_IA32_XSS && 2657 2646 !msr_write_intercepted(vcpu, msr_info->index); 2658 2647 } 2659 2648 ··· 2708 2697 msr_info->data = svm->vmcb01.ptr->save.sysenter_esp; 2709 2698 if (guest_cpuid_is_intel_compatible(vcpu)) 2710 2699 msr_info->data |= (u64)svm->sysenter_esp_hi << 32; 2700 + break; 2701 + case MSR_IA32_S_CET: 2702 + msr_info->data = svm->vmcb->save.s_cet; 2703 + break; 2704 + case MSR_IA32_INT_SSP_TAB: 2705 + msr_info->data = svm->vmcb->save.isst_addr; 2706 + break; 2707 + case MSR_KVM_INTERNAL_GUEST_SSP: 2708 + msr_info->data = svm->vmcb->save.ssp; 2711 2709 break; 2712 2710 case MSR_TSC_AUX: 2713 2711 msr_info->data = svm->tsc_aux; ··· 2949 2929 case MSR_IA32_SYSENTER_ESP: 2950 2930 svm->vmcb01.ptr->save.sysenter_esp = (u32)data; 2951 2931 svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0; 2932 + break; 2933 + case MSR_IA32_S_CET: 2934 + svm->vmcb->save.s_cet = data; 2935 + vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET); 2936 + break; 2937 + case MSR_IA32_INT_SSP_TAB: 2938 + svm->vmcb->save.isst_addr = data; 2939 + vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET); 2940 + break; 2941 + case MSR_KVM_INTERNAL_GUEST_SSP: 2942 + svm->vmcb->save.ssp = data; 2943 + vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET); 2952 2944 break; 2953 2945 case MSR_TSC_AUX: 2954 2946 /* ··· 3352 3320 pr_err("%-15s %016llx %-13s %016llx\n", 3353 3321 "rsp:", save->rsp, "rax:", save->rax); 3354 3322 pr_err("%-15s %016llx %-13s %016llx\n", 3323 + "s_cet:", save->s_cet, "ssp:", save->ssp); 3324 + pr_err("%-15s %016llx\n", 3325 + "isst_addr:", save->isst_addr); 3326 + pr_err("%-15s %016llx %-13s %016llx\n", 3355 3327 "star:", save01->star, "lstar:", save01->lstar); 3356 3328 pr_err("%-15s %016llx %-13s %016llx\n", 3357 3329 "cstar:", save01->cstar, "sfmask:", save01->sfmask); ··· 3378 3342 3379 3343 pr_err("%-15s %016llx\n", 3380 3344 "sev_features", vmsa->sev_features); 3345 + 3346 + pr_err("%-15s %016llx %-13s %016llx\n", 3347 + "pl0_ssp:", vmsa->pl0_ssp, "pl1_ssp:", vmsa->pl1_ssp); 3348 + pr_err("%-15s %016llx %-13s %016llx\n", 3349 + "pl2_ssp:", vmsa->pl2_ssp, "pl3_ssp:", vmsa->pl3_ssp); 3350 + pr_err("%-15s %016llx\n", 3351 + "u_cet:", vmsa->u_cet); 3381 3352 3382 3353 pr_err("%-15s %016llx %-13s %016llx\n", 3383 3354 "rax:", vmsa->rax, "rbx:", vmsa->rbx); ··· 5192 5149 kvm_set_cpu_caps(); 5193 5150 5194 5151 kvm_caps.supported_perf_cap = 0; 5195 - kvm_caps.supported_xss = 0; 5152 + 5153 + kvm_cpu_cap_clear(X86_FEATURE_IBT); 5196 5154 5197 5155 /* CPUID 0x80000001 and 0x8000000A (SVM features) */ 5198 5156 if (nested) { ··· 5343 5299 get_npt_level(), PG_LEVEL_1G); 5344 5300 pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled)); 5345 5301 5302 + /* 5303 + * It seems that on AMD processors PTE's accessed bit is 5304 + * being set by the CPU hardware before the NPF vmexit. 5305 + * This is not expected behaviour and our tests fail because 5306 + * of it. 5307 + * A workaround here is to disable support for 5308 + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. 5309 + * In this case userspace can know if there is support using 5310 + * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle 5311 + * it 5312 + * If future AMD CPU models change the behaviour described above, 5313 + * this variable can be changed accordingly 5314 + */ 5315 + allow_smaller_maxphyaddr = !npt_enabled; 5316 + 5346 5317 /* Setup shadow_me_value and shadow_me_mask */ 5347 5318 kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); 5348 5319 ··· 5433 5374 pr_info("PMU virtualization is disabled\n"); 5434 5375 5435 5376 svm_set_cpu_caps(); 5436 - 5437 - /* 5438 - * It seems that on AMD processors PTE's accessed bit is 5439 - * being set by the CPU hardware before the NPF vmexit. 5440 - * This is not expected behaviour and our tests fail because 5441 - * of it. 5442 - * A workaround here is to disable support for 5443 - * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. 5444 - * In this case userspace can know if there is support using 5445 - * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle 5446 - * it 5447 - * If future AMD CPU models change the behaviour described above, 5448 - * this variable can be changed accordingly 5449 - */ 5450 - allow_smaller_maxphyaddr = !npt_enabled; 5451 5377 5452 5378 kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; 5453 5379 return 0;

+3 -1

arch/x86/kvm/svm/svm.h

··· 77 77 * AVIC PHYSICAL_TABLE pointer, 78 78 * AVIC LOGICAL_TABLE pointer 79 79 */ 80 + VMCB_CET, /* S_CET, SSP, ISST_ADDR */ 80 81 VMCB_SW = 31, /* Reserved for hypervisor/software use */ 81 82 }; 82 83 ··· 86 85 (1U << VMCB_ASID) | (1U << VMCB_INTR) | \ 87 86 (1U << VMCB_NPT) | (1U << VMCB_CR) | (1U << VMCB_DR) | \ 88 87 (1U << VMCB_DT) | (1U << VMCB_SEG) | (1U << VMCB_CR2) | \ 89 - (1U << VMCB_LBR) | (1U << VMCB_AVIC) | \ 88 + (1U << VMCB_LBR) | (1U << VMCB_AVIC) | (1U << VMCB_CET) | \ 90 89 (1U << VMCB_SW)) 91 90 92 91 /* TPR and CR2 are always written before VMRUN */ ··· 943 942 DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_2) 944 943 DEFINE_KVM_GHCB_ACCESSORS(sw_scratch) 945 944 DEFINE_KVM_GHCB_ACCESSORS(xcr0) 945 + DEFINE_KVM_GHCB_ACCESSORS(xss) 946 946 947 947 #endif

+3 -2

arch/x86/kvm/trace.h

··· 461 461 462 462 #define kvm_trace_sym_exc \ 463 463 EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ 464 - EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \ 465 - EXS(MF), EXS(AC), EXS(MC) 464 + EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), EXS(MF), \ 465 + EXS(AC), EXS(MC), EXS(XM), EXS(VE), EXS(CP), \ 466 + EXS(HV), EXS(VC), EXS(SX) 466 467 467 468 /* 468 469 * Tracepoint for kvm interrupt injection:

+9

arch/x86/kvm/vmx/capabilities.h

··· 73 73 return vmcs_config.basic & VMX_BASIC_INOUT; 74 74 } 75 75 76 + static inline bool cpu_has_vmx_basic_no_hw_errcode_cc(void) 77 + { 78 + return vmcs_config.basic & VMX_BASIC_NO_HW_ERROR_CODE_CC; 79 + } 80 + 76 81 static inline bool cpu_has_virtual_nmis(void) 77 82 { 78 83 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS && ··· 105 100 return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 106 101 } 107 102 103 + static inline bool cpu_has_load_cet_ctrl(void) 104 + { 105 + return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_CET_STATE); 106 + } 108 107 static inline bool cpu_has_vmx_mpx(void) 109 108 { 110 109 return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS;

+169 -17

arch/x86/kvm/vmx/nested.c

··· 721 721 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 722 722 MSR_IA32_MPERF, MSR_TYPE_R); 723 723 724 + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 725 + MSR_IA32_U_CET, MSR_TYPE_RW); 726 + 727 + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 728 + MSR_IA32_S_CET, MSR_TYPE_RW); 729 + 730 + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 731 + MSR_IA32_PL0_SSP, MSR_TYPE_RW); 732 + 733 + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 734 + MSR_IA32_PL1_SSP, MSR_TYPE_RW); 735 + 736 + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 737 + MSR_IA32_PL2_SSP, MSR_TYPE_RW); 738 + 739 + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 740 + MSR_IA32_PL3_SSP, MSR_TYPE_RW); 741 + 724 742 kvm_vcpu_unmap(vcpu, &map); 725 743 726 744 vmx->nested.force_msr_bitmap_recalc = false; ··· 1290 1272 { 1291 1273 const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | 1292 1274 VMX_BASIC_INOUT | 1293 - VMX_BASIC_TRUE_CTLS; 1275 + VMX_BASIC_TRUE_CTLS | 1276 + VMX_BASIC_NO_HW_ERROR_CODE_CC; 1294 1277 1295 - const u64 reserved_bits = GENMASK_ULL(63, 56) | 1278 + const u64 reserved_bits = GENMASK_ULL(63, 57) | 1296 1279 GENMASK_ULL(47, 45) | 1297 1280 BIT_ULL(31); 1298 1281 ··· 2539 2520 } 2540 2521 } 2541 2522 2523 + static void vmcs_read_cet_state(struct kvm_vcpu *vcpu, u64 *s_cet, 2524 + u64 *ssp, u64 *ssp_tbl) 2525 + { 2526 + if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || 2527 + guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 2528 + *s_cet = vmcs_readl(GUEST_S_CET); 2529 + 2530 + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { 2531 + *ssp = vmcs_readl(GUEST_SSP); 2532 + *ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE); 2533 + } 2534 + } 2535 + 2536 + static void vmcs_write_cet_state(struct kvm_vcpu *vcpu, u64 s_cet, 2537 + u64 ssp, u64 ssp_tbl) 2538 + { 2539 + if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || 2540 + guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 2541 + vmcs_writel(GUEST_S_CET, s_cet); 2542 + 2543 + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { 2544 + vmcs_writel(GUEST_SSP, ssp); 2545 + vmcs_writel(GUEST_INTR_SSP_TABLE, ssp_tbl); 2546 + } 2547 + } 2548 + 2542 2549 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2543 2550 { 2544 2551 struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); ··· 2681 2636 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2682 2637 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2683 2638 2639 + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) 2640 + vmcs_write_cet_state(&vmx->vcpu, vmcs12->guest_s_cet, 2641 + vmcs12->guest_ssp, vmcs12->guest_ssp_tbl); 2642 + 2684 2643 set_cr4_guest_host_mask(vmx); 2685 2644 } 2686 2645 ··· 2724 2675 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2725 2676 vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); 2726 2677 } 2678 + 2679 + if (!vmx->nested.nested_run_pending || 2680 + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) 2681 + vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet, 2682 + vmx->nested.pre_vmenter_ssp, 2683 + vmx->nested.pre_vmenter_ssp_tbl); 2684 + 2727 2685 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2728 2686 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2729 2687 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); ··· 3005 2949 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 3006 2950 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 3007 2951 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 3008 - bool should_have_error_code; 3009 2952 bool urg = nested_cpu_has2(vmcs12, 3010 2953 SECONDARY_EXEC_UNRESTRICTED_GUEST); 3011 2954 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; ··· 3021 2966 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 3022 2967 return -EINVAL; 3023 2968 3024 - /* VM-entry interruption-info field: deliver error code */ 3025 - should_have_error_code = 3026 - intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 3027 - x86_exception_has_error_code(vector); 3028 - if (CC(has_error_code != should_have_error_code)) 3029 - return -EINVAL; 2969 + /* 2970 + * Cannot deliver error code in real mode or if the interrupt 2971 + * type is not hardware exception. For other cases, do the 2972 + * consistency check only if the vCPU doesn't enumerate 2973 + * VMX_BASIC_NO_HW_ERROR_CODE_CC. 2974 + */ 2975 + if (!prot_mode || intr_type != INTR_TYPE_HARD_EXCEPTION) { 2976 + if (CC(has_error_code)) 2977 + return -EINVAL; 2978 + } else if (!nested_cpu_has_no_hw_errcode_cc(vcpu)) { 2979 + if (CC(has_error_code != x86_exception_has_error_code(vector))) 2980 + return -EINVAL; 2981 + } 3030 2982 3031 2983 /* VM-entry exception error code */ 3032 2984 if (CC(has_error_code && ··· 3100 3038 return !__is_canonical_address(la, l1_address_bits_on_exit); 3101 3039 } 3102 3040 3041 + static int nested_vmx_check_cet_state_common(struct kvm_vcpu *vcpu, u64 s_cet, 3042 + u64 ssp, u64 ssp_tbl) 3043 + { 3044 + if (CC(!kvm_is_valid_u_s_cet(vcpu, s_cet)) || CC(!IS_ALIGNED(ssp, 4)) || 3045 + CC(is_noncanonical_msr_address(ssp_tbl, vcpu))) 3046 + return -EINVAL; 3047 + 3048 + return 0; 3049 + } 3050 + 3103 3051 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 3104 3052 struct vmcs12 *vmcs12) 3105 3053 { ··· 3118 3046 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 3119 3047 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 3120 3048 CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) 3049 + return -EINVAL; 3050 + 3051 + if (CC(vmcs12->host_cr4 & X86_CR4_CET && !(vmcs12->host_cr0 & X86_CR0_WP))) 3121 3052 return -EINVAL; 3122 3053 3123 3054 if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || ··· 3177 3102 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 3178 3103 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 3179 3104 return -EINVAL; 3105 + } 3106 + 3107 + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) { 3108 + if (nested_vmx_check_cet_state_common(vcpu, vmcs12->host_s_cet, 3109 + vmcs12->host_ssp, 3110 + vmcs12->host_ssp_tbl)) 3111 + return -EINVAL; 3112 + 3113 + /* 3114 + * IA32_S_CET and SSP must be canonical if the host will 3115 + * enter 64-bit mode after VM-exit; otherwise, higher 3116 + * 32-bits must be all 0s. 3117 + */ 3118 + if (ia32e) { 3119 + if (CC(is_noncanonical_msr_address(vmcs12->host_s_cet, vcpu)) || 3120 + CC(is_noncanonical_msr_address(vmcs12->host_ssp, vcpu))) 3121 + return -EINVAL; 3122 + } else { 3123 + if (CC(vmcs12->host_s_cet >> 32) || CC(vmcs12->host_ssp >> 32)) 3124 + return -EINVAL; 3125 + } 3180 3126 } 3181 3127 3182 3128 return 0; ··· 3258 3162 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3259 3163 return -EINVAL; 3260 3164 3165 + if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP))) 3166 + return -EINVAL; 3167 + 3261 3168 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 3262 3169 (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || 3263 3170 CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) ··· 3309 3210 (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3310 3211 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3311 3212 return -EINVAL; 3213 + 3214 + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) { 3215 + if (nested_vmx_check_cet_state_common(vcpu, vmcs12->guest_s_cet, 3216 + vmcs12->guest_ssp, 3217 + vmcs12->guest_ssp_tbl)) 3218 + return -EINVAL; 3219 + 3220 + /* 3221 + * Guest SSP must have 63:N bits identical, rather than 3222 + * be canonical (i.e., 63:N-1 bits identical), where N is 3223 + * the CPU's maximum linear-address width. Similar to 3224 + * is_noncanonical_msr_address(), use the host's 3225 + * linear-address width. 3226 + */ 3227 + if (CC(!__is_canonical_address(vmcs12->guest_ssp, max_host_virt_addr_bits() + 1))) 3228 + return -EINVAL; 3229 + } 3312 3230 3313 3231 if (nested_check_guest_non_reg_state(vmcs12)) 3314 3232 return -EINVAL; ··· 3659 3543 (!vmx->nested.nested_run_pending || 3660 3544 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3661 3545 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3546 + 3547 + if (!vmx->nested.nested_run_pending || 3548 + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) 3549 + vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet, 3550 + &vmx->nested.pre_vmenter_ssp, 3551 + &vmx->nested.pre_vmenter_ssp_tbl); 3662 3552 3663 3553 /* 3664 3554 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* ··· 4749 4627 4750 4628 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4751 4629 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4630 + 4631 + vmcs_read_cet_state(&vmx->vcpu, &vmcs12->guest_s_cet, 4632 + &vmcs12->guest_ssp, 4633 + &vmcs12->guest_ssp_tbl); 4752 4634 } 4753 4635 4754 4636 /* ··· 4877 4751 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4878 4752 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4879 4753 vmcs_write64(GUEST_BNDCFGS, 0); 4754 + 4755 + /* 4756 + * Load CET state from host state if VM_EXIT_LOAD_CET_STATE is set. 4757 + * otherwise CET state should be retained across VM-exit, i.e., 4758 + * guest values should be propagated from vmcs12 to vmcs01. 4759 + */ 4760 + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) 4761 + vmcs_write_cet_state(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp, 4762 + vmcs12->host_ssp_tbl); 4763 + else 4764 + vmcs_write_cet_state(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, 4765 + vmcs12->guest_ssp_tbl); 4880 4766 4881 4767 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4882 4768 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); ··· 6708 6570 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6709 6571 case EXIT_REASON_XSETBV: 6710 6572 return true; 6711 - case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 6573 + case EXIT_REASON_XSAVES: 6574 + case EXIT_REASON_XRSTORS: 6712 6575 /* 6713 - * This should never happen, since it is not possible to 6714 - * set XSS to a non-zero value---neither in L1 nor in L2. 6715 - * If if it were, XSS would have to be checked against 6716 - * the XSS exit bitmap in vmcs12. 6576 + * Always forward XSAVES/XRSTORS to L1 as KVM doesn't utilize 6577 + * XSS-bitmap, and always loads vmcs02 with vmcs12's XSS-bitmap 6578 + * verbatim, i.e. any exit is due to L1's bitmap. WARN if 6579 + * XSAVES isn't enabled, as the CPU is supposed to inject #UD 6580 + * in that case, before consulting the XSS-bitmap. 6717 6581 */ 6718 - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); 6582 + WARN_ON_ONCE(!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES)); 6583 + return true; 6719 6584 case EXIT_REASON_UMWAIT: 6720 6585 case EXIT_REASON_TPAUSE: 6721 6586 return nested_cpu_has2(vmcs12, ··· 7179 7038 VM_EXIT_HOST_ADDR_SPACE_SIZE | 7180 7039 #endif 7181 7040 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 7182 - VM_EXIT_CLEAR_BNDCFGS; 7041 + VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_CET_STATE; 7183 7042 msrs->exit_ctls_high |= 7184 7043 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 7185 7044 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 7186 7045 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | 7187 7046 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 7047 + 7048 + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 7049 + !kvm_cpu_cap_has(X86_FEATURE_IBT)) 7050 + msrs->exit_ctls_high &= ~VM_EXIT_LOAD_CET_STATE; 7188 7051 7189 7052 /* We support free control of debug control saving. */ 7190 7053 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; ··· 7205 7060 #ifdef CONFIG_X86_64 7206 7061 VM_ENTRY_IA32E_MODE | 7207 7062 #endif 7208 - VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; 7063 + VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | 7064 + VM_ENTRY_LOAD_CET_STATE; 7209 7065 msrs->entry_ctls_high |= 7210 7066 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | 7211 7067 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 7068 + 7069 + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 7070 + !kvm_cpu_cap_has(X86_FEATURE_IBT)) 7071 + msrs->entry_ctls_high &= ~VM_ENTRY_LOAD_CET_STATE; 7212 7072 7213 7073 /* We support free control of debug control loading. */ 7214 7074 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; ··· 7364 7214 msrs->basic |= VMX_BASIC_TRUE_CTLS; 7365 7215 if (cpu_has_vmx_basic_inout()) 7366 7216 msrs->basic |= VMX_BASIC_INOUT; 7217 + if (cpu_has_vmx_basic_no_hw_errcode_cc()) 7218 + msrs->basic |= VMX_BASIC_NO_HW_ERROR_CODE_CC; 7367 7219 } 7368 7220 7369 7221 static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs)

+5

arch/x86/kvm/vmx/nested.h

··· 309 309 __kvm_is_valid_cr4(vcpu, val); 310 310 } 311 311 312 + static inline bool nested_cpu_has_no_hw_errcode_cc(struct kvm_vcpu *vcpu) 313 + { 314 + return to_vmx(vcpu)->nested.msrs.basic & VMX_BASIC_NO_HW_ERROR_CODE_CC; 315 + } 316 + 312 317 /* No difference in the restrictions on guest and host CR4 in VMX operation. */ 313 318 #define nested_guest_cr4_valid nested_cr4_valid 314 319 #define nested_host_cr4_valid nested_cr4_valid

+6

arch/x86/kvm/vmx/vmcs12.c

··· 139 139 FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), 140 140 FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), 141 141 FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), 142 + FIELD(GUEST_S_CET, guest_s_cet), 143 + FIELD(GUEST_SSP, guest_ssp), 144 + FIELD(GUEST_INTR_SSP_TABLE, guest_ssp_tbl), 142 145 FIELD(HOST_CR0, host_cr0), 143 146 FIELD(HOST_CR3, host_cr3), 144 147 FIELD(HOST_CR4, host_cr4), ··· 154 151 FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), 155 152 FIELD(HOST_RSP, host_rsp), 156 153 FIELD(HOST_RIP, host_rip), 154 + FIELD(HOST_S_CET, host_s_cet), 155 + FIELD(HOST_SSP, host_ssp), 156 + FIELD(HOST_INTR_SSP_TABLE, host_ssp_tbl), 157 157 }; 158 158 const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets);

+13 -1

arch/x86/kvm/vmx/vmcs12.h

··· 117 117 natural_width host_ia32_sysenter_eip; 118 118 natural_width host_rsp; 119 119 natural_width host_rip; 120 - natural_width paddingl[8]; /* room for future expansion */ 120 + natural_width host_s_cet; 121 + natural_width host_ssp; 122 + natural_width host_ssp_tbl; 123 + natural_width guest_s_cet; 124 + natural_width guest_ssp; 125 + natural_width guest_ssp_tbl; 126 + natural_width paddingl[2]; /* room for future expansion */ 121 127 u32 pin_based_vm_exec_control; 122 128 u32 cpu_based_vm_exec_control; 123 129 u32 exception_bitmap; ··· 300 294 CHECK_OFFSET(host_ia32_sysenter_eip, 656); 301 295 CHECK_OFFSET(host_rsp, 664); 302 296 CHECK_OFFSET(host_rip, 672); 297 + CHECK_OFFSET(host_s_cet, 680); 298 + CHECK_OFFSET(host_ssp, 688); 299 + CHECK_OFFSET(host_ssp_tbl, 696); 300 + CHECK_OFFSET(guest_s_cet, 704); 301 + CHECK_OFFSET(guest_ssp, 712); 302 + CHECK_OFFSET(guest_ssp_tbl, 720); 303 303 CHECK_OFFSET(pin_based_vm_exec_control, 744); 304 304 CHECK_OFFSET(cpu_based_vm_exec_control, 748); 305 305 CHECK_OFFSET(exception_bitmap, 752);

+98 -11

arch/x86/kvm/vmx/vmx.c

··· 2106 2106 else 2107 2107 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; 2108 2108 break; 2109 + case MSR_IA32_S_CET: 2110 + msr_info->data = vmcs_readl(GUEST_S_CET); 2111 + break; 2112 + case MSR_KVM_INTERNAL_GUEST_SSP: 2113 + msr_info->data = vmcs_readl(GUEST_SSP); 2114 + break; 2115 + case MSR_IA32_INT_SSP_TAB: 2116 + msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE); 2117 + break; 2109 2118 case MSR_IA32_DEBUGCTLMSR: 2110 2119 msr_info->data = vmx_guest_debugctl_read(); 2111 2120 break; ··· 2433 2424 else 2434 2425 vmx->pt_desc.guest.addr_a[index / 2] = data; 2435 2426 break; 2427 + case MSR_IA32_S_CET: 2428 + vmcs_writel(GUEST_S_CET, data); 2429 + break; 2430 + case MSR_KVM_INTERNAL_GUEST_SSP: 2431 + vmcs_writel(GUEST_SSP, data); 2432 + break; 2433 + case MSR_IA32_INT_SSP_TAB: 2434 + vmcs_writel(GUEST_INTR_SSP_TABLE, data); 2435 + break; 2436 2436 case MSR_IA32_PERF_CAPABILITIES: 2437 2437 if (data & PERF_CAP_LBR_FMT) { 2438 2438 if ((data & PERF_CAP_LBR_FMT) != ··· 2615 2597 { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, 2616 2598 { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, 2617 2599 { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, 2600 + { VM_ENTRY_LOAD_CET_STATE, VM_EXIT_LOAD_CET_STATE }, 2618 2601 }; 2619 2602 2620 2603 memset(vmcs_conf, 0, sizeof(*vmcs_conf)); ··· 4102 4083 4103 4084 static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 4104 4085 { 4086 + bool intercept; 4087 + 4105 4088 if (!cpu_has_vmx_msr_bitmap()) 4106 4089 return; 4107 4090 ··· 4148 4127 if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D)) 4149 4128 vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, 4150 4129 !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); 4130 + 4131 + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { 4132 + intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); 4133 + 4134 + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, intercept); 4135 + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, intercept); 4136 + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, intercept); 4137 + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, intercept); 4138 + } 4139 + 4140 + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) || kvm_cpu_cap_has(X86_FEATURE_IBT)) { 4141 + intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) && 4142 + !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); 4143 + 4144 + vmx_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, intercept); 4145 + vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept); 4146 + } 4151 4147 4152 4148 /* 4153 4149 * x2APIC and LBR MSR intercepts are modified on-demand and cannot be ··· 4326 4288 4327 4289 if (cpu_has_load_ia32_efer()) 4328 4290 vmcs_write64(HOST_IA32_EFER, kvm_host.efer); 4291 + 4292 + /* 4293 + * Supervisor shadow stack is not enabled on host side, i.e., 4294 + * host IA32_S_CET.SHSTK_EN bit is guaranteed to 0 now, per SDM 4295 + * description(RDSSP instruction), SSP is not readable in CPL0, 4296 + * so resetting the two registers to 0s at VM-Exit does no harm 4297 + * to kernel execution. When execution flow exits to userspace, 4298 + * SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter 4299 + * 3 and 4 for details. 4300 + */ 4301 + if (cpu_has_load_cet_ctrl()) { 4302 + vmcs_writel(HOST_S_CET, kvm_host.s_cet); 4303 + vmcs_writel(HOST_SSP, 0); 4304 + vmcs_writel(HOST_INTR_SSP_TABLE, 0); 4305 + } 4329 4306 } 4330 4307 4331 4308 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) ··· 4881 4828 vmcs_write64(GUEST_BNDCFGS, 0); 4882 4829 4883 4830 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 4831 + 4832 + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { 4833 + vmcs_writel(GUEST_SSP, 0); 4834 + vmcs_writel(GUEST_INTR_SSP_TABLE, 0); 4835 + } 4836 + if (kvm_cpu_cap_has(X86_FEATURE_IBT) || 4837 + kvm_cpu_cap_has(X86_FEATURE_SHSTK)) 4838 + vmcs_writel(GUEST_S_CET, 0); 4884 4839 4885 4840 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4886 4841 ··· 6364 6303 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) 6365 6304 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); 6366 6305 6306 + if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE) 6307 + pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", 6308 + vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP), 6309 + vmcs_readl(GUEST_INTR_SSP_TABLE)); 6367 6310 pr_err("*** Host State ***\n"); 6368 6311 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", 6369 6312 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); ··· 6398 6333 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); 6399 6334 if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) 6400 6335 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); 6336 + if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE) 6337 + pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", 6338 + vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP), 6339 + vmcs_readl(HOST_INTR_SSP_TABLE)); 6401 6340 6402 6341 pr_err("*** Control State ***\n"); 6403 6342 pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", ··· 7755 7686 cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); 7756 7687 cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); 7757 7688 cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); 7689 + cr4_fixed1_update(X86_CR4_CET, ecx, feature_bit(SHSTK)); 7690 + cr4_fixed1_update(X86_CR4_CET, edx, feature_bit(IBT)); 7758 7691 7759 7692 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1); 7760 7693 cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM)); ··· 7985 7914 kvm_cpu_cap_set(X86_FEATURE_UMIP); 7986 7915 7987 7916 /* CPUID 0xD.1 */ 7988 - kvm_caps.supported_xss = 0; 7989 7917 if (!cpu_has_vmx_xsaves()) 7990 7918 kvm_cpu_cap_clear(X86_FEATURE_XSAVES); 7991 7919 ··· 7996 7926 7997 7927 if (cpu_has_vmx_waitpkg()) 7998 7928 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); 7929 + 7930 + /* 7931 + * Disable CET if unrestricted_guest is unsupported as KVM doesn't 7932 + * enforce CET HW behaviors in emulator. On platforms with 7933 + * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code 7934 + * fails, so disable CET in this case too. 7935 + */ 7936 + if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest || 7937 + !cpu_has_vmx_basic_no_hw_errcode_cc()) { 7938 + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); 7939 + kvm_cpu_cap_clear(X86_FEATURE_IBT); 7940 + } 7999 7941 } 8000 7942 8001 7943 static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, ··· 8486 8404 return -EOPNOTSUPP; 8487 8405 } 8488 8406 8407 + /* 8408 + * Shadow paging doesn't have a (further) performance penalty 8409 + * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it 8410 + * by default 8411 + */ 8412 + if (!enable_ept) 8413 + allow_smaller_maxphyaddr = true; 8414 + 8489 8415 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) 8490 8416 enable_ept_ad_bits = 0; 8491 8417 ··· 8619 8529 8620 8530 setup_default_sgx_lepubkeyhash(); 8621 8531 8532 + vmx_set_cpu_caps(); 8533 + 8534 + /* 8535 + * Configure nested capabilities after core CPU capabilities so that 8536 + * nested support can be conditional on base support, e.g. so that KVM 8537 + * can hide/show features based on kvm_cpu_cap_has(). 8538 + */ 8622 8539 if (nested) { 8623 8540 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); 8624 8541 ··· 8633 8536 if (r) 8634 8537 return r; 8635 8538 } 8636 - 8637 - vmx_set_cpu_caps(); 8638 8539 8639 8540 r = alloc_kvm_area(); 8640 8541 if (r && nested) ··· 8729 8634 } 8730 8635 8731 8636 vmx_check_vmcs12_offsets(); 8732 - 8733 - /* 8734 - * Shadow paging doesn't have a (further) performance penalty 8735 - * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it 8736 - * by default 8737 - */ 8738 - if (!enable_ept) 8739 - allow_smaller_maxphyaddr = true; 8740 8637 8741 8638 return 0; 8742 8639

+7 -2

arch/x86/kvm/vmx/vmx.h

··· 181 181 */ 182 182 u64 pre_vmenter_debugctl; 183 183 u64 pre_vmenter_bndcfgs; 184 + u64 pre_vmenter_s_cet; 185 + u64 pre_vmenter_ssp; 186 + u64 pre_vmenter_ssp_tbl; 184 187 185 188 /* to migrate it to L1 if L2 writes to L1's CR8 directly */ 186 189 int l1_tpr_threshold; ··· 487 484 VM_ENTRY_LOAD_IA32_EFER | \ 488 485 VM_ENTRY_LOAD_BNDCFGS | \ 489 486 VM_ENTRY_PT_CONCEAL_PIP | \ 490 - VM_ENTRY_LOAD_IA32_RTIT_CTL) 487 + VM_ENTRY_LOAD_IA32_RTIT_CTL | \ 488 + VM_ENTRY_LOAD_CET_STATE) 491 489 492 490 #define __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \ 493 491 (VM_EXIT_SAVE_DEBUG_CONTROLS | \ ··· 510 506 VM_EXIT_LOAD_IA32_EFER | \ 511 507 VM_EXIT_CLEAR_BNDCFGS | \ 512 508 VM_EXIT_PT_CONCEAL_PIP | \ 513 - VM_EXIT_CLEAR_IA32_RTIT_CTL) 509 + VM_EXIT_CLEAR_IA32_RTIT_CTL | \ 510 + VM_EXIT_LOAD_CET_STATE) 514 511 515 512 #define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL \ 516 513 (PIN_BASED_EXT_INTR_MASK | \

+387 -23

arch/x86/kvm/x86.c

··· 136 136 static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); 137 137 138 138 static DEFINE_MUTEX(vendor_module_lock); 139 + static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); 140 + static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); 141 + 139 142 struct kvm_x86_ops kvm_x86_ops __read_mostly; 140 143 141 144 #define KVM_X86_OP(func) \ ··· 219 216 | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ 220 217 | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ 221 218 | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) 219 + 220 + #define XFEATURE_MASK_CET_ALL (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL) 221 + /* 222 + * Note, KVM supports exposing PT to the guest, but does not support context 223 + * switching PT via XSTATE (KVM's PT virtualization relies on perf; swapping 224 + * PT via guest XSTATE would clobber perf state), i.e. KVM doesn't support 225 + * IA32_XSS[bit 8] (guests can/must use RDMSR/WRMSR to save/restore PT MSRs). 226 + */ 227 + #define KVM_SUPPORTED_XSS (XFEATURE_MASK_CET_ALL) 222 228 223 229 bool __read_mostly allow_smaller_maxphyaddr = 0; 224 230 EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); ··· 344 332 MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, 345 333 MSR_IA32_UMWAIT_CONTROL, 346 334 347 - MSR_IA32_XFD, MSR_IA32_XFD_ERR, 335 + MSR_IA32_XFD, MSR_IA32_XFD_ERR, MSR_IA32_XSS, 336 + 337 + MSR_IA32_U_CET, MSR_IA32_S_CET, 338 + MSR_IA32_PL0_SSP, MSR_IA32_PL1_SSP, MSR_IA32_PL2_SSP, 339 + MSR_IA32_PL3_SSP, MSR_IA32_INT_SSP_TAB, 348 340 }; 349 341 350 342 static const u32 msrs_to_save_pmu[] = { ··· 1187 1171 (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))) 1188 1172 return 1; 1189 1173 1174 + if (!(cr0 & X86_CR0_WP) && kvm_is_cr4_bit_set(vcpu, X86_CR4_CET)) 1175 + return 1; 1176 + 1190 1177 kvm_x86_call(set_cr0)(vcpu, cr0); 1191 1178 1192 1179 kvm_post_set_cr0(vcpu, old_cr0, cr0); ··· 1389 1370 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) 1390 1371 return 1; 1391 1372 } 1373 + 1374 + if ((cr4 & X86_CR4_CET) && !kvm_is_cr0_bit_set(vcpu, X86_CR0_WP)) 1375 + return 1; 1392 1376 1393 1377 kvm_x86_call(set_cr4)(vcpu, cr4); 1394 1378 ··· 1897 1875 1898 1876 data = (u32)data; 1899 1877 break; 1878 + case MSR_IA32_U_CET: 1879 + case MSR_IA32_S_CET: 1880 + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && 1881 + !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT)) 1882 + return KVM_MSR_RET_UNSUPPORTED; 1883 + if (!kvm_is_valid_u_s_cet(vcpu, data)) 1884 + return 1; 1885 + break; 1886 + case MSR_KVM_INTERNAL_GUEST_SSP: 1887 + if (!host_initiated) 1888 + return 1; 1889 + fallthrough; 1890 + /* 1891 + * Note that the MSR emulation here is flawed when a vCPU 1892 + * doesn't support the Intel 64 architecture. The expected 1893 + * architectural behavior in this case is that the upper 32 1894 + * bits do not exist and should always read '0'. However, 1895 + * because the actual hardware on which the virtual CPU is 1896 + * running does support Intel 64, XRSTORS/XSAVES in the 1897 + * guest could observe behavior that violates the 1898 + * architecture. Intercepting XRSTORS/XSAVES for this 1899 + * special case isn't deemed worthwhile. 1900 + */ 1901 + case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB: 1902 + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 1903 + return KVM_MSR_RET_UNSUPPORTED; 1904 + /* 1905 + * MSR_IA32_INT_SSP_TAB is not present on processors that do 1906 + * not support Intel 64 architecture. 1907 + */ 1908 + if (index == MSR_IA32_INT_SSP_TAB && !guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) 1909 + return KVM_MSR_RET_UNSUPPORTED; 1910 + if (is_noncanonical_msr_address(data, vcpu)) 1911 + return 1; 1912 + /* All SSP MSRs except MSR_IA32_INT_SSP_TAB must be 4-byte aligned */ 1913 + if (index != MSR_IA32_INT_SSP_TAB && !IS_ALIGNED(data, 4)) 1914 + return 1; 1915 + break; 1900 1916 } 1901 1917 1902 1918 msr.data = data; ··· 1978 1918 !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && 1979 1919 !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)) 1980 1920 return 1; 1921 + break; 1922 + case MSR_IA32_U_CET: 1923 + case MSR_IA32_S_CET: 1924 + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && 1925 + !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT)) 1926 + return KVM_MSR_RET_UNSUPPORTED; 1927 + break; 1928 + case MSR_KVM_INTERNAL_GUEST_SSP: 1929 + if (!host_initiated) 1930 + return 1; 1931 + fallthrough; 1932 + case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB: 1933 + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 1934 + return KVM_MSR_RET_UNSUPPORTED; 1981 1935 break; 1982 1936 } 1983 1937 ··· 3873 3799 mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); 3874 3800 } 3875 3801 3802 + /* 3803 + * Returns true if the MSR in question is managed via XSTATE, i.e. is context 3804 + * switched with the rest of guest FPU state. Note! S_CET is _not_ context 3805 + * switched via XSTATE even though it _is_ saved/restored via XSAVES/XRSTORS. 3806 + * Because S_CET is loaded on VM-Enter and VM-Exit via dedicated VMCS fields, 3807 + * the value saved/restored via XSTATE is always the host's value. That detail 3808 + * is _extremely_ important, as the guest's S_CET must _never_ be resident in 3809 + * hardware while executing in the host. Loading guest values for U_CET and 3810 + * PL[0-3]_SSP while executing in the kernel is safe, as U_CET is specific to 3811 + * userspace, and PL[0-3]_SSP are only consumed when transitioning to lower 3812 + * privilege levels, i.e. are effectively only consumed by userspace as well. 3813 + */ 3814 + static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr) 3815 + { 3816 + if (!vcpu) 3817 + return false; 3818 + 3819 + switch (msr) { 3820 + case MSR_IA32_U_CET: 3821 + return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) || 3822 + guest_cpu_cap_has(vcpu, X86_FEATURE_IBT); 3823 + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: 3824 + return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); 3825 + default: 3826 + return false; 3827 + } 3828 + } 3829 + 3830 + /* 3831 + * Lock (and if necessary, re-load) the guest FPU, i.e. XSTATE, and access an 3832 + * MSR that is managed via XSTATE. Note, the caller is responsible for doing 3833 + * the initial FPU load, this helper only ensures that guest state is resident 3834 + * in hardware (the kernel can load its FPU state in IRQ context). 3835 + */ 3836 + static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu, 3837 + struct msr_data *msr_info, 3838 + int access) 3839 + { 3840 + BUILD_BUG_ON(access != MSR_TYPE_R && access != MSR_TYPE_W); 3841 + 3842 + KVM_BUG_ON(!is_xstate_managed_msr(vcpu, msr_info->index), vcpu->kvm); 3843 + KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm); 3844 + 3845 + kvm_fpu_get(); 3846 + if (access == MSR_TYPE_R) 3847 + rdmsrq(msr_info->index, msr_info->data); 3848 + else 3849 + wrmsrq(msr_info->index, msr_info->data); 3850 + kvm_fpu_put(); 3851 + } 3852 + 3853 + static void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 3854 + { 3855 + kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_W); 3856 + } 3857 + 3858 + static void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 3859 + { 3860 + kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_R); 3861 + } 3862 + 3876 3863 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 3877 3864 { 3878 3865 u32 msr = msr_info->index; ··· 4125 3990 } 4126 3991 break; 4127 3992 case MSR_IA32_XSS: 4128 - if (!msr_info->host_initiated && 4129 - !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) 3993 + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) 3994 + return KVM_MSR_RET_UNSUPPORTED; 3995 + 3996 + if (data & ~vcpu->arch.guest_supported_xss) 4130 3997 return 1; 4131 - /* 4132 - * KVM supports exposing PT to the guest, but does not support 4133 - * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than 4134 - * XSAVES/XRSTORS to save/restore PT MSRs. 4135 - */ 4136 - if (data & ~kvm_caps.supported_xss) 4137 - return 1; 3998 + if (vcpu->arch.ia32_xss == data) 3999 + break; 4138 4000 vcpu->arch.ia32_xss = data; 4139 4001 vcpu->arch.cpuid_dynamic_bits_dirty = true; 4140 4002 break; ··· 4315 4183 vcpu->arch.guest_fpu.xfd_err = data; 4316 4184 break; 4317 4185 #endif 4186 + case MSR_IA32_U_CET: 4187 + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: 4188 + kvm_set_xstate_msr(vcpu, msr_info); 4189 + break; 4318 4190 default: 4319 4191 if (kvm_pmu_is_valid_msr(vcpu, msr)) 4320 4192 return kvm_pmu_set_msr(vcpu, msr_info); ··· 4668 4532 msr_info->data = vcpu->arch.guest_fpu.xfd_err; 4669 4533 break; 4670 4534 #endif 4535 + case MSR_IA32_U_CET: 4536 + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: 4537 + kvm_get_xstate_msr(vcpu, msr_info); 4538 + break; 4671 4539 default: 4672 4540 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) 4673 4541 return kvm_pmu_get_msr(vcpu, msr_info); ··· 4692 4552 int (*do_msr)(struct kvm_vcpu *vcpu, 4693 4553 unsigned index, u64 *data)) 4694 4554 { 4555 + bool fpu_loaded = false; 4695 4556 int i; 4696 4557 4697 - for (i = 0; i < msrs->nmsrs; ++i) 4558 + for (i = 0; i < msrs->nmsrs; ++i) { 4559 + /* 4560 + * If userspace is accessing one or more XSTATE-managed MSRs, 4561 + * temporarily load the guest's FPU state so that the guest's 4562 + * MSR value(s) is resident in hardware and thus can be accessed 4563 + * via RDMSR/WRMSR. 4564 + */ 4565 + if (!fpu_loaded && is_xstate_managed_msr(vcpu, entries[i].index)) { 4566 + kvm_load_guest_fpu(vcpu); 4567 + fpu_loaded = true; 4568 + } 4698 4569 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 4699 4570 break; 4571 + } 4572 + if (fpu_loaded) 4573 + kvm_put_guest_fpu(vcpu); 4700 4574 4701 4575 return i; 4702 4576 } ··· 4895 4741 case KVM_CAP_IRQFD_RESAMPLE: 4896 4742 case KVM_CAP_MEMORY_FAULT_INFO: 4897 4743 case KVM_CAP_X86_GUEST_MODE: 4744 + case KVM_CAP_ONE_REG: 4898 4745 r = 1; 4899 4746 break; 4900 4747 case KVM_CAP_PRE_FAULT_MEMORY: ··· 6074 5919 } 6075 5920 } 6076 5921 5922 + struct kvm_x86_reg_id { 5923 + __u32 index; 5924 + __u8 type; 5925 + __u8 rsvd1; 5926 + __u8 rsvd2:4; 5927 + __u8 size:4; 5928 + __u8 x86; 5929 + }; 5930 + 5931 + static int kvm_translate_kvm_reg(struct kvm_vcpu *vcpu, 5932 + struct kvm_x86_reg_id *reg) 5933 + { 5934 + switch (reg->index) { 5935 + case KVM_REG_GUEST_SSP: 5936 + /* 5937 + * FIXME: If host-initiated accesses are ever exempted from 5938 + * ignore_msrs (in kvm_do_msr_access()), drop this manual check 5939 + * and rely on KVM's standard checks to reject accesses to regs 5940 + * that don't exist. 5941 + */ 5942 + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 5943 + return -EINVAL; 5944 + 5945 + reg->type = KVM_X86_REG_TYPE_MSR; 5946 + reg->index = MSR_KVM_INTERNAL_GUEST_SSP; 5947 + break; 5948 + default: 5949 + return -EINVAL; 5950 + } 5951 + return 0; 5952 + } 5953 + 5954 + static int kvm_get_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val) 5955 + { 5956 + u64 val; 5957 + 5958 + if (do_get_msr(vcpu, msr, &val)) 5959 + return -EINVAL; 5960 + 5961 + if (put_user(val, user_val)) 5962 + return -EFAULT; 5963 + 5964 + return 0; 5965 + } 5966 + 5967 + static int kvm_set_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val) 5968 + { 5969 + u64 val; 5970 + 5971 + if (get_user(val, user_val)) 5972 + return -EFAULT; 5973 + 5974 + if (do_set_msr(vcpu, msr, &val)) 5975 + return -EINVAL; 5976 + 5977 + return 0; 5978 + } 5979 + 5980 + static int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl, 5981 + void __user *argp) 5982 + { 5983 + struct kvm_one_reg one_reg; 5984 + struct kvm_x86_reg_id *reg; 5985 + u64 __user *user_val; 5986 + bool load_fpu; 5987 + int r; 5988 + 5989 + if (copy_from_user(&one_reg, argp, sizeof(one_reg))) 5990 + return -EFAULT; 5991 + 5992 + if ((one_reg.id & KVM_REG_ARCH_MASK) != KVM_REG_X86) 5993 + return -EINVAL; 5994 + 5995 + reg = (struct kvm_x86_reg_id *)&one_reg.id; 5996 + if (reg->rsvd1 || reg->rsvd2) 5997 + return -EINVAL; 5998 + 5999 + if (reg->type == KVM_X86_REG_TYPE_KVM) { 6000 + r = kvm_translate_kvm_reg(vcpu, reg); 6001 + if (r) 6002 + return r; 6003 + } 6004 + 6005 + if (reg->type != KVM_X86_REG_TYPE_MSR) 6006 + return -EINVAL; 6007 + 6008 + if ((one_reg.id & KVM_REG_SIZE_MASK) != KVM_REG_SIZE_U64) 6009 + return -EINVAL; 6010 + 6011 + guard(srcu)(&vcpu->kvm->srcu); 6012 + 6013 + load_fpu = is_xstate_managed_msr(vcpu, reg->index); 6014 + if (load_fpu) 6015 + kvm_load_guest_fpu(vcpu); 6016 + 6017 + user_val = u64_to_user_ptr(one_reg.addr); 6018 + if (ioctl == KVM_GET_ONE_REG) 6019 + r = kvm_get_one_msr(vcpu, reg->index, user_val); 6020 + else 6021 + r = kvm_set_one_msr(vcpu, reg->index, user_val); 6022 + 6023 + if (load_fpu) 6024 + kvm_put_guest_fpu(vcpu); 6025 + return r; 6026 + } 6027 + 6028 + static int kvm_get_reg_list(struct kvm_vcpu *vcpu, 6029 + struct kvm_reg_list __user *user_list) 6030 + { 6031 + u64 nr_regs = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) ? 1 : 0; 6032 + u64 user_nr_regs; 6033 + 6034 + if (get_user(user_nr_regs, &user_list->n)) 6035 + return -EFAULT; 6036 + 6037 + if (put_user(nr_regs, &user_list->n)) 6038 + return -EFAULT; 6039 + 6040 + if (user_nr_regs < nr_regs) 6041 + return -E2BIG; 6042 + 6043 + if (nr_regs && 6044 + put_user(KVM_X86_REG_KVM(KVM_REG_GUEST_SSP), &user_list->reg[0])) 6045 + return -EFAULT; 6046 + 6047 + return 0; 6048 + } 6049 + 6077 6050 long kvm_arch_vcpu_ioctl(struct file *filp, 6078 6051 unsigned int ioctl, unsigned long arg) 6079 6052 { ··· 6318 6035 srcu_read_unlock(&vcpu->kvm->srcu, idx); 6319 6036 break; 6320 6037 } 6038 + case KVM_GET_ONE_REG: 6039 + case KVM_SET_ONE_REG: 6040 + r = kvm_get_set_one_reg(vcpu, ioctl, argp); 6041 + break; 6042 + case KVM_GET_REG_LIST: 6043 + r = kvm_get_reg_list(vcpu, argp); 6044 + break; 6321 6045 case KVM_TPR_ACCESS_REPORTING: { 6322 6046 struct kvm_tpr_access_ctl tac; 6323 6047 ··· 7699 7409 if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR)) 7700 7410 return; 7701 7411 break; 7412 + case MSR_IA32_XSS: 7413 + if (!kvm_caps.supported_xss) 7414 + return; 7415 + break; 7416 + case MSR_IA32_U_CET: 7417 + case MSR_IA32_S_CET: 7418 + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 7419 + !kvm_cpu_cap_has(X86_FEATURE_IBT)) 7420 + return; 7421 + break; 7422 + case MSR_IA32_INT_SSP_TAB: 7423 + if (!kvm_cpu_cap_has(X86_FEATURE_LM)) 7424 + return; 7425 + fallthrough; 7426 + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: 7427 + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK)) 7428 + return; 7429 + break; 7702 7430 default: 7703 7431 break; 7704 7432 } ··· 8748 8440 static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, 8749 8441 u32 msr_index, u64 *pdata) 8750 8442 { 8443 + /* 8444 + * Treat emulator accesses to the current shadow stack pointer as host- 8445 + * initiated, as they aren't true MSR accesses (SSP is a "just a reg"), 8446 + * and this API is used only for implicit accesses, i.e. not RDMSR, and 8447 + * so the index is fully KVM-controlled. 8448 + */ 8449 + if (unlikely(msr_index == MSR_KVM_INTERNAL_GUEST_SSP)) 8450 + return kvm_msr_read(emul_to_vcpu(ctxt), msr_index, pdata); 8451 + 8751 8452 return __kvm_emulate_msr_read(emul_to_vcpu(ctxt), msr_index, pdata); 8752 8453 } 8753 8454 ··· 10013 9696 return -EIO; 10014 9697 } 10015 9698 9699 + if (boot_cpu_has(X86_FEATURE_SHSTK) || boot_cpu_has(X86_FEATURE_IBT)) { 9700 + rdmsrq(MSR_IA32_S_CET, kvm_host.s_cet); 9701 + /* 9702 + * Linux doesn't yet support supervisor shadow stacks (SSS), so 9703 + * KVM doesn't save/restore the associated MSRs, i.e. KVM may 9704 + * clobber the host values. Yell and refuse to load if SSS is 9705 + * unexpectedly enabled, e.g. to avoid crashing the host. 9706 + */ 9707 + if (WARN_ON_ONCE(kvm_host.s_cet & CET_SHSTK_EN)) 9708 + return -EIO; 9709 + } 9710 + 10016 9711 memset(&kvm_caps, 0, sizeof(kvm_caps)); 10017 9712 10018 9713 x86_emulator_cache = kvm_alloc_emulator_cache(); ··· 10052 9723 kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 10053 9724 kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0; 10054 9725 } 9726 + 9727 + if (boot_cpu_has(X86_FEATURE_XSAVES)) { 9728 + rdmsrq(MSR_IA32_XSS, kvm_host.xss); 9729 + kvm_caps.supported_xss = kvm_host.xss & KVM_SUPPORTED_XSS; 9730 + } 9731 + 10055 9732 kvm_caps.supported_quirks = KVM_X86_VALID_QUIRKS; 10056 9733 kvm_caps.inapplicable_quirks = KVM_X86_CONDITIONAL_QUIRKS; 10057 9734 10058 9735 rdmsrq_safe(MSR_EFER, &kvm_host.efer); 10059 - 10060 - if (boot_cpu_has(X86_FEATURE_XSAVES)) 10061 - rdmsrq(MSR_IA32_XSS, kvm_host.xss); 10062 9736 10063 9737 kvm_init_pmu_capability(ops->pmu_ops); 10064 9738 ··· 10110 9778 10111 9779 if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) 10112 9780 kvm_caps.supported_xss = 0; 9781 + 9782 + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 9783 + !kvm_cpu_cap_has(X86_FEATURE_IBT)) 9784 + kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; 9785 + 9786 + if ((kvm_caps.supported_xss & XFEATURE_MASK_CET_ALL) != XFEATURE_MASK_CET_ALL) { 9787 + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); 9788 + kvm_cpu_cap_clear(X86_FEATURE_IBT); 9789 + kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; 9790 + } 10113 9791 10114 9792 if (kvm_caps.has_tsc_control) { 10115 9793 /* ··· 12213 11871 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 12214 11872 int ret; 12215 11873 11874 + if (kvm_is_cr4_bit_set(vcpu, X86_CR4_CET)) { 11875 + u64 u_cet, s_cet; 11876 + 11877 + /* 11878 + * Check both User and Supervisor on task switches as inter- 11879 + * privilege level task switches are impacted by CET at both 11880 + * the current privilege level and the new privilege level, and 11881 + * that information is not known at this time. The expectation 11882 + * is that the guest won't require emulation of task switches 11883 + * while using IBT or Shadow Stacks. 11884 + */ 11885 + if (__kvm_emulate_msr_read(vcpu, MSR_IA32_U_CET, &u_cet) || 11886 + __kvm_emulate_msr_read(vcpu, MSR_IA32_S_CET, &s_cet)) 11887 + goto unhandled_task_switch; 11888 + 11889 + if ((u_cet | s_cet) & (CET_ENDBR_EN | CET_SHSTK_EN)) 11890 + goto unhandled_task_switch; 11891 + } 11892 + 12216 11893 init_emulate_ctxt(vcpu); 12217 11894 12218 11895 ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, ··· 12241 11880 * Report an error userspace if MMIO is needed, as KVM doesn't support 12242 11881 * MMIO during a task switch (or any other complex operation). 12243 11882 */ 12244 - if (ret || vcpu->mmio_needed) { 12245 - vcpu->mmio_needed = false; 12246 - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 12247 - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 12248 - vcpu->run->internal.ndata = 0; 12249 - return 0; 12250 - } 11883 + if (ret || vcpu->mmio_needed) 11884 + goto unhandled_task_switch; 12251 11885 12252 11886 kvm_rip_write(vcpu, ctxt->eip); 12253 11887 kvm_set_rflags(vcpu, ctxt->eflags); 12254 11888 return 1; 11889 + 11890 + unhandled_task_switch: 11891 + vcpu->mmio_needed = false; 11892 + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 11893 + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 11894 + vcpu->run->internal.ndata = 0; 11895 + return 0; 12255 11896 } 12256 11897 EXPORT_SYMBOL_GPL(kvm_task_switch); 12257 11898 ··· 12801 12438 /* 12802 12439 * On INIT, only select XSTATE components are zeroed, most components 12803 12440 * are unchanged. Currently, the only components that are zeroed and 12804 - * supported by KVM are MPX related. 12441 + * supported by KVM are MPX and CET related. 12805 12442 */ 12806 12443 xfeatures_mask = (kvm_caps.supported_xcr0 | kvm_caps.supported_xss) & 12807 - (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); 12444 + (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR | 12445 + XFEATURE_MASK_CET_ALL); 12808 12446 if (!xfeatures_mask) 12809 12447 return; 12810 12448

+37

arch/x86/kvm/x86.h

··· 50 50 u64 efer; 51 51 u64 xcr0; 52 52 u64 xss; 53 + u64 s_cet; 53 54 u64 arch_capabilities; 54 55 }; 55 56 ··· 101 100 #define KVM_VMX_DEFAULT_PLE_WINDOW_MAX UINT_MAX 102 101 #define KVM_SVM_DEFAULT_PLE_WINDOW_MAX USHRT_MAX 103 102 #define KVM_SVM_DEFAULT_PLE_WINDOW 3000 103 + 104 + /* 105 + * KVM's internal, non-ABI indices for synthetic MSRs. The values themselves 106 + * are arbitrary and have no meaning, the only requirement is that they don't 107 + * conflict with "real" MSRs that KVM supports. Use values at the upper end 108 + * of KVM's reserved paravirtual MSR range to minimize churn, i.e. these values 109 + * will be usable until KVM exhausts its supply of paravirtual MSR indices. 110 + */ 111 + 112 + #define MSR_KVM_INTERNAL_GUEST_SSP 0x4b564dff 104 113 105 114 static inline unsigned int __grow_ple_window(unsigned int val, 106 115 unsigned int base, unsigned int modifier, unsigned int max) ··· 680 669 __reserved_bits |= X86_CR4_PCIDE; \ 681 670 if (!__cpu_has(__c, X86_FEATURE_LAM)) \ 682 671 __reserved_bits |= X86_CR4_LAM_SUP; \ 672 + if (!__cpu_has(__c, X86_FEATURE_SHSTK) && \ 673 + !__cpu_has(__c, X86_FEATURE_IBT)) \ 674 + __reserved_bits |= X86_CR4_CET; \ 683 675 __reserved_bits; \ 684 676 }) 685 677 ··· 714 700 715 701 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 716 702 703 + #define CET_US_RESERVED_BITS GENMASK(9, 6) 704 + #define CET_US_SHSTK_MASK_BITS GENMASK(1, 0) 705 + #define CET_US_IBT_MASK_BITS (GENMASK_ULL(5, 2) | GENMASK_ULL(63, 10)) 706 + #define CET_US_LEGACY_BITMAP_BASE(data) ((data) >> 12) 707 + 708 + static inline bool kvm_is_valid_u_s_cet(struct kvm_vcpu *vcpu, u64 data) 709 + { 710 + if (data & CET_US_RESERVED_BITS) 711 + return false; 712 + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && 713 + (data & CET_US_SHSTK_MASK_BITS)) 714 + return false; 715 + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) && 716 + (data & CET_US_IBT_MASK_BITS)) 717 + return false; 718 + if (!IS_ALIGNED(CET_US_LEGACY_BITMAP_BASE(data), 4)) 719 + return false; 720 + /* IBT can be suppressed iff the TRACKER isn't WAIT_ENDBR. */ 721 + if ((data & CET_SUPPRESS) && (data & CET_WAIT_ENDBR)) 722 + return false; 723 + 724 + return true; 725 + } 717 726 #endif

+1

tools/testing/selftests/kvm/Makefile.kvm

··· 87 87 TEST_GEN_PROGS_x86 += x86/kvm_pv_test 88 88 TEST_GEN_PROGS_x86 += x86/kvm_buslock_test 89 89 TEST_GEN_PROGS_x86 += x86/monitor_mwait_test 90 + TEST_GEN_PROGS_x86 += x86/msrs_test 90 91 TEST_GEN_PROGS_x86 += x86/nested_emulation_test 91 92 TEST_GEN_PROGS_x86 += x86/nested_exceptions_test 92 93 TEST_GEN_PROGS_x86 += x86/platform_info_test

+5

tools/testing/selftests/kvm/include/x86/processor.h

··· 1362 1362 return get_kvm_intel_param_bool("unrestricted_guest"); 1363 1363 } 1364 1364 1365 + static inline bool kvm_is_ignore_msrs(void) 1366 + { 1367 + return get_kvm_param_bool("ignore_msrs"); 1368 + } 1369 + 1365 1370 uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, 1366 1371 int *level); 1367 1372 uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr);

+489

tools/testing/selftests/kvm/x86/msrs_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + #include <asm/msr-index.h> 3 + 4 + #include <stdint.h> 5 + 6 + #include "kvm_util.h" 7 + #include "processor.h" 8 + 9 + /* Use HYPERVISOR for MSRs that are emulated unconditionally (as is HYPERVISOR). */ 10 + #define X86_FEATURE_NONE X86_FEATURE_HYPERVISOR 11 + 12 + struct kvm_msr { 13 + const struct kvm_x86_cpu_feature feature; 14 + const struct kvm_x86_cpu_feature feature2; 15 + const char *name; 16 + const u64 reset_val; 17 + const u64 write_val; 18 + const u64 rsvd_val; 19 + const u32 index; 20 + const bool is_kvm_defined; 21 + }; 22 + 23 + #define ____MSR_TEST(msr, str, val, rsvd, reset, feat, f2, is_kvm) \ 24 + { \ 25 + .index = msr, \ 26 + .name = str, \ 27 + .write_val = val, \ 28 + .rsvd_val = rsvd, \ 29 + .reset_val = reset, \ 30 + .feature = X86_FEATURE_ ##feat, \ 31 + .feature2 = X86_FEATURE_ ##f2, \ 32 + .is_kvm_defined = is_kvm, \ 33 + } 34 + 35 + #define __MSR_TEST(msr, str, val, rsvd, reset, feat) \ 36 + ____MSR_TEST(msr, str, val, rsvd, reset, feat, feat, false) 37 + 38 + #define MSR_TEST_NON_ZERO(msr, val, rsvd, reset, feat) \ 39 + __MSR_TEST(msr, #msr, val, rsvd, reset, feat) 40 + 41 + #define MSR_TEST(msr, val, rsvd, feat) \ 42 + __MSR_TEST(msr, #msr, val, rsvd, 0, feat) 43 + 44 + #define MSR_TEST2(msr, val, rsvd, feat, f2) \ 45 + ____MSR_TEST(msr, #msr, val, rsvd, 0, feat, f2, false) 46 + 47 + /* 48 + * Note, use a page aligned value for the canonical value so that the value 49 + * is compatible with MSRs that use bits 11:0 for things other than addresses. 50 + */ 51 + static const u64 canonical_val = 0x123456789000ull; 52 + 53 + /* 54 + * Arbitrary value with bits set in every byte, but not all bits set. This is 55 + * also a non-canonical value, but that's coincidental (any 64-bit value with 56 + * an alternating 0s/1s pattern will be non-canonical). 57 + */ 58 + static const u64 u64_val = 0xaaaa5555aaaa5555ull; 59 + 60 + #define MSR_TEST_CANONICAL(msr, feat) \ 61 + __MSR_TEST(msr, #msr, canonical_val, NONCANONICAL, 0, feat) 62 + 63 + #define MSR_TEST_KVM(msr, val, rsvd, feat) \ 64 + ____MSR_TEST(KVM_REG_ ##msr, #msr, val, rsvd, 0, feat, feat, true) 65 + 66 + /* 67 + * The main struct must be scoped to a function due to the use of structures to 68 + * define features. For the global structure, allocate enough space for the 69 + * foreseeable future without getting too ridiculous, to minimize maintenance 70 + * costs (bumping the array size every time an MSR is added is really annoying). 71 + */ 72 + static struct kvm_msr msrs[128]; 73 + static int idx; 74 + 75 + static bool ignore_unsupported_msrs; 76 + 77 + static u64 fixup_rdmsr_val(u32 msr, u64 want) 78 + { 79 + /* 80 + * AMD CPUs drop bits 63:32 on some MSRs that Intel CPUs support. KVM 81 + * is supposed to emulate that behavior based on guest vendor model 82 + * (which is the same as the host vendor model for this test). 83 + */ 84 + if (!host_cpu_is_amd) 85 + return want; 86 + 87 + switch (msr) { 88 + case MSR_IA32_SYSENTER_ESP: 89 + case MSR_IA32_SYSENTER_EIP: 90 + case MSR_TSC_AUX: 91 + return want & GENMASK_ULL(31, 0); 92 + default: 93 + return want; 94 + } 95 + } 96 + 97 + static void __rdmsr(u32 msr, u64 want) 98 + { 99 + u64 val; 100 + u8 vec; 101 + 102 + vec = rdmsr_safe(msr, &val); 103 + __GUEST_ASSERT(!vec, "Unexpected %s on RDMSR(0x%x)", ex_str(vec), msr); 104 + 105 + __GUEST_ASSERT(val == want, "Wanted 0x%lx from RDMSR(0x%x), got 0x%lx", 106 + want, msr, val); 107 + } 108 + 109 + static void __wrmsr(u32 msr, u64 val) 110 + { 111 + u8 vec; 112 + 113 + vec = wrmsr_safe(msr, val); 114 + __GUEST_ASSERT(!vec, "Unexpected %s on WRMSR(0x%x, 0x%lx)", 115 + ex_str(vec), msr, val); 116 + __rdmsr(msr, fixup_rdmsr_val(msr, val)); 117 + } 118 + 119 + static void guest_test_supported_msr(const struct kvm_msr *msr) 120 + { 121 + __rdmsr(msr->index, msr->reset_val); 122 + __wrmsr(msr->index, msr->write_val); 123 + GUEST_SYNC(fixup_rdmsr_val(msr->index, msr->write_val)); 124 + 125 + __rdmsr(msr->index, msr->reset_val); 126 + } 127 + 128 + static void guest_test_unsupported_msr(const struct kvm_msr *msr) 129 + { 130 + u64 val; 131 + u8 vec; 132 + 133 + /* 134 + * KVM's ABI with respect to ignore_msrs is a mess and largely beyond 135 + * repair, just skip the unsupported MSR tests. 136 + */ 137 + if (ignore_unsupported_msrs) 138 + goto skip_wrmsr_gp; 139 + 140 + /* 141 + * {S,U}_CET exist if IBT or SHSTK is supported, but with bits that are 142 + * writable only if their associated feature is supported. Skip the 143 + * RDMSR #GP test if the secondary feature is supported, but perform 144 + * the WRMSR #GP test as the to-be-written value is tied to the primary 145 + * feature. For all other MSRs, simply do nothing. 146 + */ 147 + if (this_cpu_has(msr->feature2)) { 148 + if (msr->index != MSR_IA32_U_CET && 149 + msr->index != MSR_IA32_S_CET) 150 + goto skip_wrmsr_gp; 151 + 152 + goto skip_rdmsr_gp; 153 + } 154 + 155 + vec = rdmsr_safe(msr->index, &val); 156 + __GUEST_ASSERT(vec == GP_VECTOR, "Wanted #GP on RDMSR(0x%x), got %s", 157 + msr->index, ex_str(vec)); 158 + 159 + skip_rdmsr_gp: 160 + vec = wrmsr_safe(msr->index, msr->write_val); 161 + __GUEST_ASSERT(vec == GP_VECTOR, "Wanted #GP on WRMSR(0x%x, 0x%lx), got %s", 162 + msr->index, msr->write_val, ex_str(vec)); 163 + 164 + skip_wrmsr_gp: 165 + GUEST_SYNC(0); 166 + } 167 + 168 + void guest_test_reserved_val(const struct kvm_msr *msr) 169 + { 170 + /* Skip reserved value checks as well, ignore_msrs is trully a mess. */ 171 + if (ignore_unsupported_msrs) 172 + return; 173 + 174 + /* 175 + * If the CPU will truncate the written value (e.g. SYSENTER on AMD), 176 + * expect success and a truncated value, not #GP. 177 + */ 178 + if (!this_cpu_has(msr->feature) || 179 + msr->rsvd_val == fixup_rdmsr_val(msr->index, msr->rsvd_val)) { 180 + u8 vec = wrmsr_safe(msr->index, msr->rsvd_val); 181 + 182 + __GUEST_ASSERT(vec == GP_VECTOR, 183 + "Wanted #GP on WRMSR(0x%x, 0x%lx), got %s", 184 + msr->index, msr->rsvd_val, ex_str(vec)); 185 + } else { 186 + __wrmsr(msr->index, msr->rsvd_val); 187 + __wrmsr(msr->index, msr->reset_val); 188 + } 189 + } 190 + 191 + static void guest_main(void) 192 + { 193 + for (;;) { 194 + const struct kvm_msr *msr = &msrs[READ_ONCE(idx)]; 195 + 196 + if (this_cpu_has(msr->feature)) 197 + guest_test_supported_msr(msr); 198 + else 199 + guest_test_unsupported_msr(msr); 200 + 201 + if (msr->rsvd_val) 202 + guest_test_reserved_val(msr); 203 + 204 + GUEST_SYNC(msr->reset_val); 205 + } 206 + } 207 + 208 + static bool has_one_reg; 209 + static bool use_one_reg; 210 + 211 + #define KVM_X86_MAX_NR_REGS 1 212 + 213 + static bool vcpu_has_reg(struct kvm_vcpu *vcpu, u64 reg) 214 + { 215 + struct { 216 + struct kvm_reg_list list; 217 + u64 regs[KVM_X86_MAX_NR_REGS]; 218 + } regs = {}; 219 + int r, i; 220 + 221 + /* 222 + * If KVM_GET_REG_LIST succeeds with n=0, i.e. there are no supported 223 + * regs, then the vCPU obviously doesn't support the reg. 224 + */ 225 + r = __vcpu_ioctl(vcpu, KVM_GET_REG_LIST, &regs.list); 226 + if (!r) 227 + return false; 228 + 229 + TEST_ASSERT_EQ(errno, E2BIG); 230 + 231 + /* 232 + * KVM x86 is expected to support enumerating a relative small number 233 + * of regs. The majority of registers supported by KVM_{G,S}ET_ONE_REG 234 + * are enumerated via other ioctls, e.g. KVM_GET_MSR_INDEX_LIST. For 235 + * simplicity, hardcode the maximum number of regs and manually update 236 + * the test as necessary. 237 + */ 238 + TEST_ASSERT(regs.list.n <= KVM_X86_MAX_NR_REGS, 239 + "KVM reports %llu regs, test expects at most %u regs, stale test?", 240 + regs.list.n, KVM_X86_MAX_NR_REGS); 241 + 242 + vcpu_ioctl(vcpu, KVM_GET_REG_LIST, &regs.list); 243 + for (i = 0; i < regs.list.n; i++) { 244 + if (regs.regs[i] == reg) 245 + return true; 246 + } 247 + 248 + return false; 249 + } 250 + 251 + static void host_test_kvm_reg(struct kvm_vcpu *vcpu) 252 + { 253 + bool has_reg = vcpu_cpuid_has(vcpu, msrs[idx].feature); 254 + u64 reset_val = msrs[idx].reset_val; 255 + u64 write_val = msrs[idx].write_val; 256 + u64 rsvd_val = msrs[idx].rsvd_val; 257 + u32 reg = msrs[idx].index; 258 + u64 val; 259 + int r; 260 + 261 + if (!use_one_reg) 262 + return; 263 + 264 + TEST_ASSERT_EQ(vcpu_has_reg(vcpu, KVM_X86_REG_KVM(reg)), has_reg); 265 + 266 + if (!has_reg) { 267 + r = __vcpu_get_reg(vcpu, KVM_X86_REG_KVM(reg), &val); 268 + TEST_ASSERT(r && errno == EINVAL, 269 + "Expected failure on get_reg(0x%x)", reg); 270 + rsvd_val = 0; 271 + goto out; 272 + } 273 + 274 + val = vcpu_get_reg(vcpu, KVM_X86_REG_KVM(reg)); 275 + TEST_ASSERT(val == reset_val, "Wanted 0x%lx from get_reg(0x%x), got 0x%lx", 276 + reset_val, reg, val); 277 + 278 + vcpu_set_reg(vcpu, KVM_X86_REG_KVM(reg), write_val); 279 + val = vcpu_get_reg(vcpu, KVM_X86_REG_KVM(reg)); 280 + TEST_ASSERT(val == write_val, "Wanted 0x%lx from get_reg(0x%x), got 0x%lx", 281 + write_val, reg, val); 282 + 283 + out: 284 + r = __vcpu_set_reg(vcpu, KVM_X86_REG_KVM(reg), rsvd_val); 285 + TEST_ASSERT(r, "Expected failure on set_reg(0x%x, 0x%lx)", reg, rsvd_val); 286 + } 287 + 288 + static void host_test_msr(struct kvm_vcpu *vcpu, u64 guest_val) 289 + { 290 + u64 reset_val = msrs[idx].reset_val; 291 + u32 msr = msrs[idx].index; 292 + u64 val; 293 + 294 + if (!kvm_cpu_has(msrs[idx].feature)) 295 + return; 296 + 297 + val = vcpu_get_msr(vcpu, msr); 298 + TEST_ASSERT(val == guest_val, "Wanted 0x%lx from get_msr(0x%x), got 0x%lx", 299 + guest_val, msr, val); 300 + 301 + if (use_one_reg) 302 + vcpu_set_reg(vcpu, KVM_X86_REG_MSR(msr), reset_val); 303 + else 304 + vcpu_set_msr(vcpu, msr, reset_val); 305 + 306 + val = vcpu_get_msr(vcpu, msr); 307 + TEST_ASSERT(val == reset_val, "Wanted 0x%lx from get_msr(0x%x), got 0x%lx", 308 + reset_val, msr, val); 309 + 310 + if (!has_one_reg) 311 + return; 312 + 313 + val = vcpu_get_reg(vcpu, KVM_X86_REG_MSR(msr)); 314 + TEST_ASSERT(val == reset_val, "Wanted 0x%lx from get_reg(0x%x), got 0x%lx", 315 + reset_val, msr, val); 316 + } 317 + 318 + static void do_vcpu_run(struct kvm_vcpu *vcpu) 319 + { 320 + struct ucall uc; 321 + 322 + for (;;) { 323 + vcpu_run(vcpu); 324 + 325 + switch (get_ucall(vcpu, &uc)) { 326 + case UCALL_SYNC: 327 + host_test_msr(vcpu, uc.args[1]); 328 + return; 329 + case UCALL_PRINTF: 330 + pr_info("%s", uc.buffer); 331 + break; 332 + case UCALL_ABORT: 333 + REPORT_GUEST_ASSERT(uc); 334 + case UCALL_DONE: 335 + TEST_FAIL("Unexpected UCALL_DONE"); 336 + default: 337 + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); 338 + } 339 + } 340 + } 341 + 342 + static void vcpus_run(struct kvm_vcpu **vcpus, const int NR_VCPUS) 343 + { 344 + int i; 345 + 346 + for (i = 0; i < NR_VCPUS; i++) 347 + do_vcpu_run(vcpus[i]); 348 + } 349 + 350 + #define MISC_ENABLES_RESET_VAL (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) 351 + 352 + static void test_msrs(void) 353 + { 354 + const struct kvm_msr __msrs[] = { 355 + MSR_TEST_NON_ZERO(MSR_IA32_MISC_ENABLE, 356 + MISC_ENABLES_RESET_VAL | MSR_IA32_MISC_ENABLE_FAST_STRING, 357 + MSR_IA32_MISC_ENABLE_FAST_STRING, MISC_ENABLES_RESET_VAL, NONE), 358 + MSR_TEST_NON_ZERO(MSR_IA32_CR_PAT, 0x07070707, 0, 0x7040600070406, NONE), 359 + 360 + /* 361 + * TSC_AUX is supported if RDTSCP *or* RDPID is supported. Add 362 + * entries for each features so that TSC_AUX doesn't exists for 363 + * the "unsupported" vCPU, and obviously to test both cases. 364 + */ 365 + MSR_TEST2(MSR_TSC_AUX, 0x12345678, u64_val, RDTSCP, RDPID), 366 + MSR_TEST2(MSR_TSC_AUX, 0x12345678, u64_val, RDPID, RDTSCP), 367 + 368 + MSR_TEST(MSR_IA32_SYSENTER_CS, 0x1234, 0, NONE), 369 + /* 370 + * SYSENTER_{ESP,EIP} are technically non-canonical on Intel, 371 + * but KVM doesn't emulate that behavior on emulated writes, 372 + * i.e. this test will observe different behavior if the MSR 373 + * writes are handed by hardware vs. KVM. KVM's behavior is 374 + * intended (though far from ideal), so don't bother testing 375 + * non-canonical values. 376 + */ 377 + MSR_TEST(MSR_IA32_SYSENTER_ESP, canonical_val, 0, NONE), 378 + MSR_TEST(MSR_IA32_SYSENTER_EIP, canonical_val, 0, NONE), 379 + 380 + MSR_TEST_CANONICAL(MSR_FS_BASE, LM), 381 + MSR_TEST_CANONICAL(MSR_GS_BASE, LM), 382 + MSR_TEST_CANONICAL(MSR_KERNEL_GS_BASE, LM), 383 + MSR_TEST_CANONICAL(MSR_LSTAR, LM), 384 + MSR_TEST_CANONICAL(MSR_CSTAR, LM), 385 + MSR_TEST(MSR_SYSCALL_MASK, 0xffffffff, 0, LM), 386 + 387 + MSR_TEST2(MSR_IA32_S_CET, CET_SHSTK_EN, CET_RESERVED, SHSTK, IBT), 388 + MSR_TEST2(MSR_IA32_S_CET, CET_ENDBR_EN, CET_RESERVED, IBT, SHSTK), 389 + MSR_TEST2(MSR_IA32_U_CET, CET_SHSTK_EN, CET_RESERVED, SHSTK, IBT), 390 + MSR_TEST2(MSR_IA32_U_CET, CET_ENDBR_EN, CET_RESERVED, IBT, SHSTK), 391 + MSR_TEST_CANONICAL(MSR_IA32_PL0_SSP, SHSTK), 392 + MSR_TEST(MSR_IA32_PL0_SSP, canonical_val, canonical_val | 1, SHSTK), 393 + MSR_TEST_CANONICAL(MSR_IA32_PL1_SSP, SHSTK), 394 + MSR_TEST(MSR_IA32_PL1_SSP, canonical_val, canonical_val | 1, SHSTK), 395 + MSR_TEST_CANONICAL(MSR_IA32_PL2_SSP, SHSTK), 396 + MSR_TEST(MSR_IA32_PL2_SSP, canonical_val, canonical_val | 1, SHSTK), 397 + MSR_TEST_CANONICAL(MSR_IA32_PL3_SSP, SHSTK), 398 + MSR_TEST(MSR_IA32_PL3_SSP, canonical_val, canonical_val | 1, SHSTK), 399 + 400 + MSR_TEST_KVM(GUEST_SSP, canonical_val, NONCANONICAL, SHSTK), 401 + }; 402 + 403 + const struct kvm_x86_cpu_feature feat_none = X86_FEATURE_NONE; 404 + const struct kvm_x86_cpu_feature feat_lm = X86_FEATURE_LM; 405 + 406 + /* 407 + * Create three vCPUs, but run them on the same task, to validate KVM's 408 + * context switching of MSR state. Don't pin the task to a pCPU to 409 + * also validate KVM's handling of cross-pCPU migration. Use the full 410 + * set of features for the first two vCPUs, but clear all features in 411 + * third vCPU in order to test both positive and negative paths. 412 + */ 413 + const int NR_VCPUS = 3; 414 + struct kvm_vcpu *vcpus[NR_VCPUS]; 415 + struct kvm_vm *vm; 416 + int i; 417 + 418 + kvm_static_assert(sizeof(__msrs) <= sizeof(msrs)); 419 + kvm_static_assert(ARRAY_SIZE(__msrs) <= ARRAY_SIZE(msrs)); 420 + memcpy(msrs, __msrs, sizeof(__msrs)); 421 + 422 + ignore_unsupported_msrs = kvm_is_ignore_msrs(); 423 + 424 + vm = vm_create_with_vcpus(NR_VCPUS, guest_main, vcpus); 425 + 426 + sync_global_to_guest(vm, msrs); 427 + sync_global_to_guest(vm, ignore_unsupported_msrs); 428 + 429 + /* 430 + * Clear features in the "unsupported features" vCPU. This needs to be 431 + * done before the first vCPU run as KVM's ABI is that guest CPUID is 432 + * immutable once the vCPU has been run. 433 + */ 434 + for (idx = 0; idx < ARRAY_SIZE(__msrs); idx++) { 435 + /* 436 + * Don't clear LM; selftests are 64-bit only, and KVM doesn't 437 + * honor LM=0 for MSRs that are supposed to exist if and only 438 + * if the vCPU is a 64-bit model. Ditto for NONE; clearing a 439 + * fake feature flag will result in false failures. 440 + */ 441 + if (memcmp(&msrs[idx].feature, &feat_lm, sizeof(feat_lm)) && 442 + memcmp(&msrs[idx].feature, &feat_none, sizeof(feat_none))) 443 + vcpu_clear_cpuid_feature(vcpus[2], msrs[idx].feature); 444 + } 445 + 446 + for (idx = 0; idx < ARRAY_SIZE(__msrs); idx++) { 447 + struct kvm_msr *msr = &msrs[idx]; 448 + 449 + if (msr->is_kvm_defined) { 450 + for (i = 0; i < NR_VCPUS; i++) 451 + host_test_kvm_reg(vcpus[i]); 452 + continue; 453 + } 454 + 455 + /* 456 + * Verify KVM_GET_SUPPORTED_CPUID and KVM_GET_MSR_INDEX_LIST 457 + * are consistent with respect to MSRs whose existence is 458 + * enumerated via CPUID. Skip the check for FS/GS.base MSRs, 459 + * as they aren't reported in the save/restore list since their 460 + * state is managed via SREGS. 461 + */ 462 + TEST_ASSERT(msr->index == MSR_FS_BASE || msr->index == MSR_GS_BASE || 463 + kvm_msr_is_in_save_restore_list(msr->index) == 464 + (kvm_cpu_has(msr->feature) || kvm_cpu_has(msr->feature2)), 465 + "%s %s in save/restore list, but %s according to CPUID", msr->name, 466 + kvm_msr_is_in_save_restore_list(msr->index) ? "is" : "isn't", 467 + (kvm_cpu_has(msr->feature) || kvm_cpu_has(msr->feature2)) ? 468 + "supported" : "unsupported"); 469 + 470 + sync_global_to_guest(vm, idx); 471 + 472 + vcpus_run(vcpus, NR_VCPUS); 473 + vcpus_run(vcpus, NR_VCPUS); 474 + } 475 + 476 + kvm_vm_free(vm); 477 + } 478 + 479 + int main(void) 480 + { 481 + has_one_reg = kvm_has_cap(KVM_CAP_ONE_REG); 482 + 483 + test_msrs(); 484 + 485 + if (has_one_reg) { 486 + use_one_reg = true; 487 + test_msrs(); 488 + } 489 + }