Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: nVMX: Synthesize nested VM-Exit for supported emulation intercepts

When emulating an instruction on behalf of L2 that L1 wants to intercept,
generate a nested VM-Exit instead of injecting a #UD into L2. Now that
(most of) the necessary information is available, synthesizing a VM-Exit
isn't terribly difficult.

Punt on decoding the ModR/M for descriptor table exits for now. There is
no evidence that any hypervisor intercepts descriptor table accesses *and*
uses the EXIT_QUALIFICATION to expedite emulation, i.e. it's not worth
delaying basic support for.

To avoid doing more harm than good, e.g. by putting L2 into an infinite
or effectively corrupting its code stream, inject #UD if the instruction
length is nonsensical.

Link: https://lore.kernel.org/r/20250201015518.689704-11-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>

+56 -14
+56 -14
arch/x86/kvm/vmx/vmx.c
··· 8008 8008 } 8009 8009 8010 8010 static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, 8011 - struct x86_instruction_info *info) 8011 + struct x86_instruction_info *info, 8012 + unsigned long *exit_qualification) 8012 8013 { 8013 8014 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8014 8015 unsigned short port; 8015 8016 int size; 8016 - 8017 - if (info->intercept == x86_intercept_in || 8018 - info->intercept == x86_intercept_ins) { 8019 - port = info->src_val; 8020 - size = info->dst_bytes; 8021 - } else { 8022 - port = info->dst_val; 8023 - size = info->src_bytes; 8024 - } 8017 + bool imm; 8025 8018 8026 8019 /* 8027 8020 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction ··· 8026 8033 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 8027 8034 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 8028 8035 8036 + if (info->intercept == x86_intercept_in || 8037 + info->intercept == x86_intercept_ins) { 8038 + port = info->src_val; 8039 + size = info->dst_bytes; 8040 + imm = info->src_type == OP_IMM; 8041 + } else { 8042 + port = info->dst_val; 8043 + size = info->src_bytes; 8044 + imm = info->dst_type == OP_IMM; 8045 + } 8046 + 8047 + 8048 + *exit_qualification = ((unsigned long)port << 16) | (size - 1); 8049 + 8050 + if (info->intercept == x86_intercept_ins || 8051 + info->intercept == x86_intercept_outs) 8052 + *exit_qualification |= BIT(4); 8053 + 8054 + if (info->rep_prefix) 8055 + *exit_qualification |= BIT(5); 8056 + 8057 + if (imm) 8058 + *exit_qualification |= BIT(6); 8059 + 8029 8060 return nested_vmx_check_io_bitmaps(vcpu, port, size); 8030 8061 } 8031 8062 ··· 8059 8042 struct x86_exception *exception) 8060 8043 { 8061 8044 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8045 + unsigned long exit_qualification = 0; 8046 + u32 vm_exit_reason; 8047 + u64 exit_insn_len; 8062 8048 8063 8049 switch (info->intercept) { 8064 8050 case x86_intercept_rdpid: ··· 8082 8062 case x86_intercept_ins: 8083 8063 case x86_intercept_out: 8084 8064 case x86_intercept_outs: 8085 - if (!vmx_is_io_intercepted(vcpu, info)) 8065 + if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification)) 8086 8066 return X86EMUL_CONTINUE; 8067 + 8068 + vm_exit_reason = EXIT_REASON_IO_INSTRUCTION; 8087 8069 break; 8088 8070 8089 8071 case x86_intercept_lgdt: ··· 8098 8076 case x86_intercept_str: 8099 8077 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) 8100 8078 return X86EMUL_CONTINUE; 8079 + 8080 + if (info->intercept == x86_intercept_lldt || 8081 + info->intercept == x86_intercept_ltr || 8082 + info->intercept == x86_intercept_sldt || 8083 + info->intercept == x86_intercept_str) 8084 + vm_exit_reason = EXIT_REASON_LDTR_TR; 8085 + else 8086 + vm_exit_reason = EXIT_REASON_GDTR_IDTR; 8087 + /* 8088 + * FIXME: Decode the ModR/M to generate the correct exit 8089 + * qualification for memory operands. 8090 + */ 8101 8091 break; 8102 8092 8103 8093 case x86_intercept_hlt: 8104 8094 if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING)) 8105 8095 return X86EMUL_CONTINUE; 8096 + 8097 + vm_exit_reason = EXIT_REASON_HLT; 8106 8098 break; 8107 8099 8108 8100 case x86_intercept_pause: ··· 8132 8096 !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING)) 8133 8097 return X86EMUL_CONTINUE; 8134 8098 8099 + vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION; 8135 8100 break; 8136 8101 8137 8102 /* TODO: check more intercepts... */ 8138 8103 default: 8139 - break; 8104 + return X86EMUL_UNHANDLEABLE; 8140 8105 } 8141 8106 8142 - /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ 8143 - return X86EMUL_UNHANDLEABLE; 8107 + exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip); 8108 + if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH) 8109 + return X86EMUL_UNHANDLEABLE; 8110 + 8111 + __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification, 8112 + exit_insn_len); 8113 + return X86EMUL_INTERCEPTED; 8144 8114 } 8145 8115 8146 8116 #ifdef CONFIG_X86_64