Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'kvm-x86-svm-6.16' of https://github.com/kvm-x86/linux into HEAD

KVM SVM changes for 6.16:

- Wait for target vCPU to acknowledge KVM_REQ_UPDATE_PROTECTED_GUEST_STATE to
fix a race between AP destroy and VMRUN.

- Decrypt and dump the VMSA in dump_vmcb() if debugging enabled for the VM.

- Add support for ALLOWED_SEV_FEATURES.

- Add #VMGEXIT to the set of handlers special cased for CONFIG_RETPOLINE=y.

- Treat DEBUGCTL[5:2] as reserved to pave the way for virtualizing features
that utilize those bits.

- Don't account temporary allocations in sev_send_update_data().

- Add support for KVM_CAP_X86_BUS_LOCK_EXIT on SVM, via Bus Lock Threshold.

+469 -31
+5
Documentation/virt/kvm/api.rst
··· 8001 8001 KVM_RUN_X86_BUS_LOCK in vcpu-run->flags, and conditionally sets the exit_reason 8002 8002 to KVM_EXIT_X86_BUS_LOCK. 8003 8003 8004 + Due to differences in the underlying hardware implementation, the vCPU's RIP at 8005 + the time of exit diverges between Intel and AMD. On Intel hosts, RIP points at 8006 + the next instruction, i.e. the exit is trap-like. On AMD hosts, RIP points at 8007 + the offending instruction, i.e. the exit is fault-like. 8008 + 8004 8009 Note! Detected bus locks may be coincident with other exits to userspace, i.e. 8005 8010 KVM_RUN_X86_BUS_LOCK should be checked regardless of the primary exit reason if 8006 8011 userspace wants to take action on all detected bus locks.
+2
arch/x86/include/asm/cpufeatures.h
··· 379 379 #define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */ 380 380 #define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */ 381 381 #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* SVME addr check */ 382 + #define X86_FEATURE_BUS_LOCK_THRESHOLD (15*32+29) /* Bus lock threshold */ 382 383 #define X86_FEATURE_IDLE_HLT (15*32+30) /* IDLE HLT intercept */ 383 384 384 385 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ ··· 448 447 #define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */ 449 448 #define X86_FEATURE_RMPREAD (19*32+21) /* RMPREAD instruction */ 450 449 #define X86_FEATURE_SEGMENTED_RMP (19*32+23) /* Segmented RMP support */ 450 + #define X86_FEATURE_ALLOWED_SEV_FEATURES (19*32+27) /* Allowed SEV Features */ 451 451 #define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */ 452 452 #define X86_FEATURE_HV_INUSE_WR_ALLOWED (19*32+30) /* Allow Write to in-use hypervisor-owned pages */ 453 453
+3 -2
arch/x86/include/asm/kvm_host.h
··· 125 125 KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) 126 126 #define KVM_REQ_HV_TLB_FLUSH \ 127 127 KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) 128 - #define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34) 128 + #define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE \ 129 + KVM_ARCH_REQ_FLAGS(34, KVM_REQUEST_WAIT) 129 130 130 131 #define CR0_RESERVED_BITS \ 131 132 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ ··· 412 411 }; 413 412 414 413 struct kvm_pio_request { 415 - unsigned long linear_rip; 416 414 unsigned long count; 417 415 int in; 418 416 int port; ··· 917 917 bool emulate_regs_need_sync_to_vcpu; 918 918 bool emulate_regs_need_sync_from_vcpu; 919 919 int (*complete_userspace_io)(struct kvm_vcpu *vcpu); 920 + unsigned long cui_linear_rip; 920 921 921 922 gpa_t time; 922 923 s8 pvclock_tsc_shift;
+9 -1
arch/x86/include/asm/svm.h
··· 116 116 INTERCEPT_INVPCID, 117 117 INTERCEPT_MCOMMIT, 118 118 INTERCEPT_TLBSYNC, 119 + INTERCEPT_BUSLOCK, 119 120 INTERCEPT_IDLE_HLT = 166, 120 121 }; 121 122 ··· 160 159 u64 avic_physical_id; /* Offset 0xf8 */ 161 160 u8 reserved_7[8]; 162 161 u64 vmsa_pa; /* Used for an SEV-ES guest */ 163 - u8 reserved_8[720]; 162 + u8 reserved_8[16]; 163 + u16 bus_lock_counter; /* Offset 0x120 */ 164 + u8 reserved_9[22]; 165 + u64 allowed_sev_features; /* Offset 0x138 */ 166 + u64 guest_sev_features; /* Offset 0x140 */ 167 + u8 reserved_10[664]; 164 168 /* 165 169 * Offset 0x3e0, 32 bytes reserved 166 170 * for use by hypervisor/software. ··· 296 290 #define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3) 297 291 #define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4) 298 292 #define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) 293 + 294 + #define VMCB_ALLOWED_SEV_FEATURES_VALID BIT_ULL(63) 299 295 300 296 struct vmcb_seg { 301 297 u16 selector;
+2
arch/x86/include/uapi/asm/svm.h
··· 95 95 #define SVM_EXIT_CR14_WRITE_TRAP 0x09e 96 96 #define SVM_EXIT_CR15_WRITE_TRAP 0x09f 97 97 #define SVM_EXIT_INVPCID 0x0a2 98 + #define SVM_EXIT_BUS_LOCK 0x0a5 98 99 #define SVM_EXIT_IDLE_HLT 0x0a6 99 100 #define SVM_EXIT_NPF 0x400 100 101 #define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 ··· 226 225 { SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \ 227 226 { SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \ 228 227 { SVM_EXIT_INVPCID, "invpcid" }, \ 228 + { SVM_EXIT_BUS_LOCK, "buslock" }, \ 229 229 { SVM_EXIT_IDLE_HLT, "idle-halt" }, \ 230 230 { SVM_EXIT_NPF, "npf" }, \ 231 231 { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \
+34
arch/x86/kvm/svm/nested.c
··· 678 678 vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; 679 679 vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; 680 680 681 + /* 682 + * Stash vmcb02's counter if the guest hasn't moved past the guilty 683 + * instruction; otherwise, reset the counter to '0'. 684 + * 685 + * In order to detect if L2 has made forward progress or not, track the 686 + * RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP 687 + * is changed, guest has clearly made forward progress, bus_lock_counter 688 + * still remained '1', so reset bus_lock_counter to '0'. Eg. In the 689 + * scenario, where a buslock happened in L1 before VMRUN, the bus lock 690 + * firmly happened on an instruction in the past. Even if vmcb01's 691 + * counter is still '1', (because the guilty instruction got patched), 692 + * the vCPU has clearly made forward progress and so KVM should reset 693 + * vmcb02's counter to '0'. 694 + * 695 + * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN 696 + * to prevent the same guilty instruction from triggering a VM-Exit. Eg. 697 + * if userspace rate-limits the vCPU, then it's entirely possible that 698 + * L1's tick interrupt is pending by the time userspace re-runs the 699 + * vCPU. If KVM unconditionally clears the counter on VMRUN, then when 700 + * L1 re-enters L2, the same instruction will trigger a VM-Exit and the 701 + * entire cycle start over. 702 + */ 703 + if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip)) 704 + vmcb02->control.bus_lock_counter = 1; 705 + else 706 + vmcb02->control.bus_lock_counter = 0; 707 + 681 708 /* Done at vmrun: asid. */ 682 709 683 710 /* Also overwritten later if necessary. */ ··· 1065 1038 vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); 1066 1039 1067 1040 } 1041 + 1042 + /* 1043 + * Invalidate bus_lock_rip unless KVM is still waiting for the guest 1044 + * to make forward progress before re-enabling bus lock detection. 1045 + */ 1046 + if (!vmcb02->control.bus_lock_counter) 1047 + svm->nested.ctl.bus_lock_rip = INVALID_GPA; 1068 1048 1069 1049 nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); 1070 1050
+107 -6
arch/x86/kvm/svm/sev.c
··· 560 560 if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params))) 561 561 return -EFAULT; 562 562 563 + sev->policy = params.policy; 564 + 563 565 memset(&start, 0, sizeof(start)); 564 566 565 567 dh_blob = NULL; ··· 1594 1592 1595 1593 /* allocate memory for header and transport buffer */ 1596 1594 ret = -ENOMEM; 1597 - hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT); 1595 + hdr = kzalloc(params.hdr_len, GFP_KERNEL); 1598 1596 if (!hdr) 1599 1597 goto e_unpin; 1600 1598 1601 - trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT); 1599 + trans_data = kzalloc(params.trans_len, GFP_KERNEL); 1602 1600 if (!trans_data) 1603 1601 goto e_free_hdr; 1604 1602 ··· 2200 2198 2201 2199 if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET) 2202 2200 return -EINVAL; 2201 + 2202 + sev->policy = params.policy; 2203 2203 2204 2204 sev->snp_context = snp_context_create(kvm, argp); 2205 2205 if (!sev->snp_context) ··· 3998 3994 * Unless Creation is deferred until INIT, signal the vCPU to update 3999 3995 * its state. 4000 3996 */ 4001 - if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) { 4002 - kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); 4003 - kvm_vcpu_kick(target_vcpu); 4004 - } 3997 + if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) 3998 + kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); 4005 3999 4006 4000 return 0; 4007 4001 } ··· 4457 4455 4458 4456 static void sev_es_init_vmcb(struct vcpu_svm *svm) 4459 4457 { 4458 + struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); 4460 4459 struct vmcb *vmcb = svm->vmcb01.ptr; 4461 4460 struct kvm_vcpu *vcpu = &svm->vcpu; 4462 4461 ··· 4472 4469 */ 4473 4470 if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa) 4474 4471 svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); 4472 + 4473 + if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES)) 4474 + svm->vmcb->control.allowed_sev_features = sev->vmsa_features | 4475 + VMCB_ALLOWED_SEV_FEATURES_VALID; 4475 4476 4476 4477 /* Can't intercept CR register access, HV can't modify CR registers */ 4477 4478 svm_clr_intercept(svm, INTERCEPT_CR0_READ); ··· 4936 4929 return PG_LEVEL_4K; 4937 4930 4938 4931 return level; 4932 + } 4933 + 4934 + struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) 4935 + { 4936 + struct vcpu_svm *svm = to_svm(vcpu); 4937 + struct vmcb_save_area *vmsa; 4938 + struct kvm_sev_info *sev; 4939 + int error = 0; 4940 + int ret; 4941 + 4942 + if (!sev_es_guest(vcpu->kvm)) 4943 + return NULL; 4944 + 4945 + /* 4946 + * If the VMSA has not yet been encrypted, return a pointer to the 4947 + * current un-encrypted VMSA. 4948 + */ 4949 + if (!vcpu->arch.guest_state_protected) 4950 + return (struct vmcb_save_area *)svm->sev_es.vmsa; 4951 + 4952 + sev = to_kvm_sev_info(vcpu->kvm); 4953 + 4954 + /* Check if the SEV policy allows debugging */ 4955 + if (sev_snp_guest(vcpu->kvm)) { 4956 + if (!(sev->policy & SNP_POLICY_DEBUG)) 4957 + return NULL; 4958 + } else { 4959 + if (sev->policy & SEV_POLICY_NODBG) 4960 + return NULL; 4961 + } 4962 + 4963 + if (sev_snp_guest(vcpu->kvm)) { 4964 + struct sev_data_snp_dbg dbg = {0}; 4965 + 4966 + vmsa = snp_alloc_firmware_page(__GFP_ZERO); 4967 + if (!vmsa) 4968 + return NULL; 4969 + 4970 + dbg.gctx_paddr = __psp_pa(sev->snp_context); 4971 + dbg.src_addr = svm->vmcb->control.vmsa_pa; 4972 + dbg.dst_addr = __psp_pa(vmsa); 4973 + 4974 + ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error); 4975 + 4976 + /* 4977 + * Return the target page to a hypervisor page no matter what. 4978 + * If this fails, the page can't be used, so leak it and don't 4979 + * try to use it. 4980 + */ 4981 + if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa)))) 4982 + return NULL; 4983 + 4984 + if (ret) { 4985 + pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n", 4986 + ret, error, error); 4987 + free_page((unsigned long)vmsa); 4988 + 4989 + return NULL; 4990 + } 4991 + } else { 4992 + struct sev_data_dbg dbg = {0}; 4993 + struct page *vmsa_page; 4994 + 4995 + vmsa_page = alloc_page(GFP_KERNEL); 4996 + if (!vmsa_page) 4997 + return NULL; 4998 + 4999 + vmsa = page_address(vmsa_page); 5000 + 5001 + dbg.handle = sev->handle; 5002 + dbg.src_addr = svm->vmcb->control.vmsa_pa; 5003 + dbg.dst_addr = __psp_pa(vmsa); 5004 + dbg.len = PAGE_SIZE; 5005 + 5006 + ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error); 5007 + if (ret) { 5008 + pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n", 5009 + ret, error, error); 5010 + __free_page(vmsa_page); 5011 + 5012 + return NULL; 5013 + } 5014 + } 5015 + 5016 + return vmsa; 5017 + } 5018 + 5019 + void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) 5020 + { 5021 + /* If the VMSA has not yet been encrypted, nothing was allocated */ 5022 + if (!vcpu->arch.guest_state_protected || !vmsa) 5023 + return; 5024 + 5025 + free_page((unsigned long)vmsa); 4939 5026 }
+122 -13
arch/x86/kvm/svm/svm.c
··· 29 29 #include <linux/cc_platform.h> 30 30 #include <linux/smp.h> 31 31 #include <linux/string_choices.h> 32 + #include <linux/mutex.h> 32 33 33 34 #include <asm/apic.h> 34 35 #include <asm/perf_event.h> ··· 251 250 static unsigned long iopm_base; 252 251 253 252 DEFINE_PER_CPU(struct svm_cpu_data, svm_data); 253 + 254 + static DEFINE_MUTEX(vmcb_dump_mutex); 254 255 255 256 /* 256 257 * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via ··· 1379 1376 svm_clr_intercept(svm, INTERCEPT_CLGI); 1380 1377 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; 1381 1378 } 1379 + 1380 + if (vcpu->kvm->arch.bus_lock_detection_enabled) 1381 + svm_set_intercept(svm, INTERCEPT_BUSLOCK); 1382 1382 1383 1383 if (sev_guest(vcpu->kvm)) 1384 1384 sev_init_vmcb(svm); ··· 3217 3211 } 3218 3212 3219 3213 /* 3220 - * AMD changed the architectural behavior of bits 5:2. On CPUs 3221 - * without BusLockTrap, bits 5:2 control "external pins", but 3222 - * on CPUs that support BusLockDetect, bit 2 enables BusLockTrap 3223 - * and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed 3224 - * the guest to set bits 5:2 despite not actually virtualizing 3225 - * Performance-Monitoring/Breakpoint external pins. Drop bits 3226 - * 5:2 for backwards compatibility. 3227 - */ 3228 - data &= ~GENMASK(5, 2); 3229 - 3230 - /* 3231 3214 * Suppress BTF as KVM doesn't virtualize BTF, but there's no 3232 3215 * way to communicate lack of support to the guest. 3233 3216 */ ··· 3346 3351 return kvm_handle_invpcid(vcpu, type, gva); 3347 3352 } 3348 3353 3354 + static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu) 3355 + { 3356 + struct vcpu_svm *svm = to_svm(vcpu); 3357 + 3358 + /* 3359 + * If userspace has NOT changed RIP, then KVM's ABI is to let the guest 3360 + * execute the bus-locking instruction. Set the bus lock counter to '1' 3361 + * to effectively step past the bus lock. 3362 + */ 3363 + if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)) 3364 + svm->vmcb->control.bus_lock_counter = 1; 3365 + 3366 + return 1; 3367 + } 3368 + 3369 + static int bus_lock_exit(struct kvm_vcpu *vcpu) 3370 + { 3371 + struct vcpu_svm *svm = to_svm(vcpu); 3372 + 3373 + vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; 3374 + vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; 3375 + 3376 + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); 3377 + vcpu->arch.complete_userspace_io = complete_userspace_buslock; 3378 + 3379 + if (is_guest_mode(vcpu)) 3380 + svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip; 3381 + 3382 + return 0; 3383 + } 3384 + 3349 3385 static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { 3350 3386 [SVM_EXIT_READ_CR0] = cr_interception, 3351 3387 [SVM_EXIT_READ_CR3] = cr_interception, ··· 3446 3420 [SVM_EXIT_INVPCID] = invpcid_interception, 3447 3421 [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt, 3448 3422 [SVM_EXIT_NPF] = npf_interception, 3423 + [SVM_EXIT_BUS_LOCK] = bus_lock_exit, 3449 3424 [SVM_EXIT_RSM] = rsm_interception, 3450 3425 [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, 3451 3426 [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, ··· 3461 3434 struct vmcb_control_area *control = &svm->vmcb->control; 3462 3435 struct vmcb_save_area *save = &svm->vmcb->save; 3463 3436 struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; 3437 + char *vm_type; 3464 3438 3465 3439 if (!dump_invalid_vmcb) { 3466 3440 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); 3467 3441 return; 3468 3442 } 3469 3443 3470 - pr_err("VMCB %p, last attempted VMRUN on CPU %d\n", 3471 - svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); 3444 + guard(mutex)(&vmcb_dump_mutex); 3445 + 3446 + vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" : 3447 + sev_es_guest(vcpu->kvm) ? "SEV-ES" : 3448 + sev_guest(vcpu->kvm) ? "SEV" : "SVM"; 3449 + 3450 + pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n", 3451 + vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); 3472 3452 pr_err("VMCB Control Area:\n"); 3473 3453 pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); 3474 3454 pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); ··· 3513 3479 pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); 3514 3480 pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); 3515 3481 pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); 3482 + pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features); 3483 + pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features); 3484 + 3485 + if (sev_es_guest(vcpu->kvm)) { 3486 + save = sev_decrypt_vmsa(vcpu); 3487 + if (!save) 3488 + goto no_vmsa; 3489 + 3490 + save01 = save; 3491 + } 3492 + 3516 3493 pr_err("VMCB State Save Area:\n"); 3517 3494 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3518 3495 "es:", ··· 3594 3549 pr_err("%-15s %016llx %-13s %016llx\n", 3595 3550 "excp_from:", save->last_excp_from, 3596 3551 "excp_to:", save->last_excp_to); 3552 + 3553 + if (sev_es_guest(vcpu->kvm)) { 3554 + struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save; 3555 + 3556 + pr_err("%-15s %016llx\n", 3557 + "sev_features", vmsa->sev_features); 3558 + 3559 + pr_err("%-15s %016llx %-13s %016llx\n", 3560 + "rax:", vmsa->rax, "rbx:", vmsa->rbx); 3561 + pr_err("%-15s %016llx %-13s %016llx\n", 3562 + "rcx:", vmsa->rcx, "rdx:", vmsa->rdx); 3563 + pr_err("%-15s %016llx %-13s %016llx\n", 3564 + "rsi:", vmsa->rsi, "rdi:", vmsa->rdi); 3565 + pr_err("%-15s %016llx %-13s %016llx\n", 3566 + "rbp:", vmsa->rbp, "rsp:", vmsa->rsp); 3567 + pr_err("%-15s %016llx %-13s %016llx\n", 3568 + "r8:", vmsa->r8, "r9:", vmsa->r9); 3569 + pr_err("%-15s %016llx %-13s %016llx\n", 3570 + "r10:", vmsa->r10, "r11:", vmsa->r11); 3571 + pr_err("%-15s %016llx %-13s %016llx\n", 3572 + "r12:", vmsa->r12, "r13:", vmsa->r13); 3573 + pr_err("%-15s %016llx %-13s %016llx\n", 3574 + "r14:", vmsa->r14, "r15:", vmsa->r15); 3575 + pr_err("%-15s %016llx %-13s %016llx\n", 3576 + "xcr0:", vmsa->xcr0, "xss:", vmsa->xss); 3577 + } else { 3578 + pr_err("%-15s %016llx %-13s %016lx\n", 3579 + "rax:", save->rax, "rbx:", 3580 + vcpu->arch.regs[VCPU_REGS_RBX]); 3581 + pr_err("%-15s %016lx %-13s %016lx\n", 3582 + "rcx:", vcpu->arch.regs[VCPU_REGS_RCX], 3583 + "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]); 3584 + pr_err("%-15s %016lx %-13s %016lx\n", 3585 + "rsi:", vcpu->arch.regs[VCPU_REGS_RSI], 3586 + "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]); 3587 + pr_err("%-15s %016lx %-13s %016llx\n", 3588 + "rbp:", vcpu->arch.regs[VCPU_REGS_RBP], 3589 + "rsp:", save->rsp); 3590 + #ifdef CONFIG_X86_64 3591 + pr_err("%-15s %016lx %-13s %016lx\n", 3592 + "r8:", vcpu->arch.regs[VCPU_REGS_R8], 3593 + "r9:", vcpu->arch.regs[VCPU_REGS_R9]); 3594 + pr_err("%-15s %016lx %-13s %016lx\n", 3595 + "r10:", vcpu->arch.regs[VCPU_REGS_R10], 3596 + "r11:", vcpu->arch.regs[VCPU_REGS_R11]); 3597 + pr_err("%-15s %016lx %-13s %016lx\n", 3598 + "r12:", vcpu->arch.regs[VCPU_REGS_R12], 3599 + "r13:", vcpu->arch.regs[VCPU_REGS_R13]); 3600 + pr_err("%-15s %016lx %-13s %016lx\n", 3601 + "r14:", vcpu->arch.regs[VCPU_REGS_R14], 3602 + "r15:", vcpu->arch.regs[VCPU_REGS_R15]); 3603 + #endif 3604 + } 3605 + 3606 + no_vmsa: 3607 + if (sev_es_guest(vcpu->kvm)) 3608 + sev_free_decrypted_vmsa(vcpu, save); 3597 3609 } 3598 3610 3599 3611 static bool svm_check_exit_valid(u64 exit_code) ··· 3687 3585 return kvm_emulate_halt(vcpu); 3688 3586 else if (exit_code == SVM_EXIT_NPF) 3689 3587 return npf_interception(vcpu); 3588 + #ifdef CONFIG_KVM_AMD_SEV 3589 + else if (exit_code == SVM_EXIT_VMGEXIT) 3590 + return sev_handle_vmgexit(vcpu); 3591 + #endif 3690 3592 #endif 3691 3593 return svm_exit_handlers[exit_code](vcpu); 3692 3594 } ··· 5451 5345 /* Nested VM can receive #VMEXIT instead of triggering #GP */ 5452 5346 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); 5453 5347 } 5348 + 5349 + if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD)) 5350 + kvm_caps.has_bus_lock_exit = true; 5454 5351 5455 5352 /* CPUID 0x80000008 */ 5456 5353 if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
+12
arch/x86/kvm/svm/svm.h
··· 98 98 unsigned int asid; /* ASID used for this guest */ 99 99 unsigned int handle; /* SEV firmware handle */ 100 100 int fd; /* SEV device fd */ 101 + unsigned long policy; 101 102 unsigned long pages_locked; /* Number of pages locked */ 102 103 struct list_head regions_list; /* List of registered regions */ 103 104 u64 ap_jump_table; /* SEV-ES AP Jump Table address */ ··· 114 113 void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */ 115 114 struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */ 116 115 }; 116 + 117 + #define SEV_POLICY_NODBG BIT_ULL(0) 118 + #define SNP_POLICY_DEBUG BIT_ULL(19) 117 119 118 120 struct kvm_svm { 119 121 struct kvm kvm; ··· 173 169 u64 nested_cr3; 174 170 u64 virt_ext; 175 171 u32 clean; 172 + u64 bus_lock_rip; 176 173 union { 177 174 #if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV) 178 175 struct hv_vmcb_enlightenments hv_enlightenments; ··· 788 783 int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); 789 784 void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); 790 785 int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); 786 + struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu); 787 + void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa); 791 788 #else 792 789 static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) 793 790 { ··· 821 814 return 0; 822 815 } 823 816 817 + static inline struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) 818 + { 819 + return NULL; 820 + } 821 + static inline void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) {} 824 822 #endif 825 823 826 824 /* vmenter.S */
+4 -4
arch/x86/kvm/x86.c
··· 9382 9382 { 9383 9383 vcpu->arch.pio.count = 0; 9384 9384 9385 - if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) 9385 + if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) 9386 9386 return 1; 9387 9387 9388 9388 return kvm_skip_emulated_instruction(vcpu); ··· 9407 9407 complete_fast_pio_out_port_0x7e; 9408 9408 kvm_skip_emulated_instruction(vcpu); 9409 9409 } else { 9410 - vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); 9410 + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); 9411 9411 vcpu->arch.complete_userspace_io = complete_fast_pio_out; 9412 9412 } 9413 9413 return 0; ··· 9420 9420 /* We should only ever be called with arch.pio.count equal to 1 */ 9421 9421 BUG_ON(vcpu->arch.pio.count != 1); 9422 9422 9423 - if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) { 9423 + if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) { 9424 9424 vcpu->arch.pio.count = 0; 9425 9425 return 1; 9426 9426 } ··· 9449 9449 return ret; 9450 9450 } 9451 9451 9452 - vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); 9452 + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); 9453 9453 vcpu->arch.complete_userspace_io = complete_fast_pio_in; 9454 9454 9455 9455 return 0;
+18 -1
include/linux/kvm_host.h
··· 1505 1505 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu); 1506 1506 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu); 1507 1507 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); 1508 - void kvm_vcpu_kick(struct kvm_vcpu *vcpu); 1508 + 1509 + #ifndef CONFIG_S390 1510 + void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait); 1511 + 1512 + static inline void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 1513 + { 1514 + __kvm_vcpu_kick(vcpu, false); 1515 + } 1516 + #endif 1517 + 1509 1518 int kvm_vcpu_yield_to(struct kvm_vcpu *target); 1510 1519 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool yield_to_kernel_mode); 1511 1520 ··· 2261 2252 2262 2253 __kvm_make_request(req, vcpu); 2263 2254 } 2255 + 2256 + #ifndef CONFIG_S390 2257 + static inline void kvm_make_request_and_kick(int req, struct kvm_vcpu *vcpu) 2258 + { 2259 + kvm_make_request(req, vcpu); 2260 + __kvm_vcpu_kick(vcpu, req & KVM_REQUEST_WAIT); 2261 + } 2262 + #endif 2264 2263 2265 2264 static inline bool kvm_request_pending(struct kvm_vcpu *vcpu) 2266 2265 {
+1
tools/testing/selftests/kvm/Makefile.kvm
··· 84 84 TEST_GEN_PROGS_x86 += x86/hyperv_tlb_flush 85 85 TEST_GEN_PROGS_x86 += x86/kvm_clock_test 86 86 TEST_GEN_PROGS_x86 += x86/kvm_pv_test 87 + TEST_GEN_PROGS_x86 += x86/kvm_buslock_test 87 88 TEST_GEN_PROGS_x86 += x86/monitor_mwait_test 88 89 TEST_GEN_PROGS_x86 += x86/nested_emulation_test 89 90 TEST_GEN_PROGS_x86 += x86/nested_exceptions_test
+135
tools/testing/selftests/kvm/x86/kvm_buslock_test.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2024 Advanced Micro Devices, Inc. 4 + */ 5 + #include <linux/atomic.h> 6 + 7 + #include "kvm_util.h" 8 + #include "processor.h" 9 + #include "svm_util.h" 10 + #include "vmx.h" 11 + #include "test_util.h" 12 + 13 + #define NR_BUS_LOCKS_PER_LEVEL 100 14 + #define CACHE_LINE_SIZE 64 15 + 16 + /* 17 + * To generate a bus lock, carve out a buffer that precisely occupies two cache 18 + * lines and perform an atomic access that splits the two lines. 19 + */ 20 + static u8 buffer[CACHE_LINE_SIZE * 2] __aligned(CACHE_LINE_SIZE); 21 + static atomic_t *val = (void *)&buffer[CACHE_LINE_SIZE - (sizeof(*val) / 2)]; 22 + 23 + static void guest_generate_buslocks(void) 24 + { 25 + for (int i = 0; i < NR_BUS_LOCKS_PER_LEVEL; i++) 26 + atomic_inc(val); 27 + } 28 + 29 + #define L2_GUEST_STACK_SIZE 64 30 + 31 + static void l2_guest_code(void) 32 + { 33 + guest_generate_buslocks(); 34 + GUEST_DONE(); 35 + } 36 + 37 + static void l1_svm_code(struct svm_test_data *svm) 38 + { 39 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; 40 + struct vmcb *vmcb = svm->vmcb; 41 + 42 + generic_svm_setup(svm, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); 43 + run_guest(vmcb, svm->vmcb_gpa); 44 + } 45 + 46 + static void l1_vmx_code(struct vmx_pages *vmx) 47 + { 48 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; 49 + 50 + GUEST_ASSERT_EQ(prepare_for_vmx_operation(vmx), true); 51 + GUEST_ASSERT_EQ(load_vmcs(vmx), true); 52 + 53 + prepare_vmcs(vmx, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]); 54 + 55 + GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_guest_code)); 56 + GUEST_ASSERT(!vmlaunch()); 57 + } 58 + 59 + static void guest_code(void *test_data) 60 + { 61 + guest_generate_buslocks(); 62 + 63 + if (this_cpu_has(X86_FEATURE_SVM)) 64 + l1_svm_code(test_data); 65 + else if (this_cpu_has(X86_FEATURE_VMX)) 66 + l1_vmx_code(test_data); 67 + else 68 + GUEST_DONE(); 69 + 70 + TEST_FAIL("L2 should have signaled 'done'"); 71 + } 72 + 73 + int main(int argc, char *argv[]) 74 + { 75 + const bool has_nested = kvm_cpu_has(X86_FEATURE_SVM) || kvm_cpu_has(X86_FEATURE_VMX); 76 + vm_vaddr_t nested_test_data_gva; 77 + struct kvm_vcpu *vcpu; 78 + struct kvm_run *run; 79 + struct kvm_vm *vm; 80 + int i, bus_locks = 0; 81 + 82 + TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_BUS_LOCK_EXIT)); 83 + 84 + vm = vm_create(1); 85 + vm_enable_cap(vm, KVM_CAP_X86_BUS_LOCK_EXIT, KVM_BUS_LOCK_DETECTION_EXIT); 86 + vcpu = vm_vcpu_add(vm, 0, guest_code); 87 + 88 + if (kvm_cpu_has(X86_FEATURE_SVM)) 89 + vcpu_alloc_svm(vm, &nested_test_data_gva); 90 + else 91 + vcpu_alloc_vmx(vm, &nested_test_data_gva); 92 + 93 + vcpu_args_set(vcpu, 1, nested_test_data_gva); 94 + 95 + run = vcpu->run; 96 + 97 + for (i = 0; i <= NR_BUS_LOCKS_PER_LEVEL * (1 + has_nested); i++) { 98 + struct ucall uc; 99 + 100 + vcpu_run(vcpu); 101 + 102 + if (run->exit_reason == KVM_EXIT_IO) { 103 + switch (get_ucall(vcpu, &uc)) { 104 + case UCALL_ABORT: 105 + REPORT_GUEST_ASSERT(uc); 106 + goto done; 107 + case UCALL_SYNC: 108 + continue; 109 + case UCALL_DONE: 110 + goto done; 111 + default: 112 + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); 113 + } 114 + } 115 + 116 + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_X86_BUS_LOCK); 117 + 118 + /* 119 + * Verify the counter is actually getting incremented, e.g. that 120 + * KVM isn't skipping the instruction. On Intel, the exit is 121 + * trap-like, i.e. the counter should already have been 122 + * incremented. On AMD, it's fault-like, i.e. the counter will 123 + * be incremented when the guest re-executes the instruction. 124 + */ 125 + sync_global_from_guest(vm, *val); 126 + TEST_ASSERT_EQ(atomic_read(val), bus_locks + host_cpu_is_intel); 127 + 128 + bus_locks++; 129 + } 130 + TEST_FAIL("Didn't receive UCALL_DONE, took %u bus lock exits\n", bus_locks); 131 + done: 132 + TEST_ASSERT_EQ(i, bus_locks); 133 + kvm_vm_free(vm); 134 + return 0; 135 + }
+15 -4
virt/kvm/kvm_main.c
··· 3739 3739 /* 3740 3740 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. 3741 3741 */ 3742 - void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 3742 + void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait) 3743 3743 { 3744 3744 int me, cpu; 3745 3745 ··· 3768 3768 */ 3769 3769 if (kvm_arch_vcpu_should_kick(vcpu)) { 3770 3770 cpu = READ_ONCE(vcpu->cpu); 3771 - if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 3772 - smp_send_reschedule(cpu); 3771 + if (cpu != me && (unsigned int)cpu < nr_cpu_ids && cpu_online(cpu)) { 3772 + /* 3773 + * Use a reschedule IPI to kick the vCPU if the caller 3774 + * doesn't need to wait for a response, as KVM allows 3775 + * kicking vCPUs while IRQs are disabled, but using the 3776 + * SMP function call framework with IRQs disabled can 3777 + * deadlock due to taking cross-CPU locks. 3778 + */ 3779 + if (wait) 3780 + smp_call_function_single(cpu, ack_kick, NULL, wait); 3781 + else 3782 + smp_send_reschedule(cpu); 3783 + } 3773 3784 } 3774 3785 out: 3775 3786 put_cpu(); 3776 3787 } 3777 - EXPORT_SYMBOL_GPL(kvm_vcpu_kick); 3788 + EXPORT_SYMBOL_GPL(__kvm_vcpu_kick); 3778 3789 #endif /* !CONFIG_S390 */ 3779 3790 3780 3791 int kvm_vcpu_yield_to(struct kvm_vcpu *target)