Merge tag 'kvm-x86-sev-6.17' of https://github.com/kvm-x86/linux into HEAD

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

KVM SEV cache maintenance changes for 6.17

- Drop a superfluous WBINVD (on all CPUs!) when destroying a VM.

- Use WBNOINVD instead of WBINVD when possible, for SEV cache maintenance,
e.g. to minimize collateral damage when reclaiming memory from an SEV guest.

- When reclaiming memory from an SEV guest, only do cache flushes on CPUs that
have ever run a vCPU for the guest, i.e. don't flush the caches for CPUs
that can't possibly have cache lines with dirty, encrypted data.

Paolo Bonzini 8 months ago beafd7ec a10accae

+84 -35

3 changed files

expand all

arch

x86

kvm

svm

sev.c

svm.h

x86.c

+82 -28

arch/x86/kvm/svm/sev.c

··· 117 117 */ 118 118 down_write(&sev_deactivate_lock); 119 119 120 + /* SNP firmware requires use of WBINVD for ASID recycling. */ 120 121 wbinvd_on_all_cpus(); 121 122 122 123 if (sev_snp_enabled) ··· 447 446 init_args.probe = false; 448 447 ret = sev_platform_init(&init_args); 449 448 if (ret) 450 - goto e_free; 449 + goto e_free_asid; 450 + 451 + if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { 452 + ret = -ENOMEM; 453 + goto e_free_asid; 454 + } 451 455 452 456 /* This needs to happen after SEV/SNP firmware initialization. */ 453 457 if (vm_type == KVM_X86_SNP_VM) { ··· 470 464 return 0; 471 465 472 466 e_free: 467 + free_cpumask_var(sev->have_run_cpus); 468 + e_free_asid: 473 469 argp->error = init_args.error; 474 470 sev_asid_free(sev); 475 471 sev->asid = 0; ··· 714 706 kunmap_local(page_virtual); 715 707 cond_resched(); 716 708 } 709 + } 710 + 711 + static void sev_writeback_caches(struct kvm *kvm) 712 + { 713 + /* 714 + * Note, the caller is responsible for ensuring correctness if the mask 715 + * can be modified, e.g. if a CPU could be doing VMRUN. 716 + */ 717 + if (cpumask_empty(to_kvm_sev_info(kvm)->have_run_cpus)) 718 + return; 719 + 720 + /* 721 + * Ensure that all dirty guest tagged cache entries are written back 722 + * before releasing the pages back to the system for use. CLFLUSH will 723 + * not do this without SME_COHERENT, and flushing many cache lines 724 + * individually is slower than blasting WBINVD for large VMs, so issue 725 + * WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported) 726 + * on CPUs that have done VMRUN, i.e. may have dirtied data using the 727 + * VM's ASID. 728 + * 729 + * For simplicity, never remove CPUs from the bitmap. Ideally, KVM 730 + * would clear the mask when flushing caches, but doing so requires 731 + * serializing multiple calls and having responding CPUs (to the IPI) 732 + * mark themselves as still running if they are running (or about to 733 + * run) a vCPU for the VM. 734 + */ 735 + wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus); 717 736 } 718 737 719 738 static unsigned long get_num_contig_pages(unsigned long idx, ··· 2072 2037 if (ret) 2073 2038 goto out_source_vcpu; 2074 2039 2040 + /* 2041 + * Allocate a new have_run_cpus for the destination, i.e. don't copy 2042 + * the set of CPUs from the source. If a CPU was used to run a vCPU in 2043 + * the source VM but is never used for the destination VM, then the CPU 2044 + * can only have cached memory that was accessible to the source VM. 2045 + */ 2046 + if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { 2047 + ret = -ENOMEM; 2048 + goto out_source_vcpu; 2049 + } 2050 + 2075 2051 sev_migrate_from(kvm, source_kvm); 2076 2052 kvm_vm_dead(source_kvm); 2077 2053 cg_cleanup_sev = src_sev; ··· 2740 2694 goto failed; 2741 2695 } 2742 2696 2743 - /* 2744 - * Ensure that all guest tagged cache entries are flushed before 2745 - * releasing the pages back to the system for use. CLFLUSH will 2746 - * not do this, so issue a WBINVD. 2747 - */ 2748 - wbinvd_on_all_cpus(); 2697 + sev_writeback_caches(kvm); 2749 2698 2750 2699 __unregister_enc_region_locked(kvm, region); 2751 2700 ··· 2782 2741 goto e_unlock; 2783 2742 } 2784 2743 2744 + mirror_sev = to_kvm_sev_info(kvm); 2745 + if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { 2746 + ret = -ENOMEM; 2747 + goto e_unlock; 2748 + } 2749 + 2785 2750 /* 2786 2751 * The mirror kvm holds an enc_context_owner ref so its asid can't 2787 2752 * disappear until we're done with it 2788 2753 */ 2789 2754 source_sev = to_kvm_sev_info(source_kvm); 2790 2755 kvm_get_kvm(source_kvm); 2791 - mirror_sev = to_kvm_sev_info(kvm); 2792 2756 list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms); 2793 2757 2794 2758 /* Set enc_context_owner and copy its encryption context over */ ··· 2855 2809 2856 2810 WARN_ON(!list_empty(&sev->mirror_vms)); 2857 2811 2858 - /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */ 2812 + free_cpumask_var(sev->have_run_cpus); 2813 + 2814 + /* 2815 + * If this is a mirror VM, remove it from the owner's list of a mirrors 2816 + * and skip ASID cleanup (the ASID is tied to the lifetime of the owner). 2817 + * Note, mirror VMs don't support registering encrypted regions. 2818 + */ 2859 2819 if (is_mirroring_enc_context(kvm)) { 2860 2820 struct kvm *owner_kvm = sev->enc_context_owner; 2861 2821 ··· 2872 2820 return; 2873 2821 } 2874 2822 2875 - /* 2876 - * Ensure that all guest tagged cache entries are flushed before 2877 - * releasing the pages back to the system for use. CLFLUSH will 2878 - * not do this, so issue a WBINVD. 2879 - */ 2880 - wbinvd_on_all_cpus(); 2881 2823 2882 2824 /* 2883 2825 * if userspace was terminated before unregistering the memory regions ··· 3141 3095 3142 3096 /* 3143 3097 * VM Page Flush takes a host virtual address and a guest ASID. Fall 3144 - * back to WBINVD if this faults so as not to make any problems worse 3145 - * by leaving stale encrypted data in the cache. 3098 + * back to full writeback of caches if this faults so as not to make 3099 + * any problems worse by leaving stale encrypted data in the cache. 3146 3100 */ 3147 3101 if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) 3148 - goto do_wbinvd; 3102 + goto do_sev_writeback_caches; 3149 3103 3150 3104 return; 3151 3105 3152 - do_wbinvd: 3153 - wbinvd_on_all_cpus(); 3106 + do_sev_writeback_caches: 3107 + sev_writeback_caches(vcpu->kvm); 3154 3108 } 3155 3109 3156 3110 void sev_guest_memory_reclaimed(struct kvm *kvm) 3157 3111 { 3158 3112 /* 3159 3113 * With SNP+gmem, private/encrypted memory is unreachable via the 3160 - * hva-based mmu notifiers, so these events are only actually 3161 - * pertaining to shared pages where there is no need to perform 3162 - * the WBINVD to flush associated caches. 3114 + * hva-based mmu notifiers, i.e. these events are explicitly scoped to 3115 + * shared pages, where there's no need to flush caches. 3163 3116 */ 3164 3117 if (!sev_guest(kvm) || sev_snp_guest(kvm)) 3165 3118 return; 3166 3119 3167 - wbinvd_on_all_cpus(); 3120 + sev_writeback_caches(kvm); 3168 3121 } 3169 3122 3170 3123 void sev_free_vcpu(struct kvm_vcpu *vcpu) ··· 3494 3449 */ 3495 3450 if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa)) 3496 3451 return -EINVAL; 3452 + 3453 + /* 3454 + * To optimize cache flushes when memory is reclaimed from an SEV VM, 3455 + * track physical CPUs that enter the guest for SEV VMs and thus can 3456 + * have encrypted, dirty data in the cache, and flush caches only for 3457 + * CPUs that have entered the guest. 3458 + */ 3459 + if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus)) 3460 + cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus); 3497 3461 3498 3462 /* Assign the asid allocated with this SEV guest */ 3499 3463 svm->asid = asid; ··· 3936 3882 * From this point forward, the VMSA will always be a guest-mapped page 3937 3883 * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In 3938 3884 * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but 3939 - * that involves cleanups like wbinvd_on_all_cpus() which would ideally 3940 - * be handled during teardown rather than guest boot. Deferring that 3941 - * also allows the existing logic for SEV-ES VMSAs to be re-used with 3885 + * that involves cleanups like flushing caches, which would ideally be 3886 + * handled during teardown rather than guest boot. Deferring that also 3887 + * allows the existing logic for SEV-ES VMSAs to be re-used with 3942 3888 * minimal SNP-specific changes. 3943 3889 */ 3944 3890 svm->sev_es.snp_has_guest_vmsa = true; ··· 4929 4875 4930 4876 /* 4931 4877 * SEV-ES avoids host/guest cache coherency issues through 4932 - * WBINVD hooks issued via MMU notifiers during run-time, and 4878 + * WBNOINVD hooks issued via MMU notifiers during run-time, and 4933 4879 * KVM's VM destroy path at shutdown. Those MMU notifier events 4934 4880 * don't cover gmem since there is no requirement to map pages 4935 4881 * to a HVA in order to use them for a running guest. While the

arch/x86/kvm/svm/svm.h

··· 110 110 void *guest_req_buf; /* Bounce buffer for SNP Guest Request input */ 111 111 void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */ 112 112 struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */ 113 + cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */ 113 114 }; 114 115 115 116 #define SEV_POLICY_NODBG BIT_ULL(0)

+1 -7

arch/x86/kvm/x86.c

··· 4994 4994 return r; 4995 4995 } 4996 4996 4997 - static void wbinvd_ipi(void *garbage) 4998 - { 4999 - wbinvd(); 5000 - } 5001 - 5002 4997 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) 5003 4998 { 5004 4999 return kvm_arch_has_noncoherent_dma(vcpu->kvm); ··· 5017 5022 if (kvm_x86_call(has_wbinvd_exit)()) 5018 5023 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); 5019 5024 else if (vcpu->cpu != -1 && vcpu->cpu != cpu) 5020 - smp_call_function_single(vcpu->cpu, 5021 - wbinvd_ipi, NULL, 1); 5025 + wbinvd_on_cpu(vcpu->cpu); 5022 5026 } 5023 5027 5024 5028 kvm_x86_call(vcpu_load)(vcpu, cpu);