Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
"KVM GUEST_MEMFD fixes for 6.8:

- Make KVM_MEM_GUEST_MEMFD mutually exclusive with KVM_MEM_READONLY
to avoid creating an inconsistent ABI (KVM_MEM_GUEST_MEMFD is not
writable from userspace, so there would be no way to write to a
read-only guest_memfd).

- Update documentation for KVM_SW_PROTECTED_VM to make it abundantly
clear that such VMs are purely for development and testing.

- Limit KVM_SW_PROTECTED_VM guests to the TDP MMU, as the long term
plan is to support confidential VMs with deterministic private
memory (SNP and TDX) only in the TDP MMU.

- Fix a bug in a GUEST_MEMFD dirty logging test that caused false
passes.

x86 fixes:

- Fix missing marking of a guest page as dirty when emulating an
atomic access.

- Check for mmu_notifier invalidation events before faulting in the
pfn, and before acquiring mmu_lock, to avoid unnecessary work and
lock contention with preemptible kernels (including
CONFIG_PREEMPT_DYNAMIC in non-preemptible mode).

- Disable AMD DebugSwap by default, it breaks VMSA signing and will
be re-enabled with a better VM creation API in 6.10.

- Do the cache flush of converted pages in svm_register_enc_region()
before dropping kvm->lock, to avoid a race with unregistering of
the same region and the consequent use-after-free issue"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
SEV: disable SEV-ES DebugSwap by default
KVM: x86/mmu: Retry fault before acquiring mmu_lock if mapping is changing
KVM: SVM: Flush pages under kvm->lock to fix UAF in svm_register_enc_region()
KVM: selftests: Add a testcase to verify GUEST_MEMFD and READONLY are exclusive
KVM: selftests: Create GUEST_MEMFD for relevant invalid flags testcases
KVM: x86/mmu: Restrict KVM_SW_PROTECTED_VM to the TDP MMU
KVM: x86: Update KVM_SW_PROTECTED_VM docs to make it clear they're a WIP
KVM: Make KVM_MEM_GUEST_MEMFD mutually exclusive with KVM_MEM_READONLY
KVM: x86: Mark target gfn of emulated atomic instruction as dirty

Changed files
+121 -16
Documentation
virt
kvm
arch
x86
kvm
include
linux
tools
testing
selftests
virt
+5
Documentation/virt/kvm/api.rst
··· 8791 8791 #define KVM_X86_DEFAULT_VM 0 8792 8792 #define KVM_X86_SW_PROTECTED_VM 1 8793 8793 8794 + Note, KVM_X86_SW_PROTECTED_VM is currently only for development and testing. 8795 + Do not use KVM_X86_SW_PROTECTED_VM for "real" VMs, and especially not in 8796 + production. The behavior and effective ABI for software-protected VMs is 8797 + unstable. 8798 + 8794 8799 9. Known KVM API problems 8795 8800 ========================= 8796 8801
+4 -3
arch/x86/kvm/Kconfig
··· 80 80 depends on KVM && X86_64 81 81 select KVM_GENERIC_PRIVATE_MEM 82 82 help 83 - Enable support for KVM software-protected VMs. Currently "protected" 84 - means the VM can be backed with memory provided by 85 - KVM_CREATE_GUEST_MEMFD. 83 + Enable support for KVM software-protected VMs. Currently, software- 84 + protected VMs are purely a development and testing vehicle for 85 + KVM_CREATE_GUEST_MEMFD. Attempting to run a "real" VM workload as a 86 + software-protected VM will fail miserably. 86 87 87 88 If unsure, say "N". 88 89
+42
arch/x86/kvm/mmu/mmu.c
··· 4405 4405 fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; 4406 4406 smp_rmb(); 4407 4407 4408 + /* 4409 + * Check for a relevant mmu_notifier invalidation event before getting 4410 + * the pfn from the primary MMU, and before acquiring mmu_lock. 4411 + * 4412 + * For mmu_lock, if there is an in-progress invalidation and the kernel 4413 + * allows preemption, the invalidation task may drop mmu_lock and yield 4414 + * in response to mmu_lock being contended, which is *very* counter- 4415 + * productive as this vCPU can't actually make forward progress until 4416 + * the invalidation completes. 4417 + * 4418 + * Retrying now can also avoid unnessary lock contention in the primary 4419 + * MMU, as the primary MMU doesn't necessarily hold a single lock for 4420 + * the duration of the invalidation, i.e. faulting in a conflicting pfn 4421 + * can cause the invalidation to take longer by holding locks that are 4422 + * needed to complete the invalidation. 4423 + * 4424 + * Do the pre-check even for non-preemtible kernels, i.e. even if KVM 4425 + * will never yield mmu_lock in response to contention, as this vCPU is 4426 + * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held 4427 + * to detect retry guarantees the worst case latency for the vCPU. 4428 + */ 4429 + if (fault->slot && 4430 + mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) 4431 + return RET_PF_RETRY; 4432 + 4408 4433 ret = __kvm_faultin_pfn(vcpu, fault); 4409 4434 if (ret != RET_PF_CONTINUE) 4410 4435 return ret; ··· 4439 4414 4440 4415 if (unlikely(!fault->slot)) 4441 4416 return kvm_handle_noslot_fault(vcpu, fault, access); 4417 + 4418 + /* 4419 + * Check again for a relevant mmu_notifier invalidation event purely to 4420 + * avoid contending mmu_lock. Most invalidations will be detected by 4421 + * the previous check, but checking is extremely cheap relative to the 4422 + * overall cost of failing to detect the invalidation until after 4423 + * mmu_lock is acquired. 4424 + */ 4425 + if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) { 4426 + kvm_release_pfn_clean(fault->pfn); 4427 + return RET_PF_RETRY; 4428 + } 4442 4429 4443 4430 return RET_PF_CONTINUE; 4444 4431 } ··· 4479 4442 if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) 4480 4443 return true; 4481 4444 4445 + /* 4446 + * Check for a relevant mmu_notifier invalidation event one last time 4447 + * now that mmu_lock is held, as the "unsafe" checks performed without 4448 + * holding mmu_lock can get false negatives. 4449 + */ 4482 4450 return fault->slot && 4483 4451 mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn); 4484 4452 }
+15 -10
arch/x86/kvm/svm/sev.c
··· 57 57 module_param_named(sev_es, sev_es_enabled, bool, 0444); 58 58 59 59 /* enable/disable SEV-ES DebugSwap support */ 60 - static bool sev_es_debug_swap_enabled = true; 60 + static bool sev_es_debug_swap_enabled = false; 61 61 module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444); 62 62 #else 63 63 #define sev_enabled false ··· 612 612 save->xss = svm->vcpu.arch.ia32_xss; 613 613 save->dr6 = svm->vcpu.arch.dr6; 614 614 615 - if (sev_es_debug_swap_enabled) 615 + if (sev_es_debug_swap_enabled) { 616 616 save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP; 617 + pr_warn_once("Enabling DebugSwap with KVM_SEV_ES_INIT. " 618 + "This will not work starting with Linux 6.10\n"); 619 + } 617 620 618 621 pr_debug("Virtual Machine Save Area (VMSA):\n"); 619 622 print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); ··· 1978 1975 goto e_free; 1979 1976 } 1980 1977 1978 + /* 1979 + * The guest may change the memory encryption attribute from C=0 -> C=1 1980 + * or vice versa for this memory range. Lets make sure caches are 1981 + * flushed to ensure that guest data gets written into memory with 1982 + * correct C-bit. Note, this must be done before dropping kvm->lock, 1983 + * as region and its array of pages can be freed by a different task 1984 + * once kvm->lock is released. 1985 + */ 1986 + sev_clflush_pages(region->pages, region->npages); 1987 + 1981 1988 region->uaddr = range->addr; 1982 1989 region->size = range->size; 1983 1990 1984 1991 list_add_tail(&region->list, &sev->regions_list); 1985 1992 mutex_unlock(&kvm->lock); 1986 - 1987 - /* 1988 - * The guest may change the memory encryption attribute from C=0 -> C=1 1989 - * or vice versa for this memory range. Lets make sure caches are 1990 - * flushed to ensure that guest data gets written into memory with 1991 - * correct C-bit. 1992 - */ 1993 - sev_clflush_pages(region->pages, region->npages); 1994 1993 1995 1994 return ret; 1996 1995
+11 -1
arch/x86/kvm/x86.c
··· 4580 4580 { 4581 4581 return type == KVM_X86_DEFAULT_VM || 4582 4582 (type == KVM_X86_SW_PROTECTED_VM && 4583 - IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_enabled); 4583 + IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled); 4584 4584 } 4585 4585 4586 4586 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) ··· 8007 8007 8008 8008 if (r < 0) 8009 8009 return X86EMUL_UNHANDLEABLE; 8010 + 8011 + /* 8012 + * Mark the page dirty _before_ checking whether or not the CMPXCHG was 8013 + * successful, as the old value is written back on failure. Note, for 8014 + * live migration, this is unnecessarily conservative as CMPXCHG writes 8015 + * back the original value and the access is atomic, but KVM's ABI is 8016 + * that all writes are dirty logged, regardless of the value written. 8017 + */ 8018 + kvm_vcpu_mark_page_dirty(vcpu, gpa_to_gfn(gpa)); 8019 + 8010 8020 if (r) 8011 8021 return X86EMUL_CMPXCHG_FAILED; 8012 8022
+26
include/linux/kvm_host.h
··· 2031 2031 return 1; 2032 2032 return 0; 2033 2033 } 2034 + 2035 + /* 2036 + * This lockless version of the range-based retry check *must* be paired with a 2037 + * call to the locked version after acquiring mmu_lock, i.e. this is safe to 2038 + * use only as a pre-check to avoid contending mmu_lock. This version *will* 2039 + * get false negatives and false positives. 2040 + */ 2041 + static inline bool mmu_invalidate_retry_gfn_unsafe(struct kvm *kvm, 2042 + unsigned long mmu_seq, 2043 + gfn_t gfn) 2044 + { 2045 + /* 2046 + * Use READ_ONCE() to ensure the in-progress flag and sequence counter 2047 + * are always read from memory, e.g. so that checking for retry in a 2048 + * loop won't result in an infinite retry loop. Don't force loads for 2049 + * start+end, as the key to avoiding infinite retry loops is observing 2050 + * the 1=>0 transition of in-progress, i.e. getting false negatives 2051 + * due to stale start+end values is acceptable. 2052 + */ 2053 + if (unlikely(READ_ONCE(kvm->mmu_invalidate_in_progress)) && 2054 + gfn >= kvm->mmu_invalidate_range_start && 2055 + gfn < kvm->mmu_invalidate_range_end) 2056 + return true; 2057 + 2058 + return READ_ONCE(kvm->mmu_invalidate_seq) != mmu_seq; 2059 + } 2034 2060 #endif 2035 2061 2036 2062 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+11 -1
tools/testing/selftests/kvm/set_memory_region_test.c
··· 367 367 } 368 368 369 369 if (supported_flags & KVM_MEM_GUEST_MEMFD) { 370 + int guest_memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE, 0); 371 + 370 372 r = __vm_set_user_memory_region2(vm, 0, 371 373 KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_GUEST_MEMFD, 372 - 0, MEM_REGION_SIZE, NULL, 0, 0); 374 + 0, MEM_REGION_SIZE, NULL, guest_memfd, 0); 373 375 TEST_ASSERT(r && errno == EINVAL, 374 376 "KVM_SET_USER_MEMORY_REGION2 should have failed, dirty logging private memory is unsupported"); 377 + 378 + r = __vm_set_user_memory_region2(vm, 0, 379 + KVM_MEM_READONLY | KVM_MEM_GUEST_MEMFD, 380 + 0, MEM_REGION_SIZE, NULL, guest_memfd, 0); 381 + TEST_ASSERT(r && errno == EINVAL, 382 + "KVM_SET_USER_MEMORY_REGION2 should have failed, read-only GUEST_MEMFD memslots are unsupported"); 383 + 384 + close(guest_memfd); 375 385 } 376 386 } 377 387
+7 -1
virt/kvm/kvm_main.c
··· 1615 1615 valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 1616 1616 1617 1617 #ifdef __KVM_HAVE_READONLY_MEM 1618 - valid_flags |= KVM_MEM_READONLY; 1618 + /* 1619 + * GUEST_MEMFD is incompatible with read-only memslots, as writes to 1620 + * read-only memslots have emulated MMIO, not page fault, semantics, 1621 + * and KVM doesn't allow emulated MMIO for private memory. 1622 + */ 1623 + if (!(mem->flags & KVM_MEM_GUEST_MEMFD)) 1624 + valid_flags |= KVM_MEM_READONLY; 1619 1625 #endif 1620 1626 1621 1627 if (mem->flags & ~valid_flags)