commit 1b6c146df555bcfb9ad55dbb745ee364a9a0159f · tjh.dev/kernel

+42

arch/x86/kvm/mmu/mmu.c

··· 4405 4405 fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; 4406 4406 smp_rmb(); 4407 4407 4408 + /* 4409 + * Check for a relevant mmu_notifier invalidation event before getting 4410 + * the pfn from the primary MMU, and before acquiring mmu_lock. 4411 + * 4412 + * For mmu_lock, if there is an in-progress invalidation and the kernel 4413 + * allows preemption, the invalidation task may drop mmu_lock and yield 4414 + * in response to mmu_lock being contended, which is *very* counter- 4415 + * productive as this vCPU can't actually make forward progress until 4416 + * the invalidation completes. 4417 + * 4418 + * Retrying now can also avoid unnessary lock contention in the primary 4419 + * MMU, as the primary MMU doesn't necessarily hold a single lock for 4420 + * the duration of the invalidation, i.e. faulting in a conflicting pfn 4421 + * can cause the invalidation to take longer by holding locks that are 4422 + * needed to complete the invalidation. 4423 + * 4424 + * Do the pre-check even for non-preemtible kernels, i.e. even if KVM 4425 + * will never yield mmu_lock in response to contention, as this vCPU is 4426 + * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held 4427 + * to detect retry guarantees the worst case latency for the vCPU. 4428 + */ 4429 + if (fault->slot && 4430 + mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) 4431 + return RET_PF_RETRY; 4432 + 4408 4433 ret = __kvm_faultin_pfn(vcpu, fault); 4409 4434 if (ret != RET_PF_CONTINUE) 4410 4435 return ret; ··· 4439 4414 4440 4415 if (unlikely(!fault->slot)) 4441 4416 return kvm_handle_noslot_fault(vcpu, fault, access); 4417 + 4418 + /* 4419 + * Check again for a relevant mmu_notifier invalidation event purely to 4420 + * avoid contending mmu_lock. Most invalidations will be detected by 4421 + * the previous check, but checking is extremely cheap relative to the 4422 + * overall cost of failing to detect the invalidation until after 4423 + * mmu_lock is acquired. 4424 + */ 4425 + if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) { 4426 + kvm_release_pfn_clean(fault->pfn); 4427 + return RET_PF_RETRY; 4428 + } 4442 4429 4443 4430 return RET_PF_CONTINUE; 4444 4431 } ··· 4479 4442 if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) 4480 4443 return true; 4481 4444 4445 + /* 4446 + * Check for a relevant mmu_notifier invalidation event one last time 4447 + * now that mmu_lock is held, as the "unsafe" checks performed without 4448 + * holding mmu_lock can get false negatives. 4449 + */ 4482 4450 return fault->slot && 4483 4451 mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn); 4484 4452 }

+10

arch/x86/kvm/x86.c

··· 8007 8007 8008 8008 if (r < 0) 8009 8009 return X86EMUL_UNHANDLEABLE; 8010 + 8011 + /* 8012 + * Mark the page dirty _before_ checking whether or not the CMPXCHG was 8013 + * successful, as the old value is written back on failure. Note, for 8014 + * live migration, this is unnecessarily conservative as CMPXCHG writes 8015 + * back the original value and the access is atomic, but KVM's ABI is 8016 + * that all writes are dirty logged, regardless of the value written. 8017 + */ 8018 + kvm_vcpu_mark_page_dirty(vcpu, gpa_to_gfn(gpa)); 8019 + 8010 8020 if (r) 8011 8021 return X86EMUL_CMPXCHG_FAILED; 8012 8022

+26

include/linux/kvm_host.h

··· 2031 2031 return 1; 2032 2032 return 0; 2033 2033 } 2034 + 2035 + /* 2036 + * This lockless version of the range-based retry check *must* be paired with a 2037 + * call to the locked version after acquiring mmu_lock, i.e. this is safe to 2038 + * use only as a pre-check to avoid contending mmu_lock. This version *will* 2039 + * get false negatives and false positives. 2040 + */ 2041 + static inline bool mmu_invalidate_retry_gfn_unsafe(struct kvm *kvm, 2042 + unsigned long mmu_seq, 2043 + gfn_t gfn) 2044 + { 2045 + /* 2046 + * Use READ_ONCE() to ensure the in-progress flag and sequence counter 2047 + * are always read from memory, e.g. so that checking for retry in a 2048 + * loop won't result in an infinite retry loop. Don't force loads for 2049 + * start+end, as the key to avoiding infinite retry loops is observing 2050 + * the 1=>0 transition of in-progress, i.e. getting false negatives 2051 + * due to stale start+end values is acceptable. 2052 + */ 2053 + if (unlikely(READ_ONCE(kvm->mmu_invalidate_in_progress)) && 2054 + gfn >= kvm->mmu_invalidate_range_start && 2055 + gfn < kvm->mmu_invalidate_range_end) 2056 + return true; 2057 + 2058 + return READ_ONCE(kvm->mmu_invalidate_seq) != mmu_seq; 2059 + } 2034 2060 #endif 2035 2061 2036 2062 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING