Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

thp: mmu_notifier_test_young

For GRU and EPT, we need gup-fast to set referenced bit too (this is why
it's correct to return 0 when shadow_access_mask is zero, it requires
gup-fast to set the referenced bit). qemu-kvm access already sets the
young bit in the pte if it isn't zero-copy, if it's zero copy or a shadow
paging EPT minor fault we relay on gup-fast to signal the page is in
use...

We also need to check the young bits on the secondary pagetables for NPT
and not nested shadow mmu as the data may never get accessed again by the
primary pte.

Without this closer accuracy, we'd have to remove the heuristic that
avoids collapsing hugepages in hugepage virtual regions that have not even
a single subpage in use.

->test_young is full backwards compatible with GRU and other usages that
don't have young bits in pagetables set by the hardware and that should
nuke the secondary mmu mappings when ->clear_flush_young runs just like
EPT does.

Removing the heuristic that checks the young bit in
khugepaged/collapse_huge_page completely isn't so bad either probably but
I thought it was worth it and this makes it reliable.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Andrea Arcangeli and committed by
Linus Torvalds
8ee53820 4b7167b9

+105 -2
+1
arch/x86/include/asm/kvm_host.h
··· 822 822 #define KVM_ARCH_WANT_MMU_NOTIFIER 823 823 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 824 824 int kvm_age_hva(struct kvm *kvm, unsigned long hva); 825 + int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 825 826 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 826 827 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); 827 828 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
+34
arch/x86/kvm/mmu.c
··· 945 945 return young; 946 946 } 947 947 948 + static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 949 + unsigned long data) 950 + { 951 + u64 *spte; 952 + int young = 0; 953 + 954 + /* 955 + * If there's no access bit in the secondary pte set by the 956 + * hardware it's up to gup-fast/gup to set the access bit in 957 + * the primary pte or in the page structure. 958 + */ 959 + if (!shadow_accessed_mask) 960 + goto out; 961 + 962 + spte = rmap_next(kvm, rmapp, NULL); 963 + while (spte) { 964 + u64 _spte = *spte; 965 + BUG_ON(!(_spte & PT_PRESENT_MASK)); 966 + young = _spte & PT_ACCESSED_MASK; 967 + if (young) { 968 + young = 1; 969 + break; 970 + } 971 + spte = rmap_next(kvm, rmapp, spte); 972 + } 973 + out: 974 + return young; 975 + } 976 + 948 977 #define RMAP_RECYCLE_THRESHOLD 1000 949 978 950 979 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) ··· 992 963 int kvm_age_hva(struct kvm *kvm, unsigned long hva) 993 964 { 994 965 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); 966 + } 967 + 968 + int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) 969 + { 970 + return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); 995 971 } 996 972 997 973 #ifdef MMU_DEBUG
+3
arch/x86/mm/gup.c
··· 8 8 #include <linux/mm.h> 9 9 #include <linux/vmstat.h> 10 10 #include <linux/highmem.h> 11 + #include <linux/swap.h> 11 12 12 13 #include <asm/pgtable.h> 13 14 ··· 90 89 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 91 90 page = pte_page(pte); 92 91 get_page(page); 92 + SetPageReferenced(page); 93 93 pages[*nr] = page; 94 94 (*nr)++; 95 95 ··· 105 103 VM_BUG_ON(page != compound_head(page)); 106 104 VM_BUG_ON(page_count(page) == 0); 107 105 atomic_add(nr, &page->_count); 106 + SetPageReferenced(page); 108 107 } 109 108 110 109 static inline void get_huge_page_tail(struct page *page)
+26
include/linux/mmu_notifier.h
··· 62 62 unsigned long address); 63 63 64 64 /* 65 + * test_young is called to check the young/accessed bitflag in 66 + * the secondary pte. This is used to know if the page is 67 + * frequently used without actually clearing the flag or tearing 68 + * down the secondary mapping on the page. 69 + */ 70 + int (*test_young)(struct mmu_notifier *mn, 71 + struct mm_struct *mm, 72 + unsigned long address); 73 + 74 + /* 65 75 * change_pte is called in cases that pte mapping to page is changed: 66 76 * for example, when ksm remaps pte to point to a new shared page. 67 77 */ ··· 173 163 extern void __mmu_notifier_release(struct mm_struct *mm); 174 164 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, 175 165 unsigned long address); 166 + extern int __mmu_notifier_test_young(struct mm_struct *mm, 167 + unsigned long address); 176 168 extern void __mmu_notifier_change_pte(struct mm_struct *mm, 177 169 unsigned long address, pte_t pte); 178 170 extern void __mmu_notifier_invalidate_page(struct mm_struct *mm, ··· 195 183 { 196 184 if (mm_has_notifiers(mm)) 197 185 return __mmu_notifier_clear_flush_young(mm, address); 186 + return 0; 187 + } 188 + 189 + static inline int mmu_notifier_test_young(struct mm_struct *mm, 190 + unsigned long address) 191 + { 192 + if (mm_has_notifiers(mm)) 193 + return __mmu_notifier_test_young(mm, address); 198 194 return 0; 199 195 } 200 196 ··· 328 308 } 329 309 330 310 static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, 311 + unsigned long address) 312 + { 313 + return 0; 314 + } 315 + 316 + static inline int mmu_notifier_test_young(struct mm_struct *mm, 331 317 unsigned long address) 332 318 { 333 319 return 0;
+4 -2
mm/huge_memory.c
··· 1632 1632 VM_BUG_ON(PageLRU(page)); 1633 1633 1634 1634 /* If there is no mapped pte young don't collapse the page */ 1635 - if (pte_young(pteval)) 1635 + if (pte_young(pteval) || PageReferenced(page) || 1636 + mmu_notifier_test_young(vma->vm_mm, address)) 1636 1637 referenced = 1; 1637 1638 } 1638 1639 if (unlikely(!referenced)) ··· 1893 1892 /* cannot use mapcount: can't collapse if there's a gup pin */ 1894 1893 if (page_count(page) != 1) 1895 1894 goto out_unmap; 1896 - if (pte_young(pteval)) 1895 + if (pte_young(pteval) || PageReferenced(page) || 1896 + mmu_notifier_test_young(vma->vm_mm, address)) 1897 1897 referenced = 1; 1898 1898 } 1899 1899 if (referenced)
+20
mm/mmu_notifier.c
··· 100 100 return young; 101 101 } 102 102 103 + int __mmu_notifier_test_young(struct mm_struct *mm, 104 + unsigned long address) 105 + { 106 + struct mmu_notifier *mn; 107 + struct hlist_node *n; 108 + int young = 0; 109 + 110 + rcu_read_lock(); 111 + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 112 + if (mn->ops->test_young) { 113 + young = mn->ops->test_young(mn, mm, address); 114 + if (young) 115 + break; 116 + } 117 + } 118 + rcu_read_unlock(); 119 + 120 + return young; 121 + } 122 + 103 123 void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, 104 124 pte_t pte) 105 125 {
+17
virt/kvm/kvm_main.c
··· 380 380 return young; 381 381 } 382 382 383 + static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 384 + struct mm_struct *mm, 385 + unsigned long address) 386 + { 387 + struct kvm *kvm = mmu_notifier_to_kvm(mn); 388 + int young, idx; 389 + 390 + idx = srcu_read_lock(&kvm->srcu); 391 + spin_lock(&kvm->mmu_lock); 392 + young = kvm_test_age_hva(kvm, address); 393 + spin_unlock(&kvm->mmu_lock); 394 + srcu_read_unlock(&kvm->srcu, idx); 395 + 396 + return young; 397 + } 398 + 383 399 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 384 400 struct mm_struct *mm) 385 401 { ··· 412 396 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 413 397 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 414 398 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 399 + .test_young = kvm_mmu_notifier_test_young, 415 400 .change_pte = kvm_mmu_notifier_change_pte, 416 401 .release = kvm_mmu_notifier_release, 417 402 };