Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm, thp: Do not make pmd/pud dirty without a reason

Currently we make page table entries dirty all the time regardless of
access type and don't even consider if the mapping is write-protected.
The reasoning is that we don't really need dirty tracking on THP and
making the entry dirty upfront may save some time on first write to the
page.

Unfortunately, such approach may result in false-positive
can_follow_write_pmd() for huge zero page or read-only shmem file.

Let's only make page dirty only if we about to write to the page anyway
(as we do for small pages).

I've restructured the code to make entry dirty inside
maybe_p[mu]d_mkwrite(). It also takes into account if the vma is
write-protected.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Kirill A. Shutemov and committed by
Linus Torvalds
152e93af a8f97366

+24 -16
+19 -12
mm/huge_memory.c
··· 474 474 } 475 475 __setup("transparent_hugepage=", setup_transparent_hugepage); 476 476 477 - pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 477 + pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma, bool dirty) 478 478 { 479 - if (likely(vma->vm_flags & VM_WRITE)) 479 + if (likely(vma->vm_flags & VM_WRITE)) { 480 480 pmd = pmd_mkwrite(pmd); 481 + if (dirty) 482 + pmd = pmd_mkdirty(pmd); 483 + } 481 484 return pmd; 482 485 } 483 486 ··· 602 599 } 603 600 604 601 entry = mk_huge_pmd(page, vma->vm_page_prot); 605 - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 602 + entry = maybe_pmd_mkwrite(entry, vma, true); 606 603 page_add_new_anon_rmap(page, vma, haddr, true); 607 604 mem_cgroup_commit_charge(page, memcg, false, true); 608 605 lru_cache_add_active_or_unevictable(page, vma); ··· 744 741 if (pfn_t_devmap(pfn)) 745 742 entry = pmd_mkdevmap(entry); 746 743 if (write) { 747 - entry = pmd_mkyoung(pmd_mkdirty(entry)); 748 - entry = maybe_pmd_mkwrite(entry, vma); 744 + entry = pmd_mkyoung(entry); 745 + entry = maybe_pmd_mkwrite(entry, vma, true); 749 746 } 750 747 751 748 if (pgtable) { ··· 791 788 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); 792 789 793 790 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 794 - static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) 791 + static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma, 792 + bool dirty) 795 793 { 796 - if (likely(vma->vm_flags & VM_WRITE)) 794 + if (likely(vma->vm_flags & VM_WRITE)) { 797 795 pud = pud_mkwrite(pud); 796 + if (dirty) 797 + pud = pud_mkdirty(pud); 798 + } 798 799 return pud; 799 800 } 800 801 ··· 814 807 if (pfn_t_devmap(pfn)) 815 808 entry = pud_mkdevmap(entry); 816 809 if (write) { 817 - entry = pud_mkyoung(pud_mkdirty(entry)); 818 - entry = maybe_pud_mkwrite(entry, vma); 810 + entry = pud_mkyoung(entry); 811 + entry = maybe_pud_mkwrite(entry, vma, true); 819 812 } 820 813 set_pud_at(mm, addr, pud, entry); 821 814 update_mmu_cache_pud(vma, addr, pud); ··· 1286 1279 if (reuse_swap_page(page, NULL)) { 1287 1280 pmd_t entry; 1288 1281 entry = pmd_mkyoung(orig_pmd); 1289 - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1282 + entry = maybe_pmd_mkwrite(entry, vma, true); 1290 1283 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) 1291 1284 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1292 1285 ret |= VM_FAULT_WRITE; ··· 1356 1349 } else { 1357 1350 pmd_t entry; 1358 1351 entry = mk_huge_pmd(new_page, vma->vm_page_prot); 1359 - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1352 + entry = maybe_pmd_mkwrite(entry, vma, true); 1360 1353 pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); 1361 1354 page_add_new_anon_rmap(new_page, vma, haddr, true); 1362 1355 mem_cgroup_commit_charge(new_page, memcg, false, true); ··· 2935 2928 if (pmd_swp_soft_dirty(*pvmw->pmd)) 2936 2929 pmde = pmd_mksoft_dirty(pmde); 2937 2930 if (is_write_migration_entry(entry)) 2938 - pmde = maybe_pmd_mkwrite(pmde, vma); 2931 + pmde = maybe_pmd_mkwrite(pmde, vma, false); 2939 2932 2940 2933 flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); 2941 2934 page_add_anon_rmap(new, vma, mmun_start, true);
+2 -1
mm/internal.h
··· 328 328 } 329 329 } 330 330 331 - extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); 331 + extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma, 332 + bool dirty); 332 333 333 334 /* 334 335 * At what user virtual address is page expected in @vma?
+1 -1
mm/khugepaged.c
··· 1057 1057 pgtable = pmd_pgtable(_pmd); 1058 1058 1059 1059 _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); 1060 - _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); 1060 + _pmd = maybe_pmd_mkwrite(_pmd, vma, false); 1061 1061 1062 1062 /* 1063 1063 * spin_lock() below is not the equivalent of smp_wmb(), so
+1 -1
mm/memory.c
··· 3335 3335 3336 3336 entry = mk_huge_pmd(page, vma->vm_page_prot); 3337 3337 if (write) 3338 - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 3338 + entry = maybe_pmd_mkwrite(entry, vma, true); 3339 3339 3340 3340 add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR); 3341 3341 page_add_file_rmap(page, true);
+1 -1
mm/migrate.c
··· 2068 2068 } 2069 2069 2070 2070 entry = mk_huge_pmd(new_page, vma->vm_page_prot); 2071 - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 2071 + entry = maybe_pmd_mkwrite(entry, vma, false); 2072 2072 2073 2073 /* 2074 2074 * Clear the old entry under pagetable lock and establish the new PTE.