Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'simplify-do_wp_page'

Merge emailed patches from Peter Xu:
"This is a small series that I picked up from Linus's suggestion to
simplify cow handling (and also make it more strict) by checking
against page refcounts rather than mapcounts.

This makes uffd-wp work again (verified by running upmapsort)"

Note: this is horrendously bad timing, and making this kind of
fundamental vm change after -rc3 is not at all how things should work.
The saving grace is that it really is a a nice simplification:

8 files changed, 29 insertions(+), 120 deletions(-)

The reason for the bad timing is that it turns out that commit
17839856fd58 ("gup: document and work around 'COW can break either way'
issue" broke not just UFFD functionality (as Peter noticed), but Mikulas
Patocka also reports that it caused issues for strace when running in a
DAX environment with ext4 on a persistent memory setup.

And we can't just revert that commit without re-introducing the original
issue that is a potential security hole, so making COW stricter (and in
the process much simpler) is a step to then undoing the forced COW that
broke other uses.

Link: https://lore.kernel.org/lkml/alpine.LRH.2.02.2009031328040.6929@file01.intranet.prod.int.rdu2.redhat.com/

* emailed patches from Peter Xu <peterx@redhat.com>:
mm: Add PGREUSE counter
mm/gup: Remove enfornced COW mechanism
mm/ksm: Remove reuse_ksm_page()
mm: do_wp_page() simplification

+29 -120
-8
drivers/gpu/drm/i915/gem/i915_gem_userptr.c
··· 596 596 GFP_KERNEL | 597 597 __GFP_NORETRY | 598 598 __GFP_NOWARN); 599 - /* 600 - * Using __get_user_pages_fast() with a read-only 601 - * access is questionable. A read-only page may be 602 - * COW-broken, and then this might end up giving 603 - * the wrong side of the COW.. 604 - * 605 - * We may or may not care. 606 - */ 607 599 if (pvec) { 608 600 /* defer to worker if malloc fails */ 609 601 if (!i915_gem_object_is_readonly(obj))
-7
include/linux/ksm.h
··· 53 53 54 54 void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); 55 55 void ksm_migrate_page(struct page *newpage, struct page *oldpage); 56 - bool reuse_ksm_page(struct page *page, 57 - struct vm_area_struct *vma, unsigned long address); 58 56 59 57 #else /* !CONFIG_KSM */ 60 58 ··· 85 87 86 88 static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) 87 89 { 88 - } 89 - static inline bool reuse_ksm_page(struct page *page, 90 - struct vm_area_struct *vma, unsigned long address) 91 - { 92 - return false; 93 90 } 94 91 #endif /* CONFIG_MMU */ 95 92 #endif /* !CONFIG_KSM */
+1
include/linux/vm_event_item.h
··· 30 30 PGFAULT, PGMAJFAULT, 31 31 PGLAZYFREED, 32 32 PGREFILL, 33 + PGREUSE, 33 34 PGSTEAL_KSWAPD, 34 35 PGSTEAL_DIRECT, 35 36 PGSCAN_KSWAPD,
+5 -35
mm/gup.c
··· 381 381 } 382 382 383 383 /* 384 - * FOLL_FORCE or a forced COW break can write even to unwritable pte's, 385 - * but only after we've gone through a COW cycle and they are dirty. 384 + * FOLL_FORCE can write to even unwritable pte's, but only 385 + * after we've gone through a COW cycle and they are dirty. 386 386 */ 387 387 static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) 388 388 { 389 - return pte_write(pte) || ((flags & FOLL_COW) && pte_dirty(pte)); 390 - } 391 - 392 - /* 393 - * A (separate) COW fault might break the page the other way and 394 - * get_user_pages() would return the page from what is now the wrong 395 - * VM. So we need to force a COW break at GUP time even for reads. 396 - */ 397 - static inline bool should_force_cow_break(struct vm_area_struct *vma, unsigned int flags) 398 - { 399 - return is_cow_mapping(vma->vm_flags) && (flags & (FOLL_GET | FOLL_PIN)); 389 + return pte_write(pte) || 390 + ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); 400 391 } 401 392 402 393 static struct page *follow_page_pte(struct vm_area_struct *vma, ··· 1058 1067 goto out; 1059 1068 } 1060 1069 if (is_vm_hugetlb_page(vma)) { 1061 - if (should_force_cow_break(vma, foll_flags)) 1062 - foll_flags |= FOLL_WRITE; 1063 1070 i = follow_hugetlb_page(mm, vma, pages, vmas, 1064 1071 &start, &nr_pages, i, 1065 - foll_flags, locked); 1072 + gup_flags, locked); 1066 1073 if (locked && *locked == 0) { 1067 1074 /* 1068 1075 * We've got a VM_FAULT_RETRY ··· 1074 1085 continue; 1075 1086 } 1076 1087 } 1077 - 1078 - if (should_force_cow_break(vma, foll_flags)) 1079 - foll_flags |= FOLL_WRITE; 1080 - 1081 1088 retry: 1082 1089 /* 1083 1090 * If we have a pending SIGKILL, don't keep faulting pages and ··· 2674 2689 return -EFAULT; 2675 2690 2676 2691 /* 2677 - * The FAST_GUP case requires FOLL_WRITE even for pure reads, 2678 - * because get_user_pages() may need to cause an early COW in 2679 - * order to avoid confusing the normal COW routines. So only 2680 - * targets that are already writable are safe to do by just 2681 - * looking at the page tables. 2682 - * 2683 - * NOTE! With FOLL_FAST_ONLY we allow read-only gup_fast() here, 2684 - * because there is no slow path to fall back on. But you'd 2685 - * better be careful about possible COW pages - you'll get _a_ 2686 - * COW page, but not necessarily the one you intended to get 2687 - * depending on what COW event happens after this. COW may break 2688 - * the page copy in a random direction. 2689 - * 2690 2692 * Disable interrupts. The nested form is used, in order to allow 2691 2693 * full, general purpose use of this routine. 2692 2694 * ··· 2686 2714 */ 2687 2715 if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { 2688 2716 unsigned long fast_flags = gup_flags; 2689 - if (!(gup_flags & FOLL_FAST_ONLY)) 2690 - fast_flags |= FOLL_WRITE; 2691 2717 2692 2718 local_irq_save(flags); 2693 2719 gup_pgd_range(addr, end, fast_flags, pages, &nr_pinned);
+4 -3
mm/huge_memory.c
··· 1291 1291 } 1292 1292 1293 1293 /* 1294 - * FOLL_FORCE or a forced COW break can write even to unwritable pmd's, 1295 - * but only after we've gone through a COW cycle and they are dirty. 1294 + * FOLL_FORCE can write to even unwritable pmd's, but only 1295 + * after we've gone through a COW cycle and they are dirty. 1296 1296 */ 1297 1297 static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) 1298 1298 { 1299 - return pmd_write(pmd) || ((flags & FOLL_COW) && pmd_dirty(pmd)); 1299 + return pmd_write(pmd) || 1300 + ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); 1300 1301 } 1301 1302 1302 1303 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
-25
mm/ksm.c
··· 2661 2661 goto again; 2662 2662 } 2663 2663 2664 - bool reuse_ksm_page(struct page *page, 2665 - struct vm_area_struct *vma, 2666 - unsigned long address) 2667 - { 2668 - #ifdef CONFIG_DEBUG_VM 2669 - if (WARN_ON(is_zero_pfn(page_to_pfn(page))) || 2670 - WARN_ON(!page_mapped(page)) || 2671 - WARN_ON(!PageLocked(page))) { 2672 - dump_page(page, "reuse_ksm_page"); 2673 - return false; 2674 - } 2675 - #endif 2676 - 2677 - if (PageSwapCache(page) || !page_stable_node(page)) 2678 - return false; 2679 - /* Prohibit parallel get_ksm_page() */ 2680 - if (!page_ref_freeze(page, 1)) 2681 - return false; 2682 - 2683 - page_move_anon_rmap(page, vma); 2684 - page->index = linear_page_index(vma, address); 2685 - page_ref_unfreeze(page, 1); 2686 - 2687 - return true; 2688 - } 2689 2664 #ifdef CONFIG_MIGRATION 2690 2665 void ksm_migrate_page(struct page *newpage, struct page *oldpage) 2691 2666 {
+18 -42
mm/memory.c
··· 2622 2622 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) 2623 2623 update_mmu_cache(vma, vmf->address, vmf->pte); 2624 2624 pte_unmap_unlock(vmf->pte, vmf->ptl); 2625 + count_vm_event(PGREUSE); 2625 2626 } 2626 2627 2627 2628 /* ··· 2928 2927 * not dirty accountable. 2929 2928 */ 2930 2929 if (PageAnon(vmf->page)) { 2931 - int total_map_swapcount; 2932 - if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) || 2933 - page_count(vmf->page) != 1)) 2930 + struct page *page = vmf->page; 2931 + 2932 + /* PageKsm() doesn't necessarily raise the page refcount */ 2933 + if (PageKsm(page) || page_count(page) != 1) 2934 2934 goto copy; 2935 - if (!trylock_page(vmf->page)) { 2936 - get_page(vmf->page); 2937 - pte_unmap_unlock(vmf->pte, vmf->ptl); 2938 - lock_page(vmf->page); 2939 - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, 2940 - vmf->address, &vmf->ptl); 2941 - if (!pte_same(*vmf->pte, vmf->orig_pte)) { 2942 - update_mmu_tlb(vma, vmf->address, vmf->pte); 2943 - unlock_page(vmf->page); 2944 - pte_unmap_unlock(vmf->pte, vmf->ptl); 2945 - put_page(vmf->page); 2946 - return 0; 2947 - } 2948 - put_page(vmf->page); 2935 + if (!trylock_page(page)) 2936 + goto copy; 2937 + if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) { 2938 + unlock_page(page); 2939 + goto copy; 2949 2940 } 2950 - if (PageKsm(vmf->page)) { 2951 - bool reused = reuse_ksm_page(vmf->page, vmf->vma, 2952 - vmf->address); 2953 - unlock_page(vmf->page); 2954 - if (!reused) 2955 - goto copy; 2956 - wp_page_reuse(vmf); 2957 - return VM_FAULT_WRITE; 2958 - } 2959 - if (reuse_swap_page(vmf->page, &total_map_swapcount)) { 2960 - if (total_map_swapcount == 1) { 2961 - /* 2962 - * The page is all ours. Move it to 2963 - * our anon_vma so the rmap code will 2964 - * not search our parent or siblings. 2965 - * Protected against the rmap code by 2966 - * the page lock. 2967 - */ 2968 - page_move_anon_rmap(vmf->page, vma); 2969 - } 2970 - unlock_page(vmf->page); 2971 - wp_page_reuse(vmf); 2972 - return VM_FAULT_WRITE; 2973 - } 2974 - unlock_page(vmf->page); 2941 + /* 2942 + * Ok, we've got the only map reference, and the only 2943 + * page count reference, and the page is locked, 2944 + * it's dark out, and we're wearing sunglasses. Hit it. 2945 + */ 2946 + wp_page_reuse(vmf); 2947 + unlock_page(page); 2948 + return VM_FAULT_WRITE; 2975 2949 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2976 2950 (VM_WRITE|VM_SHARED))) { 2977 2951 return wp_page_shared(vmf);
+1
mm/vmstat.c
··· 1241 1241 "pglazyfreed", 1242 1242 1243 1243 "pgrefill", 1244 + "pgreuse", 1244 1245 "pgsteal_kswapd", 1245 1246 "pgsteal_direct", 1246 1247 "pgscan_kswapd",