Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: fix swapin race condition

The pte_same check is reliable only if the swap entry remains pinned (by
the page lock on swapcache). We've also to ensure the swapcache isn't
removed before we take the lock as try_to_free_swap won't care about the
page pin.

One of the possible impacts of this patch is that a KSM-shared page can
point to the anon_vma of another process, which could exit before the page
is freed.

This can leave a page with a pointer to a recycled anon_vma object, or
worse, a pointer to something that is no longer an anon_vma.

[riel@redhat.com: changelog help]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Andrea Arcangeli and committed by
Linus Torvalds
4969c119 7c5367f2

+43 -19
+9 -11
include/linux/ksm.h
··· 16 16 struct stable_node; 17 17 struct mem_cgroup; 18 18 19 + struct page *ksm_does_need_to_copy(struct page *page, 20 + struct vm_area_struct *vma, unsigned long address); 21 + 19 22 #ifdef CONFIG_KSM 20 23 int ksm_madvise(struct vm_area_struct *vma, unsigned long start, 21 24 unsigned long end, int advice, unsigned long *vm_flags); ··· 73 70 * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, 74 71 * but what if the vma was unmerged while the page was swapped out? 75 72 */ 76 - struct page *ksm_does_need_to_copy(struct page *page, 77 - struct vm_area_struct *vma, unsigned long address); 78 - static inline struct page *ksm_might_need_to_copy(struct page *page, 73 + static inline int ksm_might_need_to_copy(struct page *page, 79 74 struct vm_area_struct *vma, unsigned long address) 80 75 { 81 76 struct anon_vma *anon_vma = page_anon_vma(page); 82 77 83 - if (!anon_vma || 84 - (anon_vma->root == vma->anon_vma->root && 85 - page->index == linear_page_index(vma, address))) 86 - return page; 87 - 88 - return ksm_does_need_to_copy(page, vma, address); 78 + return anon_vma && 79 + (anon_vma->root != vma->anon_vma->root || 80 + page->index != linear_page_index(vma, address)); 89 81 } 90 82 91 83 int page_referenced_ksm(struct page *page, ··· 113 115 return 0; 114 116 } 115 117 116 - static inline struct page *ksm_might_need_to_copy(struct page *page, 118 + static inline int ksm_might_need_to_copy(struct page *page, 117 119 struct vm_area_struct *vma, unsigned long address) 118 120 { 119 - return page; 121 + return 0; 120 122 } 121 123 122 124 static inline int page_referenced_ksm(struct page *page,
-3
mm/ksm.c
··· 1504 1504 { 1505 1505 struct page *new_page; 1506 1506 1507 - unlock_page(page); /* any racers will COW it, not modify it */ 1508 - 1509 1507 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1510 1508 if (new_page) { 1511 1509 copy_user_highpage(new_page, page, address, vma); ··· 1519 1521 add_page_to_unevictable_list(new_page); 1520 1522 } 1521 1523 1522 - page_cache_release(page); 1523 1524 return new_page; 1524 1525 } 1525 1526
+34 -5
mm/memory.c
··· 2623 2623 unsigned int flags, pte_t orig_pte) 2624 2624 { 2625 2625 spinlock_t *ptl; 2626 - struct page *page; 2626 + struct page *page, *swapcache = NULL; 2627 2627 swp_entry_t entry; 2628 2628 pte_t pte; 2629 2629 struct mem_cgroup *ptr = NULL; ··· 2679 2679 lock_page(page); 2680 2680 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2681 2681 2682 - page = ksm_might_need_to_copy(page, vma, address); 2683 - if (!page) { 2684 - ret = VM_FAULT_OOM; 2685 - goto out; 2682 + /* 2683 + * Make sure try_to_free_swap didn't release the swapcache 2684 + * from under us. The page pin isn't enough to prevent that. 2685 + */ 2686 + if (unlikely(!PageSwapCache(page))) 2687 + goto out_page; 2688 + 2689 + if (ksm_might_need_to_copy(page, vma, address)) { 2690 + swapcache = page; 2691 + page = ksm_does_need_to_copy(page, vma, address); 2692 + 2693 + if (unlikely(!page)) { 2694 + ret = VM_FAULT_OOM; 2695 + page = swapcache; 2696 + swapcache = NULL; 2697 + goto out_page; 2698 + } 2686 2699 } 2687 2700 2688 2701 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { ··· 2748 2735 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 2749 2736 try_to_free_swap(page); 2750 2737 unlock_page(page); 2738 + if (swapcache) { 2739 + /* 2740 + * Hold the lock to avoid the swap entry to be reused 2741 + * until we take the PT lock for the pte_same() check 2742 + * (to avoid false positives from pte_same). For 2743 + * further safety release the lock after the swap_free 2744 + * so that the swap count won't change under a 2745 + * parallel locked swapcache. 2746 + */ 2747 + unlock_page(swapcache); 2748 + page_cache_release(swapcache); 2749 + } 2751 2750 2752 2751 if (flags & FAULT_FLAG_WRITE) { 2753 2752 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); ··· 2781 2756 unlock_page(page); 2782 2757 out_release: 2783 2758 page_cache_release(page); 2759 + if (swapcache) { 2760 + unlock_page(swapcache); 2761 + page_cache_release(swapcache); 2762 + } 2784 2763 return ret; 2785 2764 } 2786 2765