[PATCH] fix get_user_pages bug

Checking pte_dirty instead of pte_write in __follow_page is problematic
for s390, and for copy_one_pte which leaves dirty when clearing write.

So revert __follow_page to check pte_write as before, and make
do_wp_page pass back a special extra VM_FAULT_WRITE bit to say it has
done its full job: once get_user_pages receives this value, it no longer
requires pte_write in __follow_page.

But most callers of handle_mm_fault, in the various architectures, have
switch statements which do not expect this new case. To avoid changing
them all in a hurry, make an inline wrapper function (using the old
name) that masks off the new bit, and use the extended interface with
double underscores.

Yes, we do have a call to do_wp_page from do_swap_page, but no need to
change that: in rare case it's needed, another do_wp_page will follow.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
[ Cleanups by Nick Piggin ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by Nick Piggin and committed by Linus Torvalds f33ea7f4 5cb4cc0d

+40 -13
+17 -5
include/linux/mm.h
··· 625 625 * Used to decide whether a process gets delivered SIGBUS or 626 626 * just gets major/minor fault counters bumped up. 627 627 */ 628 - #define VM_FAULT_OOM (-1) 629 - #define VM_FAULT_SIGBUS 0 630 - #define VM_FAULT_MINOR 1 631 - #define VM_FAULT_MAJOR 2 628 + #define VM_FAULT_OOM 0x00 629 + #define VM_FAULT_SIGBUS 0x01 630 + #define VM_FAULT_MINOR 0x02 631 + #define VM_FAULT_MAJOR 0x03 632 + 633 + /* 634 + * Special case for get_user_pages. 635 + * Must be in a distinct bit from the above VM_FAULT_ flags. 636 + */ 637 + #define VM_FAULT_WRITE 0x10 632 638 633 639 #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) 634 640 ··· 710 704 extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); 711 705 extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); 712 706 extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); 713 - extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); 707 + extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); 708 + 709 + static inline int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access) 710 + { 711 + return __handle_mm_fault(mm, vma, address, write_access) & (~VM_FAULT_WRITE); 712 + } 713 + 714 714 extern int make_pages_present(unsigned long addr, unsigned long end); 715 715 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); 716 716 void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
+23 -8
mm/memory.c
··· 811 811 pte = *ptep; 812 812 pte_unmap(ptep); 813 813 if (pte_present(pte)) { 814 - if (write && !pte_dirty(pte)) 814 + if (write && !pte_write(pte)) 815 815 goto out; 816 816 if (read && !pte_read(pte)) 817 817 goto out; 818 818 pfn = pte_pfn(pte); 819 819 if (pfn_valid(pfn)) { 820 820 page = pfn_to_page(pfn); 821 - if (accessed) 821 + if (accessed) { 822 + if (write && !pte_dirty(pte) &&!PageDirty(page)) 823 + set_page_dirty(page); 822 824 mark_page_accessed(page); 825 + } 823 826 return page; 824 827 } 825 828 } ··· 944 941 } 945 942 spin_lock(&mm->page_table_lock); 946 943 do { 944 + int write_access = write; 947 945 struct page *page; 948 946 949 947 cond_resched_lock(&mm->page_table_lock); 950 - while (!(page = follow_page(mm, start, write))) { 948 + while (!(page = follow_page(mm, start, write_access))) { 951 949 /* 952 950 * Shortcut for anonymous pages. We don't want 953 951 * to force the creation of pages tables for ··· 961 957 break; 962 958 } 963 959 spin_unlock(&mm->page_table_lock); 964 - switch (handle_mm_fault(mm,vma,start,write)) { 960 + switch (__handle_mm_fault(mm, vma, start, 961 + write_access)) { 962 + case VM_FAULT_WRITE: 963 + /* 964 + * do_wp_page has broken COW when 965 + * necessary, even if maybe_mkwrite 966 + * decided not to set pte_write 967 + */ 968 + write_access = 0; 969 + /* FALLTHRU */ 965 970 case VM_FAULT_MINOR: 966 971 tsk->min_flt++; 967 972 break; ··· 1233 1220 struct page *old_page, *new_page; 1234 1221 unsigned long pfn = pte_pfn(pte); 1235 1222 pte_t entry; 1223 + int ret; 1236 1224 1237 1225 if (unlikely(!pfn_valid(pfn))) { 1238 1226 /* ··· 1261 1247 lazy_mmu_prot_update(entry); 1262 1248 pte_unmap(page_table); 1263 1249 spin_unlock(&mm->page_table_lock); 1264 - return VM_FAULT_MINOR; 1250 + return VM_FAULT_MINOR|VM_FAULT_WRITE; 1265 1251 } 1266 1252 } 1267 1253 pte_unmap(page_table); ··· 1288 1274 /* 1289 1275 * Re-check the pte - we dropped the lock 1290 1276 */ 1277 + ret = VM_FAULT_MINOR; 1291 1278 spin_lock(&mm->page_table_lock); 1292 1279 page_table = pte_offset_map(pmd, address); 1293 1280 if (likely(pte_same(*page_table, pte))) { ··· 1305 1290 1306 1291 /* Free the old page.. */ 1307 1292 new_page = old_page; 1293 + ret |= VM_FAULT_WRITE; 1308 1294 } 1309 1295 pte_unmap(page_table); 1310 1296 page_cache_release(new_page); 1311 1297 page_cache_release(old_page); 1312 1298 spin_unlock(&mm->page_table_lock); 1313 - return VM_FAULT_MINOR; 1299 + return ret; 1314 1300 1315 1301 no_new_page: 1316 1302 page_cache_release(old_page); ··· 2003 1987 if (write_access) { 2004 1988 if (!pte_write(entry)) 2005 1989 return do_wp_page(mm, vma, address, pte, pmd, entry); 2006 - 2007 1990 entry = pte_mkdirty(entry); 2008 1991 } 2009 1992 entry = pte_mkyoung(entry); ··· 2017 2002 /* 2018 2003 * By the time we get here, we already hold the mm semaphore 2019 2004 */ 2020 - int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, 2005 + int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, 2021 2006 unsigned long address, int write_access) 2022 2007 { 2023 2008 pgd_t *pgd;