Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: remove the now-unnecessary mmget_still_valid() hack

The preceding patches have ensured that core dumping properly takes the
mmap_lock. Thanks to that, we can now remove mmget_still_valid() and all
its users.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Link: http://lkml.kernel.org/r/20200827114932.3572699-8-jannh@google.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Jann Horn and committed by
Linus Torvalds
4d45e75a 7f3bfab5

+32 -110
-3
drivers/infiniband/core/uverbs_main.c
··· 845 845 * will only be one mm, so no big deal. 846 846 */ 847 847 mmap_read_lock(mm); 848 - if (!mmget_still_valid(mm)) 849 - goto skip_mm; 850 848 mutex_lock(&ufile->umap_lock); 851 849 list_for_each_entry_safe (priv, next_priv, &ufile->umaps, 852 850 list) { ··· 863 865 } 864 866 } 865 867 mutex_unlock(&ufile->umap_lock); 866 - skip_mm: 867 868 mmap_read_unlock(mm); 868 869 mmput(mm); 869 870 }
+21 -23
drivers/vfio/pci/vfio_pci.c
··· 1480 1480 } else { 1481 1481 mmap_read_lock(mm); 1482 1482 } 1483 - if (mmget_still_valid(mm)) { 1484 - if (try) { 1485 - if (!mutex_trylock(&vdev->vma_lock)) { 1486 - mmap_read_unlock(mm); 1487 - mmput(mm); 1488 - return 0; 1489 - } 1490 - } else { 1491 - mutex_lock(&vdev->vma_lock); 1483 + if (try) { 1484 + if (!mutex_trylock(&vdev->vma_lock)) { 1485 + mmap_read_unlock(mm); 1486 + mmput(mm); 1487 + return 0; 1492 1488 } 1493 - list_for_each_entry_safe(mmap_vma, tmp, 1494 - &vdev->vma_list, vma_next) { 1495 - struct vm_area_struct *vma = mmap_vma->vma; 1496 - 1497 - if (vma->vm_mm != mm) 1498 - continue; 1499 - 1500 - list_del(&mmap_vma->vma_next); 1501 - kfree(mmap_vma); 1502 - 1503 - zap_vma_ptes(vma, vma->vm_start, 1504 - vma->vm_end - vma->vm_start); 1505 - } 1506 - mutex_unlock(&vdev->vma_lock); 1489 + } else { 1490 + mutex_lock(&vdev->vma_lock); 1507 1491 } 1492 + list_for_each_entry_safe(mmap_vma, tmp, 1493 + &vdev->vma_list, vma_next) { 1494 + struct vm_area_struct *vma = mmap_vma->vma; 1495 + 1496 + if (vma->vm_mm != mm) 1497 + continue; 1498 + 1499 + list_del(&mmap_vma->vma_next); 1500 + kfree(mmap_vma); 1501 + 1502 + zap_vma_ptes(vma, vma->vm_start, 1503 + vma->vm_end - vma->vm_start); 1504 + } 1505 + mutex_unlock(&vdev->vma_lock); 1508 1506 mmap_read_unlock(mm); 1509 1507 mmput(mm); 1510 1508 }
-18
fs/proc/task_mmu.c
··· 1244 1244 count = -EINTR; 1245 1245 goto out_mm; 1246 1246 } 1247 - /* 1248 - * Avoid to modify vma->vm_flags 1249 - * without locked ops while the 1250 - * coredump reads the vm_flags. 1251 - */ 1252 - if (!mmget_still_valid(mm)) { 1253 - /* 1254 - * Silently return "count" 1255 - * like if get_task_mm() 1256 - * failed. FIXME: should this 1257 - * function have returned 1258 - * -ESRCH if get_task_mm() 1259 - * failed like if 1260 - * get_proc_task() fails? 1261 - */ 1262 - mmap_write_unlock(mm); 1263 - goto out_mm; 1264 - } 1265 1247 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1266 1248 vma->vm_flags &= ~VM_SOFTDIRTY; 1267 1249 vma_set_page_prot(vma);
+9 -19
fs/userfaultfd.c
··· 601 601 602 602 /* the various vma->vm_userfaultfd_ctx still points to it */ 603 603 mmap_write_lock(mm); 604 - /* no task can run (and in turn coredump) yet */ 605 - VM_WARN_ON(!mmget_still_valid(mm)); 606 604 for (vma = mm->mmap; vma; vma = vma->vm_next) 607 605 if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { 608 606 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; ··· 840 842 /* len == 0 means wake all */ 841 843 struct userfaultfd_wake_range range = { .len = 0, }; 842 844 unsigned long new_flags; 843 - bool still_valid; 844 845 845 846 WRITE_ONCE(ctx->released, true); 846 847 ··· 855 858 * taking the mmap_lock for writing. 856 859 */ 857 860 mmap_write_lock(mm); 858 - still_valid = mmget_still_valid(mm); 859 861 prev = NULL; 860 862 for (vma = mm->mmap; vma; vma = vma->vm_next) { 861 863 cond_resched(); ··· 865 869 continue; 866 870 } 867 871 new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); 868 - if (still_valid) { 869 - prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, 870 - new_flags, vma->anon_vma, 871 - vma->vm_file, vma->vm_pgoff, 872 - vma_policy(vma), 873 - NULL_VM_UFFD_CTX); 874 - if (prev) 875 - vma = prev; 876 - else 877 - prev = vma; 878 - } 872 + prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, 873 + new_flags, vma->anon_vma, 874 + vma->vm_file, vma->vm_pgoff, 875 + vma_policy(vma), 876 + NULL_VM_UFFD_CTX); 877 + if (prev) 878 + vma = prev; 879 + else 880 + prev = vma; 879 881 vma->vm_flags = new_flags; 880 882 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 881 883 } ··· 1303 1309 goto out; 1304 1310 1305 1311 mmap_write_lock(mm); 1306 - if (!mmget_still_valid(mm)) 1307 - goto out_unlock; 1308 1312 vma = find_vma_prev(mm, start, &prev); 1309 1313 if (!vma) 1310 1314 goto out_unlock; ··· 1503 1511 goto out; 1504 1512 1505 1513 mmap_write_lock(mm); 1506 - if (!mmget_still_valid(mm)) 1507 - goto out_unlock; 1508 1514 vma = find_vma_prev(mm, start, &prev); 1509 1515 if (!vma) 1510 1516 goto out_unlock;
-25
include/linux/sched/mm.h
··· 49 49 __mmdrop(mm); 50 50 } 51 51 52 - /* 53 - * This has to be called after a get_task_mm()/mmget_not_zero() 54 - * followed by taking the mmap_lock for writing before modifying the 55 - * vmas or anything the coredump pretends not to change from under it. 56 - * 57 - * It also has to be called when mmgrab() is used in the context of 58 - * the process, but then the mm_count refcount is transferred outside 59 - * the context of the process to run down_write() on that pinned mm. 60 - * 61 - * NOTE: find_extend_vma() called from GUP context is the only place 62 - * that can modify the "mm" (notably the vm_start/end) under mmap_lock 63 - * for reading and outside the context of the process, so it is also 64 - * the only case that holds the mmap_lock for reading that must call 65 - * this function. Generally if the mmap_lock is hold for reading 66 - * there's no need of this check after get_task_mm()/mmget_not_zero(). 67 - * 68 - * This function can be obsoleted and the check can be removed, after 69 - * the coredump code will hold the mmap_lock for writing before 70 - * invoking the ->core_dump methods. 71 - */ 72 - static inline bool mmget_still_valid(struct mm_struct *mm) 73 - { 74 - return likely(!mm->core_state); 75 - } 76 - 77 52 /** 78 53 * mmget() - Pin the address space associated with a &struct mm_struct. 79 54 * @mm: The address space to pin.
+1 -1
mm/khugepaged.c
··· 434 434 435 435 static inline int khugepaged_test_exit(struct mm_struct *mm) 436 436 { 437 - return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm); 437 + return atomic_read(&mm->mm_users) == 0; 438 438 } 439 439 440 440 static bool hugepage_vma_check(struct vm_area_struct *vma,
-17
mm/madvise.c
··· 1085 1085 if (write) { 1086 1086 if (mmap_write_lock_killable(current->mm)) 1087 1087 return -EINTR; 1088 - 1089 - /* 1090 - * We may have stolen the mm from another process 1091 - * that is undergoing core dumping. 1092 - * 1093 - * Right now that's io_ring, in the future it may 1094 - * be remote process management and not "current" 1095 - * at all. 1096 - * 1097 - * We need to fix core dumping to not do this, 1098 - * but for now we have the mmget_still_valid() 1099 - * model. 1100 - */ 1101 - if (!mmget_still_valid(current->mm)) { 1102 - mmap_write_unlock(current->mm); 1103 - return -EINTR; 1104 - } 1105 1088 } else { 1106 1089 mmap_read_lock(current->mm); 1107 1090 }
+1 -4
mm/mmap.c
··· 2562 2562 if (vma && (vma->vm_start <= addr)) 2563 2563 return vma; 2564 2564 /* don't alter vm_end if the coredump is running */ 2565 - if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr)) 2565 + if (!prev || expand_stack(prev, addr)) 2566 2566 return NULL; 2567 2567 if (prev->vm_flags & VM_LOCKED) 2568 2568 populate_vma_page_range(prev, addr, prev->vm_end, NULL); ··· 2587 2587 if (vma->vm_start <= addr) 2588 2588 return vma; 2589 2589 if (!(vma->vm_flags & VM_GROWSDOWN)) 2590 - return NULL; 2591 - /* don't alter vm_start if the coredump is running */ 2592 - if (!mmget_still_valid(mm)) 2593 2590 return NULL; 2594 2591 start = vma->vm_start; 2595 2592 if (expand_stack(vma, addr))