Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

userfaultfd: don't pin the user memory in userfaultfd_file_create()

userfaultfd_file_create() increments mm->mm_users; this means that the
memory won't be unmapped/freed if mm owner exits/execs, and UFFDIO_COPY
after that can populate the orphaned mm more.

Change userfaultfd_file_create() and userfaultfd_ctx_put() to use
mm->mm_count to pin mm_struct. This means that
atomic_inc_not_zero(mm->mm_users) is needed when we are going to
actually play with this memory. Except handle_userfault() path doesn't
need this, the caller must already have a reference.

The patch adds the new trivial helper, mmget_not_zero(), it can have
more users.

Link: http://lkml.kernel.org/r/20160516172254.GA8595@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Oleg Nesterov and committed by
Linus Torvalds
d2005e3f cd33a76b

+34 -14
+28 -13
fs/userfaultfd.c
··· 137 137 VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); 138 138 VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); 139 139 VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); 140 - mmput(ctx->mm); 140 + mmdrop(ctx->mm); 141 141 kmem_cache_free(userfaultfd_ctx_cachep, ctx); 142 142 } 143 143 } ··· 434 434 435 435 ACCESS_ONCE(ctx->released) = true; 436 436 437 + if (!mmget_not_zero(mm)) 438 + goto wakeup; 439 + 437 440 /* 438 441 * Flush page faults out of all CPUs. NOTE: all page faults 439 442 * must be retried without returning VM_FAULT_SIGBUS if ··· 469 466 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 470 467 } 471 468 up_write(&mm->mmap_sem); 472 - 469 + mmput(mm); 470 + wakeup: 473 471 /* 474 472 * After no new page faults can wait on this fault_*wqh, flush 475 473 * the last page faults that may have been already waiting on ··· 764 760 start = uffdio_register.range.start; 765 761 end = start + uffdio_register.range.len; 766 762 763 + ret = -ENOMEM; 764 + if (!mmget_not_zero(mm)) 765 + goto out; 766 + 767 767 down_write(&mm->mmap_sem); 768 768 vma = find_vma_prev(mm, start, &prev); 769 - 770 - ret = -ENOMEM; 771 769 if (!vma) 772 770 goto out_unlock; 773 771 ··· 870 864 } while (vma && vma->vm_start < end); 871 865 out_unlock: 872 866 up_write(&mm->mmap_sem); 867 + mmput(mm); 873 868 if (!ret) { 874 869 /* 875 870 * Now that we scanned all vmas we can already tell ··· 909 902 start = uffdio_unregister.start; 910 903 end = start + uffdio_unregister.len; 911 904 905 + ret = -ENOMEM; 906 + if (!mmget_not_zero(mm)) 907 + goto out; 908 + 912 909 down_write(&mm->mmap_sem); 913 910 vma = find_vma_prev(mm, start, &prev); 914 - 915 - ret = -ENOMEM; 916 911 if (!vma) 917 912 goto out_unlock; 918 913 ··· 1007 998 } while (vma && vma->vm_start < end); 1008 999 out_unlock: 1009 1000 up_write(&mm->mmap_sem); 1001 + mmput(mm); 1010 1002 out: 1011 1003 return ret; 1012 1004 } ··· 1077 1067 goto out; 1078 1068 if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE) 1079 1069 goto out; 1080 - 1081 - ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, 1082 - uffdio_copy.len); 1070 + if (mmget_not_zero(ctx->mm)) { 1071 + ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, 1072 + uffdio_copy.len); 1073 + mmput(ctx->mm); 1074 + } 1083 1075 if (unlikely(put_user(ret, &user_uffdio_copy->copy))) 1084 1076 return -EFAULT; 1085 1077 if (ret < 0) ··· 1122 1110 if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) 1123 1111 goto out; 1124 1112 1125 - ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, 1126 - uffdio_zeropage.range.len); 1113 + if (mmget_not_zero(ctx->mm)) { 1114 + ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, 1115 + uffdio_zeropage.range.len); 1116 + mmput(ctx->mm); 1117 + } 1127 1118 if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) 1128 1119 return -EFAULT; 1129 1120 if (ret < 0) ··· 1304 1289 ctx->released = false; 1305 1290 ctx->mm = current->mm; 1306 1291 /* prevent the mm struct to be freed */ 1307 - atomic_inc(&ctx->mm->mm_users); 1292 + atomic_inc(&ctx->mm->mm_count); 1308 1293 1309 1294 file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx, 1310 1295 O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); 1311 1296 if (IS_ERR(file)) { 1312 - mmput(ctx->mm); 1297 + mmdrop(ctx->mm); 1313 1298 kmem_cache_free(userfaultfd_ctx_cachep, ctx); 1314 1299 } 1315 1300 out:
+6 -1
include/linux/sched.h
··· 2723 2723 2724 2724 /* mmdrop drops the mm and the page tables */ 2725 2725 extern void __mmdrop(struct mm_struct *); 2726 - static inline void mmdrop(struct mm_struct * mm) 2726 + static inline void mmdrop(struct mm_struct *mm) 2727 2727 { 2728 2728 if (unlikely(atomic_dec_and_test(&mm->mm_count))) 2729 2729 __mmdrop(mm); 2730 + } 2731 + 2732 + static inline bool mmget_not_zero(struct mm_struct *mm) 2733 + { 2734 + return atomic_inc_not_zero(&mm->mm_users); 2730 2735 } 2731 2736 2732 2737 /* mmput gets rid of the mappings and all user-space */