Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

userfaultfd: add new syscall to provide memory externalization

Once an userfaultfd has been created and certain region of the process
virtual address space have been registered into it, the thread responsible
for doing the memory externalization can manage the page faults in
userland by talking to the kernel using the userfaultfd protocol.

poll() can be used to know when there are new pending userfaults to be
read (POLLIN).

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Andrea Arcangeli and committed by
Linus Torvalds
86039bd3 c1294d05

+1036
+1036
fs/userfaultfd.c
··· 1 + /* 2 + * fs/userfaultfd.c 3 + * 4 + * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> 5 + * Copyright (C) 2008-2009 Red Hat, Inc. 6 + * Copyright (C) 2015 Red Hat, Inc. 7 + * 8 + * This work is licensed under the terms of the GNU GPL, version 2. See 9 + * the COPYING file in the top-level directory. 10 + * 11 + * Some part derived from fs/eventfd.c (anon inode setup) and 12 + * mm/ksm.c (mm hashing). 13 + */ 14 + 15 + #include <linux/hashtable.h> 16 + #include <linux/sched.h> 17 + #include <linux/mm.h> 18 + #include <linux/poll.h> 19 + #include <linux/slab.h> 20 + #include <linux/seq_file.h> 21 + #include <linux/file.h> 22 + #include <linux/bug.h> 23 + #include <linux/anon_inodes.h> 24 + #include <linux/syscalls.h> 25 + #include <linux/userfaultfd_k.h> 26 + #include <linux/mempolicy.h> 27 + #include <linux/ioctl.h> 28 + #include <linux/security.h> 29 + 30 + enum userfaultfd_state { 31 + UFFD_STATE_WAIT_API, 32 + UFFD_STATE_RUNNING, 33 + }; 34 + 35 + struct userfaultfd_ctx { 36 + /* pseudo fd refcounting */ 37 + atomic_t refcount; 38 + /* waitqueue head for the userfaultfd page faults */ 39 + wait_queue_head_t fault_wqh; 40 + /* waitqueue head for the pseudo fd to wakeup poll/read */ 41 + wait_queue_head_t fd_wqh; 42 + /* userfaultfd syscall flags */ 43 + unsigned int flags; 44 + /* state machine */ 45 + enum userfaultfd_state state; 46 + /* released */ 47 + bool released; 48 + /* mm with one ore more vmas attached to this userfaultfd_ctx */ 49 + struct mm_struct *mm; 50 + }; 51 + 52 + struct userfaultfd_wait_queue { 53 + unsigned long address; 54 + wait_queue_t wq; 55 + bool pending; 56 + struct userfaultfd_ctx *ctx; 57 + }; 58 + 59 + struct userfaultfd_wake_range { 60 + unsigned long start; 61 + unsigned long len; 62 + }; 63 + 64 + static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, 65 + int wake_flags, void *key) 66 + { 67 + struct userfaultfd_wake_range *range = key; 68 + int ret; 69 + struct userfaultfd_wait_queue *uwq; 70 + unsigned long start, len; 71 + 72 + uwq = container_of(wq, struct userfaultfd_wait_queue, wq); 73 + ret = 0; 74 + /* don't wake the pending ones to avoid reads to block */ 75 + if (uwq->pending && !ACCESS_ONCE(uwq->ctx->released)) 76 + goto out; 77 + /* len == 0 means wake all */ 78 + start = range->start; 79 + len = range->len; 80 + if (len && (start > uwq->address || start + len <= uwq->address)) 81 + goto out; 82 + ret = wake_up_state(wq->private, mode); 83 + if (ret) 84 + /* 85 + * Wake only once, autoremove behavior. 86 + * 87 + * After the effect of list_del_init is visible to the 88 + * other CPUs, the waitqueue may disappear from under 89 + * us, see the !list_empty_careful() in 90 + * handle_userfault(). try_to_wake_up() has an 91 + * implicit smp_mb__before_spinlock, and the 92 + * wq->private is read before calling the extern 93 + * function "wake_up_state" (which in turns calls 94 + * try_to_wake_up). While the spin_lock;spin_unlock; 95 + * wouldn't be enough, the smp_mb__before_spinlock is 96 + * enough to avoid an explicit smp_mb() here. 97 + */ 98 + list_del_init(&wq->task_list); 99 + out: 100 + return ret; 101 + } 102 + 103 + /** 104 + * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd 105 + * context. 106 + * @ctx: [in] Pointer to the userfaultfd context. 107 + * 108 + * Returns: In case of success, returns not zero. 109 + */ 110 + static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) 111 + { 112 + if (!atomic_inc_not_zero(&ctx->refcount)) 113 + BUG(); 114 + } 115 + 116 + /** 117 + * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd 118 + * context. 119 + * @ctx: [in] Pointer to userfaultfd context. 120 + * 121 + * The userfaultfd context reference must have been previously acquired either 122 + * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget(). 123 + */ 124 + static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) 125 + { 126 + if (atomic_dec_and_test(&ctx->refcount)) { 127 + VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock)); 128 + VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); 129 + VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); 130 + VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); 131 + VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); 132 + VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); 133 + mmput(ctx->mm); 134 + kfree(ctx); 135 + } 136 + } 137 + 138 + static inline unsigned long userfault_address(unsigned long address, 139 + unsigned int flags, 140 + unsigned long reason) 141 + { 142 + BUILD_BUG_ON(PAGE_SHIFT < UFFD_BITS); 143 + address &= PAGE_MASK; 144 + if (flags & FAULT_FLAG_WRITE) 145 + /* 146 + * Encode "write" fault information in the LSB of the 147 + * address read by userland, without depending on 148 + * FAULT_FLAG_WRITE kernel internal value. 149 + */ 150 + address |= UFFD_BIT_WRITE; 151 + if (reason & VM_UFFD_WP) 152 + /* 153 + * Encode "reason" fault information as bit number 1 154 + * in the address read by userland. If bit number 1 is 155 + * clear it means the reason is a VM_FAULT_MISSING 156 + * fault. 157 + */ 158 + address |= UFFD_BIT_WP; 159 + return address; 160 + } 161 + 162 + /* 163 + * The locking rules involved in returning VM_FAULT_RETRY depending on 164 + * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and 165 + * FAULT_FLAG_KILLABLE are not straightforward. The "Caution" 166 + * recommendation in __lock_page_or_retry is not an understatement. 167 + * 168 + * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released 169 + * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is 170 + * not set. 171 + * 172 + * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not 173 + * set, VM_FAULT_RETRY can still be returned if and only if there are 174 + * fatal_signal_pending()s, and the mmap_sem must be released before 175 + * returning it. 176 + */ 177 + int handle_userfault(struct vm_area_struct *vma, unsigned long address, 178 + unsigned int flags, unsigned long reason) 179 + { 180 + struct mm_struct *mm = vma->vm_mm; 181 + struct userfaultfd_ctx *ctx; 182 + struct userfaultfd_wait_queue uwq; 183 + 184 + BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); 185 + 186 + ctx = vma->vm_userfaultfd_ctx.ctx; 187 + if (!ctx) 188 + return VM_FAULT_SIGBUS; 189 + 190 + BUG_ON(ctx->mm != mm); 191 + 192 + VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); 193 + VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); 194 + 195 + /* 196 + * If it's already released don't get it. This avoids to loop 197 + * in __get_user_pages if userfaultfd_release waits on the 198 + * caller of handle_userfault to release the mmap_sem. 199 + */ 200 + if (unlikely(ACCESS_ONCE(ctx->released))) 201 + return VM_FAULT_SIGBUS; 202 + 203 + /* 204 + * Check that we can return VM_FAULT_RETRY. 205 + * 206 + * NOTE: it should become possible to return VM_FAULT_RETRY 207 + * even if FAULT_FLAG_TRIED is set without leading to gup() 208 + * -EBUSY failures, if the userfaultfd is to be extended for 209 + * VM_UFFD_WP tracking and we intend to arm the userfault 210 + * without first stopping userland access to the memory. For 211 + * VM_UFFD_MISSING userfaults this is enough for now. 212 + */ 213 + if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) { 214 + /* 215 + * Validate the invariant that nowait must allow retry 216 + * to be sure not to return SIGBUS erroneously on 217 + * nowait invocations. 218 + */ 219 + BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT); 220 + #ifdef CONFIG_DEBUG_VM 221 + if (printk_ratelimit()) { 222 + printk(KERN_WARNING 223 + "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags); 224 + dump_stack(); 225 + } 226 + #endif 227 + return VM_FAULT_SIGBUS; 228 + } 229 + 230 + /* 231 + * Handle nowait, not much to do other than tell it to retry 232 + * and wait. 233 + */ 234 + if (flags & FAULT_FLAG_RETRY_NOWAIT) 235 + return VM_FAULT_RETRY; 236 + 237 + /* take the reference before dropping the mmap_sem */ 238 + userfaultfd_ctx_get(ctx); 239 + 240 + /* be gentle and immediately relinquish the mmap_sem */ 241 + up_read(&mm->mmap_sem); 242 + 243 + init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); 244 + uwq.wq.private = current; 245 + uwq.address = userfault_address(address, flags, reason); 246 + uwq.pending = true; 247 + uwq.ctx = ctx; 248 + 249 + spin_lock(&ctx->fault_wqh.lock); 250 + /* 251 + * After the __add_wait_queue the uwq is visible to userland 252 + * through poll/read(). 253 + */ 254 + __add_wait_queue(&ctx->fault_wqh, &uwq.wq); 255 + for (;;) { 256 + set_current_state(TASK_KILLABLE); 257 + if (!uwq.pending || ACCESS_ONCE(ctx->released) || 258 + fatal_signal_pending(current)) 259 + break; 260 + spin_unlock(&ctx->fault_wqh.lock); 261 + 262 + wake_up_poll(&ctx->fd_wqh, POLLIN); 263 + schedule(); 264 + 265 + spin_lock(&ctx->fault_wqh.lock); 266 + } 267 + __remove_wait_queue(&ctx->fault_wqh, &uwq.wq); 268 + __set_current_state(TASK_RUNNING); 269 + spin_unlock(&ctx->fault_wqh.lock); 270 + 271 + /* 272 + * ctx may go away after this if the userfault pseudo fd is 273 + * already released. 274 + */ 275 + userfaultfd_ctx_put(ctx); 276 + 277 + return VM_FAULT_RETRY; 278 + } 279 + 280 + static int userfaultfd_release(struct inode *inode, struct file *file) 281 + { 282 + struct userfaultfd_ctx *ctx = file->private_data; 283 + struct mm_struct *mm = ctx->mm; 284 + struct vm_area_struct *vma, *prev; 285 + /* len == 0 means wake all */ 286 + struct userfaultfd_wake_range range = { .len = 0, }; 287 + unsigned long new_flags; 288 + 289 + ACCESS_ONCE(ctx->released) = true; 290 + 291 + /* 292 + * Flush page faults out of all CPUs. NOTE: all page faults 293 + * must be retried without returning VM_FAULT_SIGBUS if 294 + * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx 295 + * changes while handle_userfault released the mmap_sem. So 296 + * it's critical that released is set to true (above), before 297 + * taking the mmap_sem for writing. 298 + */ 299 + down_write(&mm->mmap_sem); 300 + prev = NULL; 301 + for (vma = mm->mmap; vma; vma = vma->vm_next) { 302 + cond_resched(); 303 + BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ 304 + !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); 305 + if (vma->vm_userfaultfd_ctx.ctx != ctx) { 306 + prev = vma; 307 + continue; 308 + } 309 + new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); 310 + prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, 311 + new_flags, vma->anon_vma, 312 + vma->vm_file, vma->vm_pgoff, 313 + vma_policy(vma), 314 + NULL_VM_UFFD_CTX); 315 + if (prev) 316 + vma = prev; 317 + else 318 + prev = vma; 319 + vma->vm_flags = new_flags; 320 + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 321 + } 322 + up_write(&mm->mmap_sem); 323 + 324 + /* 325 + * After no new page faults can wait on this fault_wqh, flush 326 + * the last page faults that may have been already waiting on 327 + * the fault_wqh. 328 + */ 329 + spin_lock(&ctx->fault_wqh.lock); 330 + __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range); 331 + spin_unlock(&ctx->fault_wqh.lock); 332 + 333 + wake_up_poll(&ctx->fd_wqh, POLLHUP); 334 + userfaultfd_ctx_put(ctx); 335 + return 0; 336 + } 337 + 338 + /* fault_wqh.lock must be hold by the caller */ 339 + static inline unsigned int find_userfault(struct userfaultfd_ctx *ctx, 340 + struct userfaultfd_wait_queue **uwq) 341 + { 342 + wait_queue_t *wq; 343 + struct userfaultfd_wait_queue *_uwq; 344 + unsigned int ret = 0; 345 + 346 + VM_BUG_ON(!spin_is_locked(&ctx->fault_wqh.lock)); 347 + 348 + list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) { 349 + _uwq = container_of(wq, struct userfaultfd_wait_queue, wq); 350 + if (_uwq->pending) { 351 + ret = POLLIN; 352 + if (!uwq) 353 + /* 354 + * If there's at least a pending and 355 + * we don't care which one it is, 356 + * break immediately and leverage the 357 + * efficiency of the LIFO walk. 358 + */ 359 + break; 360 + /* 361 + * If we need to find which one was pending we 362 + * keep walking until we find the first not 363 + * pending one, so we read() them in FIFO order. 364 + */ 365 + *uwq = _uwq; 366 + } else 367 + /* 368 + * break the loop at the first not pending 369 + * one, there cannot be pending userfaults 370 + * after the first not pending one, because 371 + * all new pending ones are inserted at the 372 + * head and we walk it in LIFO. 373 + */ 374 + break; 375 + } 376 + 377 + return ret; 378 + } 379 + 380 + static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) 381 + { 382 + struct userfaultfd_ctx *ctx = file->private_data; 383 + unsigned int ret; 384 + 385 + poll_wait(file, &ctx->fd_wqh, wait); 386 + 387 + switch (ctx->state) { 388 + case UFFD_STATE_WAIT_API: 389 + return POLLERR; 390 + case UFFD_STATE_RUNNING: 391 + spin_lock(&ctx->fault_wqh.lock); 392 + ret = find_userfault(ctx, NULL); 393 + spin_unlock(&ctx->fault_wqh.lock); 394 + return ret; 395 + default: 396 + BUG(); 397 + } 398 + } 399 + 400 + static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, 401 + __u64 *addr) 402 + { 403 + ssize_t ret; 404 + DECLARE_WAITQUEUE(wait, current); 405 + struct userfaultfd_wait_queue *uwq = NULL; 406 + 407 + /* always take the fd_wqh lock before the fault_wqh lock */ 408 + spin_lock(&ctx->fd_wqh.lock); 409 + __add_wait_queue(&ctx->fd_wqh, &wait); 410 + for (;;) { 411 + set_current_state(TASK_INTERRUPTIBLE); 412 + spin_lock(&ctx->fault_wqh.lock); 413 + if (find_userfault(ctx, &uwq)) { 414 + /* 415 + * The fault_wqh.lock prevents the uwq to 416 + * disappear from under us. 417 + */ 418 + uwq->pending = false; 419 + /* careful to always initialize addr if ret == 0 */ 420 + *addr = uwq->address; 421 + spin_unlock(&ctx->fault_wqh.lock); 422 + ret = 0; 423 + break; 424 + } 425 + spin_unlock(&ctx->fault_wqh.lock); 426 + if (signal_pending(current)) { 427 + ret = -ERESTARTSYS; 428 + break; 429 + } 430 + if (no_wait) { 431 + ret = -EAGAIN; 432 + break; 433 + } 434 + spin_unlock(&ctx->fd_wqh.lock); 435 + schedule(); 436 + spin_lock(&ctx->fd_wqh.lock); 437 + } 438 + __remove_wait_queue(&ctx->fd_wqh, &wait); 439 + __set_current_state(TASK_RUNNING); 440 + spin_unlock(&ctx->fd_wqh.lock); 441 + 442 + return ret; 443 + } 444 + 445 + static ssize_t userfaultfd_read(struct file *file, char __user *buf, 446 + size_t count, loff_t *ppos) 447 + { 448 + struct userfaultfd_ctx *ctx = file->private_data; 449 + ssize_t _ret, ret = 0; 450 + /* careful to always initialize addr if ret == 0 */ 451 + __u64 uninitialized_var(addr); 452 + int no_wait = file->f_flags & O_NONBLOCK; 453 + 454 + if (ctx->state == UFFD_STATE_WAIT_API) 455 + return -EINVAL; 456 + BUG_ON(ctx->state != UFFD_STATE_RUNNING); 457 + 458 + for (;;) { 459 + if (count < sizeof(addr)) 460 + return ret ? ret : -EINVAL; 461 + _ret = userfaultfd_ctx_read(ctx, no_wait, &addr); 462 + if (_ret < 0) 463 + return ret ? ret : _ret; 464 + if (put_user(addr, (__u64 __user *) buf)) 465 + return ret ? ret : -EFAULT; 466 + ret += sizeof(addr); 467 + buf += sizeof(addr); 468 + count -= sizeof(addr); 469 + /* 470 + * Allow to read more than one fault at time but only 471 + * block if waiting for the very first one. 472 + */ 473 + no_wait = O_NONBLOCK; 474 + } 475 + } 476 + 477 + static void __wake_userfault(struct userfaultfd_ctx *ctx, 478 + struct userfaultfd_wake_range *range) 479 + { 480 + unsigned long start, end; 481 + 482 + start = range->start; 483 + end = range->start + range->len; 484 + 485 + spin_lock(&ctx->fault_wqh.lock); 486 + /* wake all in the range and autoremove */ 487 + __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range); 488 + spin_unlock(&ctx->fault_wqh.lock); 489 + } 490 + 491 + static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, 492 + struct userfaultfd_wake_range *range) 493 + { 494 + /* 495 + * To be sure waitqueue_active() is not reordered by the CPU 496 + * before the pagetable update, use an explicit SMP memory 497 + * barrier here. PT lock release or up_read(mmap_sem) still 498 + * have release semantics that can allow the 499 + * waitqueue_active() to be reordered before the pte update. 500 + */ 501 + smp_mb(); 502 + 503 + /* 504 + * Use waitqueue_active because it's very frequent to 505 + * change the address space atomically even if there are no 506 + * userfaults yet. So we take the spinlock only when we're 507 + * sure we've userfaults to wake. 508 + */ 509 + if (waitqueue_active(&ctx->fault_wqh)) 510 + __wake_userfault(ctx, range); 511 + } 512 + 513 + static __always_inline int validate_range(struct mm_struct *mm, 514 + __u64 start, __u64 len) 515 + { 516 + __u64 task_size = mm->task_size; 517 + 518 + if (start & ~PAGE_MASK) 519 + return -EINVAL; 520 + if (len & ~PAGE_MASK) 521 + return -EINVAL; 522 + if (!len) 523 + return -EINVAL; 524 + if (start < mmap_min_addr) 525 + return -EINVAL; 526 + if (start >= task_size) 527 + return -EINVAL; 528 + if (len > task_size - start) 529 + return -EINVAL; 530 + return 0; 531 + } 532 + 533 + static int userfaultfd_register(struct userfaultfd_ctx *ctx, 534 + unsigned long arg) 535 + { 536 + struct mm_struct *mm = ctx->mm; 537 + struct vm_area_struct *vma, *prev, *cur; 538 + int ret; 539 + struct uffdio_register uffdio_register; 540 + struct uffdio_register __user *user_uffdio_register; 541 + unsigned long vm_flags, new_flags; 542 + bool found; 543 + unsigned long start, end, vma_end; 544 + 545 + user_uffdio_register = (struct uffdio_register __user *) arg; 546 + 547 + ret = -EFAULT; 548 + if (copy_from_user(&uffdio_register, user_uffdio_register, 549 + sizeof(uffdio_register)-sizeof(__u64))) 550 + goto out; 551 + 552 + ret = -EINVAL; 553 + if (!uffdio_register.mode) 554 + goto out; 555 + if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING| 556 + UFFDIO_REGISTER_MODE_WP)) 557 + goto out; 558 + vm_flags = 0; 559 + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) 560 + vm_flags |= VM_UFFD_MISSING; 561 + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { 562 + vm_flags |= VM_UFFD_WP; 563 + /* 564 + * FIXME: remove the below error constraint by 565 + * implementing the wprotect tracking mode. 566 + */ 567 + ret = -EINVAL; 568 + goto out; 569 + } 570 + 571 + ret = validate_range(mm, uffdio_register.range.start, 572 + uffdio_register.range.len); 573 + if (ret) 574 + goto out; 575 + 576 + start = uffdio_register.range.start; 577 + end = start + uffdio_register.range.len; 578 + 579 + down_write(&mm->mmap_sem); 580 + vma = find_vma_prev(mm, start, &prev); 581 + 582 + ret = -ENOMEM; 583 + if (!vma) 584 + goto out_unlock; 585 + 586 + /* check that there's at least one vma in the range */ 587 + ret = -EINVAL; 588 + if (vma->vm_start >= end) 589 + goto out_unlock; 590 + 591 + /* 592 + * Search for not compatible vmas. 593 + * 594 + * FIXME: this shall be relaxed later so that it doesn't fail 595 + * on tmpfs backed vmas (in addition to the current allowance 596 + * on anonymous vmas). 597 + */ 598 + found = false; 599 + for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { 600 + cond_resched(); 601 + 602 + BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ 603 + !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); 604 + 605 + /* check not compatible vmas */ 606 + ret = -EINVAL; 607 + if (cur->vm_ops) 608 + goto out_unlock; 609 + 610 + /* 611 + * Check that this vma isn't already owned by a 612 + * different userfaultfd. We can't allow more than one 613 + * userfaultfd to own a single vma simultaneously or we 614 + * wouldn't know which one to deliver the userfaults to. 615 + */ 616 + ret = -EBUSY; 617 + if (cur->vm_userfaultfd_ctx.ctx && 618 + cur->vm_userfaultfd_ctx.ctx != ctx) 619 + goto out_unlock; 620 + 621 + found = true; 622 + } 623 + BUG_ON(!found); 624 + 625 + if (vma->vm_start < start) 626 + prev = vma; 627 + 628 + ret = 0; 629 + do { 630 + cond_resched(); 631 + 632 + BUG_ON(vma->vm_ops); 633 + BUG_ON(vma->vm_userfaultfd_ctx.ctx && 634 + vma->vm_userfaultfd_ctx.ctx != ctx); 635 + 636 + /* 637 + * Nothing to do: this vma is already registered into this 638 + * userfaultfd and with the right tracking mode too. 639 + */ 640 + if (vma->vm_userfaultfd_ctx.ctx == ctx && 641 + (vma->vm_flags & vm_flags) == vm_flags) 642 + goto skip; 643 + 644 + if (vma->vm_start > start) 645 + start = vma->vm_start; 646 + vma_end = min(end, vma->vm_end); 647 + 648 + new_flags = (vma->vm_flags & ~vm_flags) | vm_flags; 649 + prev = vma_merge(mm, prev, start, vma_end, new_flags, 650 + vma->anon_vma, vma->vm_file, vma->vm_pgoff, 651 + vma_policy(vma), 652 + ((struct vm_userfaultfd_ctx){ ctx })); 653 + if (prev) { 654 + vma = prev; 655 + goto next; 656 + } 657 + if (vma->vm_start < start) { 658 + ret = split_vma(mm, vma, start, 1); 659 + if (ret) 660 + break; 661 + } 662 + if (vma->vm_end > end) { 663 + ret = split_vma(mm, vma, end, 0); 664 + if (ret) 665 + break; 666 + } 667 + next: 668 + /* 669 + * In the vma_merge() successful mprotect-like case 8: 670 + * the next vma was merged into the current one and 671 + * the current one has not been updated yet. 672 + */ 673 + vma->vm_flags = new_flags; 674 + vma->vm_userfaultfd_ctx.ctx = ctx; 675 + 676 + skip: 677 + prev = vma; 678 + start = vma->vm_end; 679 + vma = vma->vm_next; 680 + } while (vma && vma->vm_start < end); 681 + out_unlock: 682 + up_write(&mm->mmap_sem); 683 + if (!ret) { 684 + /* 685 + * Now that we scanned all vmas we can already tell 686 + * userland which ioctls methods are guaranteed to 687 + * succeed on this range. 688 + */ 689 + if (put_user(UFFD_API_RANGE_IOCTLS, 690 + &user_uffdio_register->ioctls)) 691 + ret = -EFAULT; 692 + } 693 + out: 694 + return ret; 695 + } 696 + 697 + static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, 698 + unsigned long arg) 699 + { 700 + struct mm_struct *mm = ctx->mm; 701 + struct vm_area_struct *vma, *prev, *cur; 702 + int ret; 703 + struct uffdio_range uffdio_unregister; 704 + unsigned long new_flags; 705 + bool found; 706 + unsigned long start, end, vma_end; 707 + const void __user *buf = (void __user *)arg; 708 + 709 + ret = -EFAULT; 710 + if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) 711 + goto out; 712 + 713 + ret = validate_range(mm, uffdio_unregister.start, 714 + uffdio_unregister.len); 715 + if (ret) 716 + goto out; 717 + 718 + start = uffdio_unregister.start; 719 + end = start + uffdio_unregister.len; 720 + 721 + down_write(&mm->mmap_sem); 722 + vma = find_vma_prev(mm, start, &prev); 723 + 724 + ret = -ENOMEM; 725 + if (!vma) 726 + goto out_unlock; 727 + 728 + /* check that there's at least one vma in the range */ 729 + ret = -EINVAL; 730 + if (vma->vm_start >= end) 731 + goto out_unlock; 732 + 733 + /* 734 + * Search for not compatible vmas. 735 + * 736 + * FIXME: this shall be relaxed later so that it doesn't fail 737 + * on tmpfs backed vmas (in addition to the current allowance 738 + * on anonymous vmas). 739 + */ 740 + found = false; 741 + ret = -EINVAL; 742 + for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { 743 + cond_resched(); 744 + 745 + BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ 746 + !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); 747 + 748 + /* 749 + * Check not compatible vmas, not strictly required 750 + * here as not compatible vmas cannot have an 751 + * userfaultfd_ctx registered on them, but this 752 + * provides for more strict behavior to notice 753 + * unregistration errors. 754 + */ 755 + if (cur->vm_ops) 756 + goto out_unlock; 757 + 758 + found = true; 759 + } 760 + BUG_ON(!found); 761 + 762 + if (vma->vm_start < start) 763 + prev = vma; 764 + 765 + ret = 0; 766 + do { 767 + cond_resched(); 768 + 769 + BUG_ON(vma->vm_ops); 770 + 771 + /* 772 + * Nothing to do: this vma is already registered into this 773 + * userfaultfd and with the right tracking mode too. 774 + */ 775 + if (!vma->vm_userfaultfd_ctx.ctx) 776 + goto skip; 777 + 778 + if (vma->vm_start > start) 779 + start = vma->vm_start; 780 + vma_end = min(end, vma->vm_end); 781 + 782 + new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); 783 + prev = vma_merge(mm, prev, start, vma_end, new_flags, 784 + vma->anon_vma, vma->vm_file, vma->vm_pgoff, 785 + vma_policy(vma), 786 + NULL_VM_UFFD_CTX); 787 + if (prev) { 788 + vma = prev; 789 + goto next; 790 + } 791 + if (vma->vm_start < start) { 792 + ret = split_vma(mm, vma, start, 1); 793 + if (ret) 794 + break; 795 + } 796 + if (vma->vm_end > end) { 797 + ret = split_vma(mm, vma, end, 0); 798 + if (ret) 799 + break; 800 + } 801 + next: 802 + /* 803 + * In the vma_merge() successful mprotect-like case 8: 804 + * the next vma was merged into the current one and 805 + * the current one has not been updated yet. 806 + */ 807 + vma->vm_flags = new_flags; 808 + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 809 + 810 + skip: 811 + prev = vma; 812 + start = vma->vm_end; 813 + vma = vma->vm_next; 814 + } while (vma && vma->vm_start < end); 815 + out_unlock: 816 + up_write(&mm->mmap_sem); 817 + out: 818 + return ret; 819 + } 820 + 821 + /* 822 + * This is mostly needed to re-wakeup those userfaults that were still 823 + * pending when userland wake them up the first time. We don't wake 824 + * the pending one to avoid blocking reads to block, or non blocking 825 + * read to return -EAGAIN, if used with POLLIN, to avoid userland 826 + * doubts on why POLLIN wasn't reliable. 827 + */ 828 + static int userfaultfd_wake(struct userfaultfd_ctx *ctx, 829 + unsigned long arg) 830 + { 831 + int ret; 832 + struct uffdio_range uffdio_wake; 833 + struct userfaultfd_wake_range range; 834 + const void __user *buf = (void __user *)arg; 835 + 836 + ret = -EFAULT; 837 + if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) 838 + goto out; 839 + 840 + ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); 841 + if (ret) 842 + goto out; 843 + 844 + range.start = uffdio_wake.start; 845 + range.len = uffdio_wake.len; 846 + 847 + /* 848 + * len == 0 means wake all and we don't want to wake all here, 849 + * so check it again to be sure. 850 + */ 851 + VM_BUG_ON(!range.len); 852 + 853 + wake_userfault(ctx, &range); 854 + ret = 0; 855 + 856 + out: 857 + return ret; 858 + } 859 + 860 + /* 861 + * userland asks for a certain API version and we return which bits 862 + * and ioctl commands are implemented in this kernel for such API 863 + * version or -EINVAL if unknown. 864 + */ 865 + static int userfaultfd_api(struct userfaultfd_ctx *ctx, 866 + unsigned long arg) 867 + { 868 + struct uffdio_api uffdio_api; 869 + void __user *buf = (void __user *)arg; 870 + int ret; 871 + 872 + ret = -EINVAL; 873 + if (ctx->state != UFFD_STATE_WAIT_API) 874 + goto out; 875 + ret = -EFAULT; 876 + if (copy_from_user(&uffdio_api, buf, sizeof(__u64))) 877 + goto out; 878 + if (uffdio_api.api != UFFD_API) { 879 + /* careful not to leak info, we only read the first 8 bytes */ 880 + memset(&uffdio_api, 0, sizeof(uffdio_api)); 881 + if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) 882 + goto out; 883 + ret = -EINVAL; 884 + goto out; 885 + } 886 + /* careful not to leak info, we only read the first 8 bytes */ 887 + uffdio_api.bits = UFFD_API_BITS; 888 + uffdio_api.ioctls = UFFD_API_IOCTLS; 889 + ret = -EFAULT; 890 + if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) 891 + goto out; 892 + ctx->state = UFFD_STATE_RUNNING; 893 + ret = 0; 894 + out: 895 + return ret; 896 + } 897 + 898 + static long userfaultfd_ioctl(struct file *file, unsigned cmd, 899 + unsigned long arg) 900 + { 901 + int ret = -EINVAL; 902 + struct userfaultfd_ctx *ctx = file->private_data; 903 + 904 + switch(cmd) { 905 + case UFFDIO_API: 906 + ret = userfaultfd_api(ctx, arg); 907 + break; 908 + case UFFDIO_REGISTER: 909 + ret = userfaultfd_register(ctx, arg); 910 + break; 911 + case UFFDIO_UNREGISTER: 912 + ret = userfaultfd_unregister(ctx, arg); 913 + break; 914 + case UFFDIO_WAKE: 915 + ret = userfaultfd_wake(ctx, arg); 916 + break; 917 + } 918 + return ret; 919 + } 920 + 921 + #ifdef CONFIG_PROC_FS 922 + static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) 923 + { 924 + struct userfaultfd_ctx *ctx = f->private_data; 925 + wait_queue_t *wq; 926 + struct userfaultfd_wait_queue *uwq; 927 + unsigned long pending = 0, total = 0; 928 + 929 + spin_lock(&ctx->fault_wqh.lock); 930 + list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) { 931 + uwq = container_of(wq, struct userfaultfd_wait_queue, wq); 932 + if (uwq->pending) 933 + pending++; 934 + total++; 935 + } 936 + spin_unlock(&ctx->fault_wqh.lock); 937 + 938 + /* 939 + * If more protocols will be added, there will be all shown 940 + * separated by a space. Like this: 941 + * protocols: aa:... bb:... 942 + */ 943 + seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n", 944 + pending, total, UFFD_API, UFFD_API_BITS, 945 + UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS); 946 + } 947 + #endif 948 + 949 + static const struct file_operations userfaultfd_fops = { 950 + #ifdef CONFIG_PROC_FS 951 + .show_fdinfo = userfaultfd_show_fdinfo, 952 + #endif 953 + .release = userfaultfd_release, 954 + .poll = userfaultfd_poll, 955 + .read = userfaultfd_read, 956 + .unlocked_ioctl = userfaultfd_ioctl, 957 + .compat_ioctl = userfaultfd_ioctl, 958 + .llseek = noop_llseek, 959 + }; 960 + 961 + /** 962 + * userfaultfd_file_create - Creates an userfaultfd file pointer. 963 + * @flags: Flags for the userfaultfd file. 964 + * 965 + * This function creates an userfaultfd file pointer, w/out installing 966 + * it into the fd table. This is useful when the userfaultfd file is 967 + * used during the initialization of data structures that require 968 + * extra setup after the userfaultfd creation. So the userfaultfd 969 + * creation is split into the file pointer creation phase, and the 970 + * file descriptor installation phase. In this way races with 971 + * userspace closing the newly installed file descriptor can be 972 + * avoided. Returns an userfaultfd file pointer, or a proper error 973 + * pointer. 974 + */ 975 + static struct file *userfaultfd_file_create(int flags) 976 + { 977 + struct file *file; 978 + struct userfaultfd_ctx *ctx; 979 + 980 + BUG_ON(!current->mm); 981 + 982 + /* Check the UFFD_* constants for consistency. */ 983 + BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); 984 + BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); 985 + 986 + file = ERR_PTR(-EINVAL); 987 + if (flags & ~UFFD_SHARED_FCNTL_FLAGS) 988 + goto out; 989 + 990 + file = ERR_PTR(-ENOMEM); 991 + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 992 + if (!ctx) 993 + goto out; 994 + 995 + atomic_set(&ctx->refcount, 1); 996 + init_waitqueue_head(&ctx->fault_wqh); 997 + init_waitqueue_head(&ctx->fd_wqh); 998 + ctx->flags = flags; 999 + ctx->state = UFFD_STATE_WAIT_API; 1000 + ctx->released = false; 1001 + ctx->mm = current->mm; 1002 + /* prevent the mm struct to be freed */ 1003 + atomic_inc(&ctx->mm->mm_users); 1004 + 1005 + file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx, 1006 + O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); 1007 + if (IS_ERR(file)) 1008 + kfree(ctx); 1009 + out: 1010 + return file; 1011 + } 1012 + 1013 + SYSCALL_DEFINE1(userfaultfd, int, flags) 1014 + { 1015 + int fd, error; 1016 + struct file *file; 1017 + 1018 + error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS); 1019 + if (error < 0) 1020 + return error; 1021 + fd = error; 1022 + 1023 + file = userfaultfd_file_create(flags); 1024 + if (IS_ERR(file)) { 1025 + error = PTR_ERR(file); 1026 + goto err_put_unused_fd; 1027 + } 1028 + fd_install(fd, file); 1029 + 1030 + return fd; 1031 + 1032 + err_put_unused_fd: 1033 + put_unused_fd(fd); 1034 + 1035 + return error; 1036 + }