fs/userfaultfd.c at v6.4 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / userfaultfd.c
at v6.4 2235 lines 59 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  fs/userfaultfd.c
   4 *
   5 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
   6 *  Copyright (C) 2008-2009 Red Hat, Inc.
   7 *  Copyright (C) 2015  Red Hat, Inc.
   8 *
   9 *  Some part derived from fs/eventfd.c (anon inode setup) and
  10 *  mm/ksm.c (mm hashing).
  11 */
  12
  13#include <linux/list.h>
  14#include <linux/hashtable.h>
  15#include <linux/sched/signal.h>
  16#include <linux/sched/mm.h>
  17#include <linux/mm.h>
  18#include <linux/mm_inline.h>
  19#include <linux/mmu_notifier.h>
  20#include <linux/poll.h>
  21#include <linux/slab.h>
  22#include <linux/seq_file.h>
  23#include <linux/file.h>
  24#include <linux/bug.h>
  25#include <linux/anon_inodes.h>
  26#include <linux/syscalls.h>
  27#include <linux/userfaultfd_k.h>
  28#include <linux/mempolicy.h>
  29#include <linux/ioctl.h>
  30#include <linux/security.h>
  31#include <linux/hugetlb.h>
  32#include <linux/swapops.h>
  33#include <linux/miscdevice.h>
  34
  35static int sysctl_unprivileged_userfaultfd __read_mostly;
  36
  37#ifdef CONFIG_SYSCTL
  38static struct ctl_table vm_userfaultfd_table[] = {
  39	{
  40		.procname	= "unprivileged_userfaultfd",
  41		.data		= &sysctl_unprivileged_userfaultfd,
  42		.maxlen		= sizeof(sysctl_unprivileged_userfaultfd),
  43		.mode		= 0644,
  44		.proc_handler	= proc_dointvec_minmax,
  45		.extra1		= SYSCTL_ZERO,
  46		.extra2		= SYSCTL_ONE,
  47	},
  48	{ }
  49};
  50#endif
  51
  52static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
  53
  54/*
  55 * Start with fault_pending_wqh and fault_wqh so they're more likely
  56 * to be in the same cacheline.
  57 *
  58 * Locking order:
  59 *	fd_wqh.lock
  60 *		fault_pending_wqh.lock
  61 *			fault_wqh.lock
  62 *		event_wqh.lock
  63 *
  64 * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
  65 * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
  66 * also taken in IRQ context.
  67 */
  68struct userfaultfd_ctx {
  69	/* waitqueue head for the pending (i.e. not read) userfaults */
  70	wait_queue_head_t fault_pending_wqh;
  71	/* waitqueue head for the userfaults */
  72	wait_queue_head_t fault_wqh;
  73	/* waitqueue head for the pseudo fd to wakeup poll/read */
  74	wait_queue_head_t fd_wqh;
  75	/* waitqueue head for events */
  76	wait_queue_head_t event_wqh;
  77	/* a refile sequence protected by fault_pending_wqh lock */
  78	seqcount_spinlock_t refile_seq;
  79	/* pseudo fd refcounting */
  80	refcount_t refcount;
  81	/* userfaultfd syscall flags */
  82	unsigned int flags;
  83	/* features requested from the userspace */
  84	unsigned int features;
  85	/* released */
  86	bool released;
  87	/* memory mappings are changing because of non-cooperative event */
  88	atomic_t mmap_changing;
  89	/* mm with one ore more vmas attached to this userfaultfd_ctx */
  90	struct mm_struct *mm;
  91};
  92
  93struct userfaultfd_fork_ctx {
  94	struct userfaultfd_ctx *orig;
  95	struct userfaultfd_ctx *new;
  96	struct list_head list;
  97};
  98
  99struct userfaultfd_unmap_ctx {
 100	struct userfaultfd_ctx *ctx;
 101	unsigned long start;
 102	unsigned long end;
 103	struct list_head list;
 104};
 105
 106struct userfaultfd_wait_queue {
 107	struct uffd_msg msg;
 108	wait_queue_entry_t wq;
 109	struct userfaultfd_ctx *ctx;
 110	bool waken;
 111};
 112
 113struct userfaultfd_wake_range {
 114	unsigned long start;
 115	unsigned long len;
 116};
 117
 118/* internal indication that UFFD_API ioctl was successfully executed */
 119#define UFFD_FEATURE_INITIALIZED		(1u << 31)
 120
 121static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
 122{
 123	return ctx->features & UFFD_FEATURE_INITIALIZED;
 124}
 125
 126/*
 127 * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
 128 * meaningful when userfaultfd_wp()==true on the vma and when it's
 129 * anonymous.
 130 */
 131bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
 132{
 133	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
 134
 135	if (!ctx)
 136		return false;
 137
 138	return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
 139}
 140
 141static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
 142				     vm_flags_t flags)
 143{
 144	const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
 145
 146	vm_flags_reset(vma, flags);
 147	/*
 148	 * For shared mappings, we want to enable writenotify while
 149	 * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
 150	 * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
 151	 */
 152	if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
 153		vma_set_page_prot(vma);
 154}
 155
 156static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
 157				     int wake_flags, void *key)
 158{
 159	struct userfaultfd_wake_range *range = key;
 160	int ret;
 161	struct userfaultfd_wait_queue *uwq;
 162	unsigned long start, len;
 163
 164	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
 165	ret = 0;
 166	/* len == 0 means wake all */
 167	start = range->start;
 168	len = range->len;
 169	if (len && (start > uwq->msg.arg.pagefault.address ||
 170		    start + len <= uwq->msg.arg.pagefault.address))
 171		goto out;
 172	WRITE_ONCE(uwq->waken, true);
 173	/*
 174	 * The Program-Order guarantees provided by the scheduler
 175	 * ensure uwq->waken is visible before the task is woken.
 176	 */
 177	ret = wake_up_state(wq->private, mode);
 178	if (ret) {
 179		/*
 180		 * Wake only once, autoremove behavior.
 181		 *
 182		 * After the effect of list_del_init is visible to the other
 183		 * CPUs, the waitqueue may disappear from under us, see the
 184		 * !list_empty_careful() in handle_userfault().
 185		 *
 186		 * try_to_wake_up() has an implicit smp_mb(), and the
 187		 * wq->private is read before calling the extern function
 188		 * "wake_up_state" (which in turns calls try_to_wake_up).
 189		 */
 190		list_del_init(&wq->entry);
 191	}
 192out:
 193	return ret;
 194}
 195
 196/**
 197 * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
 198 * context.
 199 * @ctx: [in] Pointer to the userfaultfd context.
 200 */
 201static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
 202{
 203	refcount_inc(&ctx->refcount);
 204}
 205
 206/**
 207 * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
 208 * context.
 209 * @ctx: [in] Pointer to userfaultfd context.
 210 *
 211 * The userfaultfd context reference must have been previously acquired either
 212 * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
 213 */
 214static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
 215{
 216	if (refcount_dec_and_test(&ctx->refcount)) {
 217		VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
 218		VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
 219		VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
 220		VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
 221		VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
 222		VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
 223		VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
 224		VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
 225		mmdrop(ctx->mm);
 226		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
 227	}
 228}
 229
 230static inline void msg_init(struct uffd_msg *msg)
 231{
 232	BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
 233	/*
 234	 * Must use memset to zero out the paddings or kernel data is
 235	 * leaked to userland.
 236	 */
 237	memset(msg, 0, sizeof(struct uffd_msg));
 238}
 239
 240static inline struct uffd_msg userfault_msg(unsigned long address,
 241					    unsigned long real_address,
 242					    unsigned int flags,
 243					    unsigned long reason,
 244					    unsigned int features)
 245{
 246	struct uffd_msg msg;
 247
 248	msg_init(&msg);
 249	msg.event = UFFD_EVENT_PAGEFAULT;
 250
 251	msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
 252				    real_address : address;
 253
 254	/*
 255	 * These flags indicate why the userfault occurred:
 256	 * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
 257	 * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
 258	 * - Neither of these flags being set indicates a MISSING fault.
 259	 *
 260	 * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
 261	 * fault. Otherwise, it was a read fault.
 262	 */
 263	if (flags & FAULT_FLAG_WRITE)
 264		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
 265	if (reason & VM_UFFD_WP)
 266		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
 267	if (reason & VM_UFFD_MINOR)
 268		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
 269	if (features & UFFD_FEATURE_THREAD_ID)
 270		msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
 271	return msg;
 272}
 273
 274#ifdef CONFIG_HUGETLB_PAGE
 275/*
 276 * Same functionality as userfaultfd_must_wait below with modifications for
 277 * hugepmd ranges.
 278 */
 279static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
 280					 struct vm_area_struct *vma,
 281					 unsigned long address,
 282					 unsigned long flags,
 283					 unsigned long reason)
 284{
 285	pte_t *ptep, pte;
 286	bool ret = true;
 287
 288	mmap_assert_locked(ctx->mm);
 289
 290	ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma));
 291	if (!ptep)
 292		goto out;
 293
 294	ret = false;
 295	pte = huge_ptep_get(ptep);
 296
 297	/*
 298	 * Lockless access: we're in a wait_event so it's ok if it
 299	 * changes under us.  PTE markers should be handled the same as none
 300	 * ptes here.
 301	 */
 302	if (huge_pte_none_mostly(pte))
 303		ret = true;
 304	if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
 305		ret = true;
 306out:
 307	return ret;
 308}
 309#else
 310static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
 311					 struct vm_area_struct *vma,
 312					 unsigned long address,
 313					 unsigned long flags,
 314					 unsigned long reason)
 315{
 316	return false;	/* should never get here */
 317}
 318#endif /* CONFIG_HUGETLB_PAGE */
 319
 320/*
 321 * Verify the pagetables are still not ok after having reigstered into
 322 * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
 323 * userfault that has already been resolved, if userfaultfd_read and
 324 * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
 325 * threads.
 326 */
 327static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
 328					 unsigned long address,
 329					 unsigned long flags,
 330					 unsigned long reason)
 331{
 332	struct mm_struct *mm = ctx->mm;
 333	pgd_t *pgd;
 334	p4d_t *p4d;
 335	pud_t *pud;
 336	pmd_t *pmd, _pmd;
 337	pte_t *pte;
 338	bool ret = true;
 339
 340	mmap_assert_locked(mm);
 341
 342	pgd = pgd_offset(mm, address);
 343	if (!pgd_present(*pgd))
 344		goto out;
 345	p4d = p4d_offset(pgd, address);
 346	if (!p4d_present(*p4d))
 347		goto out;
 348	pud = pud_offset(p4d, address);
 349	if (!pud_present(*pud))
 350		goto out;
 351	pmd = pmd_offset(pud, address);
 352	/*
 353	 * READ_ONCE must function as a barrier with narrower scope
 354	 * and it must be equivalent to:
 355	 *	_pmd = *pmd; barrier();
 356	 *
 357	 * This is to deal with the instability (as in
 358	 * pmd_trans_unstable) of the pmd.
 359	 */
 360	_pmd = READ_ONCE(*pmd);
 361	if (pmd_none(_pmd))
 362		goto out;
 363
 364	ret = false;
 365	if (!pmd_present(_pmd))
 366		goto out;
 367
 368	if (pmd_trans_huge(_pmd)) {
 369		if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
 370			ret = true;
 371		goto out;
 372	}
 373
 374	/*
 375	 * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
 376	 * and use the standard pte_offset_map() instead of parsing _pmd.
 377	 */
 378	pte = pte_offset_map(pmd, address);
 379	/*
 380	 * Lockless access: we're in a wait_event so it's ok if it
 381	 * changes under us.  PTE markers should be handled the same as none
 382	 * ptes here.
 383	 */
 384	if (pte_none_mostly(*pte))
 385		ret = true;
 386	if (!pte_write(*pte) && (reason & VM_UFFD_WP))
 387		ret = true;
 388	pte_unmap(pte);
 389
 390out:
 391	return ret;
 392}
 393
 394static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
 395{
 396	if (flags & FAULT_FLAG_INTERRUPTIBLE)
 397		return TASK_INTERRUPTIBLE;
 398
 399	if (flags & FAULT_FLAG_KILLABLE)
 400		return TASK_KILLABLE;
 401
 402	return TASK_UNINTERRUPTIBLE;
 403}
 404
 405/*
 406 * The locking rules involved in returning VM_FAULT_RETRY depending on
 407 * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
 408 * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
 409 * recommendation in __lock_page_or_retry is not an understatement.
 410 *
 411 * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
 412 * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
 413 * not set.
 414 *
 415 * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
 416 * set, VM_FAULT_RETRY can still be returned if and only if there are
 417 * fatal_signal_pending()s, and the mmap_lock must be released before
 418 * returning it.
 419 */
 420vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 421{
 422	struct vm_area_struct *vma = vmf->vma;
 423	struct mm_struct *mm = vma->vm_mm;
 424	struct userfaultfd_ctx *ctx;
 425	struct userfaultfd_wait_queue uwq;
 426	vm_fault_t ret = VM_FAULT_SIGBUS;
 427	bool must_wait;
 428	unsigned int blocking_state;
 429
 430	/*
 431	 * We don't do userfault handling for the final child pid update.
 432	 *
 433	 * We also don't do userfault handling during
 434	 * coredumping. hugetlbfs has the special
 435	 * follow_hugetlb_page() to skip missing pages in the
 436	 * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
 437	 * the no_page_table() helper in follow_page_mask(), but the
 438	 * shmem_vm_ops->fault method is invoked even during
 439	 * coredumping without mmap_lock and it ends up here.
 440	 */
 441	if (current->flags & (PF_EXITING|PF_DUMPCORE))
 442		goto out;
 443
 444	/*
 445	 * Coredumping runs without mmap_lock so we can only check that
 446	 * the mmap_lock is held, if PF_DUMPCORE was not set.
 447	 */
 448	mmap_assert_locked(mm);
 449
 450	ctx = vma->vm_userfaultfd_ctx.ctx;
 451	if (!ctx)
 452		goto out;
 453
 454	BUG_ON(ctx->mm != mm);
 455
 456	/* Any unrecognized flag is a bug. */
 457	VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
 458	/* 0 or > 1 flags set is a bug; we expect exactly 1. */
 459	VM_BUG_ON(!reason || (reason & (reason - 1)));
 460
 461	if (ctx->features & UFFD_FEATURE_SIGBUS)
 462		goto out;
 463	if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
 464		goto out;
 465
 466	/*
 467	 * If it's already released don't get it. This avoids to loop
 468	 * in __get_user_pages if userfaultfd_release waits on the
 469	 * caller of handle_userfault to release the mmap_lock.
 470	 */
 471	if (unlikely(READ_ONCE(ctx->released))) {
 472		/*
 473		 * Don't return VM_FAULT_SIGBUS in this case, so a non
 474		 * cooperative manager can close the uffd after the
 475		 * last UFFDIO_COPY, without risking to trigger an
 476		 * involuntary SIGBUS if the process was starting the
 477		 * userfaultfd while the userfaultfd was still armed
 478		 * (but after the last UFFDIO_COPY). If the uffd
 479		 * wasn't already closed when the userfault reached
 480		 * this point, that would normally be solved by
 481		 * userfaultfd_must_wait returning 'false'.
 482		 *
 483		 * If we were to return VM_FAULT_SIGBUS here, the non
 484		 * cooperative manager would be instead forced to
 485		 * always call UFFDIO_UNREGISTER before it can safely
 486		 * close the uffd.
 487		 */
 488		ret = VM_FAULT_NOPAGE;
 489		goto out;
 490	}
 491
 492	/*
 493	 * Check that we can return VM_FAULT_RETRY.
 494	 *
 495	 * NOTE: it should become possible to return VM_FAULT_RETRY
 496	 * even if FAULT_FLAG_TRIED is set without leading to gup()
 497	 * -EBUSY failures, if the userfaultfd is to be extended for
 498	 * VM_UFFD_WP tracking and we intend to arm the userfault
 499	 * without first stopping userland access to the memory. For
 500	 * VM_UFFD_MISSING userfaults this is enough for now.
 501	 */
 502	if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
 503		/*
 504		 * Validate the invariant that nowait must allow retry
 505		 * to be sure not to return SIGBUS erroneously on
 506		 * nowait invocations.
 507		 */
 508		BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
 509#ifdef CONFIG_DEBUG_VM
 510		if (printk_ratelimit()) {
 511			printk(KERN_WARNING
 512			       "FAULT_FLAG_ALLOW_RETRY missing %x\n",
 513			       vmf->flags);
 514			dump_stack();
 515		}
 516#endif
 517		goto out;
 518	}
 519
 520	/*
 521	 * Handle nowait, not much to do other than tell it to retry
 522	 * and wait.
 523	 */
 524	ret = VM_FAULT_RETRY;
 525	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
 526		goto out;
 527
 528	/* take the reference before dropping the mmap_lock */
 529	userfaultfd_ctx_get(ctx);
 530
 531	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
 532	uwq.wq.private = current;
 533	uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
 534				reason, ctx->features);
 535	uwq.ctx = ctx;
 536	uwq.waken = false;
 537
 538	blocking_state = userfaultfd_get_blocking_state(vmf->flags);
 539
 540        /*
 541         * Take the vma lock now, in order to safely call
 542         * userfaultfd_huge_must_wait() later. Since acquiring the
 543         * (sleepable) vma lock can modify the current task state, that
 544         * must be before explicitly calling set_current_state().
 545         */
 546	if (is_vm_hugetlb_page(vma))
 547		hugetlb_vma_lock_read(vma);
 548
 549	spin_lock_irq(&ctx->fault_pending_wqh.lock);
 550	/*
 551	 * After the __add_wait_queue the uwq is visible to userland
 552	 * through poll/read().
 553	 */
 554	__add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
 555	/*
 556	 * The smp_mb() after __set_current_state prevents the reads
 557	 * following the spin_unlock to happen before the list_add in
 558	 * __add_wait_queue.
 559	 */
 560	set_current_state(blocking_state);
 561	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
 562
 563	if (!is_vm_hugetlb_page(vma))
 564		must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
 565						  reason);
 566	else
 567		must_wait = userfaultfd_huge_must_wait(ctx, vma,
 568						       vmf->address,
 569						       vmf->flags, reason);
 570	if (is_vm_hugetlb_page(vma))
 571		hugetlb_vma_unlock_read(vma);
 572	mmap_read_unlock(mm);
 573
 574	if (likely(must_wait && !READ_ONCE(ctx->released))) {
 575		wake_up_poll(&ctx->fd_wqh, EPOLLIN);
 576		schedule();
 577	}
 578
 579	__set_current_state(TASK_RUNNING);
 580
 581	/*
 582	 * Here we race with the list_del; list_add in
 583	 * userfaultfd_ctx_read(), however because we don't ever run
 584	 * list_del_init() to refile across the two lists, the prev
 585	 * and next pointers will never point to self. list_add also
 586	 * would never let any of the two pointers to point to
 587	 * self. So list_empty_careful won't risk to see both pointers
 588	 * pointing to self at any time during the list refile. The
 589	 * only case where list_del_init() is called is the full
 590	 * removal in the wake function and there we don't re-list_add
 591	 * and it's fine not to block on the spinlock. The uwq on this
 592	 * kernel stack can be released after the list_del_init.
 593	 */
 594	if (!list_empty_careful(&uwq.wq.entry)) {
 595		spin_lock_irq(&ctx->fault_pending_wqh.lock);
 596		/*
 597		 * No need of list_del_init(), the uwq on the stack
 598		 * will be freed shortly anyway.
 599		 */
 600		list_del(&uwq.wq.entry);
 601		spin_unlock_irq(&ctx->fault_pending_wqh.lock);
 602	}
 603
 604	/*
 605	 * ctx may go away after this if the userfault pseudo fd is
 606	 * already released.
 607	 */
 608	userfaultfd_ctx_put(ctx);
 609
 610out:
 611	return ret;
 612}
 613
 614static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 615					      struct userfaultfd_wait_queue *ewq)
 616{
 617	struct userfaultfd_ctx *release_new_ctx;
 618
 619	if (WARN_ON_ONCE(current->flags & PF_EXITING))
 620		goto out;
 621
 622	ewq->ctx = ctx;
 623	init_waitqueue_entry(&ewq->wq, current);
 624	release_new_ctx = NULL;
 625
 626	spin_lock_irq(&ctx->event_wqh.lock);
 627	/*
 628	 * After the __add_wait_queue the uwq is visible to userland
 629	 * through poll/read().
 630	 */
 631	__add_wait_queue(&ctx->event_wqh, &ewq->wq);
 632	for (;;) {
 633		set_current_state(TASK_KILLABLE);
 634		if (ewq->msg.event == 0)
 635			break;
 636		if (READ_ONCE(ctx->released) ||
 637		    fatal_signal_pending(current)) {
 638			/*
 639			 * &ewq->wq may be queued in fork_event, but
 640			 * __remove_wait_queue ignores the head
 641			 * parameter. It would be a problem if it
 642			 * didn't.
 643			 */
 644			__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
 645			if (ewq->msg.event == UFFD_EVENT_FORK) {
 646				struct userfaultfd_ctx *new;
 647
 648				new = (struct userfaultfd_ctx *)
 649					(unsigned long)
 650					ewq->msg.arg.reserved.reserved1;
 651				release_new_ctx = new;
 652			}
 653			break;
 654		}
 655
 656		spin_unlock_irq(&ctx->event_wqh.lock);
 657
 658		wake_up_poll(&ctx->fd_wqh, EPOLLIN);
 659		schedule();
 660
 661		spin_lock_irq(&ctx->event_wqh.lock);
 662	}
 663	__set_current_state(TASK_RUNNING);
 664	spin_unlock_irq(&ctx->event_wqh.lock);
 665
 666	if (release_new_ctx) {
 667		struct vm_area_struct *vma;
 668		struct mm_struct *mm = release_new_ctx->mm;
 669		VMA_ITERATOR(vmi, mm, 0);
 670
 671		/* the various vma->vm_userfaultfd_ctx still points to it */
 672		mmap_write_lock(mm);
 673		for_each_vma(vmi, vma) {
 674			if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
 675				vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 676				userfaultfd_set_vm_flags(vma,
 677							 vma->vm_flags & ~__VM_UFFD_FLAGS);
 678			}
 679		}
 680		mmap_write_unlock(mm);
 681
 682		userfaultfd_ctx_put(release_new_ctx);
 683	}
 684
 685	/*
 686	 * ctx may go away after this if the userfault pseudo fd is
 687	 * already released.
 688	 */
 689out:
 690	atomic_dec(&ctx->mmap_changing);
 691	VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
 692	userfaultfd_ctx_put(ctx);
 693}
 694
 695static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
 696				       struct userfaultfd_wait_queue *ewq)
 697{
 698	ewq->msg.event = 0;
 699	wake_up_locked(&ctx->event_wqh);
 700	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
 701}
 702
 703int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
 704{
 705	struct userfaultfd_ctx *ctx = NULL, *octx;
 706	struct userfaultfd_fork_ctx *fctx;
 707
 708	octx = vma->vm_userfaultfd_ctx.ctx;
 709	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
 710		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 711		userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
 712		return 0;
 713	}
 714
 715	list_for_each_entry(fctx, fcs, list)
 716		if (fctx->orig == octx) {
 717			ctx = fctx->new;
 718			break;
 719		}
 720
 721	if (!ctx) {
 722		fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
 723		if (!fctx)
 724			return -ENOMEM;
 725
 726		ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
 727		if (!ctx) {
 728			kfree(fctx);
 729			return -ENOMEM;
 730		}
 731
 732		refcount_set(&ctx->refcount, 1);
 733		ctx->flags = octx->flags;
 734		ctx->features = octx->features;
 735		ctx->released = false;
 736		atomic_set(&ctx->mmap_changing, 0);
 737		ctx->mm = vma->vm_mm;
 738		mmgrab(ctx->mm);
 739
 740		userfaultfd_ctx_get(octx);
 741		atomic_inc(&octx->mmap_changing);
 742		fctx->orig = octx;
 743		fctx->new = ctx;
 744		list_add_tail(&fctx->list, fcs);
 745	}
 746
 747	vma->vm_userfaultfd_ctx.ctx = ctx;
 748	return 0;
 749}
 750
 751static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
 752{
 753	struct userfaultfd_ctx *ctx = fctx->orig;
 754	struct userfaultfd_wait_queue ewq;
 755
 756	msg_init(&ewq.msg);
 757
 758	ewq.msg.event = UFFD_EVENT_FORK;
 759	ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
 760
 761	userfaultfd_event_wait_completion(ctx, &ewq);
 762}
 763
 764void dup_userfaultfd_complete(struct list_head *fcs)
 765{
 766	struct userfaultfd_fork_ctx *fctx, *n;
 767
 768	list_for_each_entry_safe(fctx, n, fcs, list) {
 769		dup_fctx(fctx);
 770		list_del(&fctx->list);
 771		kfree(fctx);
 772	}
 773}
 774
 775void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 776			     struct vm_userfaultfd_ctx *vm_ctx)
 777{
 778	struct userfaultfd_ctx *ctx;
 779
 780	ctx = vma->vm_userfaultfd_ctx.ctx;
 781
 782	if (!ctx)
 783		return;
 784
 785	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
 786		vm_ctx->ctx = ctx;
 787		userfaultfd_ctx_get(ctx);
 788		atomic_inc(&ctx->mmap_changing);
 789	} else {
 790		/* Drop uffd context if remap feature not enabled */
 791		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 792		userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
 793	}
 794}
 795
 796void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
 797				 unsigned long from, unsigned long to,
 798				 unsigned long len)
 799{
 800	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
 801	struct userfaultfd_wait_queue ewq;
 802
 803	if (!ctx)
 804		return;
 805
 806	if (to & ~PAGE_MASK) {
 807		userfaultfd_ctx_put(ctx);
 808		return;
 809	}
 810
 811	msg_init(&ewq.msg);
 812
 813	ewq.msg.event = UFFD_EVENT_REMAP;
 814	ewq.msg.arg.remap.from = from;
 815	ewq.msg.arg.remap.to = to;
 816	ewq.msg.arg.remap.len = len;
 817
 818	userfaultfd_event_wait_completion(ctx, &ewq);
 819}
 820
 821bool userfaultfd_remove(struct vm_area_struct *vma,
 822			unsigned long start, unsigned long end)
 823{
 824	struct mm_struct *mm = vma->vm_mm;
 825	struct userfaultfd_ctx *ctx;
 826	struct userfaultfd_wait_queue ewq;
 827
 828	ctx = vma->vm_userfaultfd_ctx.ctx;
 829	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
 830		return true;
 831
 832	userfaultfd_ctx_get(ctx);
 833	atomic_inc(&ctx->mmap_changing);
 834	mmap_read_unlock(mm);
 835
 836	msg_init(&ewq.msg);
 837
 838	ewq.msg.event = UFFD_EVENT_REMOVE;
 839	ewq.msg.arg.remove.start = start;
 840	ewq.msg.arg.remove.end = end;
 841
 842	userfaultfd_event_wait_completion(ctx, &ewq);
 843
 844	return false;
 845}
 846
 847static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
 848			  unsigned long start, unsigned long end)
 849{
 850	struct userfaultfd_unmap_ctx *unmap_ctx;
 851
 852	list_for_each_entry(unmap_ctx, unmaps, list)
 853		if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
 854		    unmap_ctx->end == end)
 855			return true;
 856
 857	return false;
 858}
 859
 860int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start,
 861			   unsigned long end, struct list_head *unmaps)
 862{
 863	VMA_ITERATOR(vmi, mm, start);
 864	struct vm_area_struct *vma;
 865
 866	for_each_vma_range(vmi, vma, end) {
 867		struct userfaultfd_unmap_ctx *unmap_ctx;
 868		struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
 869
 870		if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
 871		    has_unmap_ctx(ctx, unmaps, start, end))
 872			continue;
 873
 874		unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
 875		if (!unmap_ctx)
 876			return -ENOMEM;
 877
 878		userfaultfd_ctx_get(ctx);
 879		atomic_inc(&ctx->mmap_changing);
 880		unmap_ctx->ctx = ctx;
 881		unmap_ctx->start = start;
 882		unmap_ctx->end = end;
 883		list_add_tail(&unmap_ctx->list, unmaps);
 884	}
 885
 886	return 0;
 887}
 888
 889void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
 890{
 891	struct userfaultfd_unmap_ctx *ctx, *n;
 892	struct userfaultfd_wait_queue ewq;
 893
 894	list_for_each_entry_safe(ctx, n, uf, list) {
 895		msg_init(&ewq.msg);
 896
 897		ewq.msg.event = UFFD_EVENT_UNMAP;
 898		ewq.msg.arg.remove.start = ctx->start;
 899		ewq.msg.arg.remove.end = ctx->end;
 900
 901		userfaultfd_event_wait_completion(ctx->ctx, &ewq);
 902
 903		list_del(&ctx->list);
 904		kfree(ctx);
 905	}
 906}
 907
 908static int userfaultfd_release(struct inode *inode, struct file *file)
 909{
 910	struct userfaultfd_ctx *ctx = file->private_data;
 911	struct mm_struct *mm = ctx->mm;
 912	struct vm_area_struct *vma, *prev;
 913	/* len == 0 means wake all */
 914	struct userfaultfd_wake_range range = { .len = 0, };
 915	unsigned long new_flags;
 916	VMA_ITERATOR(vmi, mm, 0);
 917
 918	WRITE_ONCE(ctx->released, true);
 919
 920	if (!mmget_not_zero(mm))
 921		goto wakeup;
 922
 923	/*
 924	 * Flush page faults out of all CPUs. NOTE: all page faults
 925	 * must be retried without returning VM_FAULT_SIGBUS if
 926	 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
 927	 * changes while handle_userfault released the mmap_lock. So
 928	 * it's critical that released is set to true (above), before
 929	 * taking the mmap_lock for writing.
 930	 */
 931	mmap_write_lock(mm);
 932	prev = NULL;
 933	for_each_vma(vmi, vma) {
 934		cond_resched();
 935		BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
 936		       !!(vma->vm_flags & __VM_UFFD_FLAGS));
 937		if (vma->vm_userfaultfd_ctx.ctx != ctx) {
 938			prev = vma;
 939			continue;
 940		}
 941		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
 942		prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end,
 943				 new_flags, vma->anon_vma,
 944				 vma->vm_file, vma->vm_pgoff,
 945				 vma_policy(vma),
 946				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
 947		if (prev) {
 948			vma = prev;
 949		} else {
 950			prev = vma;
 951		}
 952
 953		userfaultfd_set_vm_flags(vma, new_flags);
 954		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 955	}
 956	mmap_write_unlock(mm);
 957	mmput(mm);
 958wakeup:
 959	/*
 960	 * After no new page faults can wait on this fault_*wqh, flush
 961	 * the last page faults that may have been already waiting on
 962	 * the fault_*wqh.
 963	 */
 964	spin_lock_irq(&ctx->fault_pending_wqh.lock);
 965	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
 966	__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
 967	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
 968
 969	/* Flush pending events that may still wait on event_wqh */
 970	wake_up_all(&ctx->event_wqh);
 971
 972	wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
 973	userfaultfd_ctx_put(ctx);
 974	return 0;
 975}
 976
 977/* fault_pending_wqh.lock must be hold by the caller */
 978static inline struct userfaultfd_wait_queue *find_userfault_in(
 979		wait_queue_head_t *wqh)
 980{
 981	wait_queue_entry_t *wq;
 982	struct userfaultfd_wait_queue *uwq;
 983
 984	lockdep_assert_held(&wqh->lock);
 985
 986	uwq = NULL;
 987	if (!waitqueue_active(wqh))
 988		goto out;
 989	/* walk in reverse to provide FIFO behavior to read userfaults */
 990	wq = list_last_entry(&wqh->head, typeof(*wq), entry);
 991	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
 992out:
 993	return uwq;
 994}
 995
 996static inline struct userfaultfd_wait_queue *find_userfault(
 997		struct userfaultfd_ctx *ctx)
 998{
 999	return find_userfault_in(&ctx->fault_pending_wqh);
1000}
1001
1002static inline struct userfaultfd_wait_queue *find_userfault_evt(
1003		struct userfaultfd_ctx *ctx)
1004{
1005	return find_userfault_in(&ctx->event_wqh);
1006}
1007
1008static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
1009{
1010	struct userfaultfd_ctx *ctx = file->private_data;
1011	__poll_t ret;
1012
1013	poll_wait(file, &ctx->fd_wqh, wait);
1014
1015	if (!userfaultfd_is_initialized(ctx))
1016		return EPOLLERR;
1017
1018	/*
1019	 * poll() never guarantees that read won't block.
1020	 * userfaults can be waken before they're read().
1021	 */
1022	if (unlikely(!(file->f_flags & O_NONBLOCK)))
1023		return EPOLLERR;
1024	/*
1025	 * lockless access to see if there are pending faults
1026	 * __pollwait last action is the add_wait_queue but
1027	 * the spin_unlock would allow the waitqueue_active to
1028	 * pass above the actual list_add inside
1029	 * add_wait_queue critical section. So use a full
1030	 * memory barrier to serialize the list_add write of
1031	 * add_wait_queue() with the waitqueue_active read
1032	 * below.
1033	 */
1034	ret = 0;
1035	smp_mb();
1036	if (waitqueue_active(&ctx->fault_pending_wqh))
1037		ret = EPOLLIN;
1038	else if (waitqueue_active(&ctx->event_wqh))
1039		ret = EPOLLIN;
1040
1041	return ret;
1042}
1043
1044static const struct file_operations userfaultfd_fops;
1045
1046static int resolve_userfault_fork(struct userfaultfd_ctx *new,
1047				  struct inode *inode,
1048				  struct uffd_msg *msg)
1049{
1050	int fd;
1051
1052	fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
1053			O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
1054	if (fd < 0)
1055		return fd;
1056
1057	msg->arg.reserved.reserved1 = 0;
1058	msg->arg.fork.ufd = fd;
1059	return 0;
1060}
1061
1062static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
1063				    struct uffd_msg *msg, struct inode *inode)
1064{
1065	ssize_t ret;
1066	DECLARE_WAITQUEUE(wait, current);
1067	struct userfaultfd_wait_queue *uwq;
1068	/*
1069	 * Handling fork event requires sleeping operations, so
1070	 * we drop the event_wqh lock, then do these ops, then
1071	 * lock it back and wake up the waiter. While the lock is
1072	 * dropped the ewq may go away so we keep track of it
1073	 * carefully.
1074	 */
1075	LIST_HEAD(fork_event);
1076	struct userfaultfd_ctx *fork_nctx = NULL;
1077
1078	/* always take the fd_wqh lock before the fault_pending_wqh lock */
1079	spin_lock_irq(&ctx->fd_wqh.lock);
1080	__add_wait_queue(&ctx->fd_wqh, &wait);
1081	for (;;) {
1082		set_current_state(TASK_INTERRUPTIBLE);
1083		spin_lock(&ctx->fault_pending_wqh.lock);
1084		uwq = find_userfault(ctx);
1085		if (uwq) {
1086			/*
1087			 * Use a seqcount to repeat the lockless check
1088			 * in wake_userfault() to avoid missing
1089			 * wakeups because during the refile both
1090			 * waitqueue could become empty if this is the
1091			 * only userfault.
1092			 */
1093			write_seqcount_begin(&ctx->refile_seq);
1094
1095			/*
1096			 * The fault_pending_wqh.lock prevents the uwq
1097			 * to disappear from under us.
1098			 *
1099			 * Refile this userfault from
1100			 * fault_pending_wqh to fault_wqh, it's not
1101			 * pending anymore after we read it.
1102			 *
1103			 * Use list_del() by hand (as
1104			 * userfaultfd_wake_function also uses
1105			 * list_del_init() by hand) to be sure nobody
1106			 * changes __remove_wait_queue() to use
1107			 * list_del_init() in turn breaking the
1108			 * !list_empty_careful() check in
1109			 * handle_userfault(). The uwq->wq.head list
1110			 * must never be empty at any time during the
1111			 * refile, or the waitqueue could disappear
1112			 * from under us. The "wait_queue_head_t"
1113			 * parameter of __remove_wait_queue() is unused
1114			 * anyway.
1115			 */
1116			list_del(&uwq->wq.entry);
1117			add_wait_queue(&ctx->fault_wqh, &uwq->wq);
1118
1119			write_seqcount_end(&ctx->refile_seq);
1120
1121			/* careful to always initialize msg if ret == 0 */
1122			*msg = uwq->msg;
1123			spin_unlock(&ctx->fault_pending_wqh.lock);
1124			ret = 0;
1125			break;
1126		}
1127		spin_unlock(&ctx->fault_pending_wqh.lock);
1128
1129		spin_lock(&ctx->event_wqh.lock);
1130		uwq = find_userfault_evt(ctx);
1131		if (uwq) {
1132			*msg = uwq->msg;
1133
1134			if (uwq->msg.event == UFFD_EVENT_FORK) {
1135				fork_nctx = (struct userfaultfd_ctx *)
1136					(unsigned long)
1137					uwq->msg.arg.reserved.reserved1;
1138				list_move(&uwq->wq.entry, &fork_event);
1139				/*
1140				 * fork_nctx can be freed as soon as
1141				 * we drop the lock, unless we take a
1142				 * reference on it.
1143				 */
1144				userfaultfd_ctx_get(fork_nctx);
1145				spin_unlock(&ctx->event_wqh.lock);
1146				ret = 0;
1147				break;
1148			}
1149
1150			userfaultfd_event_complete(ctx, uwq);
1151			spin_unlock(&ctx->event_wqh.lock);
1152			ret = 0;
1153			break;
1154		}
1155		spin_unlock(&ctx->event_wqh.lock);
1156
1157		if (signal_pending(current)) {
1158			ret = -ERESTARTSYS;
1159			break;
1160		}
1161		if (no_wait) {
1162			ret = -EAGAIN;
1163			break;
1164		}
1165		spin_unlock_irq(&ctx->fd_wqh.lock);
1166		schedule();
1167		spin_lock_irq(&ctx->fd_wqh.lock);
1168	}
1169	__remove_wait_queue(&ctx->fd_wqh, &wait);
1170	__set_current_state(TASK_RUNNING);
1171	spin_unlock_irq(&ctx->fd_wqh.lock);
1172
1173	if (!ret && msg->event == UFFD_EVENT_FORK) {
1174		ret = resolve_userfault_fork(fork_nctx, inode, msg);
1175		spin_lock_irq(&ctx->event_wqh.lock);
1176		if (!list_empty(&fork_event)) {
1177			/*
1178			 * The fork thread didn't abort, so we can
1179			 * drop the temporary refcount.
1180			 */
1181			userfaultfd_ctx_put(fork_nctx);
1182
1183			uwq = list_first_entry(&fork_event,
1184					       typeof(*uwq),
1185					       wq.entry);
1186			/*
1187			 * If fork_event list wasn't empty and in turn
1188			 * the event wasn't already released by fork
1189			 * (the event is allocated on fork kernel
1190			 * stack), put the event back to its place in
1191			 * the event_wq. fork_event head will be freed
1192			 * as soon as we return so the event cannot
1193			 * stay queued there no matter the current
1194			 * "ret" value.
1195			 */
1196			list_del(&uwq->wq.entry);
1197			__add_wait_queue(&ctx->event_wqh, &uwq->wq);
1198
1199			/*
1200			 * Leave the event in the waitqueue and report
1201			 * error to userland if we failed to resolve
1202			 * the userfault fork.
1203			 */
1204			if (likely(!ret))
1205				userfaultfd_event_complete(ctx, uwq);
1206		} else {
1207			/*
1208			 * Here the fork thread aborted and the
1209			 * refcount from the fork thread on fork_nctx
1210			 * has already been released. We still hold
1211			 * the reference we took before releasing the
1212			 * lock above. If resolve_userfault_fork
1213			 * failed we've to drop it because the
1214			 * fork_nctx has to be freed in such case. If
1215			 * it succeeded we'll hold it because the new
1216			 * uffd references it.
1217			 */
1218			if (ret)
1219				userfaultfd_ctx_put(fork_nctx);
1220		}
1221		spin_unlock_irq(&ctx->event_wqh.lock);
1222	}
1223
1224	return ret;
1225}
1226
1227static ssize_t userfaultfd_read(struct file *file, char __user *buf,
1228				size_t count, loff_t *ppos)
1229{
1230	struct userfaultfd_ctx *ctx = file->private_data;
1231	ssize_t _ret, ret = 0;
1232	struct uffd_msg msg;
1233	int no_wait = file->f_flags & O_NONBLOCK;
1234	struct inode *inode = file_inode(file);
1235
1236	if (!userfaultfd_is_initialized(ctx))
1237		return -EINVAL;
1238
1239	for (;;) {
1240		if (count < sizeof(msg))
1241			return ret ? ret : -EINVAL;
1242		_ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
1243		if (_ret < 0)
1244			return ret ? ret : _ret;
1245		if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
1246			return ret ? ret : -EFAULT;
1247		ret += sizeof(msg);
1248		buf += sizeof(msg);
1249		count -= sizeof(msg);
1250		/*
1251		 * Allow to read more than one fault at time but only
1252		 * block if waiting for the very first one.
1253		 */
1254		no_wait = O_NONBLOCK;
1255	}
1256}
1257
1258static void __wake_userfault(struct userfaultfd_ctx *ctx,
1259			     struct userfaultfd_wake_range *range)
1260{
1261	spin_lock_irq(&ctx->fault_pending_wqh.lock);
1262	/* wake all in the range and autoremove */
1263	if (waitqueue_active(&ctx->fault_pending_wqh))
1264		__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
1265				     range);
1266	if (waitqueue_active(&ctx->fault_wqh))
1267		__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
1268	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
1269}
1270
1271static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
1272					   struct userfaultfd_wake_range *range)
1273{
1274	unsigned seq;
1275	bool need_wakeup;
1276
1277	/*
1278	 * To be sure waitqueue_active() is not reordered by the CPU
1279	 * before the pagetable update, use an explicit SMP memory
1280	 * barrier here. PT lock release or mmap_read_unlock(mm) still
1281	 * have release semantics that can allow the
1282	 * waitqueue_active() to be reordered before the pte update.
1283	 */
1284	smp_mb();
1285
1286	/*
1287	 * Use waitqueue_active because it's very frequent to
1288	 * change the address space atomically even if there are no
1289	 * userfaults yet. So we take the spinlock only when we're
1290	 * sure we've userfaults to wake.
1291	 */
1292	do {
1293		seq = read_seqcount_begin(&ctx->refile_seq);
1294		need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
1295			waitqueue_active(&ctx->fault_wqh);
1296		cond_resched();
1297	} while (read_seqcount_retry(&ctx->refile_seq, seq));
1298	if (need_wakeup)
1299		__wake_userfault(ctx, range);
1300}
1301
1302static __always_inline int validate_range(struct mm_struct *mm,
1303					  __u64 start, __u64 len)
1304{
1305	__u64 task_size = mm->task_size;
1306
1307	if (start & ~PAGE_MASK)
1308		return -EINVAL;
1309	if (len & ~PAGE_MASK)
1310		return -EINVAL;
1311	if (!len)
1312		return -EINVAL;
1313	if (start < mmap_min_addr)
1314		return -EINVAL;
1315	if (start >= task_size)
1316		return -EINVAL;
1317	if (len > task_size - start)
1318		return -EINVAL;
1319	return 0;
1320}
1321
1322static int userfaultfd_register(struct userfaultfd_ctx *ctx,
1323				unsigned long arg)
1324{
1325	struct mm_struct *mm = ctx->mm;
1326	struct vm_area_struct *vma, *prev, *cur;
1327	int ret;
1328	struct uffdio_register uffdio_register;
1329	struct uffdio_register __user *user_uffdio_register;
1330	unsigned long vm_flags, new_flags;
1331	bool found;
1332	bool basic_ioctls;
1333	unsigned long start, end, vma_end;
1334	struct vma_iterator vmi;
1335	pgoff_t pgoff;
1336
1337	user_uffdio_register = (struct uffdio_register __user *) arg;
1338
1339	ret = -EFAULT;
1340	if (copy_from_user(&uffdio_register, user_uffdio_register,
1341			   sizeof(uffdio_register)-sizeof(__u64)))
1342		goto out;
1343
1344	ret = -EINVAL;
1345	if (!uffdio_register.mode)
1346		goto out;
1347	if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
1348		goto out;
1349	vm_flags = 0;
1350	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
1351		vm_flags |= VM_UFFD_MISSING;
1352	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
1353#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
1354		goto out;
1355#endif
1356		vm_flags |= VM_UFFD_WP;
1357	}
1358	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
1359#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
1360		goto out;
1361#endif
1362		vm_flags |= VM_UFFD_MINOR;
1363	}
1364
1365	ret = validate_range(mm, uffdio_register.range.start,
1366			     uffdio_register.range.len);
1367	if (ret)
1368		goto out;
1369
1370	start = uffdio_register.range.start;
1371	end = start + uffdio_register.range.len;
1372
1373	ret = -ENOMEM;
1374	if (!mmget_not_zero(mm))
1375		goto out;
1376
1377	ret = -EINVAL;
1378	mmap_write_lock(mm);
1379	vma_iter_init(&vmi, mm, start);
1380	vma = vma_find(&vmi, end);
1381	if (!vma)
1382		goto out_unlock;
1383
1384	/*
1385	 * If the first vma contains huge pages, make sure start address
1386	 * is aligned to huge page size.
1387	 */
1388	if (is_vm_hugetlb_page(vma)) {
1389		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1390
1391		if (start & (vma_hpagesize - 1))
1392			goto out_unlock;
1393	}
1394
1395	/*
1396	 * Search for not compatible vmas.
1397	 */
1398	found = false;
1399	basic_ioctls = false;
1400	cur = vma;
1401	do {
1402		cond_resched();
1403
1404		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1405		       !!(cur->vm_flags & __VM_UFFD_FLAGS));
1406
1407		/* check not compatible vmas */
1408		ret = -EINVAL;
1409		if (!vma_can_userfault(cur, vm_flags))
1410			goto out_unlock;
1411
1412		/*
1413		 * UFFDIO_COPY will fill file holes even without
1414		 * PROT_WRITE. This check enforces that if this is a
1415		 * MAP_SHARED, the process has write permission to the backing
1416		 * file. If VM_MAYWRITE is set it also enforces that on a
1417		 * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
1418		 * F_WRITE_SEAL can be taken until the vma is destroyed.
1419		 */
1420		ret = -EPERM;
1421		if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
1422			goto out_unlock;
1423
1424		/*
1425		 * If this vma contains ending address, and huge pages
1426		 * check alignment.
1427		 */
1428		if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
1429		    end > cur->vm_start) {
1430			unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
1431
1432			ret = -EINVAL;
1433
1434			if (end & (vma_hpagesize - 1))
1435				goto out_unlock;
1436		}
1437		if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
1438			goto out_unlock;
1439
1440		/*
1441		 * Check that this vma isn't already owned by a
1442		 * different userfaultfd. We can't allow more than one
1443		 * userfaultfd to own a single vma simultaneously or we
1444		 * wouldn't know which one to deliver the userfaults to.
1445		 */
1446		ret = -EBUSY;
1447		if (cur->vm_userfaultfd_ctx.ctx &&
1448		    cur->vm_userfaultfd_ctx.ctx != ctx)
1449			goto out_unlock;
1450
1451		/*
1452		 * Note vmas containing huge pages
1453		 */
1454		if (is_vm_hugetlb_page(cur))
1455			basic_ioctls = true;
1456
1457		found = true;
1458	} for_each_vma_range(vmi, cur, end);
1459	BUG_ON(!found);
1460
1461	vma_iter_set(&vmi, start);
1462	prev = vma_prev(&vmi);
1463	if (vma->vm_start < start)
1464		prev = vma;
1465
1466	ret = 0;
1467	for_each_vma_range(vmi, vma, end) {
1468		cond_resched();
1469
1470		BUG_ON(!vma_can_userfault(vma, vm_flags));
1471		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
1472		       vma->vm_userfaultfd_ctx.ctx != ctx);
1473		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1474
1475		/*
1476		 * Nothing to do: this vma is already registered into this
1477		 * userfaultfd and with the right tracking mode too.
1478		 */
1479		if (vma->vm_userfaultfd_ctx.ctx == ctx &&
1480		    (vma->vm_flags & vm_flags) == vm_flags)
1481			goto skip;
1482
1483		if (vma->vm_start > start)
1484			start = vma->vm_start;
1485		vma_end = min(end, vma->vm_end);
1486
1487		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
1488		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
1489		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
1490				 vma->anon_vma, vma->vm_file, pgoff,
1491				 vma_policy(vma),
1492				 ((struct vm_userfaultfd_ctx){ ctx }),
1493				 anon_vma_name(vma));
1494		if (prev) {
1495			/* vma_merge() invalidated the mas */
1496			vma = prev;
1497			goto next;
1498		}
1499		if (vma->vm_start < start) {
1500			ret = split_vma(&vmi, vma, start, 1);
1501			if (ret)
1502				break;
1503		}
1504		if (vma->vm_end > end) {
1505			ret = split_vma(&vmi, vma, end, 0);
1506			if (ret)
1507				break;
1508		}
1509	next:
1510		/*
1511		 * In the vma_merge() successful mprotect-like case 8:
1512		 * the next vma was merged into the current one and
1513		 * the current one has not been updated yet.
1514		 */
1515		userfaultfd_set_vm_flags(vma, new_flags);
1516		vma->vm_userfaultfd_ctx.ctx = ctx;
1517
1518		if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
1519			hugetlb_unshare_all_pmds(vma);
1520
1521	skip:
1522		prev = vma;
1523		start = vma->vm_end;
1524	}
1525
1526out_unlock:
1527	mmap_write_unlock(mm);
1528	mmput(mm);
1529	if (!ret) {
1530		__u64 ioctls_out;
1531
1532		ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
1533		    UFFD_API_RANGE_IOCTLS;
1534
1535		/*
1536		 * Declare the WP ioctl only if the WP mode is
1537		 * specified and all checks passed with the range
1538		 */
1539		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
1540			ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
1541
1542		/* CONTINUE ioctl is only supported for MINOR ranges. */
1543		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
1544			ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
1545
1546		/*
1547		 * Now that we scanned all vmas we can already tell
1548		 * userland which ioctls methods are guaranteed to
1549		 * succeed on this range.
1550		 */
1551		if (put_user(ioctls_out, &user_uffdio_register->ioctls))
1552			ret = -EFAULT;
1553	}
1554out:
1555	return ret;
1556}
1557
1558static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
1559				  unsigned long arg)
1560{
1561	struct mm_struct *mm = ctx->mm;
1562	struct vm_area_struct *vma, *prev, *cur;
1563	int ret;
1564	struct uffdio_range uffdio_unregister;
1565	unsigned long new_flags;
1566	bool found;
1567	unsigned long start, end, vma_end;
1568	const void __user *buf = (void __user *)arg;
1569	struct vma_iterator vmi;
1570	pgoff_t pgoff;
1571
1572	ret = -EFAULT;
1573	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
1574		goto out;
1575
1576	ret = validate_range(mm, uffdio_unregister.start,
1577			     uffdio_unregister.len);
1578	if (ret)
1579		goto out;
1580
1581	start = uffdio_unregister.start;
1582	end = start + uffdio_unregister.len;
1583
1584	ret = -ENOMEM;
1585	if (!mmget_not_zero(mm))
1586		goto out;
1587
1588	mmap_write_lock(mm);
1589	ret = -EINVAL;
1590	vma_iter_init(&vmi, mm, start);
1591	vma = vma_find(&vmi, end);
1592	if (!vma)
1593		goto out_unlock;
1594
1595	/*
1596	 * If the first vma contains huge pages, make sure start address
1597	 * is aligned to huge page size.
1598	 */
1599	if (is_vm_hugetlb_page(vma)) {
1600		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1601
1602		if (start & (vma_hpagesize - 1))
1603			goto out_unlock;
1604	}
1605
1606	/*
1607	 * Search for not compatible vmas.
1608	 */
1609	found = false;
1610	cur = vma;
1611	do {
1612		cond_resched();
1613
1614		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1615		       !!(cur->vm_flags & __VM_UFFD_FLAGS));
1616
1617		/*
1618		 * Check not compatible vmas, not strictly required
1619		 * here as not compatible vmas cannot have an
1620		 * userfaultfd_ctx registered on them, but this
1621		 * provides for more strict behavior to notice
1622		 * unregistration errors.
1623		 */
1624		if (!vma_can_userfault(cur, cur->vm_flags))
1625			goto out_unlock;
1626
1627		found = true;
1628	} for_each_vma_range(vmi, cur, end);
1629	BUG_ON(!found);
1630
1631	vma_iter_set(&vmi, start);
1632	prev = vma_prev(&vmi);
1633	if (vma->vm_start < start)
1634		prev = vma;
1635
1636	ret = 0;
1637	for_each_vma_range(vmi, vma, end) {
1638		cond_resched();
1639
1640		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
1641
1642		/*
1643		 * Nothing to do: this vma is already registered into this
1644		 * userfaultfd and with the right tracking mode too.
1645		 */
1646		if (!vma->vm_userfaultfd_ctx.ctx)
1647			goto skip;
1648
1649		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1650
1651		if (vma->vm_start > start)
1652			start = vma->vm_start;
1653		vma_end = min(end, vma->vm_end);
1654
1655		if (userfaultfd_missing(vma)) {
1656			/*
1657			 * Wake any concurrent pending userfault while
1658			 * we unregister, so they will not hang
1659			 * permanently and it avoids userland to call
1660			 * UFFDIO_WAKE explicitly.
1661			 */
1662			struct userfaultfd_wake_range range;
1663			range.start = start;
1664			range.len = vma_end - start;
1665			wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
1666		}
1667
1668		/* Reset ptes for the whole vma range if wr-protected */
1669		if (userfaultfd_wp(vma))
1670			uffd_wp_range(vma, start, vma_end - start, false);
1671
1672		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
1673		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
1674		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
1675				 vma->anon_vma, vma->vm_file, pgoff,
1676				 vma_policy(vma),
1677				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
1678		if (prev) {
1679			vma = prev;
1680			goto next;
1681		}
1682		if (vma->vm_start < start) {
1683			ret = split_vma(&vmi, vma, start, 1);
1684			if (ret)
1685				break;
1686		}
1687		if (vma->vm_end > end) {
1688			ret = split_vma(&vmi, vma, end, 0);
1689			if (ret)
1690				break;
1691		}
1692	next:
1693		/*
1694		 * In the vma_merge() successful mprotect-like case 8:
1695		 * the next vma was merged into the current one and
1696		 * the current one has not been updated yet.
1697		 */
1698		userfaultfd_set_vm_flags(vma, new_flags);
1699		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
1700
1701	skip:
1702		prev = vma;
1703		start = vma->vm_end;
1704	}
1705
1706out_unlock:
1707	mmap_write_unlock(mm);
1708	mmput(mm);
1709out:
1710	return ret;
1711}
1712
1713/*
1714 * userfaultfd_wake may be used in combination with the
1715 * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
1716 */
1717static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
1718			    unsigned long arg)
1719{
1720	int ret;
1721	struct uffdio_range uffdio_wake;
1722	struct userfaultfd_wake_range range;
1723	const void __user *buf = (void __user *)arg;
1724
1725	ret = -EFAULT;
1726	if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
1727		goto out;
1728
1729	ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
1730	if (ret)
1731		goto out;
1732
1733	range.start = uffdio_wake.start;
1734	range.len = uffdio_wake.len;
1735
1736	/*
1737	 * len == 0 means wake all and we don't want to wake all here,
1738	 * so check it again to be sure.
1739	 */
1740	VM_BUG_ON(!range.len);
1741
1742	wake_userfault(ctx, &range);
1743	ret = 0;
1744
1745out:
1746	return ret;
1747}
1748
1749static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
1750			    unsigned long arg)
1751{
1752	__s64 ret;
1753	struct uffdio_copy uffdio_copy;
1754	struct uffdio_copy __user *user_uffdio_copy;
1755	struct userfaultfd_wake_range range;
1756	uffd_flags_t flags = 0;
1757
1758	user_uffdio_copy = (struct uffdio_copy __user *) arg;
1759
1760	ret = -EAGAIN;
1761	if (atomic_read(&ctx->mmap_changing))
1762		goto out;
1763
1764	ret = -EFAULT;
1765	if (copy_from_user(&uffdio_copy, user_uffdio_copy,
1766			   /* don't copy "copy" last field */
1767			   sizeof(uffdio_copy)-sizeof(__s64)))
1768		goto out;
1769
1770	ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
1771	if (ret)
1772		goto out;
1773	/*
1774	 * double check for wraparound just in case. copy_from_user()
1775	 * will later check uffdio_copy.src + uffdio_copy.len to fit
1776	 * in the userland range.
1777	 */
1778	ret = -EINVAL;
1779	if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
1780		goto out;
1781	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
1782		goto out;
1783	if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
1784		flags |= MFILL_ATOMIC_WP;
1785	if (mmget_not_zero(ctx->mm)) {
1786		ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
1787					uffdio_copy.len, &ctx->mmap_changing,
1788					flags);
1789		mmput(ctx->mm);
1790	} else {
1791		return -ESRCH;
1792	}
1793	if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
1794		return -EFAULT;
1795	if (ret < 0)
1796		goto out;
1797	BUG_ON(!ret);
1798	/* len == 0 would wake all */
1799	range.len = ret;
1800	if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
1801		range.start = uffdio_copy.dst;
1802		wake_userfault(ctx, &range);
1803	}
1804	ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
1805out:
1806	return ret;
1807}
1808
1809static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
1810				unsigned long arg)
1811{
1812	__s64 ret;
1813	struct uffdio_zeropage uffdio_zeropage;
1814	struct uffdio_zeropage __user *user_uffdio_zeropage;
1815	struct userfaultfd_wake_range range;
1816
1817	user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
1818
1819	ret = -EAGAIN;
1820	if (atomic_read(&ctx->mmap_changing))
1821		goto out;
1822
1823	ret = -EFAULT;
1824	if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
1825			   /* don't copy "zeropage" last field */
1826			   sizeof(uffdio_zeropage)-sizeof(__s64)))
1827		goto out;
1828
1829	ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
1830			     uffdio_zeropage.range.len);
1831	if (ret)
1832		goto out;
1833	ret = -EINVAL;
1834	if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
1835		goto out;
1836
1837	if (mmget_not_zero(ctx->mm)) {
1838		ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
1839					   uffdio_zeropage.range.len,
1840					   &ctx->mmap_changing);
1841		mmput(ctx->mm);
1842	} else {
1843		return -ESRCH;
1844	}
1845	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
1846		return -EFAULT;
1847	if (ret < 0)
1848		goto out;
1849	/* len == 0 would wake all */
1850	BUG_ON(!ret);
1851	range.len = ret;
1852	if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
1853		range.start = uffdio_zeropage.range.start;
1854		wake_userfault(ctx, &range);
1855	}
1856	ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
1857out:
1858	return ret;
1859}
1860
1861static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
1862				    unsigned long arg)
1863{
1864	int ret;
1865	struct uffdio_writeprotect uffdio_wp;
1866	struct uffdio_writeprotect __user *user_uffdio_wp;
1867	struct userfaultfd_wake_range range;
1868	bool mode_wp, mode_dontwake;
1869
1870	if (atomic_read(&ctx->mmap_changing))
1871		return -EAGAIN;
1872
1873	user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
1874
1875	if (copy_from_user(&uffdio_wp, user_uffdio_wp,
1876			   sizeof(struct uffdio_writeprotect)))
1877		return -EFAULT;
1878
1879	ret = validate_range(ctx->mm, uffdio_wp.range.start,
1880			     uffdio_wp.range.len);
1881	if (ret)
1882		return ret;
1883
1884	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
1885			       UFFDIO_WRITEPROTECT_MODE_WP))
1886		return -EINVAL;
1887
1888	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
1889	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
1890
1891	if (mode_wp && mode_dontwake)
1892		return -EINVAL;
1893
1894	if (mmget_not_zero(ctx->mm)) {
1895		ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
1896					  uffdio_wp.range.len, mode_wp,
1897					  &ctx->mmap_changing);
1898		mmput(ctx->mm);
1899	} else {
1900		return -ESRCH;
1901	}
1902
1903	if (ret)
1904		return ret;
1905
1906	if (!mode_wp && !mode_dontwake) {
1907		range.start = uffdio_wp.range.start;
1908		range.len = uffdio_wp.range.len;
1909		wake_userfault(ctx, &range);
1910	}
1911	return ret;
1912}
1913
1914static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
1915{
1916	__s64 ret;
1917	struct uffdio_continue uffdio_continue;
1918	struct uffdio_continue __user *user_uffdio_continue;
1919	struct userfaultfd_wake_range range;
1920	uffd_flags_t flags = 0;
1921
1922	user_uffdio_continue = (struct uffdio_continue __user *)arg;
1923
1924	ret = -EAGAIN;
1925	if (atomic_read(&ctx->mmap_changing))
1926		goto out;
1927
1928	ret = -EFAULT;
1929	if (copy_from_user(&uffdio_continue, user_uffdio_continue,
1930			   /* don't copy the output fields */
1931			   sizeof(uffdio_continue) - (sizeof(__s64))))
1932		goto out;
1933
1934	ret = validate_range(ctx->mm, uffdio_continue.range.start,
1935			     uffdio_continue.range.len);
1936	if (ret)
1937		goto out;
1938
1939	ret = -EINVAL;
1940	/* double check for wraparound just in case. */
1941	if (uffdio_continue.range.start + uffdio_continue.range.len <=
1942	    uffdio_continue.range.start) {
1943		goto out;
1944	}
1945	if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
1946				     UFFDIO_CONTINUE_MODE_WP))
1947		goto out;
1948	if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
1949		flags |= MFILL_ATOMIC_WP;
1950
1951	if (mmget_not_zero(ctx->mm)) {
1952		ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
1953					    uffdio_continue.range.len,
1954					    &ctx->mmap_changing, flags);
1955		mmput(ctx->mm);
1956	} else {
1957		return -ESRCH;
1958	}
1959
1960	if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
1961		return -EFAULT;
1962	if (ret < 0)
1963		goto out;
1964
1965	/* len == 0 would wake all */
1966	BUG_ON(!ret);
1967	range.len = ret;
1968	if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
1969		range.start = uffdio_continue.range.start;
1970		wake_userfault(ctx, &range);
1971	}
1972	ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
1973
1974out:
1975	return ret;
1976}
1977
1978static inline unsigned int uffd_ctx_features(__u64 user_features)
1979{
1980	/*
1981	 * For the current set of features the bits just coincide. Set
1982	 * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
1983	 */
1984	return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
1985}
1986
1987/*
1988 * userland asks for a certain API version and we return which bits
1989 * and ioctl commands are implemented in this kernel for such API
1990 * version or -EINVAL if unknown.
1991 */
1992static int userfaultfd_api(struct userfaultfd_ctx *ctx,
1993			   unsigned long arg)
1994{
1995	struct uffdio_api uffdio_api;
1996	void __user *buf = (void __user *)arg;
1997	unsigned int ctx_features;
1998	int ret;
1999	__u64 features;
2000
2001	ret = -EFAULT;
2002	if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
2003		goto out;
2004	features = uffdio_api.features;
2005	ret = -EINVAL;
2006	if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
2007		goto err_out;
2008	ret = -EPERM;
2009	if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
2010		goto err_out;
2011	/* report all available features and ioctls to userland */
2012	uffdio_api.features = UFFD_API_FEATURES;
2013#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
2014	uffdio_api.features &=
2015		~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
2016#endif
2017#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
2018	uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
2019#endif
2020#ifndef CONFIG_PTE_MARKER_UFFD_WP
2021	uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
2022	uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
2023#endif
2024	uffdio_api.ioctls = UFFD_API_IOCTLS;
2025	ret = -EFAULT;
2026	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
2027		goto out;
2028
2029	/* only enable the requested features for this uffd context */
2030	ctx_features = uffd_ctx_features(features);
2031	ret = -EINVAL;
2032	if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
2033		goto err_out;
2034
2035	ret = 0;
2036out:
2037	return ret;
2038err_out:
2039	memset(&uffdio_api, 0, sizeof(uffdio_api));
2040	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
2041		ret = -EFAULT;
2042	goto out;
2043}
2044
2045static long userfaultfd_ioctl(struct file *file, unsigned cmd,
2046			      unsigned long arg)
2047{
2048	int ret = -EINVAL;
2049	struct userfaultfd_ctx *ctx = file->private_data;
2050
2051	if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
2052		return -EINVAL;
2053
2054	switch(cmd) {
2055	case UFFDIO_API:
2056		ret = userfaultfd_api(ctx, arg);
2057		break;
2058	case UFFDIO_REGISTER:
2059		ret = userfaultfd_register(ctx, arg);
2060		break;
2061	case UFFDIO_UNREGISTER:
2062		ret = userfaultfd_unregister(ctx, arg);
2063		break;
2064	case UFFDIO_WAKE:
2065		ret = userfaultfd_wake(ctx, arg);
2066		break;
2067	case UFFDIO_COPY:
2068		ret = userfaultfd_copy(ctx, arg);
2069		break;
2070	case UFFDIO_ZEROPAGE:
2071		ret = userfaultfd_zeropage(ctx, arg);
2072		break;
2073	case UFFDIO_WRITEPROTECT:
2074		ret = userfaultfd_writeprotect(ctx, arg);
2075		break;
2076	case UFFDIO_CONTINUE:
2077		ret = userfaultfd_continue(ctx, arg);
2078		break;
2079	}
2080	return ret;
2081}
2082
2083#ifdef CONFIG_PROC_FS
2084static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
2085{
2086	struct userfaultfd_ctx *ctx = f->private_data;
2087	wait_queue_entry_t *wq;
2088	unsigned long pending = 0, total = 0;
2089
2090	spin_lock_irq(&ctx->fault_pending_wqh.lock);
2091	list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
2092		pending++;
2093		total++;
2094	}
2095	list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
2096		total++;
2097	}
2098	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
2099
2100	/*
2101	 * If more protocols will be added, there will be all shown
2102	 * separated by a space. Like this:
2103	 *	protocols: aa:... bb:...
2104	 */
2105	seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
2106		   pending, total, UFFD_API, ctx->features,
2107		   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
2108}
2109#endif
2110
2111static const struct file_operations userfaultfd_fops = {
2112#ifdef CONFIG_PROC_FS
2113	.show_fdinfo	= userfaultfd_show_fdinfo,
2114#endif
2115	.release	= userfaultfd_release,
2116	.poll		= userfaultfd_poll,
2117	.read		= userfaultfd_read,
2118	.unlocked_ioctl = userfaultfd_ioctl,
2119	.compat_ioctl	= compat_ptr_ioctl,
2120	.llseek		= noop_llseek,
2121};
2122
2123static void init_once_userfaultfd_ctx(void *mem)
2124{
2125	struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
2126
2127	init_waitqueue_head(&ctx->fault_pending_wqh);
2128	init_waitqueue_head(&ctx->fault_wqh);
2129	init_waitqueue_head(&ctx->event_wqh);
2130	init_waitqueue_head(&ctx->fd_wqh);
2131	seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
2132}
2133
2134static int new_userfaultfd(int flags)
2135{
2136	struct userfaultfd_ctx *ctx;
2137	int fd;
2138
2139	BUG_ON(!current->mm);
2140
2141	/* Check the UFFD_* constants for consistency.  */
2142	BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
2143	BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
2144	BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
2145
2146	if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
2147		return -EINVAL;
2148
2149	ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
2150	if (!ctx)
2151		return -ENOMEM;
2152
2153	refcount_set(&ctx->refcount, 1);
2154	ctx->flags = flags;
2155	ctx->features = 0;
2156	ctx->released = false;
2157	atomic_set(&ctx->mmap_changing, 0);
2158	ctx->mm = current->mm;
2159	/* prevent the mm struct to be freed */
2160	mmgrab(ctx->mm);
2161
2162	fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
2163			O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
2164	if (fd < 0) {
2165		mmdrop(ctx->mm);
2166		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
2167	}
2168	return fd;
2169}
2170
2171static inline bool userfaultfd_syscall_allowed(int flags)
2172{
2173	/* Userspace-only page faults are always allowed */
2174	if (flags & UFFD_USER_MODE_ONLY)
2175		return true;
2176
2177	/*
2178	 * The user is requesting a userfaultfd which can handle kernel faults.
2179	 * Privileged users are always allowed to do this.
2180	 */
2181	if (capable(CAP_SYS_PTRACE))
2182		return true;
2183
2184	/* Otherwise, access to kernel fault handling is sysctl controlled. */
2185	return sysctl_unprivileged_userfaultfd;
2186}
2187
2188SYSCALL_DEFINE1(userfaultfd, int, flags)
2189{
2190	if (!userfaultfd_syscall_allowed(flags))
2191		return -EPERM;
2192
2193	return new_userfaultfd(flags);
2194}
2195
2196static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
2197{
2198	if (cmd != USERFAULTFD_IOC_NEW)
2199		return -EINVAL;
2200
2201	return new_userfaultfd(flags);
2202}
2203
2204static const struct file_operations userfaultfd_dev_fops = {
2205	.unlocked_ioctl = userfaultfd_dev_ioctl,
2206	.compat_ioctl = userfaultfd_dev_ioctl,
2207	.owner = THIS_MODULE,
2208	.llseek = noop_llseek,
2209};
2210
2211static struct miscdevice userfaultfd_misc = {
2212	.minor = MISC_DYNAMIC_MINOR,
2213	.name = "userfaultfd",
2214	.fops = &userfaultfd_dev_fops
2215};
2216
2217static int __init userfaultfd_init(void)
2218{
2219	int ret;
2220
2221	ret = misc_register(&userfaultfd_misc);
2222	if (ret)
2223		return ret;
2224
2225	userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
2226						sizeof(struct userfaultfd_ctx),
2227						0,
2228						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2229						init_once_userfaultfd_ctx);
2230#ifdef CONFIG_SYSCTL
2231	register_sysctl_init("vm", vm_userfaultfd_table);
2232#endif
2233	return 0;
2234}
2235__initcall(userfaultfd_init);